From 3158f4a51894e46ecb593bffbfd12824e1d6534a Mon Sep 17 00:00:00 2001
From: Yonghee Han <onstudy@samsung.com>
Date: Wed, 27 Jul 2016 16:40:17 +0900
Subject: Imported Upstream version 2.4.1

Change-Id: I0b584f569cb0e0f4eac13cdb79e110c2dbc34bfc
---
 block/Makefile.objs     |    8 +-
 block/backup.c          |  160 ++-
 block/blkdebug.c        |   20 +-
 block/blkverify.c       |    4 +-
 block/block-backend.c   |   16 +
 block/commit.c          |    3 +-
 block/curl.c            |   16 +-
 block/dmg.c             |    1 +
 block/io.c              | 2610 +++++++++++++++++++++++++++++++++++++++++++++++
 block/iscsi.c           |  169 ++-
 block/mirror.c          |  117 ++-
 block/nfs.c             |    2 +-
 block/null.c            |   66 +-
 block/parallels.c       |  679 ++++++++++--
 block/qapi.c            |   65 +-
 block/qcow.c            |  119 ++-
 block/qcow2-cache.c     |  174 ++--
 block/qcow2-cluster.c   |  137 +--
 block/qcow2-refcount.c  |   73 +-
 block/qcow2-snapshot.c  |    7 +-
 block/qcow2.c           |  129 ++-
 block/qcow2.h           |   26 +-
 block/qed.c             |    8 +-
 block/quorum.c          |  102 +-
 block/raw-posix.c       |  115 ++-
 block/raw-win32.c       |    1 +
 block/rbd.c             |   66 +-
 block/sheepdog.c        |  155 ++-
 block/snapshot.c        |   19 +-
 block/ssh.c             |    4 +-
 block/stream.c          |    5 +-
 block/throttle-groups.c |  501 +++++++++
 block/vdi.c             |    6 +-
 block/vhdx-log.c        |    1 +
 block/vhdx.c            |   10 +-
 block/vmdk.c            |   83 +-
 block/vpc.c             |    6 +-
 block/vvfat.c           |   37 +-
 38 files changed, 4854 insertions(+), 866 deletions(-)
 create mode 100644 block/io.c
 create mode 100644 block/throttle-groups.c

(limited to 'block')

diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e46..58ef2ef3f 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,15 +1,16 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
-block-obj-$(CONFIG_QUORUM) += quorum.o
+block-obj-y += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
+block-obj-y += throttle-groups.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -37,6 +38,7 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m        += dmg.o
 dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
diff --git a/block/backup.c b/block/backup.c
index 1c535b1ab..965654d52 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -19,6 +19,7 @@
 #include "block/block.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 
 #define BACKUP_CLUSTER_BITS 16
@@ -37,6 +38,8 @@ typedef struct CowRequest {
 typedef struct BackupBlockJob {
     BlockJob common;
     BlockDriverState *target;
+    /* bitmap for sync=incremental */
+    BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
     RateLimit limit;
     BlockdevOnError on_source_error;
@@ -195,7 +198,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
 
     if (speed < 0) {
-        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -242,6 +245,91 @@ static void backup_complete(BlockJob *job, void *opaque)
     g_free(data);
 }
 
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    /* we need to yield so that bdrv_drain_all() returns.
+     * (without, VM does not reboot)
+     */
+    if (job->common.speed) {
+        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+                                                      job->sectors_read);
+        job->sectors_read = 0;
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+    } else {
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+    }
+
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+    bool error_is_read;
+    int ret = 0;
+    int clusters_per_iter;
+    uint32_t granularity;
+    int64_t sector;
+    int64_t cluster;
+    int64_t end;
+    int64_t last_cluster = -1;
+    BlockDriverState *bs = job->common.bs;
+    HBitmapIter hbi;
+
+    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+
+    /* Find the next dirty sector(s) */
+    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+        cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+        /* Fake progress updates for any clusters we skipped */
+        if (cluster != last_cluster + 1) {
+            job->common.offset += ((cluster - last_cluster - 1) *
+                                   BACKUP_CLUSTER_SIZE);
+        }
+
+        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+            do {
+                if (yield_and_check(job)) {
+                    return ret;
+                }
+                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                if ((ret < 0) &&
+                    backup_error_action(job, error_is_read, -ret) ==
+                    BLOCK_ERROR_ACTION_REPORT) {
+                    return ret;
+                }
+            } while (ret < 0);
+        }
+
+        /* If the bitmap granularity is smaller than the backup granularity,
+         * we need to advance the iterator pointer to the next cluster. */
+        if (granularity < BACKUP_CLUSTER_SIZE) {
+            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+        }
+
+        last_cluster = cluster - 1;
+    }
+
+    /* Play some final catchup with the progress meter */
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    if (last_cluster + 1 < end) {
+        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+    }
+
+    return ret;
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
     BackupBlockJob *job = opaque;
@@ -259,8 +347,7 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_init(&job->flush_rwlock);
 
     start = 0;
-    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
-                       BACKUP_SECTORS_PER_CLUSTER);
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
 
     job->bitmap = hbitmap_alloc(end, 0);
 
@@ -278,28 +365,13 @@ static void coroutine_fn backup_run(void *opaque)
             qemu_coroutine_yield();
             job->common.busy = true;
         }
+    } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+        ret = backup_run_incremental(job);
     } else {
         /* Both FULL and TOP SYNC_MODE's require copying.. */
         for (; start < end; start++) {
             bool error_is_read;
-
-            if (block_job_is_cancelled(&job->common)) {
-                break;
-            }
-
-            /* we need to yield so that qemu_aio_flush() returns.
-             * (without, VM does not reboot)
-             */
-            if (job->common.speed) {
-                uint64_t delay_ns = ratelimit_calculate_delay(
-                        &job->limit, job->sectors_read);
-                job->sectors_read = 0;
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
-            } else {
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
-            }
-
-            if (block_job_is_cancelled(&job->common)) {
+            if (yield_and_check(job)) {
                 break;
             }
 
@@ -357,6 +429,18 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
 
+    if (job->sync_bitmap) {
+        BdrvDirtyBitmap *bm;
+        if (ret < 0 || block_job_is_cancelled(&job->common)) {
+            /* Merge the successor back into the parent, delete nothing. */
+            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        } else {
+            /* Everything is fine, delete this bitmap and install the backup. */
+            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        }
+    }
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
@@ -369,6 +453,7 @@ static void coroutine_fn backup_run(void *opaque)
 
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
@@ -388,7 +473,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
         !bdrv_iostatus_is_enabled(bs)) {
-        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
         return;
     }
 
@@ -412,17 +497,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
+    if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+        if (!sync_bitmap) {
+            error_setg(errp, "must provide a valid bitmap name for "
+                             "\"incremental\" sync mode");
+            return;
+        }
+
+        /* Create a new bitmap, and freeze/disable this one. */
+        if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+            return;
+        }
+    } else if (sync_bitmap) {
+        error_setg(errp,
+                   "a sync_bitmap was provided to backup_run, "
+                   "but received an incompatible sync_mode (%s)",
+                   MirrorSyncMode_lookup[sync_mode]);
+        return;
+    }
+
     len = bdrv_getlength(bs);
     if (len < 0) {
         error_setg_errno(errp, -len, "unable to get length for '%s'",
                          bdrv_get_device_name(bs));
-        return;
+        goto error;
     }
 
     BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
                                            cb, opaque, errp);
     if (!job) {
-        return;
+        goto error;
     }
 
     bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +535,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     job->on_target_error = on_target_error;
     job->target = target;
     job->sync_mode = sync_mode;
+    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
+                       sync_bitmap : NULL;
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run);
     qemu_coroutine_enter(job->common.co, job);
+    return;
+
+ error:
+    if (sync_bitmap) {
+        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+    }
 }
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0a3..bc247f46f 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -216,10 +216,9 @@ static int get_event_by_name(const char *name, BlkDebugEvent *event)
 struct add_rule_data {
     BDRVBlkdebugState *s;
     int action;
-    Error **errp;
 };
 
-static int add_rule(QemuOpts *opts, void *opaque)
+static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
 {
     struct add_rule_data *d = opaque;
     BDRVBlkdebugState *s = d->s;
@@ -230,10 +229,10 @@ static int add_rule(QemuOpts *opts, void *opaque)
     /* Find the right event for the rule */
     event_name = qemu_opt_get(opts, "event");
     if (!event_name) {
-        error_setg(d->errp, "Missing event name for rule");
+        error_setg(errp, "Missing event name for rule");
         return -1;
     } else if (get_event_by_name(event_name, &event) < 0) {
-        error_setg(d->errp, "Invalid event name \"%s\"", event_name);
+        error_setg(errp, "Invalid event name \"%s\"", event_name);
         return -1;
     }
 
@@ -319,8 +318,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,
 
     d.s = s;
     d.action = ACTION_INJECT_ERROR;
-    d.errp = &local_err;
-    qemu_opts_foreach(&inject_error_opts, add_rule, &d, 1);
+    qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         ret = -EINVAL;
@@ -328,7 +326,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,
     }
 
     d.action = ACTION_SET_STATE;
-    qemu_opts_foreach(&set_state_opts, add_rule, &d, 1);
+    qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         ret = -EINVAL;
@@ -431,7 +429,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the backing file */
     assert(bs->file == NULL);
     ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
-                          flags | BDRV_O_PROTOCOL, false, &local_err);
+                          bs, &child_file, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto out;
@@ -721,6 +719,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
     return bdrv_getlength(bs->file);
 }
 
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+    return bdrv_truncate(bs->file, offset);
+}
+
 static void blkdebug_refresh_filename(BlockDriverState *bs)
 {
     QDict *opts;
@@ -779,6 +782,7 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_file_open         = blkdebug_open,
     .bdrv_close             = blkdebug_close,
     .bdrv_getlength         = blkdebug_getlength,
+    .bdrv_truncate          = blkdebug_truncate,
     .bdrv_refresh_filename  = blkdebug_refresh_filename,
 
     .bdrv_aio_readv         = blkdebug_aio_readv,
diff --git a/block/blkverify.c b/block/blkverify.c
index 438dff8bc..d277e6322 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -125,7 +125,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the raw file */
     assert(bs->file == NULL);
     ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
-                          "raw", flags | BDRV_O_PROTOCOL, false, &local_err);
+                          "raw", bs, &child_file, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
@@ -134,7 +134,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     /* Open the test file */
     assert(s->test_file == NULL);
     ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
-                          "test", flags, false, &local_err);
+                          "test", bs, &child_format, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         s->test_file = NULL;
diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c05..aee8a1202 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
     return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
 }
 
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                     int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
@@ -689,6 +700,11 @@ int blk_flush_all(void)
     return bdrv_flush_all();
 }
 
+void blk_drain(BlockBackend *blk)
+{
+    bdrv_drain(blk->bs);
+}
+
 void blk_drain_all(void)
 {
     bdrv_drain_all();
diff --git a/block/commit.c b/block/commit.c
index cfa2bbebc..7312a5bdc 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -15,6 +15,7 @@
 #include "trace.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -186,7 +187,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
     CommitBlockJob *s = container_of(job, CommitBlockJob, common);
 
     if (speed < 0) {
-        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
diff --git a/block/curl.c b/block/curl.c
index bbee3ca17..032cc8ae2 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -22,8 +22,10 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qstring.h"
 #include <curl/curl.h>
 
 // #define DEBUG_CURL
@@ -297,6 +299,18 @@ static void curl_multi_check_completion(BDRVCURLState *s)
             /* ACBs for successful messages get completed in curl_read_cb */
             if (msg->data.result != CURLE_OK) {
                 int i;
+                static int errcount = 100;
+
+                /* Don't lose the original error message from curl, since
+                 * it contains extra data.
+                 */
+                if (errcount > 0) {
+                    error_report("curl: %s", state->errmsg);
+                    if (--errcount == 0) {
+                        error_report("curl: further errors suppressed");
+                    }
+                }
+
                 for (i = 0; i < CURL_NUM_ACB; i++) {
                     CURLAIOCB *acb = state->acb[i];
 
@@ -304,7 +318,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
                         continue;
                     }
 
-                    acb->common.cb(acb->common.opaque, -EIO);
+                    acb->common.cb(acb->common.opaque, -EPROTO);
                     qemu_aio_unref(acb);
                     state->acb[i] = NULL;
                 }
diff --git a/block/dmg.c b/block/dmg.c
index 825c49d59..9f2528169 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -24,6 +24,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/bswap.h"
+#include "qemu/error-report.h"
 #include "qemu/module.h"
 #include <zlib.h>
 #ifdef CONFIG_BZIP2
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 000000000..d4bc83b33
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2610 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "block/throttle-groups.h"
+#include "qemu/error-report.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        ThrottleConfig *cfg)
+{
+    int i;
+
+    throttle_group_config(bs, cfg);
+
+    for (i = 0; i < 2; i++) {
+        qemu_co_enter_next(&bs->throttled_reqs[i]);
+    }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+    bool drained = false;
+    bool enabled = bs->io_limits_enabled;
+    int i;
+
+    bs->io_limits_enabled = false;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+            drained = true;
+        }
+    }
+
+    bs->io_limits_enabled = enabled;
+
+    return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+    bdrv_start_throttled_reqs(bs);
+    throttle_group_unregister_bs(bs);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
+{
+    assert(!bs->io_limits_enabled);
+    throttle_group_register_bs(bs, group);
+    bs->io_limits_enabled = true;
+}
+
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
+{
+    /* this bs is not part of any group */
+    if (!bs->throttle_state) {
+        return;
+    }
+
+    /* this bs is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    bdrv_io_limits_disable(bs);
+    bdrv_io_limits_enable(bs, group);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+    /* Block drivers without coroutine functions need emulation */
+    if (!bdrv->bdrv_co_readv) {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+         * the block driver lacks aio we need to emulate that too.
+         */
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        }
+    }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+    Error *local_err = NULL;
+
+    memset(&bs->bl, 0, sizeof(bs->bl));
+
+    if (!drv) {
+        return;
+    }
+
+    /* Take some limits from the children as a default */
+    if (bs->file) {
+        bdrv_refresh_limits(bs->file, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+        bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment;
+        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+    } else {
+        bs->bl.min_mem_alignment = 512;
+        bs->bl.opt_mem_alignment = getpagesize();
+    }
+
+    if (bs->backing_hd) {
+        bdrv_refresh_limits(bs->backing_hd, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length =
+            MAX(bs->bl.opt_transfer_length,
+                bs->backing_hd->bl.opt_transfer_length);
+        bs->bl.max_transfer_length =
+            MIN_NON_ZERO(bs->bl.max_transfer_length,
+                         bs->backing_hd->bl.max_transfer_length);
+        bs->bl.opt_mem_alignment =
+            MAX(bs->bl.opt_mem_alignment,
+                bs->backing_hd->bl.opt_mem_alignment);
+        bs->bl.min_mem_alignment =
+            MAX(bs->bl.min_mem_alignment,
+                bs->backing_hd->bl.min_mem_alignment);
+    }
+
+    /* Then let the driver override it */
+    if (drv->bdrv_refresh_limits) {
+        drv->bdrv_refresh_limits(bs, errp);
+    }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+        return true;
+    }
+    if (bs->file && bdrv_requests_pending(bs->file)) {
+        return true;
+    }
+    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+        return true;
+    }
+    return false;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts.  In that case, use
+ * bdrv_drain_all() instead.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+    bool busy = true;
+
+    while (busy) {
+        /* Keep iterating */
+         bdrv_flush_io_queue(bs);
+         busy = bdrv_requests_pending(bs);
+         busy |= aio_poll(bdrv_get_aio_context(bs), busy);
+    }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ */
+void bdrv_drain_all(void)
+{
+    /* Always run first iteration so any pending completion BHs run */
+    bool busy = true;
+    BlockDriverState *bs = NULL;
+    GSList *aio_ctxs = NULL, *ctx;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_pause(bs->job);
+        }
+        aio_context_release(aio_context);
+
+        if (!g_slist_find(aio_ctxs, aio_context)) {
+            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+        }
+    }
+
+    /* Note that completion of an asynchronous I/O operation can trigger any
+     * number of other I/O operations on other devices---for example a
+     * coroutine can submit an I/O request to another device in response to
+     * request completion.  Therefore we must keep looping until there was no
+     * more activity rather than simply draining each device independently.
+     */
+    while (busy) {
+        busy = false;
+
+        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+            AioContext *aio_context = ctx->data;
+            bs = NULL;
+
+            aio_context_acquire(aio_context);
+            while ((bs = bdrv_next(bs))) {
+                if (aio_context == bdrv_get_aio_context(bs)) {
+                    bdrv_flush_io_queue(bs);
+                    if (bdrv_requests_pending(bs)) {
+                        busy = true;
+                        aio_poll(aio_context, busy);
+                    }
+                }
+            }
+            busy |= aio_poll(aio_context, false);
+            aio_context_release(aio_context);
+        }
+    }
+
+    bs = NULL;
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_resume(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+    g_slist_free(aio_ctxs);
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    if (req->serialising) {
+        req->bs->serialising_in_flight--;
+    }
+
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t offset,
+                                  unsigned int bytes, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .offset         = offset,
+        .bytes          = bytes,
+        .is_write       = is_write,
+        .co             = qemu_coroutine_self(),
+        .serialising    = false,
+        .overlap_offset = offset,
+        .overlap_bytes  = bytes,
+    };
+
+    qemu_co_queue_init(&req->wait_queue);
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+    int64_t overlap_offset = req->offset & ~(align - 1);
+    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+                               - overlap_offset;
+
+    if (!req->serialising) {
+        req->bs->serialising_in_flight++;
+        req->serialising = true;
+    }
+
+    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    int ret;
+
+    ret = bdrv_get_info(bs, &bdi);
+    if (ret < 0 || bdi.cluster_size == 0) {
+        return bs->request_alignment;
+    } else {
+        return bdi.cluster_size;
+    }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t offset, unsigned int bytes)
+{
+    /*        aaaa   bbbb */
+    if (offset >= req->overlap_offset + req->overlap_bytes) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->overlap_offset >= offset + bytes) {
+        return false;
+    }
+    return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+    BlockDriverState *bs = self->bs;
+    BdrvTrackedRequest *req;
+    bool retry;
+    bool waited = false;
+
+    if (!bs->serialising_in_flight) {
+        return false;
+    }
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (req == self || (!req->serialising && !self->serialising)) {
+                continue;
+            }
+            if (tracked_request_overlaps(req, self->overlap_offset,
+                                         self->overlap_bytes))
+            {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
+                /* If the request is already (indirectly) waiting for us, or
+                 * will wait for us as soon as it wakes up, then just go on
+                 * (instead of producing a deadlock in the former case). */
+                if (!req->waiting_for) {
+                    self->waiting_for = req;
+                    qemu_co_queue_wait(&req->wait_queue);
+                    self->waiting_for = NULL;
+                    retry = true;
+                    waited = true;
+                    break;
+                }
+            }
+        }
+    } while (retry);
+
+    return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+                                   size_t size)
+{
+    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+        return -EIO;
+    }
+
+    if (!bdrv_is_inserted(bs)) {
+        return -ENOMEDIUM;
+    }
+
+    if (offset < 0) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EIO;
+    }
+
+    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+    BlockDriverState *bs;
+    int64_t offset;
+    QEMUIOVector *qiov;
+    bool is_write;
+    int ret;
+    BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    if (!rwco->is_write) {
+        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+                                      rwco->qiov->size, rwco->qiov,
+                                      rwco->flags);
+    } else {
+        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+                                       rwco->qiov->size, rwco->qiov,
+                                       rwco->flags);
+    }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+                        QEMUIOVector *qiov, bool is_write,
+                        BdrvRequestFlags flags)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .offset = offset,
+        .qiov = qiov,
+        .is_write = is_write,
+        .ret = NOT_DONE,
+        .flags = flags,
+    };
+
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_rw_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_rw_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
+
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+                        &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+              uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+                          uint8_t *buf, int nb_sectors)
+{
+    bool enabled;
+    int ret;
+
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    bs->io_limits_enabled = enabled;
+    return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+  -EIO         generic I/O error (may happen for all errors)
+  -ENOMEDIUM   No media inserted.
+  -EINVAL      Invalid sector number or nb_sectors
+  -EACCES      Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+               const uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                      int nb_sectors, BdrvRequestFlags flags)
+{
+    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+                      BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+    int n;
+
+    target_sectors = bdrv_nb_sectors(bs);
+    if (target_sectors < 0) {
+        return target_sectors;
+    }
+
+    for (;;) {
+        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+        if (nb_sectors <= 0) {
+            return 0;
+        }
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            error_report("error getting block status at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        if (ret & BDRV_BLOCK_ZERO) {
+            sector_num += n;
+            continue;
+        }
+        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        if (ret < 0) {
+            error_report("error writing zeroes at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        sector_num += n;
+    }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = bytes,
+    };
+    int ret;
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+    int ret;
+
+    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+                const void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = bytes,
+    };
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+    const void *buf, int count)
+{
+    int ret;
+
+    ret = bdrv_pwrite(bs, offset, buf, count);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* No flush needed for cache modes that already do it */
+    if (bs->enable_write_cache) {
+        bdrv_flush(bs);
+    }
+
+    return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+                                   cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    if (bounce_buffer == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                             &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    if (drv->bdrv_co_write_zeroes &&
+        buffer_is_zero(bounce_buffer, iov.iov_len)) {
+        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+                                      cluster_nb_sectors, 0);
+    } else {
+        /* This does not change the data on the disk, it is not necessary
+         * to flush even in cache=writethrough mode.
+         */
+        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    }
+
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+                        nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    int64_t align, QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    /* Handle Copy on Read and associated serialisation */
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        /* If we touch the same cluster it counts as an overlap.  This
+         * guarantees that allocating writes will be serialized and not race
+         * with each other for the same cluster.  For example, in copy-on-read
+         * it ensures that the CoR read and write operations are atomic and
+         * guest writes cannot interleave between them. */
+        mark_request_serialising(req, bdrv_get_cluster_size(bs));
+    }
+
+    wait_serialising_requests(req);
+
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        int pnum;
+
+        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
+    /* Forward the request to the BlockDriver */
+    if (!bs->zero_beyond_eof) {
+        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    } else {
+        /* Read zeros after EOF */
+        int64_t total_sectors, max_nb_sectors;
+
+        total_sectors = bdrv_nb_sectors(bs);
+        if (total_sectors < 0) {
+            ret = total_sectors;
+            goto out;
+        }
+
+        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+                                  align >> BDRV_SECTOR_BITS);
+        if (nb_sectors < max_nb_sectors) {
+            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+        } else if (max_nb_sectors > 0) {
+            QEMUIOVector local_qiov;
+
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, 0,
+                              max_nb_sectors * BDRV_SECTOR_SIZE);
+
+            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+                                     &local_qiov);
+
+            qemu_iovec_destroy(&local_qiov);
+        } else {
+            ret = 0;
+        }
+
+        /* Reading beyond end of file is supposed to produce zeroes */
+        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+            uint64_t offset = MAX(0, total_sectors - sector_num);
+            uint64_t bytes = (sector_num + nb_sectors - offset) *
+                              BDRV_SECTOR_SIZE;
+            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (bs->copy_on_read) {
+        flags |= BDRV_REQ_COPY_ON_READ;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        throttle_group_co_io_limits_intercept(bs, bytes, false);
+    }
+
+    /* Align read if necessary by padding qiov */
+    if (offset & (align - 1)) {
+        head_buf = qemu_blockalign(bs, align);
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+        tail_buf = qemu_blockalign(bs, align);
+        qemu_iovec_add(&local_qiov, tail_buf,
+                       align - ((offset + bytes) & (align - 1)));
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    tracked_request_begin(&req, bs, offset, bytes, false);
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+                              use_local_qiov ? &local_qiov : qiov,
+                              flags);
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+        qemu_vfree(head_buf);
+        qemu_vfree(tail_buf);
+    }
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector qiov;
+    struct iovec iov = {0};
+    int ret = 0;
+
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+                                        BDRV_REQUEST_MAX_SECTORS);
+
+    while (nb_sectors > 0 && !ret) {
+        int num = nb_sectors;
+
+        /* Align request.  Block drivers can expect the "bulk" of the request
+         * to be aligned.
+         */
+        if (bs->bl.write_zeroes_alignment
+            && num > bs->bl.write_zeroes_alignment) {
+            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+                /* Make a small request up to the first aligned sector.  */
+                num = bs->bl.write_zeroes_alignment;
+                num -= sector_num % bs->bl.write_zeroes_alignment;
+            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+                /* Shorten the request to the last aligned sector.  num cannot
+                 * underflow because num > bs->bl.write_zeroes_alignment.
+                 */
+                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+            }
+        }
+
+        /* limit request size */
+        if (num > max_write_zeroes) {
+            num = max_write_zeroes;
+        }
+
+        ret = -ENOTSUP;
+        /* First try the efficient write zeroes operation */
+        if (drv->bdrv_co_write_zeroes) {
+            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+        }
+
+        if (ret == -ENOTSUP) {
+            /* Fall back to bounce buffer if write zeroes is unsupported */
+            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+            num = MIN(num, max_xfer_len);
+            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            if (iov.iov_base == NULL) {
+                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                if (iov.iov_base == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
+                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+            }
+            qemu_iovec_init_external(&qiov, &iov, 1);
+
+            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+            /* Keep bounce buffer around if it is big enough for all
+             * all future requests.
+             */
+            if (num < max_xfer_len) {
+                qemu_vfree(iov.iov_base);
+                iov.iov_base = NULL;
+            }
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+
+fail:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    bool waited;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    waited = wait_serialising_requests(req);
+    assert(!waited || !req->serialising);
+    assert(req->overlap_offset <= offset);
+    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        qemu_iovec_is_zero(qiov)) {
+        flags |= BDRV_REQ_ZERO_WRITE;
+        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+            flags |= BDRV_REQ_MAY_UNMAP;
+        }
+    }
+
+    if (ret < 0) {
+        /* Do nothing, write notifier decided to fail this request */
+    } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+    } else {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
+    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+    if (ret == 0 && !bs->enable_write_cache) {
+        ret = bdrv_co_flush(bs);
+    }
+
+    bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+    if (ret >= 0) {
+        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+    }
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+                                                int64_t offset,
+                                                unsigned int bytes,
+                                                BdrvRequestFlags flags,
+                                                BdrvTrackedRequest *req)
+{
+    uint8_t *buf = NULL;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    unsigned int head_padding_bytes, tail_padding_bytes;
+    int ret = 0;
+
+    head_padding_bytes = offset & (align - 1);
+    tail_padding_bytes = align - ((offset + bytes) & (align - 1));
+
+
+    assert(flags & BDRV_REQ_ZERO_WRITE);
+    if (head_padding_bytes || tail_padding_bytes) {
+        buf = qemu_blockalign(bs, align);
+        iov = (struct iovec) {
+            .iov_base   = buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    }
+    if (head_padding_bytes) {
+        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
+
+        /* RMW the unaligned part before head. */
+        mark_request_serialising(req, align);
+        wait_serialising_requests(req);
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+                                  align, &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        memset(buf + head_padding_bytes, 0, zero_bytes);
+        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+                                   &local_qiov,
+                                   flags & ~BDRV_REQ_ZERO_WRITE);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += zero_bytes;
+        bytes -= zero_bytes;
+    }
+
+    assert(!bytes || (offset & (align - 1)) == 0);
+    if (bytes >= align) {
+        /* Write the aligned part in the middle. */
+        uint64_t aligned_bytes = bytes & ~(align - 1);
+        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
+                                   NULL, flags);
+        if (ret < 0) {
+            goto fail;
+        }
+        bytes -= aligned_bytes;
+        offset += aligned_bytes;
+    }
+
+    assert(!bytes || (offset & (align - 1)) == 0);
+    if (bytes) {
+        assert(align == tail_padding_bytes + bytes);
+        /* RMW the unaligned part after tail. */
+        mark_request_serialising(req, align);
+        wait_serialising_requests(req);
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, req, offset, align,
+                                  align, &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        memset(buf, 0, bytes);
+        ret = bdrv_aligned_pwritev(bs, req, offset, align,
+                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
+    }
+fail:
+    qemu_vfree(buf);
+    return ret;
+
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BdrvTrackedRequest req;
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EPERM;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        throttle_group_co_io_limits_intercept(bs, bytes, true);
+    }
+
+    /*
+     * Align write if necessary by performing a read-modify-write cycle.
+     * Pad qiov with the read parts and be sure to have a tracked request not
+     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+     */
+    tracked_request_begin(&req, bs, offset, bytes, true);
+
+    if (!qiov) {
+        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+        goto out;
+    }
+
+    if (offset & (align - 1)) {
+        QEMUIOVector head_qiov;
+        struct iovec head_iov;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        head_buf = qemu_blockalign(bs, align);
+        head_iov = (struct iovec) {
+            .iov_base   = head_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+                                  align, &head_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        QEMUIOVector tail_qiov;
+        struct iovec tail_iov;
+        size_t tail_bytes;
+        bool waited;
+
+        mark_request_serialising(&req, align);
+        waited = wait_serialising_requests(&req);
+        assert(!waited || !use_local_qiov);
+
+        tail_buf = qemu_blockalign(bs, align);
+        tail_iov = (struct iovec) {
+            .iov_base   = tail_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+                                  align, &tail_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+
+        tail_bytes = (offset + bytes) & (align - 1);
+        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+                               use_local_qiov ? &local_qiov : qiov,
+                               flags);
+
+fail:
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+    }
+    qemu_vfree(head_buf);
+    qemu_vfree(tail_buf);
+out:
+    tracked_request_end(&req);
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      BdrvRequestFlags flags)
+{
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+                             BDRV_REQ_ZERO_WRITE | flags);
+}
+
+int bdrv_flush_all(void)
+{
+    BlockDriverState *bs = NULL;
+    int result = 0;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        int ret;
+
+        aio_context_acquire(aio_context);
+        ret = bdrv_flush(bs);
+        if (ret < 0 && !result) {
+            result = ret;
+        }
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+    BlockDriverState *bs;
+    BlockDriverState *base;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int64_t ret;
+    bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+                                                     int64_t sector_num,
+                                                     int nb_sectors, int *pnum)
+{
+    int64_t total_sectors;
+    int64_t n;
+    int64_t ret, ret2;
+
+    total_sectors = bdrv_nb_sectors(bs);
+    if (total_sectors < 0) {
+        return total_sectors;
+    }
+
+    if (sector_num >= total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
+    if (!bs->drv->bdrv_co_get_block_status) {
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+        if (bs->drv->protocol_name) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+        }
+        return ret;
+    }
+
+    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        *pnum = 0;
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_RAW) {
+        assert(ret & BDRV_BLOCK_OFFSET_VALID);
+        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                     *pnum, pnum);
+    }
+
+    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+        ret |= BDRV_BLOCK_ALLOCATED;
+    } else {
+        if (bdrv_unallocated_blocks_are_zero(bs)) {
+            ret |= BDRV_BLOCK_ZERO;
+        } else if (bs->backing_hd) {
+            BlockDriverState *bs2 = bs->backing_hd;
+            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+                ret |= BDRV_BLOCK_ZERO;
+            }
+        }
+    }
+
+    if (bs->file &&
+        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+        (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        int file_pnum;
+
+        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                        *pnum, &file_pnum);
+        if (ret2 >= 0) {
+            /* Ignore errors.  This is just providing extra information, it
+             * is useful but not necessary.
+             */
+            if (!file_pnum) {
+                /* !file_pnum indicates an offset at or beyond the EOF; it is
+                 * perfectly valid for the format block driver to point to such
+                 * offsets, so catch it and mark everything as zero */
+                ret |= BDRV_BLOCK_ZERO;
+            } else {
+                /* Limit request to the range reported by the protocol driver */
+                *pnum = file_pnum;
+                ret |= (ret2 & BDRV_BLOCK_ZERO);
+            }
+        }
+    }
+
+    return ret;
+}
+
+static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
+        BlockDriverState *base,
+        int64_t sector_num,
+        int nb_sectors,
+        int *pnum)
+{
+    BlockDriverState *p;
+    int64_t ret = 0;
+
+    assert(bs != base);
+    for (p = bs; p != base; p = p->backing_hd) {
+        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+            break;
+        }
+        /* [sector_num, pnum] unallocated on this layer, which could be only
+         * the first part of [sector_num, nb_sectors].  */
+        nb_sectors = MIN(nb_sectors, *pnum);
+    }
+    return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status_above() */
+static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
+{
+    BdrvCoGetBlockStatusData *data = opaque;
+
+    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
+                                               data->sector_num,
+                                               data->nb_sectors,
+                                               data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status_above().
+ *
+ * See bdrv_co_get_block_status_above() for details.
+ */
+int64_t bdrv_get_block_status_above(BlockDriverState *bs,
+                                    BlockDriverState *base,
+                                    int64_t sector_num,
+                                    int nb_sectors, int *pnum)
+{
+    Coroutine *co;
+    BdrvCoGetBlockStatusData data = {
+        .bs = bs,
+        .base = base,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_get_block_status_above_co_entry(&data);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
+        qemu_coroutine_enter(co, &data);
+        while (!data.done) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return data.ret;
+}
+
+int64_t bdrv_get_block_status(BlockDriverState *bs,
+                              int64_t sector_num,
+                              int nb_sectors, int *pnum)
+{
+    return bdrv_get_block_status_above(bs, bs->backing_hd,
+                                       sector_num, nb_sectors, pnum);
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, int *pnum)
+{
+    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        return ret;
+    }
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive).  BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain.  Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ *  the specified sector) that are known to be in the same
+ *  allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+                            BlockDriverState *base,
+                            int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BlockDriverState *intermediate;
+    int ret, n = nb_sectors;
+
+    intermediate = top;
+    while (intermediate && intermediate != base) {
+        int pnum_inter;
+        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+                                &pnum_inter);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            *pnum = pnum_inter;
+            return 1;
+        }
+
+        /*
+         * [sector_num, nb_sectors] is unallocated on top but intermediate
+         * might have
+         *
+         * [sector_num+x, nr_sectors] allocated.
+         */
+        if (n > pnum_inter &&
+            (intermediate == top ||
+             sector_num + pnum_inter < intermediate->total_sectors)) {
+            n = pnum_inter;
+        }
+
+        intermediate = intermediate->backing_hd;
+    }
+
+    *pnum = n;
+    return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                          const uint8_t *buf, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!drv->bdrv_write_compressed) {
+        return -ENOTSUP;
+    }
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = size,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    } else if (drv->bdrv_save_vmstate) {
+        return drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_writev_vmstate(bs->file, qiov, pos);
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (drv->bdrv_load_vmstate)
+        return drv->bdrv_load_vmstate(bs, buf, pos, size);
+    if (bs->file)
+        return bdrv_load_vmstate(bs->file, buf, pos, size);
+    return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                           QEMUIOVector *qiov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                            QEMUIOVector *qiov, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+                                 BDRV_REQ_ZERO_WRITE | flags,
+                                 cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+    } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        if (mcb->callbacks[i].free_qiov) {
+            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+        }
+        g_free(mcb->callbacks[i].free_qiov);
+    }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+    MultiwriteCB *mcb = opaque;
+
+    trace_multiwrite_cb(mcb, ret);
+
+    if (ret < 0 && !mcb->error) {
+        mcb->error = ret;
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        multiwrite_user_cb(mcb);
+        g_free(mcb);
+    }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+    const BlockRequest *req1 = a, *req2 = b;
+
+    /*
+     * Note that we can't simply subtract req2->sector from req1->sector
+     * here as that could overflow the return value.
+     */
+    if (req1->sector > req2->sector) {
+        return 1;
+    } else if (req1->sector < req2->sector) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs, MultiwriteCB *mcb)
+{
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+        // Handle exactly sequential writes and overlapping writes.
+        if (reqs[i].sector <= oldreq_last) {
+            merge = 1;
+        }
+
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+            merge = 0;
+        }
+
+        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+            merge = 0;
+        }
+
+        if (merge) {
+            size_t size;
+            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+            qemu_iovec_init(qiov,
+                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+            // Add the first request to the merged one. If the requests are
+            // overlapping, drop the last sectors of the first request.
+            size = (reqs[i].sector - reqs[outidx].sector) << 9;
+            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+            // We should need to add any zeros between the two requests
+            assert (reqs[i].sector <= oldreq_last);
+
+            // Add the second request
+            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+            // Add tail of first request, if necessary
+            if (qiov->size < reqs[outidx].qiov->size) {
+                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+                                  reqs[outidx].qiov->size - qiov->size);
+            }
+
+            reqs[outidx].nb_sectors = qiov->size >> 9;
+            reqs[outidx].qiov = qiov;
+
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+    return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+    MultiwriteCB *mcb;
+    int i;
+
+    /* don't submit writes if we don't have a medium */
+    if (bs->drv == NULL) {
+        for (i = 0; i < num_reqs; i++) {
+            reqs[i].error = -ENOMEDIUM;
+        }
+        return -1;
+    }
+
+    if (num_reqs == 0) {
+        return 0;
+    }
+
+    // Create MultiwriteCB structure
+    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check for mergable requests
+    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+    /* Run the aio requests. */
+    mcb->num_requests = num_reqs;
+    for (i = 0; i < num_reqs; i++) {
+        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+                              reqs[i].nb_sectors, reqs[i].flags,
+                              multiwrite_cb, mcb,
+                              true);
+    }
+
+    return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+    qemu_aio_ref(acb);
+    bdrv_aio_cancel_async(acb);
+    while (acb->refcnt > 1) {
+        if (acb->aiocb_info->get_aio_context) {
+            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+        } else if (acb->bs) {
+            aio_poll(bdrv_get_aio_context(acb->bs), true);
+        } else {
+            abort();
+        }
+    }
+    qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+    if (acb->aiocb_info->cancel_async) {
+        acb->aiocb_info->cancel_async(acb);
+    }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+    BlockAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    /* vector translation state */
+    QEMUIOVector *qiov;
+    uint8_t *bounce;
+    int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+    BlockAIOCBSync *acb = opaque;
+
+    if (!acb->is_write && acb->ret >= 0) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque,
+                                      int is_write)
+
+{
+    BlockAIOCBSync *acb;
+
+    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+    acb->is_write = is_write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_try_blockalign(bs, qiov->size);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+    if (acb->bounce == NULL) {
+        acb->ret = -ENOMEM;
+    } else if (is_write) {
+        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+    } else {
+        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+    }
+
+    qemu_bh_schedule(acb->bh);
+
+    return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+    BlockAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    bool need_bh;
+    bool *done;
+    QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+    if (!acb->need_bh) {
+        acb->common.cb(acb->common.opaque, acb->req.error);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+
+    assert(!acb->need_bh);
+    qemu_bh_delete(acb->bh);
+    bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+    acb->need_bh = false;
+    if (acb->req.error != -EINPROGRESS) {
+        BlockDriverState *bs = acb->common.bs;
+
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+        qemu_bh_schedule(acb->bh);
+    }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    } else {
+        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    }
+
+    bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->req.flags = flags;
+    acb->is_write = is_write;
+
+    co = qemu_coroutine_create(bdrv_co_do_rw);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_flush(bs);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_flush(bs, opaque);
+
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+
+    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_slice_alloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;
+
+    if (is_write) {
+        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                                       bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                                      bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+    int ret;
+
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+        bdrv_is_sg(bs)) {
+        return 0;
+    }
+
+    /* Write back cached data to the OS even with cache=unsafe */
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+    if (bs->drv->bdrv_co_flush_to_os) {
+        ret = bs->drv->bdrv_co_flush_to_os(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* But don't actually force it to the disk with cache=unsafe */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        goto flush_parent;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+    if (bs->drv->bdrv_co_flush_to_disk) {
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
+    } else if (bs->drv->bdrv_aio_flush) {
+        BlockAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            ret = -EIO;
+        } else {
+            qemu_coroutine_yield();
+            ret = co.ret;
+        }
+    } else {
+        /*
+         * Some block drivers always operate in either writethrough or unsafe
+         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+         * know how the server works (because the behaviour is hardcoded or
+         * depends on server-side configuration), so we can't ensure that
+         * everything is safe on disk. Returning an error doesn't work because
+         * that would break guests even if the server operates in writethrough
+         * mode.
+         *
+         * Let's hope the user knows what he's doing.
+         */
+        ret = 0;
+    }
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+flush_parent:
+    return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_flush_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_flush_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+typedef struct DiscardCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+    DiscardCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                 int nb_sectors)
+{
+    int max_discard, ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    } else if (bs->read_only) {
+        return -EPERM;
+    }
+
+    /* Do nothing if disabled.  */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        return 0;
+    }
+
+    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+        return 0;
+    }
+
+    bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+    while (nb_sectors > 0) {
+        int ret;
+        int num = nb_sectors;
+
+        /* align request */
+        if (bs->bl.discard_alignment &&
+            num >= bs->bl.discard_alignment &&
+            sector_num % bs->bl.discard_alignment) {
+            if (num > bs->bl.discard_alignment) {
+                num = bs->bl.discard_alignment;
+            }
+            num -= sector_num % bs->bl.discard_alignment;
+        }
+
+        /* limit request size */
+        if (num > max_discard) {
+            num = max_discard;
+        }
+
+        if (bs->drv->bdrv_co_discard) {
+            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+        } else {
+            BlockAIOCB *acb;
+            CoroutineIOCompletion co = {
+                .coroutine = qemu_coroutine_self(),
+            };
+
+            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+                                            bdrv_co_io_em_complete, &co);
+            if (acb == NULL) {
+                return -EIO;
+            } else {
+                qemu_coroutine_yield();
+                ret = co.ret;
+            }
+        }
+        if (ret && ret != -ENOTSUP) {
+            return ret;
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+    return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    Coroutine *co;
+    DiscardCo rwco = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_discard_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_discard_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_ioctl)
+        return drv->bdrv_ioctl(bs, req, buf);
+    return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_aio_ioctl)
+        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+    return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+    return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+    return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+    size_t align = bdrv_opt_mem_align(bs);
+
+    /* Ensure that NULL is never returned on success */
+    assert(align > 0);
+    if (size == 0) {
+        size = align;
+    }
+
+    return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+    void *mem = qemu_try_blockalign(bs, size);
+
+    if (mem) {
+        memset(mem, 0, size);
+    }
+
+    return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+    int i;
+    size_t alignment = bdrv_min_mem_align(bs);
+
+    for (i = 0; i < qiov->niov; i++) {
+        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+            return false;
+        }
+        if (qiov->iov[i].iov_len % alignment) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+                                    NotifierWithReturn *notifier)
+{
+    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_plug) {
+        drv->bdrv_io_plug(bs);
+    } else if (bs->file) {
+        bdrv_io_plug(bs->file);
+    }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_unplug) {
+        drv->bdrv_io_unplug(bs);
+    } else if (bs->file) {
+        bdrv_io_unplug(bs->file);
+    }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_flush_io_queue) {
+        drv->bdrv_flush_io_queue(bs);
+    } else if (bs->file) {
+        bdrv_flush_io_queue(bs->file);
+    }
+    bdrv_start_throttled_reqs(bs);
+}
diff --git a/block/iscsi.c b/block/iscsi.c
index be8af46ad..93f1ee4c6 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -38,6 +38,7 @@
 #include "qemu/iov.h"
 #include "sysemu/sysemu.h"
 #include "qmp-commands.h"
+#include "qapi/qmp/qstring.h"
 
 #include <iscsi/iscsi.h>
 #include <iscsi/scsi-lowlevel.h>
@@ -57,9 +58,6 @@ typedef struct IscsiLun {
     int events;
     QEMUTimer *nop_timer;
     QEMUTimer *event_timer;
-    uint8_t lbpme;
-    uint8_t lbprz;
-    uint8_t has_write_same;
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
@@ -67,6 +65,12 @@ typedef struct IscsiLun {
     int cluster_sectors;
     bool use_16_for_rw;
     bool write_protected;
+    bool lbpme;
+    bool lbprz;
+    bool dpofua;
+    bool has_write_same;
+    bool force_next_flush;
+    bool request_timed_out;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -79,6 +83,7 @@ typedef struct IscsiTask {
     QEMUBH *bh;
     IscsiLun *iscsilun;
     QEMUTimer retry_timer;
+    bool force_next_flush;
 } IscsiTask;
 
 typedef struct IscsiAIOCB {
@@ -96,11 +101,12 @@ typedef struct IscsiAIOCB {
 #endif
 } IscsiAIOCB;
 
-#define EVENT_INTERVAL 250
+/* libiscsi uses time_t so its enough to process events every second */
+#define EVENT_INTERVAL 1000
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
 
 /* this threshold is a trade-off knob to choose between
  * the potential additional overhead of an extra GET_LBA_STATUS request
@@ -163,6 +169,19 @@ static inline unsigned exp_random(double mean)
     return -mean * log((double)rand() / RAND_MAX);
 }
 
+/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
+ * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
+ * macro was introduced in 1.11.0. So use the API_VERSION macro as
+ * a hint that the macros are defined and define them ourselves
+ * otherwise to keep the required libiscsi version at 1.9.0 */
+#if !defined(LIBISCSI_API_VERSION)
+#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28
+#define QEMU_SCSI_STATUS_TIMEOUT        0x0f000002
+#else
+#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
+#define QEMU_SCSI_STATUS_TIMEOUT        SCSI_STATUS_TIMEOUT
+#endif
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                         void *command_data, void *opaque)
@@ -183,10 +202,19 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                 iTask->do_retry = 1;
                 goto out;
             }
-            if (status == SCSI_STATUS_BUSY) {
+            if (status == SCSI_STATUS_BUSY ||
+                status == QEMU_SCSI_STATUS_TIMEOUT ||
+                status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
                 unsigned retry_time =
                     exp_random(iscsi_retry_times[iTask->retries - 1]);
-                error_report("iSCSI Busy (retry #%u in %u ms): %s",
+                if (status == QEMU_SCSI_STATUS_TIMEOUT) {
+                    /* make sure the request is rescheduled AFTER the
+                     * reconnect is initiated */
+                    retry_time = EVENT_INTERVAL * 2;
+                    iTask->iscsilun->request_timed_out = true;
+                }
+                error_report("iSCSI Busy/TaskSetFull/TimeOut"
+                             " (retry #%u in %u ms): %s",
                              iTask->retries, retry_time,
                              iscsi_get_error(iscsi));
                 aio_timer_init(iTask->iscsilun->aio_context,
@@ -199,6 +227,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
             }
         }
         error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+    } else {
+        iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
     }
 
 out:
@@ -268,20 +298,26 @@ iscsi_set_events(IscsiLun *iscsilun)
                            iscsilun);
         iscsilun->events = ev;
     }
-
-    /* newer versions of libiscsi may return zero events. In this
-     * case start a timer to ensure we are able to return to service
-     * once this situation changes. */
-    if (!ev) {
-        timer_mod(iscsilun->event_timer,
-                  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
-    }
 }
 
-static void iscsi_timed_set_events(void *opaque)
+static void iscsi_timed_check_events(void *opaque)
 {
     IscsiLun *iscsilun = opaque;
+
+    /* check for timed out requests */
+    iscsi_service(iscsilun->iscsi, 0);
+
+    if (iscsilun->request_timed_out) {
+        iscsilun->request_timed_out = false;
+        iscsi_reconnect(iscsilun->iscsi);
+    }
+
+    /* newer versions of libiscsi may return zero events. Ensure we are able
+     * to return to service once this situation changes. */
     iscsi_set_events(iscsilun);
+
+    timer_mod(iscsilun->event_timer,
+              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
 }
 
 static void
@@ -369,6 +405,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     struct IscsiTask iTask;
     uint64_t lba;
     uint32_t num_sectors;
+    int fua;
 
     if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
@@ -384,15 +421,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
+    fua = iscsilun->dpofua && !bs->enable_write_cache;
+    iTask.force_next_flush = !fua;
     if (iscsilun->use_16_for_rw) {
         iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     } else {
         iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     }
     if (iTask.task == NULL) {
@@ -460,7 +499,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
     *pnum = nb_sectors;
 
     /* LUN does not support logical block provisioning */
-    if (iscsilun->lbpme == 0) {
+    if (!iscsilun->lbpme) {
         goto out;
     }
 
@@ -616,12 +655,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
 
-    if (bs->sg) {
+    if (!iscsilun->force_next_flush) {
         return 0;
     }
+    iscsilun->force_next_flush = false;
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
-
 retry:
     if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                       0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -917,6 +956,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     }
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
+    iTask.force_next_flush = true;
 retry:
     if (use_16_for_ws) {
         iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
@@ -1080,16 +1120,37 @@ static char *parse_initiator_name(const char *target)
     return iscsi_name;
 }
 
+static int parse_timeout(const char *target)
+{
+    QemuOptsList *list;
+    QemuOpts *opts;
+    const char *timeout;
+
+    list = qemu_find_opts("iscsi");
+    if (list) {
+        opts = qemu_opts_find(list, target);
+        if (!opts) {
+            opts = QTAILQ_FIRST(&list->head);
+        }
+        if (opts) {
+            timeout = qemu_opt_get(opts, "timeout");
+            if (timeout) {
+                return atoi(timeout);
+            }
+        }
+    }
+
+    return 0;
+}
+
 static void iscsi_nop_timed_event(void *opaque)
 {
     IscsiLun *iscsilun = opaque;
 
-    if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) {
+    if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
         error_report("iSCSI: NOP timeout. Reconnecting...");
-        iscsi_reconnect(iscsilun->iscsi);
-    }
-
-    if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
+        iscsilun->request_timed_out = true;
+    } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
         error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
         return;
     }
@@ -1121,8 +1182,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
                 } else {
                     iscsilun->block_size = rc16->block_length;
                     iscsilun->num_blocks = rc16->returned_lba + 1;
-                    iscsilun->lbpme = rc16->lbpme;
-                    iscsilun->lbprz = rc16->lbprz;
+                    iscsilun->lbpme = !!rc16->lbpme;
+                    iscsilun->lbprz = !!rc16->lbprz;
                     iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
                 }
             }
@@ -1153,6 +1214,10 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
 
     if (task == NULL || task->status != SCSI_STATUS_GOOD) {
         error_setg(errp, "iSCSI: failed to send readcapacity10 command.");
+    } else if (!iscsilun->block_size ||
+               iscsilun->block_size % BDRV_SECTOR_SIZE) {
+        error_setg(errp, "iSCSI: the target returned an invalid "
+                   "block size of %d.", iscsilun->block_size);
     }
     if (task) {
         scsi_free_scsi_task(task);
@@ -1247,17 +1312,21 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
     timer_mod(iscsilun->nop_timer,
               qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
 
-    /* Prepare a timer for a delayed call to iscsi_set_events */
+    /* Set up a timer for periodic calls to iscsi_set_events and to
+     * scan for command timeout */
     iscsilun->event_timer = aio_timer_new(iscsilun->aio_context,
                                           QEMU_CLOCK_REALTIME, SCALE_MS,
-                                          iscsi_timed_set_events, iscsilun);
+                                          iscsi_timed_check_events, iscsilun);
+    timer_mod(iscsilun->event_timer,
+              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
 }
 
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
 {
     struct scsi_task *task;
     struct scsi_mode_sense *ms = NULL;
-    bool wrprotected = false;
+    iscsilun->write_protected = false;
+    iscsilun->dpofua = false;
 
     task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
                                  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1347,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
                      iscsi_get_error(iscsilun->iscsi));
         goto out;
     }
-    wrprotected = ms->device_specific_parameter & 0x80;
+    iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+    iscsilun->dpofua          = ms->device_specific_parameter & 0x10;
 
 out:
     if (task) {
         scsi_free_scsi_task(task);
     }
-    return wrprotected;
 }
 
 /*
@@ -1304,14 +1373,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
-    int i, ret = 0;
-
-    if ((BDRV_SECTOR_SIZE % 512) != 0) {
-        error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. "
-                   "BDRV_SECTOR_SIZE(%lld) is not a multiple "
-                   "of 512", BDRV_SECTOR_SIZE);
-        return -EINVAL;
-    }
+    int i, ret = 0, timeout = 0;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -1381,6 +1443,16 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
 
+    /* timeout handling is broken in libiscsi before 1.15.0 */
+    timeout = parse_timeout(iscsi_url->target);
+#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621
+    iscsi_set_timeout(iscsi, timeout);
+#else
+    if (timeout) {
+        error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0");
+    }
+#endif
+
     if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
         error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
             iscsi_get_error(iscsi));
@@ -1403,7 +1475,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     scsi_free_scsi_task(task);
     task = NULL;
 
-    iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+    iscsi_modesense_sync(iscsilun);
+
     /* Check the write protect flag of the LUN if we want to write */
     if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
         iscsilun->write_protected) {
@@ -1481,7 +1554,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
-        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+        if (iscsilun->lbprz) {
             iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
             if (iscsilun->allocationmap == NULL) {
                 ret = -ENOMEM;
@@ -1655,7 +1728,7 @@ out:
 static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     IscsiLun *iscsilun = bs->opaque;
-    bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+    bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
     bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
     bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
     return 0;
@@ -1728,6 +1801,10 @@ static QemuOptsList qemu_iscsi_opts = {
             .name = "initiator-name",
             .type = QEMU_OPT_STRING,
             .help = "Initiator iqn name to use when connecting",
+        },{
+            .name = "timeout",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Request timeout in seconds (default 0 = no timeout)",
         },
         { /* end of list */ }
     },
diff --git a/block/mirror.c b/block/mirror.c
index bd079a439..b2fb4b9b1 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -14,11 +14,13 @@
 #include "trace.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
+#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
 
 /* The mirroring buffer is a list of granularity-sized chunks.
  * Free chunks are organized in a list.
@@ -58,6 +60,7 @@ typedef struct MirrorBlockJob {
     int sectors_in_flight;
     int ret;
     bool unmap;
+    bool waiting_for_io;
 } MirrorBlockJob;
 
 typedef struct MirrorOp {
@@ -112,11 +115,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
     qemu_iovec_destroy(&op->qiov);
     g_slice_free(MirrorOp, op);
 
-    /* Enter coroutine when it is not sleeping.  The coroutine sleeps to
-     * rate-limit itself.  The coroutine will eventually resume since there is
-     * a sleep timeout so don't wake it early.
-     */
-    if (s->common.busy) {
+    if (s->waiting_for_io) {
         qemu_coroutine_enter(s->common.co, NULL);
     }
 }
@@ -126,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, false, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -144,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, true, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -173,10 +168,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
         s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s,
-                                  bdrv_get_dirty_count(source, s->dirty_bitmap));
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
         assert(s->sector_num >= 0);
     }
 
@@ -206,7 +200,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
     while (test_bit(next_chunk, s->in_flight_bitmap)) {
         trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        s->waiting_for_io = true;
         qemu_coroutine_yield();
+        s->waiting_for_io = false;
     }
 
     do {
@@ -242,7 +238,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
          */
         while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
             trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+            s->waiting_for_io = true;
             qemu_coroutine_yield();
+            s->waiting_for_io = false;
         }
         if (s->buf_free_count < nb_chunks + added_chunks) {
             trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
@@ -291,8 +289,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         next_sector += sectors_per_chunk;
     }
 
-    bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
-                            nb_sectors);
+    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
 
     /* Copy the dirty cluster.  */
     s->in_flight++;
@@ -337,7 +334,9 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void mirror_drain(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
+        s->waiting_for_io = true;
         qemu_coroutine_yield();
+        s->waiting_for_io = false;
     }
 }
 
@@ -392,7 +391,7 @@ static void coroutine_fn mirror_run(void *opaque)
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
     BlockDriverState *bs = s->common.bs;
-    int64_t sector_num, end, sectors_per_chunk, length;
+    int64_t sector_num, end, length;
     uint64_t last_pause_ns;
     BlockDriverInfo bdi;
     char backing_filename[2]; /* we only need 2 characters because we are only
@@ -446,16 +445,28 @@ static void coroutine_fn mirror_run(void *opaque)
         goto immediate_exit;
     }
 
-    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     mirror_free_init(s);
 
+    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     if (!s->is_none_mode) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
         BlockDriverState *base = s->base;
         for (sector_num = 0; sector_num < end; ) {
-            int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
-            ret = bdrv_is_allocated_above(bs, base,
-                                          sector_num, next - sector_num, &n);
+            /* Just to make sure we are not exceeding int limit. */
+            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
+                                 end - sector_num);
+            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+            if (now - last_pause_ns > SLICE_TIME) {
+                last_pause_ns = now;
+                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+            }
+
+            if (block_job_is_cancelled(&s->common)) {
+                goto immediate_exit;
+            }
+
+            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
 
             if (ret < 0) {
                 goto immediate_exit;
@@ -463,16 +474,13 @@ static void coroutine_fn mirror_run(void *opaque)
 
             assert(n > 0);
             if (ret == 1) {
-                bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
-                sector_num = next;
-            } else {
-                sector_num += n;
+                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
             }
+            sector_num += n;
         }
     }
 
-    bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
-    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
     for (;;) {
         uint64_t delay_ns = 0;
         int64_t cnt;
@@ -483,7 +491,7 @@ static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
-        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         /* s->common.offset contains the number of bytes already processed so
          * far, cnt is the number of dirty sectors remaining and
          * s->sectors_in_flight is the number of sectors currently being
@@ -492,7 +500,7 @@ static void coroutine_fn mirror_run(void *opaque)
                         (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
 
         /* Note that even when no rate limit is applied we need to yield
-         * periodically with no pending I/O so that qemu_aio_flush() returns.
+         * periodically with no pending I/O so that bdrv_drain_all() returns.
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
@@ -501,13 +509,12 @@ static void coroutine_fn mirror_run(void *opaque)
             if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
+                s->waiting_for_io = true;
                 qemu_coroutine_yield();
+                s->waiting_for_io = false;
                 continue;
             } else if (cnt != 0) {
                 delay_ns = mirror_iteration(s);
-                if (delay_ns == 0) {
-                    continue;
-                }
             }
         }
 
@@ -533,7 +540,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
                 should_complete = s->should_complete ||
                     block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
             }
         }
 
@@ -548,7 +555,7 @@ static void coroutine_fn mirror_run(void *opaque)
              */
             trace_mirror_before_drain(s, cnt);
             bdrv_drain(bs);
-            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         }
 
         ret = 0;
@@ -599,7 +606,7 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
 
     if (speed < 0) {
-        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -624,8 +631,8 @@ static void mirror_complete(BlockJob *job, Error **errp)
         return;
     }
     if (!s->synced) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_READY,
-                  bdrv_get_device_name(job->bs));
+        error_setg(errp, QERR_BLOCK_JOB_NOT_READY,
+                   bdrv_get_device_name(job->bs));
         return;
     }
 
@@ -651,7 +658,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
     }
 
     s->should_complete = true;
-    block_job_resume(job);
+    block_job_enter(&s->common);
 }
 
 static const BlockJobDriver mirror_job_driver = {
@@ -673,7 +680,7 @@ static const BlockJobDriver commit_active_job_driver = {
 
 static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              const char *replaces,
-                             int64_t speed, int64_t granularity,
+                             int64_t speed, uint32_t granularity,
                              int64_t buf_size,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
@@ -686,15 +693,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     MirrorBlockJob *s;
 
     if (granularity == 0) {
-        /* Choose the default granularity based on the target file's cluster
-         * size, clamped between 4k and 64k.  */
-        BlockDriverInfo bdi;
-        if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
-            granularity = MAX(4096, bdi.cluster_size);
-            granularity = MIN(65536, granularity);
-        } else {
-            granularity = 65536;
-        }
+        granularity = bdrv_get_default_bitmap_granularity(target);
     }
 
     assert ((granularity & (granularity - 1)) == 0);
@@ -702,10 +701,18 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
         !bdrv_iostatus_is_enabled(bs)) {
-        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
         return;
     }
 
+    if (buf_size < 0) {
+        error_setg(errp, "Invalid parameter 'buf-size'");
+        return;
+    }
+
+    if (buf_size == 0) {
+        buf_size = DEFAULT_MIRROR_BUF_SIZE;
+    }
 
     s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
@@ -719,11 +726,13 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->is_none_mode = is_none_mode;
     s->base = base;
     s->granularity = granularity;
-    s->buf_size = MAX(buf_size, granularity);
+    s->buf_size = ROUND_UP(buf_size, granularity);
     s->unmap = unmap;
 
-    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
+        g_free(s->replaces);
+        block_job_release(bs);
         return;
     }
     bdrv_set_enable_write_cache(s->target, true);
@@ -736,7 +745,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  int64_t speed, uint32_t granularity, int64_t buf_size,
                   MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   bool unmap,
@@ -746,6 +755,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     bool is_none_mode;
     BlockDriverState *base;
 
+    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+        error_setg(errp, "Sync mode 'incremental' not supported");
+        return;
+    }
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
     mirror_start_job(bs, target, replaces,
diff --git a/block/nfs.c b/block/nfs.c
index c026ff688..02eb4e464 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
         aio_poll(client->aio_context, true);
     }
 
-    return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+    return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
 
 static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
diff --git a/block/null.c b/block/null.c
index ec2bd27a4..7d083233f 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
 
 #include "block/block_int.h"
 
+#define NULL_OPT_LATENCY "latency-ns"
+
 typedef struct {
     int64_t length;
+    int64_t latency_ns;
 } BDRVNullState;
 
 static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "size of the null block",
         },
+        {
+            .name = NULL_OPT_LATENCY,
+            .type = QEMU_OPT_NUMBER,
+            .help = "nanoseconds (approximated) to wait "
+                    "before completing request",
+        },
         { /* end of list */ }
     },
 };
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
 {
     QemuOpts *opts;
     BDRVNullState *s = bs->opaque;
+    int ret = 0;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     s->length =
         qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+    s->latency_ns =
+        qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+    if (s->latency_ns < 0) {
+        error_setg(errp, "latency-ns is invalid");
+        ret = -EINVAL;
+    }
     qemu_opts_del(opts);
-    return 0;
+    return ret;
 }
 
 static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
     return s->length;
 }
 
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+    BDRVNullState *s = bs->opaque;
+
+    if (s->latency_ns) {
+        co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+                        s->latency_ns);
+    }
+    return 0;
+}
+
 static coroutine_fn int null_co_readv(BlockDriverState *bs,
                                       int64_t sector_num, int nb_sectors,
                                       QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_writev(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_flush(BlockDriverState *bs)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 typedef struct {
     BlockAIOCB common;
     QEMUBH *bh;
+    QEMUTimer timer;
 } NullAIOCB;
 
 static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
     qemu_aio_unref(acb);
 }
 
+static void null_timer_cb(void *opaque)
+{
+    NullAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    timer_deinit(&acb->timer);
+    qemu_aio_unref(acb);
+}
+
 static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
                                           BlockCompletionFunc *cb,
                                           void *opaque)
 {
     NullAIOCB *acb;
+    BDRVNullState *s = bs->opaque;
 
     acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
+    /* Only emulate latency after vcpu is running. */
+    if (s->latency_ns) {
+        aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+                       QEMU_CLOCK_REALTIME, SCALE_NS,
+                       null_timer_cb, acb);
+        timer_mod_ns(&acb->timer,
+                     qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+    } else {
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+        qemu_bh_schedule(acb->bh);
+    }
     return &acb->common;
 }
 
@@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
     return null_aio_common(bs, cb, opaque);
 }
 
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_null_co = {
     .format_name            = "null-co",
     .protocol_name          = "null-co",
@@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = {
     .bdrv_co_readv          = null_co_readv,
     .bdrv_co_writev         = null_co_writev,
     .bdrv_co_flush_to_disk  = null_co_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static BlockDriver bdrv_null_aio = {
@@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
     .bdrv_aio_readv         = null_aio_readv,
     .bdrv_aio_writev        = null_aio_writev,
     .bdrv_aio_flush         = null_aio_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static void bdrv_null_init(void)
diff --git a/block/parallels.c b/block/parallels.c
index 4f9cd8dd2..046b56844 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -2,8 +2,12 @@
  * Block driver for Parallels disk image format
  *
  * Copyright (c) 2007 Alex Beregszaszi
+ * Copyright (c) 2015 Denis V. Lunev <den@openvz.org>
  *
- * This code is based on comparing different disk images created by Parallels.
+ * This code was originally based on comparing different disk images created
+ * by Parallels. Currently it is based on opened OpenVZ sources
+ * available at
+ *     http://git.openvz.org/?p=ploop;a=summary
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,63 +30,539 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bitmap.h"
+#include "qapi/util.h"
 
 /**************************************************************/
 
 #define HEADER_MAGIC "WithoutFreeSpace"
 #define HEADER_MAGIC2 "WithouFreSpacExt"
 #define HEADER_VERSION 2
-#define HEADER_SIZE 64
+#define HEADER_INUSE_MAGIC  (0x746F6E59)
+
+#define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */
+
 
 // always little-endian
-struct parallels_header {
+typedef struct ParallelsHeader {
     char magic[16]; // "WithoutFreeSpace"
     uint32_t version;
     uint32_t heads;
     uint32_t cylinders;
     uint32_t tracks;
-    uint32_t catalog_entries;
+    uint32_t bat_entries;
     uint64_t nb_sectors;
     uint32_t inuse;
     uint32_t data_off;
     char padding[12];
-} QEMU_PACKED;
+} QEMU_PACKED ParallelsHeader;
+
+
+typedef enum ParallelsPreallocMode {
+    PRL_PREALLOC_MODE_FALLOCATE = 0,
+    PRL_PREALLOC_MODE_TRUNCATE = 1,
+    PRL_PREALLOC_MODE_MAX = 2,
+} ParallelsPreallocMode;
+
+static const char *prealloc_mode_lookup[] = {
+    "falloc",
+    "truncate",
+    NULL,
+};
+
 
 typedef struct BDRVParallelsState {
+    /** Locking is conservative, the lock protects
+     *   - image file extending (truncate, fallocate)
+     *   - any access to block allocation table
+     */
     CoMutex lock;
 
-    uint32_t *catalog_bitmap;
-    unsigned int catalog_size;
+    ParallelsHeader *header;
+    uint32_t header_size;
+    bool header_unclean;
+
+    unsigned long *bat_dirty_bmap;
+    unsigned int  bat_dirty_block;
+
+    uint32_t *bat_bitmap;
+    unsigned int bat_size;
+
+    int64_t  data_end;
+    uint64_t prealloc_size;
+    ParallelsPreallocMode prealloc_mode;
 
     unsigned int tracks;
 
     unsigned int off_multiplier;
 } BDRVParallelsState;
 
-static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+
+#define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode"
+#define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size"
+
+static QemuOptsList parallels_runtime_opts = {
+    .name = "parallels",
+    .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
+    .desc = {
+        {
+            .name = PARALLELS_OPT_PREALLOC_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Preallocation size on image expansion",
+            .def_value_str = "128MiB",
+        },
+        {
+            .name = PARALLELS_OPT_PREALLOC_MODE,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode on image expansion "
+                    "(allowed values: falloc, truncate)",
+            .def_value_str = "falloc",
+        },
+        { /* end of list */ },
+    },
+};
+
+
+static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx)
+{
+    return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier;
+}
+
+static uint32_t bat_entry_off(uint32_t idx)
+{
+    return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx;
+}
+
+static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num)
+{
+    uint32_t index, offset;
+
+    index = sector_num / s->tracks;
+    offset = sector_num % s->tracks;
+
+    /* not allocated */
+    if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) {
+        return -1;
+    }
+    return bat2sect(s, index) + offset;
+}
+
+static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
+        int nb_sectors)
+{
+    int ret = s->tracks - sector_num % s->tracks;
+    return MIN(nb_sectors, ret);
+}
+
+static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    int64_t start_off = -2, prev_end_off = -2;
+
+    *pnum = 0;
+    while (nb_sectors > 0 || start_off == -2) {
+        int64_t offset = seek_to_sector(s, sector_num);
+        int to_end;
+
+        if (start_off == -2) {
+            start_off = offset;
+            prev_end_off = offset;
+        } else if (offset != prev_end_off) {
+            break;
+        }
+
+        to_end = cluster_remainder(s, sector_num, nb_sectors);
+        nb_sectors -= to_end;
+        sector_num += to_end;
+        *pnum += to_end;
+
+        if (offset > 0) {
+            prev_end_off += to_end;
+        }
+    }
+    return start_off;
+}
+
+static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
+                                 int nb_sectors, int *pnum)
 {
-    const struct parallels_header *ph = (const void *)buf;
+    BDRVParallelsState *s = bs->opaque;
+    uint32_t idx, to_allocate, i;
+    int64_t pos, space;
+
+    pos = block_status(s, sector_num, nb_sectors, pnum);
+    if (pos > 0) {
+        return pos;
+    }
+
+    idx = sector_num / s->tracks;
+    if (idx >= s->bat_size) {
+        return -EINVAL;
+    }
+
+    to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx;
+    space = to_allocate * s->tracks;
+    if (s->data_end + space > bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS) {
+        int ret;
+        space += s->prealloc_size;
+        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
+            ret = bdrv_write_zeroes(bs->file, s->data_end, space, 0);
+        } else {
+            ret = bdrv_truncate(bs->file,
+                                (s->data_end + space) << BDRV_SECTOR_BITS);
+        }
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    for (i = 0; i < to_allocate; i++) {
+        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+        s->data_end += s->tracks;
+        bitmap_set(s->bat_dirty_bmap,
+                   bat_entry_off(idx) / s->bat_dirty_block, 1);
+    }
+
+    return bat2sect(s, idx) + sector_num % s->tracks;
+}
+
 
-    if (buf_size < HEADER_SIZE)
+static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block);
+    unsigned long bit;
+
+    qemu_co_mutex_lock(&s->lock);
+
+    bit = find_first_bit(s->bat_dirty_bmap, size);
+    while (bit < size) {
+        uint32_t off = bit * s->bat_dirty_block;
+        uint32_t to_write = s->bat_dirty_block;
+        int ret;
+
+        if (off + to_write > s->header_size) {
+            to_write = s->header_size - off;
+        }
+        ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write);
+        if (ret < 0) {
+            qemu_co_mutex_unlock(&s->lock);
+            return ret;
+        }
+        bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1);
+    }
+    bitmap_zero(s->bat_dirty_bmap, size);
+
+    qemu_co_mutex_unlock(&s->lock);
+    return 0;
+}
+
+
+static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t offset;
+
+    qemu_co_mutex_lock(&s->lock);
+    offset = block_status(s, sector_num, nb_sectors, pnum);
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (offset < 0) {
         return 0;
+    }
+
+    return (offset << BDRV_SECTOR_BITS) |
+        BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+}
+
+static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    while (nb_sectors > 0) {
+        int64_t position;
+        int n, nbytes;
+
+        qemu_co_mutex_lock(&s->lock);
+        position = allocate_clusters(bs, sector_num, nb_sectors, &n);
+        qemu_co_mutex_unlock(&s->lock);
+        if (position < 0) {
+            ret = (int)position;
+            break;
+        }
+
+        nbytes = n << BDRV_SECTOR_BITS;
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
+
+        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
+        if (ret < 0) {
+            break;
+        }
+
+        nb_sectors -= n;
+        sector_num += n;
+        bytes_done += nbytes;
+    }
+
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    while (nb_sectors > 0) {
+        int64_t position;
+        int n, nbytes;
+
+        qemu_co_mutex_lock(&s->lock);
+        position = block_status(s, sector_num, nb_sectors, &n);
+        qemu_co_mutex_unlock(&s->lock);
+
+        nbytes = n << BDRV_SECTOR_BITS;
+
+        if (position < 0) {
+            qemu_iovec_memset(qiov, bytes_done, 0, nbytes);
+        } else {
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
+
+            ret = bdrv_co_readv(bs->file, position, n, &hd_qiov);
+            if (ret < 0) {
+                break;
+            }
+        }
+
+        nb_sectors -= n;
+        sector_num += n;
+        bytes_done += nbytes;
+    }
+
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+
+static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
+                           BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t size, prev_off, high_off;
+    int ret;
+    uint32_t i;
+    bool flush_bat = false;
+    int cluster_size = s->tracks << BDRV_SECTOR_BITS;
+
+    size = bdrv_getlength(bs->file);
+    if (size < 0) {
+        res->check_errors++;
+        return size;
+    }
+
+    if (s->header_unclean) {
+        fprintf(stderr, "%s image was not closed correctly\n",
+                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+        res->corruptions++;
+        if (fix & BDRV_FIX_ERRORS) {
+            /* parallels_close will do the job right */
+            res->corruptions_fixed++;
+            s->header_unclean = false;
+        }
+    }
+
+    res->bfi.total_clusters = s->bat_size;
+    res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+    high_off = 0;
+    prev_off = 0;
+    for (i = 0; i < s->bat_size; i++) {
+        int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (off == 0) {
+            prev_off = 0;
+            continue;
+        }
+
+        /* cluster outside the image */
+        if (off > size) {
+            fprintf(stderr, "%s cluster %u is outside image\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+            res->corruptions++;
+            if (fix & BDRV_FIX_ERRORS) {
+                prev_off = 0;
+                s->bat_bitmap[i] = 0;
+                res->corruptions_fixed++;
+                flush_bat = true;
+                continue;
+            }
+        }
+
+        res->bfi.allocated_clusters++;
+        if (off > high_off) {
+            high_off = off;
+        }
+
+        if (prev_off != 0 && (prev_off + cluster_size) != off) {
+            res->bfi.fragmented_clusters++;
+        }
+        prev_off = off;
+    }
+
+    if (flush_bat) {
+        ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size);
+        if (ret < 0) {
+            res->check_errors++;
+            return ret;
+        }
+    }
+
+    res->image_end_offset = high_off + cluster_size;
+    if (size > res->image_end_offset) {
+        int64_t count;
+        count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size);
+        fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n",
+                fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
+                size - res->image_end_offset);
+        res->leaks += count;
+        if (fix & BDRV_FIX_LEAKS) {
+            ret = bdrv_truncate(bs->file, res->image_end_offset);
+            if (ret < 0) {
+                res->check_errors++;
+                return ret;
+            }
+            res->leaks_fixed += count;
+        }
+    }
+
+    return 0;
+}
+
+
+static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
+{
+    int64_t total_size, cl_size;
+    uint8_t tmp[BDRV_SECTOR_SIZE];
+    Error *local_err = NULL;
+    BlockDriverState *file;
+    uint32_t bat_entries, bat_sectors;
+    ParallelsHeader header;
+    int ret;
+
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
+    cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
+                          DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE);
+
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        return ret;
+    }
+
+    file = NULL;
+    ret = bdrv_open(&file, filename, NULL, NULL,
+                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        return ret;
+    }
+    ret = bdrv_truncate(file, 0);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    bat_entries = DIV_ROUND_UP(total_size, cl_size);
+    bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size);
+    bat_sectors = (bat_sectors *  cl_size) >> BDRV_SECTOR_BITS;
+
+    memset(&header, 0, sizeof(header));
+    memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic));
+    header.version = cpu_to_le32(HEADER_VERSION);
+    /* don't care much about geometry, it is not used on image level */
+    header.heads = cpu_to_le32(16);
+    header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32);
+    header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS);
+    header.bat_entries = cpu_to_le32(bat_entries);
+    header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE));
+    header.data_off = cpu_to_le32(bat_sectors);
+
+    /* write all the data */
+    memset(tmp, 0, sizeof(tmp));
+    memcpy(tmp, &header, sizeof(header));
+
+    ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0);
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = 0;
+
+done:
+    bdrv_unref(file);
+    return ret;
+
+exit:
+    error_setg_errno(errp, -ret, "Failed to create Parallels image");
+    goto done;
+}
+
+
+static int parallels_probe(const uint8_t *buf, int buf_size,
+                           const char *filename)
+{
+    const ParallelsHeader *ph = (const void *)buf;
+
+    if (buf_size < sizeof(ParallelsHeader)) {
+        return 0;
+    }
 
     if ((!memcmp(ph->magic, HEADER_MAGIC, 16) ||
-        !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
-        (le32_to_cpu(ph->version) == HEADER_VERSION))
+           !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
+           (le32_to_cpu(ph->version) == HEADER_VERSION)) {
         return 100;
+    }
 
     return 0;
 }
 
+static int parallels_update_header(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    unsigned size = MAX(bdrv_opt_mem_align(bs->file), sizeof(ParallelsHeader));
+
+    if (size > s->header_size) {
+        size = s->header_size;
+    }
+    return bdrv_pwrite_sync(bs->file, 0, s->header, size);
+}
+
 static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
                           Error **errp)
 {
     BDRVParallelsState *s = bs->opaque;
-    int i;
-    struct parallels_header ph;
-    int ret;
-
-    bs->read_only = 1; // no write support yet
+    ParallelsHeader ph;
+    int ret, size, i;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
+    char *buf;
 
     ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
     if (ret < 0) {
@@ -115,25 +595,90 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    s->catalog_size = le32_to_cpu(ph.catalog_entries);
-    if (s->catalog_size > INT_MAX / 4) {
+    s->bat_size = le32_to_cpu(ph.bat_entries);
+    if (s->bat_size > INT_MAX / sizeof(uint32_t)) {
         error_setg(errp, "Catalog too large");
         ret = -EFBIG;
         goto fail;
     }
-    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
-    if (s->catalog_size && s->catalog_bitmap == NULL) {
+
+    size = bat_entry_off(s->bat_size);
+    s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file));
+    s->header = qemu_try_blockalign(bs->file, s->header_size);
+    if (s->header == NULL) {
         ret = -ENOMEM;
         goto fail;
     }
+    s->data_end = le32_to_cpu(ph.data_off);
+    if (s->data_end == 0) {
+        s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
+    }
+    if (s->data_end < s->header_size) {
+        /* there is not enough unused space to fit to block align between BAT
+           and actual data. We can't avoid read-modify-write... */
+        s->header_size = size;
+    }
 
-    ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
+    ret = bdrv_pread(bs->file, 0, s->header, s->header_size);
     if (ret < 0) {
         goto fail;
     }
+    s->bat_bitmap = (uint32_t *)(s->header + 1);
+
+    for (i = 0; i < s->bat_size; i++) {
+        int64_t off = bat2sect(s, i);
+        if (off >= s->data_end) {
+            s->data_end = off + s->tracks;
+        }
+    }
 
-    for (i = 0; i < s->catalog_size; i++)
-        le32_to_cpus(&s->catalog_bitmap[i]);
+    if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
+        /* Image was not closed correctly. The check is mandatory */
+        s->header_unclean = true;
+        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
+            error_setg(errp, "parallels: Image was not closed correctly; "
+                       "cannot be opened read/write");
+            ret = -EACCES;
+            goto fail;
+        }
+    }
+
+    opts = qemu_opts_create(&parallels_runtime_opts, NULL, 0, &local_err);
+    if (local_err != NULL) {
+        goto fail_options;
+    }
+
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err != NULL) {
+        goto fail_options;
+    }
+
+    s->prealloc_size =
+        qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
+    s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
+    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
+    s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf,
+            PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err);
+    g_free(buf);
+    if (local_err != NULL) {
+        goto fail_options;
+    }
+    if (!bdrv_has_zero_init(bs->file) ||
+            bdrv_truncate(bs->file, bdrv_getlength(bs->file)) != 0) {
+        s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE;
+    }
+
+    if (flags & BDRV_O_RDWR) {
+        s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC);
+        ret = parallels_update_header(bs);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    s->bat_dirty_block = 4 * getpagesize();
+    s->bat_dirty_bmap =
+        bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block));
 
     qemu_co_mutex_init(&s->lock);
     return 0;
@@ -142,67 +687,67 @@ fail_format:
     error_setg(errp, "Image not in Parallels format");
     ret = -EINVAL;
 fail:
-    g_free(s->catalog_bitmap);
+    qemu_vfree(s->header);
     return ret;
+
+fail_options:
+    error_propagate(errp, local_err);
+    ret = -EINVAL;
+    goto fail;
 }
 
-static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+
+static void parallels_close(BlockDriverState *bs)
 {
     BDRVParallelsState *s = bs->opaque;
-    uint32_t index, offset;
-
-    index = sector_num / s->tracks;
-    offset = sector_num % s->tracks;
 
-    /* not allocated */
-    if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0))
-        return -1;
-    return
-        ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512;
-}
+    if (bs->open_flags & BDRV_O_RDWR) {
+        s->header->inuse = 0;
+        parallels_update_header(bs);
+    }
 
-static int parallels_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
-{
-    while (nb_sectors > 0) {
-        int64_t position = seek_to_sector(bs, sector_num);
-        if (position >= 0) {
-            if (bdrv_pread(bs->file, position, buf, 512) != 512)
-                return -1;
-        } else {
-            memset(buf, 0, 512);
-        }
-        nb_sectors--;
-        sector_num++;
-        buf += 512;
+    if (bs->open_flags & BDRV_O_RDWR) {
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS);
     }
-    return 0;
-}
 
-static coroutine_fn int parallels_co_read(BlockDriverState *bs, int64_t sector_num,
-                                          uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVParallelsState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = parallels_read(bs, sector_num, buf, nb_sectors);
-    qemu_co_mutex_unlock(&s->lock);
-    return ret;
+    g_free(s->bat_dirty_bmap);
+    qemu_vfree(s->header);
 }
 
-static void parallels_close(BlockDriverState *bs)
-{
-    BDRVParallelsState *s = bs->opaque;
-    g_free(s->catalog_bitmap);
-}
+static QemuOptsList parallels_create_opts = {
+    .name = "parallels-create-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head),
+    .desc = {
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size",
+        },
+        {
+            .name = BLOCK_OPT_CLUSTER_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Parallels image cluster size",
+            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE),
+        },
+        { /* end of list */ }
+    }
+};
 
 static BlockDriver bdrv_parallels = {
     .format_name	= "parallels",
     .instance_size	= sizeof(BDRVParallelsState),
     .bdrv_probe		= parallels_probe,
     .bdrv_open		= parallels_open,
-    .bdrv_read          = parallels_co_read,
     .bdrv_close		= parallels_close,
+    .bdrv_co_get_block_status = parallels_co_get_block_status,
+    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
+    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
+    .bdrv_co_readv  = parallels_co_readv,
+    .bdrv_co_writev = parallels_co_writev,
+
+    .bdrv_create    = parallels_create,
+    .bdrv_check     = parallels_check,
+    .create_opts    = &parallels_create_opts,
 };
 
 static void bdrv_parallels_init(void)
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed44..2ce509711 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 #include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
@@ -31,8 +32,10 @@
 #include "qapi/qmp/types.h"
 #include "sysemu/block-backend.h"
 
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 {
+    ImageInfo **p_image_info;
+    BlockDriverState *bs0;
     BlockDeviceInfo *info = g_malloc0(sizeof(*info));
 
     info->file                   = g_strdup(bs->filename);
@@ -63,7 +66,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 
     if (bs->io_limits_enabled) {
         ThrottleConfig cfg;
-        throttle_get_config(&bs->throttle_state, &cfg);
+
+        throttle_group_get_config(bs, &cfg);
+
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
         info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
@@ -88,10 +93,32 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 
         info->has_iops_size = cfg.op_size;
         info->iops_size = cfg.op_size;
+
+        info->has_group = true;
+        info->group = g_strdup(throttle_group_get_name(bs));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
 
+    bs0 = bs;
+    p_image_info = &info->image;
+    while (1) {
+        Error *local_err = NULL;
+        bdrv_query_image_info(bs0, p_image_info, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            qapi_free_BlockDeviceInfo(info);
+            return NULL;
+        }
+        if (bs0->drv && bs0->backing_hd) {
+            bs0 = bs0->backing_hd;
+            (*p_image_info)->has_backing_image = true;
+            p_image_info = &((*p_image_info)->backing_image);
+        } else {
+            break;
+        }
+    }
+
     return info;
 }
 
@@ -264,9 +291,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 {
     BlockInfo *info = g_malloc0(sizeof(*info));
     BlockDriverState *bs = blk_bs(blk);
-    BlockDriverState *bs0;
-    ImageInfo **p_image_info;
-    Error *local_err = NULL;
     info->device = g_strdup(blk_name(blk));
     info->type = g_strdup("unknown");
     info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +313,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 
     if (bs->drv) {
         info->has_inserted = true;
-        info->inserted = bdrv_block_device_info(bs);
-
-        bs0 = bs;
-        p_image_info = &info->inserted->image;
-        while (1) {
-            bdrv_query_image_info(bs0, p_image_info, &local_err);
-            if (local_err) {
-                error_propagate(errp, local_err);
-                goto err;
-            }
-            if (bs0->drv && bs0->backing_hd) {
-                bs0 = bs0->backing_hd;
-                (*p_image_info)->has_backing_image = true;
-                p_image_info = &((*p_image_info)->backing_image);
-            } else {
-                break;
-            }
+        info->inserted = bdrv_block_device_info(bs, errp);
+        if (info->inserted == NULL) {
+            goto err;
         }
     }
 
@@ -510,18 +520,9 @@ static void dump_qobject(fprintf_function func_fprintf, void *f,
         }
         case QTYPE_QBOOL: {
             QBool *value = qobject_to_qbool(obj);
-            func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false");
-            break;
-        }
-        case QTYPE_QERROR: {
-            QString *value = qerror_human((QError *)obj);
-            func_fprintf(f, "%s", qstring_get_str(value));
-            QDECREF(value);
+            func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false");
             break;
         }
-        case QTYPE_NONE:
-            break;
-        case QTYPE_MAX:
         default:
             abort();
     }
diff --git a/block/qcow.c b/block/qcow.c
index 055896910..01fba54ce 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -25,7 +25,8 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include <zlib.h>
-#include "qemu/aes.h"
+#include "qapi/qmp/qerror.h"
+#include "crypto/cipher.h"
 #include "migration/migration.h"
 
 /**************************************************************/
@@ -71,10 +72,8 @@ typedef struct BDRVQcowState {
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
-    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    QCryptoCipher *cipher; /* NULL if no key yet */
     uint32_t crypt_method_header;
-    AES_KEY aes_encrypt_key;
-    AES_KEY aes_decrypt_key;
     CoMutex lock;
     Error *migration_blocker;
 } BDRVQcowState;
@@ -123,8 +122,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
         char version[64];
         snprintf(version, sizeof(version), "QCOW version %" PRIu32,
                  header.version);
-        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "qcow", version);
+        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                   bdrv_get_device_or_node_name(bs), "qcow", version);
         ret = -ENOTSUP;
         goto fail;
     }
@@ -153,6 +152,11 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto fail;
     }
+    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+        error_setg(errp, "AES cipher not available");
+        ret = -EINVAL;
+        goto fail;
+    }
     s->crypt_method_header = header.crypt_method;
     if (s->crypt_method_header) {
         bs->encrypted = 1;
@@ -229,9 +233,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when qcow images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qcow", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->lock);
@@ -259,6 +263,7 @@ static int qcow_set_key(BlockDriverState *bs, const char *key)
     BDRVQcowState *s = bs->opaque;
     uint8_t keybuf[16];
     int len, i;
+    Error *err;
 
     memset(keybuf, 0, 16);
     len = strlen(key);
@@ -269,38 +274,68 @@ static int qcow_set_key(BlockDriverState *bs, const char *key)
     for(i = 0;i < len;i++) {
         keybuf[i] = key[i];
     }
-    s->crypt_method = s->crypt_method_header;
-
-    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
-        return -1;
-    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+    assert(bs->encrypted);
+
+    qcrypto_cipher_free(s->cipher);
+    s->cipher = qcrypto_cipher_new(
+        QCRYPTO_CIPHER_ALG_AES_128,
+        QCRYPTO_CIPHER_MODE_CBC,
+        keybuf, G_N_ELEMENTS(keybuf),
+        &err);
+
+    if (!s->cipher) {
+        /* XXX would be nice if errors in this method could
+         * be properly propagate to the caller. Would need
+         * the bdrv_set_key() API signature to be fixed. */
+        error_free(err);
         return -1;
+    }
     return 0;
 }
 
 /* The crypt function is compatible with the linux cryptoloop
    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
    supported */
-static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-                            uint8_t *out_buf, const uint8_t *in_buf,
-                            int nb_sectors, int enc,
-                            const AES_KEY *key)
+static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                           uint8_t *out_buf, const uint8_t *in_buf,
+                           int nb_sectors, bool enc, Error **errp)
 {
     union {
         uint64_t ll[2];
         uint8_t b[16];
     } ivec;
     int i;
+    int ret;
 
     for(i = 0; i < nb_sectors; i++) {
         ivec.ll[0] = cpu_to_le64(sector_num);
         ivec.ll[1] = 0;
-        AES_cbc_encrypt(in_buf, out_buf, 512, key,
-                        ivec.b, enc);
+        if (qcrypto_cipher_setiv(s->cipher,
+                                 ivec.b, G_N_ELEMENTS(ivec.b),
+                                 errp) < 0) {
+            return -1;
+        }
+        if (enc) {
+            ret = qcrypto_cipher_encrypt(s->cipher,
+                                         in_buf,
+                                         out_buf,
+                                         512,
+                                         errp);
+        } else {
+            ret = qcrypto_cipher_decrypt(s->cipher,
+                                         in_buf,
+                                         out_buf,
+                                         512,
+                                         errp);
+        }
+        if (ret < 0) {
+            return -1;
+        }
         sector_num++;
         in_buf += 512;
         out_buf += 512;
     }
+    return 0;
 }
 
 /* 'allocate' is:
@@ -411,17 +446,23 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                 bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
                 /* if encrypted, we must initialize the cluster
                    content which won't be written */
-                if (s->crypt_method &&
+                if (bs->encrypted &&
                     (n_end - n_start) < s->cluster_sectors) {
                     uint64_t start_sect;
+                    assert(s->cipher);
                     start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
                     memset(s->cluster_data + 512, 0x00, 512);
                     for(i = 0; i < s->cluster_sectors; i++) {
                         if (i < n_start || i >= n_end) {
-                            encrypt_sectors(s, start_sect + i,
-                                            s->cluster_data,
-                                            s->cluster_data + 512, 1, 1,
-                                            &s->aes_encrypt_key);
+                            Error *err = NULL;
+                            if (encrypt_sectors(s, start_sect + i,
+                                                s->cluster_data,
+                                                s->cluster_data + 512, 1,
+                                                true, &err) < 0) {
+                                error_free(err);
+                                errno = EIO;
+                                return -1;
+                            }
                             if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
                                             s->cluster_data, 512) != 512)
                                 return -1;
@@ -461,7 +502,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
     if (!cluster_offset) {
         return 0;
     }
-    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) {
+    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) {
         return BDRV_BLOCK_DATA;
     }
     cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
@@ -528,6 +569,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
     QEMUIOVector hd_qiov;
     uint8_t *buf;
     void *orig_buf;
+    Error *err = NULL;
 
     if (qiov->niov > 1) {
         buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
@@ -590,10 +632,12 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
             if (ret < 0) {
                 break;
             }
-            if (s->crypt_method) {
-                encrypt_sectors(s, sector_num, buf, buf,
-                                n, 0,
-                                &s->aes_decrypt_key);
+            if (bs->encrypted) {
+                assert(s->cipher);
+                if (encrypt_sectors(s, sector_num, buf, buf,
+                                    n, false, &err) < 0) {
+                    goto fail;
+                }
             }
         }
         ret = 0;
@@ -614,6 +658,7 @@ done:
     return ret;
 
 fail:
+    error_free(err);
     ret = -EIO;
     goto done;
 }
@@ -661,12 +706,18 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
             ret = -EIO;
             break;
         }
-        if (s->crypt_method) {
+        if (bs->encrypted) {
+            Error *err = NULL;
+            assert(s->cipher);
             if (!cluster_data) {
                 cluster_data = g_malloc0(s->cluster_size);
             }
-            encrypt_sectors(s, sector_num, cluster_data, buf,
-                            n, 1, &s->aes_encrypt_key);
+            if (encrypt_sectors(s, sector_num, cluster_data, buf,
+                                n, true, &err) < 0) {
+                error_free(err);
+                ret = -EIO;
+                break;
+            }
             src_buf = cluster_data;
         } else {
             src_buf = buf;
@@ -703,6 +754,8 @@ static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
 
+    qcrypto_cipher_free(s->cipher);
+    s->cipher = NULL;
     g_free(s->l1_table);
     qemu_vfree(s->l2_cache);
     g_free(s->cluster_cache);
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index b1155492a..53b8afc3d 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -28,62 +28,68 @@
 #include "trace.h"
 
 typedef struct Qcow2CachedTable {
-    void*   table;
-    int64_t offset;
-    bool    dirty;
-    int     cache_hits;
-    int     ref;
+    int64_t  offset;
+    bool     dirty;
+    uint64_t lru_counter;
+    int      ref;
 } Qcow2CachedTable;
 
 struct Qcow2Cache {
-    Qcow2CachedTable*       entries;
-    struct Qcow2Cache*      depends;
+    Qcow2CachedTable       *entries;
+    struct Qcow2Cache      *depends;
     int                     size;
     bool                    depends_on_flush;
+    void                   *table_array;
+    uint64_t                lru_counter;
 };
 
+static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
+                    Qcow2Cache *c, int table)
+{
+    BDRVQcowState *s = bs->opaque;
+    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
+}
+
+static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
+                  Qcow2Cache *c, void *table)
+{
+    BDRVQcowState *s = bs->opaque;
+    ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
+    int idx = table_offset / s->cluster_size;
+    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
+    return idx;
+}
+
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 {
     BDRVQcowState *s = bs->opaque;
     Qcow2Cache *c;
-    int i;
 
     c = g_new0(Qcow2Cache, 1);
     c->size = num_tables;
     c->entries = g_try_new0(Qcow2CachedTable, num_tables);
-    if (!c->entries) {
-        goto fail;
-    }
-
-    for (i = 0; i < c->size; i++) {
-        c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size);
-        if (c->entries[i].table == NULL) {
-            goto fail;
-        }
+    c->table_array = qemu_try_blockalign(bs->file,
+                                         (size_t) num_tables * s->cluster_size);
+
+    if (!c->entries || !c->table_array) {
+        qemu_vfree(c->table_array);
+        g_free(c->entries);
+        g_free(c);
+        c = NULL;
     }
 
     return c;
-
-fail:
-    if (c->entries) {
-        for (i = 0; i < c->size; i++) {
-            qemu_vfree(c->entries[i].table);
-        }
-    }
-    g_free(c->entries);
-    g_free(c);
-    return NULL;
 }
 
-int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c)
 {
     int i;
 
     for (i = 0; i < c->size; i++) {
         assert(c->entries[i].ref == 0);
-        qemu_vfree(c->entries[i].table);
     }
 
+    qemu_vfree(c->table_array);
     g_free(c->entries);
     g_free(c);
 
@@ -151,8 +157,8 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
         BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
     }
 
-    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
-        s->cluster_size);
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset,
+                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
     if (ret < 0) {
         return ret;
     }
@@ -228,42 +234,12 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
     for (i = 0; i < c->size; i++) {
         assert(c->entries[i].ref == 0);
         c->entries[i].offset = 0;
-        c->entries[i].cache_hits = 0;
+        c->entries[i].lru_counter = 0;
     }
 
-    return 0;
-}
-
-static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
-{
-    int i;
-    int min_count = INT_MAX;
-    int min_index = -1;
-
-
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].ref) {
-            continue;
-        }
-
-        if (c->entries[i].cache_hits < min_count) {
-            min_index = i;
-            min_count = c->entries[i].cache_hits;
-        }
-
-        /* Give newer hits priority */
-        /* TODO Check how to optimize the replacement strategy */
-        if (c->entries[i].cache_hits > 1) {
-            c->entries[i].cache_hits /= 2;
-        }
-    }
+    c->lru_counter = 0;
 
-    if (min_index == -1) {
-        /* This can't happen in current synchronous code, but leave the check
-         * here as a reminder for whoever starts using AIO with the cache */
-        abort();
-    }
-    return min_index;
+    return 0;
 }
 
 static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
@@ -272,24 +248,39 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
     BDRVQcowState *s = bs->opaque;
     int i;
     int ret;
+    int lookup_index;
+    uint64_t min_lru_counter = UINT64_MAX;
+    int min_lru_index = -1;
 
     trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
                           offset, read_from_disk);
 
     /* Check if the table is already cached */
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].offset == offset) {
+    i = lookup_index = (offset / s->cluster_size * 4) % c->size;
+    do {
+        const Qcow2CachedTable *t = &c->entries[i];
+        if (t->offset == offset) {
             goto found;
         }
+        if (t->ref == 0 && t->lru_counter < min_lru_counter) {
+            min_lru_counter = t->lru_counter;
+            min_lru_index = i;
+        }
+        if (++i == c->size) {
+            i = 0;
+        }
+    } while (i != lookup_index);
+
+    if (min_lru_index == -1) {
+        /* This can't happen in current synchronous code, but leave the check
+         * here as a reminder for whoever starts using AIO with the cache */
+        abort();
     }
 
-    /* If not, write a table back and replace it */
-    i = qcow2_cache_find_entry_to_replace(c);
+    /* Cache miss: write a table back and replace it */
+    i = min_lru_index;
     trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
                                         c == s->l2_table_cache, i);
-    if (i < 0) {
-        return i;
-    }
 
     ret = qcow2_cache_entry_flush(bs, c, i);
     if (ret < 0) {
@@ -304,22 +295,19 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
             BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
         }
 
-        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+        ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i),
+                         s->cluster_size);
         if (ret < 0) {
             return ret;
         }
     }
 
-    /* Give the table some hits for the start so that it won't be replaced
-     * immediately. The number 32 is completely arbitrary. */
-    c->entries[i].cache_hits = 32;
     c->entries[i].offset = offset;
 
     /* And return the right table */
 found:
-    c->entries[i].cache_hits++;
     c->entries[i].ref++;
-    *table = c->entries[i].table;
+    *table = qcow2_cache_get_table_addr(bs, c, i);
 
     trace_qcow2_cache_get_done(qemu_coroutine_self(),
                                c == s->l2_table_cache, i);
@@ -339,36 +327,24 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     return qcow2_cache_do_get(bs, c, offset, table, false);
 }
 
-int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
 {
-    int i;
+    int i = qcow2_cache_get_table_idx(bs, c, *table);
 
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].table == *table) {
-            goto found;
-        }
-    }
-    return -ENOENT;
-
-found:
     c->entries[i].ref--;
     *table = NULL;
 
+    if (c->entries[i].ref == 0) {
+        c->entries[i].lru_counter = ++c->lru_counter;
+    }
+
     assert(c->entries[i].ref >= 0);
-    return 0;
 }
 
-void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+     void *table)
 {
-    int i;
-
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].table == table) {
-            goto found;
-        }
-    }
-    abort();
-
-found:
+    int i = qcow2_cache_get_table_idx(bs, c, table);
+    assert(c->entries[i].offset != 0);
     c->entries[i].dirty = true;
 }
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ed2b44d29..7e94fe70e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -253,17 +253,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 
         memcpy(l2_table, old_table, s->cluster_size);
 
-        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
-        if (ret < 0) {
-            goto fail;
-        }
+        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
     }
 
     /* write the l2 table to the file */
     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 
     trace_qcow2_l2_allocate_write_l2(bs, l1_index);
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
         goto fail;
@@ -301,7 +298,7 @@ fail:
  * as contiguous. (This allows it, for example, to stop at the first compressed
  * cluster which may require a different handling)
  */
-static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+static int count_contiguous_clusters(int nb_clusters, int cluster_size,
         uint64_t *l2_table, uint64_t stop_flags)
 {
     int i;
@@ -324,7 +321,7 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
 	return i;
 }
 
-static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table)
 {
     int i;
 
@@ -342,26 +339,47 @@ static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_tab
 /* The crypt function is compatible with the linux cryptoloop
    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
    supported */
-void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-                           uint8_t *out_buf, const uint8_t *in_buf,
-                           int nb_sectors, int enc,
-                           const AES_KEY *key)
+int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                          uint8_t *out_buf, const uint8_t *in_buf,
+                          int nb_sectors, bool enc,
+                          Error **errp)
 {
     union {
         uint64_t ll[2];
         uint8_t b[16];
     } ivec;
     int i;
+    int ret;
 
     for(i = 0; i < nb_sectors; i++) {
         ivec.ll[0] = cpu_to_le64(sector_num);
         ivec.ll[1] = 0;
-        AES_cbc_encrypt(in_buf, out_buf, 512, key,
-                        ivec.b, enc);
+        if (qcrypto_cipher_setiv(s->cipher,
+                                 ivec.b, G_N_ELEMENTS(ivec.b),
+                                 errp) < 0) {
+            return -1;
+        }
+        if (enc) {
+            ret = qcrypto_cipher_encrypt(s->cipher,
+                                         in_buf,
+                                         out_buf,
+                                         512,
+                                         errp);
+        } else {
+            ret = qcrypto_cipher_decrypt(s->cipher,
+                                         in_buf,
+                                         out_buf,
+                                         512,
+                                         errp);
+        }
+        if (ret < 0) {
+            return -1;
+        }
         sector_num++;
         in_buf += 512;
         out_buf += 512;
     }
+    return 0;
 }
 
 static int coroutine_fn copy_sectors(BlockDriverState *bs,
@@ -403,10 +421,16 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
         goto out;
     }
 
-    if (s->crypt_method) {
-        qcow2_encrypt_sectors(s, start_sect + n_start,
-                        iov.iov_base, iov.iov_base, n, 1,
-                        &s->aes_encrypt_key);
+    if (bs->encrypted) {
+        Error *err = NULL;
+        assert(s->cipher);
+        if (qcow2_encrypt_sectors(s, start_sect + n_start,
+                                  iov.iov_base, iov.iov_base, n,
+                                  true, &err) < 0) {
+            ret = -EIO;
+            error_free(err);
+            goto out;
+        }
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0,
@@ -471,6 +495,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     if (nb_needed > nb_available) {
         nb_needed = nb_available;
     }
+    assert(nb_needed <= INT_MAX);
 
     *cluster_offset = 0;
 
@@ -506,6 +531,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
     nb_clusters = size_to_clusters(s, nb_needed << 9);
 
     ret = qcow2_get_cluster_type(*cluster_offset);
@@ -692,12 +719,9 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     /* compressed clusters never have the copied flag */
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
     l2_table[l2_index] = cpu_to_be64(cluster_offset);
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        return 0;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     return cluster_offset;
 }
@@ -771,7 +795,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     if (ret < 0) {
         goto err;
     }
-    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 
     assert(l2_index + m->nb_clusters <= s->l2_size);
     for (i = 0; i < m->nb_clusters; i++) {
@@ -789,10 +813,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
      }
 
 
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        goto err;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     /*
      * If this was a COW, we need to decrease the refcount of the old cluster.
@@ -942,9 +963,9 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
     int l2_index;
     uint64_t cluster_offset;
     uint64_t *l2_table;
-    unsigned int nb_clusters;
+    uint64_t nb_clusters;
     unsigned int keep_clusters;
-    int ret, pret;
+    int ret;
 
     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
                               *bytes);
@@ -961,6 +982,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
     l2_index = offset_to_l2_index(s, guest_offset);
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    assert(nb_clusters <= INT_MAX);
 
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
@@ -1011,10 +1033,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 
     /* Cleanup */
 out:
-    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (pret < 0) {
-        return pret;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     /* Only return a host offset if we actually made progress. Otherwise we
      * would make requirements for handle_alloc() that it can't fulfill */
@@ -1046,7 +1065,7 @@ out:
  * restarted, but the whole request should not be failed.
  */
 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, unsigned int *nb_clusters)
+                                   uint64_t *host_offset, uint64_t *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
 
@@ -1064,7 +1083,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
         *host_offset = cluster_offset;
         return 0;
     } else {
-        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
         if (ret < 0) {
             return ret;
         }
@@ -1100,7 +1119,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     int l2_index;
     uint64_t *l2_table;
     uint64_t entry;
-    unsigned int nb_clusters;
+    uint64_t nb_clusters;
     int ret;
 
     uint64_t alloc_cluster_offset;
@@ -1118,6 +1137,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
 
     l2_index = offset_to_l2_index(s, guest_offset);
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    assert(nb_clusters <= INT_MAX);
 
     /* Find L2 entry for the first involved cluster */
     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
@@ -1139,10 +1159,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
      * wrong with our code. */
     assert(nb_clusters > 0);
 
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        return ret;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     /* Allocate, if necessary at a given offset in the image file */
     alloc_cluster_offset = start_of_cluster(s, *host_offset);
@@ -1414,7 +1431,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
  * clusters.
  */
 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard)
+                             uint64_t nb_clusters, enum qcow2_discard_type type,
+                             bool full_discard)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table;
@@ -1429,6 +1447,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
 
     /* Limit nb_clusters to one L2 table */
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    assert(nb_clusters <= INT_MAX);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t old_l2_entry;
@@ -1470,7 +1489,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         }
 
         /* First remove L2 entries */
-        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
         if (!full_discard && s->qcow_version >= 3) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
         } else {
@@ -1481,10 +1500,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
     }
 
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        return ret;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     return nb_clusters;
 }
@@ -1494,7 +1510,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t end_offset;
-    unsigned int nb_clusters;
+    uint64_t nb_clusters;
     int ret;
 
     end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
@@ -1536,7 +1552,7 @@ fail:
  * clusters.
  */
 static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
-    unsigned int nb_clusters)
+                          uint64_t nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table;
@@ -1551,6 +1567,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
 
     /* Limit nb_clusters to one L2 table */
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+    assert(nb_clusters <= INT_MAX);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t old_offset;
@@ -1558,7 +1575,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
         old_offset = be64_to_cpu(l2_table[l2_index + i]);
 
         /* Update L2 entries */
-        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
         if (old_offset & QCOW_OFLAG_COMPRESSED) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
@@ -1567,10 +1584,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
         }
     }
 
-    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-    if (ret < 0) {
-        return ret;
-    }
+    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
     return nb_clusters;
 }
@@ -1578,7 +1592,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
 {
     BDRVQcowState *s = bs->opaque;
-    unsigned int nb_clusters;
+    uint64_t nb_clusters;
     int ret;
 
     /* The zero flag is only supported by version 3 and newer */
@@ -1760,14 +1774,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
 
         if (is_active_l1) {
             if (l2_dirty) {
-                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
                 qcow2_cache_depends_on_flush(s->l2_table_cache);
             }
-            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
-            if (ret < 0) {
-                l2_table = NULL;
-                goto fail;
-            }
+            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
         } else {
             if (l2_dirty) {
                 ret = qcow2_pre_write_overlap_check(bs,
@@ -1798,12 +1808,7 @@ fail:
         if (!is_active_l1) {
             qemu_vfree(l2_table);
         } else {
-            if (ret < 0) {
-                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
-            } else {
-                ret = qcow2_cache_put(bs, s->l2_table_cache,
-                        (void **)&l2_table);
-            }
+            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
         }
     }
     return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 63c00858d..0b6c302ee 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -265,10 +265,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
     block_index = cluster_index & (s->refcount_block_size - 1);
     *refcount = s->get_refcount(refcount_block, block_index);
 
-    ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
-    if (ret < 0) {
-        return ret;
-    }
+    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 
     return 0;
 }
@@ -424,7 +421,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
 
     /* Now the new refcount block needs to be written to disk */
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
-    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
     if (ret < 0) {
         goto fail_block;
@@ -448,10 +445,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         return -EAGAIN;
     }
 
-    ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
-    if (ret < 0) {
-        goto fail_block;
-    }
+    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 
     /*
      * If we come here, we need to grow the refcount table. Again, a new
@@ -723,13 +717,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         /* Load the refcount block and allocate it if needed */
         if (table_index != old_table_index) {
             if (refcount_block) {
-                ret = qcow2_cache_put(bs, s->refcount_block_cache,
-                                      &refcount_block);
-                if (ret < 0) {
-                    goto fail;
-                }
+                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
             }
-
             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
             if (ret < 0) {
                 goto fail;
@@ -737,7 +726,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         }
         old_table_index = table_index;
 
-        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+                                     refcount_block);
 
         /* we can update the count and save it */
         block_index = cluster_index & (s->refcount_block_size - 1);
@@ -773,11 +763,7 @@ fail:
 
     /* Write last changed block to disk */
     if (refcount_block) {
-        int wret;
-        wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
-        if (wret < 0) {
-            return ret < 0 ? ret : wret;
-        }
+        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
     }
 
     /*
@@ -889,8 +875,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
     return offset;
 }
 
-int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
-    int nb_clusters)
+int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+                                int64_t nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t cluster_index, refcount;
@@ -954,19 +940,21 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     }
 
     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
-    if (!offset || free_in_cluster < size) {
-        int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
-        if (new_cluster < 0) {
-            return new_cluster;
-        }
+    do {
+        if (!offset || free_in_cluster < size) {
+            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
+            if (new_cluster < 0) {
+                return new_cluster;
+            }
 
-        if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
-            offset = new_cluster;
+            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
+                offset = new_cluster;
+            }
         }
-    }
 
-    assert(offset);
-    ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
+        assert(offset);
+        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
+    } while (ret == -EAGAIN);
     if (ret < 0) {
         return ret;
     }
@@ -1182,15 +1170,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             s->refcount_block_cache);
                     }
                     l2_table[j] = cpu_to_be64(offset);
-                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
+                                                 l2_table);
                 }
             }
 
-            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-            if (ret < 0) {
-                goto fail;
-            }
-
+            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 
             if (addend != 0) {
                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
@@ -1274,7 +1259,7 @@ static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries)
 static int realloc_refcount_array(BDRVQcowState *s, void **array,
                                   int64_t *size, int64_t new_size)
 {
-    size_t old_byte_size, new_byte_size;
+    int64_t old_byte_size, new_byte_size;
     void *new_ptr;
 
     /* Round to clusters so the array can be directly written to disk */
@@ -1290,13 +1275,17 @@ static int realloc_refcount_array(BDRVQcowState *s, void **array,
 
     assert(new_byte_size > 0);
 
+    if (new_byte_size > SIZE_MAX) {
+        return -ENOMEM;
+    }
+
     new_ptr = g_try_realloc(*array, new_byte_size);
     if (!new_ptr) {
         return -ENOMEM;
     }
 
     if (new_byte_size > old_byte_size) {
-        memset((void *)((uintptr_t)new_ptr + old_byte_size), 0,
+        memset((char *)new_ptr + old_byte_size, 0,
                new_byte_size - old_byte_size);
     }
 
@@ -2455,7 +2444,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
     if (ret < 0) {
         return ret;
     } else if (ret > 0) {
-        int metadata_ol_bitnr = ffs(ret) - 1;
+        int metadata_ol_bitnr = ctz32(ret);
         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
 
         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb1d..b6f58c13e 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -25,6 +25,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/error-report.h"
 
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
@@ -351,10 +352,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     memset(sn, 0, sizeof(*sn));
 
-    /* Generate an ID if it wasn't passed */
-    if (sn_info->id_str[0] == '\0') {
-        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
-    }
+    /* Generate an ID */
+    find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
 
     /* Check that the ID is unique */
     if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db22..76c331b38 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -25,7 +25,6 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include <zlib.h>
-#include "qemu/aes.h"
 #include "block/qcow2.h"
 #include "qemu/error-report.h"
 #include "qapi/qmp/qerror.h"
@@ -207,8 +206,8 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
     vsnprintf(msg, sizeof(msg), fmt, ap);
     va_end(ap);
 
-    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-              bdrv_get_device_name(bs), "qcow2", msg);
+    error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+               bdrv_get_device_or_node_name(bs), "qcow2", msg);
 }
 
 static void report_unsupported_feature(BlockDriverState *bs,
@@ -483,9 +482,11 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
     [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 };
 
-static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
+static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
+                             uint64_t *l2_cache_size,
                              uint64_t *refcount_cache_size, Error **errp)
 {
+    BDRVQcowState *s = bs->opaque;
     uint64_t combined_cache_size;
     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
 
@@ -525,7 +526,9 @@ static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
         }
     } else {
         if (!l2_cache_size_set && !refcount_cache_size_set) {
-            *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE;
+            *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
+                                 (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
+                                 * s->cluster_size);
             *refcount_cache_size = *l2_cache_size
                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
         } else if (!l2_cache_size_set) {
@@ -695,6 +698,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto fail;
     }
+    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+        error_setg(errp, "AES cipher not available");
+        ret = -EINVAL;
+        goto fail;
+    }
     s->crypt_method_header = header.crypt_method;
     if (s->crypt_method_header) {
         bs->encrypted = 1;
@@ -803,7 +811,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err);
+    read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
+                     &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         ret = -EINVAL;
@@ -1027,6 +1036,7 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     BDRVQcowState *s = bs->opaque;
     uint8_t keybuf[16];
     int len, i;
+    Error *err = NULL;
 
     memset(keybuf, 0, 16);
     len = strlen(key);
@@ -1037,30 +1047,22 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     for(i = 0;i < len;i++) {
         keybuf[i] = key[i];
     }
-    s->crypt_method = s->crypt_method_header;
+    assert(bs->encrypted);
 
-    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
-        return -1;
-    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+    qcrypto_cipher_free(s->cipher);
+    s->cipher = qcrypto_cipher_new(
+        QCRYPTO_CIPHER_ALG_AES_128,
+        QCRYPTO_CIPHER_MODE_CBC,
+        keybuf, G_N_ELEMENTS(keybuf),
+        &err);
+
+    if (!s->cipher) {
+        /* XXX would be nice if errors in this method could
+         * be properly propagate to the caller. Would need
+         * the bdrv_set_key() API signature to be fixed. */
+        error_free(err);
         return -1;
-#if 0
-    /* test */
-    {
-        uint8_t in[16];
-        uint8_t out[16];
-        uint8_t tmp[16];
-        for(i=0;i<16;i++)
-            in[i] = i;
-        AES_encrypt(in, tmp, &s->aes_encrypt_key);
-        AES_decrypt(tmp, out, &s->aes_decrypt_key);
-        for(i = 0; i < 16; i++)
-            printf(" %02x", tmp[i]);
-        printf("\n");
-        for(i = 0; i < 16; i++)
-            printf(" %02x", out[i]);
-        printf("\n");
     }
-#endif
     return 0;
 }
 
@@ -1103,7 +1105,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     }
 
     if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
-        !s->crypt_method) {
+        !s->cipher) {
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
         status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
@@ -1153,7 +1155,7 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
 
         /* prepare next request */
         cur_nr_sectors = remaining_sectors;
-        if (s->crypt_method) {
+        if (s->cipher) {
             cur_nr_sectors = MIN(cur_nr_sectors,
                 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
         }
@@ -1224,7 +1226,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 goto fail;
             }
 
-            if (s->crypt_method) {
+            if (bs->encrypted) {
+                assert(s->cipher);
+
                 /*
                  * For encrypted images, read everything into a temporary
                  * contiguous buffer on which the AES functions can work.
@@ -1255,9 +1259,16 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             if (ret < 0) {
                 goto fail;
             }
-            if (s->crypt_method) {
-                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
-                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+            if (bs->encrypted) {
+                assert(s->cipher);
+                Error *err = NULL;
+                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data,
+                                          cluster_data, cur_nr_sectors, false,
+                                          &err) < 0) {
+                    error_free(err);
+                    ret = -EIO;
+                    goto fail;
+                }
                 qemu_iovec_from_buf(qiov, bytes_done,
                     cluster_data, 512 * cur_nr_sectors);
             }
@@ -1315,7 +1326,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         trace_qcow2_writev_start_part(qemu_coroutine_self());
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         cur_nr_sectors = remaining_sectors;
-        if (s->crypt_method &&
+        if (bs->encrypted &&
             cur_nr_sectors >
             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
             cur_nr_sectors =
@@ -1334,7 +1345,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
             cur_nr_sectors * 512);
 
-        if (s->crypt_method) {
+        if (bs->encrypted) {
+            Error *err = NULL;
+            assert(s->cipher);
             if (!cluster_data) {
                 cluster_data = qemu_try_blockalign(bs->file,
                                                    QCOW_MAX_CRYPT_CLUSTERS
@@ -1349,8 +1362,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
 
-            qcow2_encrypt_sectors(s, sector_num, cluster_data,
-                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+            if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
+                                      cluster_data, cur_nr_sectors,
+                                      true, &err) < 0) {
+                error_free(err);
+                ret = -EIO;
+                goto fail;
+            }
 
             qemu_iovec_reset(&hd_qiov);
             qemu_iovec_add(&hd_qiov, cluster_data,
@@ -1456,6 +1474,9 @@ static void qcow2_close(BlockDriverState *bs)
     qcow2_cache_destroy(bs, s->l2_table_cache);
     qcow2_cache_destroy(bs, s->refcount_block_cache);
 
+    qcrypto_cipher_free(s->cipher);
+    s->cipher = NULL;
+
     g_free(s->unknown_header_fields);
     cleanup_unknown_header_ext(bs);
 
@@ -1472,9 +1493,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
     BDRVQcowState *s = bs->opaque;
     int flags = s->flags;
-    AES_KEY aes_encrypt_key;
-    AES_KEY aes_decrypt_key;
-    uint32_t crypt_method = 0;
+    QCryptoCipher *cipher = NULL;
     QDict *options;
     Error *local_err = NULL;
     int ret;
@@ -1484,11 +1503,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
      * that means we don't have to worry about reopening them here.
      */
 
-    if (s->crypt_method) {
-        crypt_method = s->crypt_method;
-        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
-        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
-    }
+    cipher = s->cipher;
+    s->cipher = NULL;
 
     qcow2_close(bs);
 
@@ -1513,11 +1529,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
         return;
     }
 
-    if (crypt_method) {
-        s->crypt_method = crypt_method;
-        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
-        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
-    }
+    s->cipher = cipher;
 }
 
 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
@@ -1802,7 +1814,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
 {
     /* Calculate cluster_bits */
     int cluster_bits;
-    cluster_bits = ffs(cluster_size) - 1;
+    cluster_bits = ctz32(cluster_size);
     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
         (1 << cluster_bits) != cluster_size)
     {
@@ -2110,7 +2122,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
         goto finish;
     }
 
-    refcount_order = ffs(refcount_bits) - 1;
+    refcount_order = ctz32(refcount_bits);
 
     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                         cluster_size, prealloc, opts, version, refcount_order,
@@ -2718,8 +2730,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
             backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
         } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
             encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
-                                        s->crypt_method);
-            if (encrypt != !!s->crypt_method) {
+                                        !!s->cipher);
+
+            if (encrypt != !!s->cipher) {
                 fprintf(stderr, "Changing the encryption flag is not "
                         "supported.\n");
                 return -ENOTSUP;
@@ -2824,6 +2837,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                              int64_t size, const char *message_format, ...)
 {
     BDRVQcowState *s = bs->opaque;
+    const char *node_name;
     char *message;
     va_list ap;
 
@@ -2847,8 +2861,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                 "corruption events will be suppressed\n", message);
     }
 
-    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
-                                          offset >= 0, offset, size >= 0, size,
+    node_name = bdrv_get_node_name(bs);
+    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+                                          *node_name != '\0', node_name,
+                                          message, offset >= 0, offset,
+                                          size >= 0, size,
                                           fatal, &error_abort);
     g_free(message);
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 2f2094959..4b5a6afc8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,7 +25,7 @@
 #ifndef BLOCK_QCOW2_H
 #define BLOCK_QCOW2_H
 
-#include "qemu/aes.h"
+#include "crypto/cipher.h"
 #include "block/coroutine.h"
 
 //#define DEBUG_ALLOC
@@ -68,6 +68,8 @@
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
 
+/* Whichever is more */
+#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
 #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
 
 /* The refblock cache needs only a fourth of the L2 cache size to cover as many
@@ -251,10 +253,8 @@ typedef struct BDRVQcowState {
 
     CoMutex lock;
 
-    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    QCryptoCipher *cipher; /* current cipher, NULL if no key yet */
     uint32_t crypt_method_header;
-    AES_KEY aes_encrypt_key;
-    AES_KEY aes_decrypt_key;
     uint64_t snapshots_offset;
     int snapshots_size;
     unsigned int nb_snapshots;
@@ -412,7 +412,7 @@ static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
     return offset & (s->cluster_size - 1);
 }
 
-static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+static inline uint64_t size_to_clusters(BDRVQcowState *s, uint64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;
 }
@@ -506,8 +506,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                   enum qcow2_discard_type type);
 
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
-int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
-    int nb_clusters);
+int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+                                int64_t nb_clusters);
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
 void qcow2_free_clusters(BlockDriverState *bs,
                           int64_t offset, int64_t size,
@@ -534,10 +534,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
-void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-                     uint8_t *out_buf, const uint8_t *in_buf,
-                     int nb_sectors, int enc,
-                     const AES_KEY *key);
+int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                          uint8_t *out_buf, const uint8_t *in_buf,
+                          int nb_sectors, bool enc, Error **errp);
 
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int *num, uint64_t *cluster_offset);
@@ -575,7 +574,8 @@ int qcow2_read_snapshots(BlockDriverState *bs);
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
 int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 
-void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+     void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
@@ -587,6 +587,6 @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     void **table);
-int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
 #endif
diff --git a/block/qed.c b/block/qed.c
index 892b13c80..954ed007c 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -407,8 +407,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
         char buf[64];
         snprintf(buf, sizeof(buf), "%" PRIx64,
             s->header.features & ~QED_FEATURE_MASK);
-        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bdrv_get_device_name(bs), "QED", buf);
+        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                   bdrv_get_device_or_node_name(bs), "QED", buf);
         return -ENOTSUP;
     }
     if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
 
     s->table_nelems = (s->header.cluster_size * s->header.table_size) /
                       sizeof(uint64_t);
-    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_shift = ctz32(s->header.cluster_size);
     s->l2_mask = s->table_nelems - 1;
-    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
 
     /* Header size calculation must not overflow uint32_t */
     if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/quorum.c b/block/quorum.c
index 437b12251..2f6c45f76 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -13,16 +13,16 @@
  * See the COPYING file in the top-level directory.
  */
 
-#include <gnutls/gnutls.h>
-#include <gnutls/crypto.h>
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qstring.h"
 #include "qapi-event.h"
+#include "crypto/hash.h"
 
 #define HASH_LENGTH 32
 
@@ -33,7 +33,7 @@
 
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
-    char h[HASH_LENGTH];       /* SHA-256 hash */
+    uint8_t h[HASH_LENGTH];    /* SHA-256 hash */
     int64_t l;                 /* simpler 64 bits hash */
 } QuorumVoteValue;
 
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
 
 static void quorum_report_failure(QuorumAIOCB *acb)
 {
-    const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
-                            bdrv_get_device_name(acb->common.bs) :
-                            acb->common.bs->node_name;
-
+    const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
     qapi_event_send_quorum_failure(reference, acb->sector_num,
                                    acb->nb_sectors, &error_abort);
 }
@@ -430,25 +427,21 @@ static void quorum_free_vote_list(QuorumVotes *votes)
 
 static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
 {
-    int j, ret;
-    gnutls_hash_hd_t dig;
     QEMUIOVector *qiov = &acb->qcrs[i].qiov;
-
-    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
-
-    if (ret < 0) {
-        return ret;
+    size_t len = sizeof(hash->h);
+    uint8_t *data = hash->h;
+
+    /* XXX - would be nice if we could pass in the Error **
+     * and propagate that back, but this quorum code is
+     * restricted to just errno values currently */
+    if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256,
+                            qiov->iov, qiov->niov,
+                            &data, &len,
+                            NULL) < 0) {
+        return -EINVAL;
     }
 
-    for (j = 0; j < qiov->niov; j++) {
-        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
-        if (ret < 0) {
-            break;
-        }
-    }
-
-    gnutls_hash_deinit(dig, (void *) hash);
-    return ret;
+    return 0;
 }
 
 static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
@@ -803,8 +796,8 @@ static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
 {
 
     if (threshold < 1) {
-        error_set(errp, QERR_INVALID_PARAMETER_VALUE,
-                  "vote-threshold", "value >= 1");
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+                   "vote-threshold", "value >= 1");
         return -ERANGE;
     }
 
@@ -869,25 +862,18 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     Error *local_err = NULL;
     QemuOpts *opts = NULL;
     bool *opened;
-    QDict *sub = NULL;
-    QList *list = NULL;
-    const QListEntry *lentry;
     int i;
     int ret = 0;
 
     qdict_flatten(options);
-    qdict_extract_subqdict(options, &sub, "children.");
-    qdict_array_split(sub, &list);
 
-    if (qdict_size(sub)) {
-        error_setg(&local_err, "Invalid option children.%s",
-                   qdict_first(sub)->key);
+    /* count how many different children are present */
+    s->num_children = qdict_array_entries(options, "children.");
+    if (s->num_children < 0) {
+        error_setg(&local_err, "Option children is not a valid array");
         ret = -EINVAL;
         goto exit;
     }
-
-    /* count how many different children are present */
-    s->num_children = qlist_size(list);
     if (s->num_children < 2) {
         error_setg(&local_err,
                    "Number of provided children must be greater than 1");
@@ -940,37 +926,17 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     s->bs = g_new0(BlockDriverState *, s->num_children);
     opened = g_new0(bool, s->num_children);
 
-    for (i = 0, lentry = qlist_first(list); lentry;
-         lentry = qlist_next(lentry), i++) {
-        QDict *d;
-        QString *string;
-
-        switch (qobject_type(lentry->value))
-        {
-            /* List of options */
-            case QTYPE_QDICT:
-                d = qobject_to_qdict(lentry->value);
-                QINCREF(d);
-                ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL,
-                                &local_err);
-                break;
-
-            /* QMP reference */
-            case QTYPE_QSTRING:
-                string = qobject_to_qstring(lentry->value);
-                ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL,
-                                flags, NULL, &local_err);
-                break;
-
-            default:
-                error_setg(&local_err, "Specification of child block device %i "
-                           "is invalid", i);
-                ret = -EINVAL;
-        }
+    for (i = 0; i < s->num_children; i++) {
+        char indexstr[32];
+        ret = snprintf(indexstr, 32, "children.%d", i);
+        assert(ret < 32);
 
+        ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs,
+                              &child_format, false, &local_err);
         if (ret < 0) {
             goto close_exit;
         }
+
         opened[i] = true;
     }
 
@@ -993,8 +959,6 @@ exit:
     if (local_err) {
         error_propagate(errp, local_err);
     }
-    QDECREF(list);
-    QDECREF(sub);
     return ret;
 }
 
@@ -1056,9 +1020,9 @@ static void quorum_refresh_filename(BlockDriverState *bs)
     qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD,
                   QOBJECT(qint_from_int(s->threshold)));
     qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY,
-                  QOBJECT(qbool_from_int(s->is_blkverify)));
+                  QOBJECT(qbool_from_bool(s->is_blkverify)));
     qdict_put_obj(opts, QUORUM_OPT_REWRITE,
-                  QOBJECT(qbool_from_int(s->rewrite_corrupted)));
+                  QOBJECT(qbool_from_bool(s->rewrite_corrupted)));
     qdict_put_obj(opts, "children", QOBJECT(children));
 
     bs->full_open_options = opts;
@@ -1091,6 +1055,10 @@ static BlockDriver bdrv_quorum = {
 
 static void bdrv_quorum_init(void)
 {
+    if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) {
+        /* SHA256 hash support is required for quorum device */
+        return;
+    }
     bdrv_register(&bdrv_quorum);
 }
 
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 24d85826c..855febed5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "qemu/log.h"
 #include "block/block_int.h"
@@ -31,6 +32,7 @@
 #include "qemu/iov.h"
 #include "raw-aio.h"
 #include "qapi/util.h"
+#include "qapi/qmp/qstring.h"
 
 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@@ -57,6 +59,7 @@
 #include <linux/fd.h>
 #include <linux/fs.h>
 #include <linux/hdreg.h>
+#include <scsi/sg.h>
 #ifdef __s390__
 #include <asm/dasd.h>
 #endif
@@ -94,15 +97,19 @@
 #include <xfs/xfs.h>
 #endif
 
-//#define DEBUG_FLOPPY
-
 //#define DEBUG_BLOCK
-#if defined(DEBUG_BLOCK)
-#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
-    { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+
+#ifdef DEBUG_BLOCK
+# define DEBUG_BLOCK_PRINT 1
 #else
-#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+# define DEBUG_BLOCK_PRINT 0
 #endif
+#define DPRINTF(fmt, ...) \
+do { \
+    if (DEBUG_BLOCK_PRINT) { \
+        printf(fmt, ## __VA_ARGS__); \
+    } \
+} while (0)
 
 /* OS X does not have O_DSYNC */
 #ifndef O_DSYNC
@@ -301,10 +308,11 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     char *buf;
+    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 
-    /* For /dev/sg devices the alignment is not really used.
+    /* For SCSI generic devices the alignment is not really used.
        With buffered I/O, we don't have any restrictions. */
-    if (bs->sg || !s->needs_alignment) {
+    if (bdrv_is_sg(bs) || !s->needs_alignment) {
         bs->request_alignment = 1;
         s->buf_align = 1;
         return;
@@ -330,9 +338,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     /* If we could not get the sizes so far, we can only guess them */
     if (!s->buf_align) {
         size_t align;
-        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
-        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
-            if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
+        buf = qemu_memalign(max_align, 2 * max_align);
+        for (align = 512; align <= max_align; align <<= 1) {
+            if (raw_is_io_aligned(fd, buf + align, max_align)) {
                 s->buf_align = align;
                 break;
             }
@@ -342,8 +350,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 
     if (!bs->request_alignment) {
         size_t align;
-        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
-        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+        buf = qemu_memalign(s->buf_align, max_align);
+        for (align = 512; align <= max_align; align <<= 1) {
             if (raw_is_io_aligned(fd, buf, align)) {
                 bs->request_alignment = align;
                 break;
@@ -725,7 +733,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
     BDRVRawState *s = bs->opaque;
 
     raw_probe_alignment(bs, s->fd, errp);
-    bs->bl.opt_mem_alignment = s->buf_align;
+    bs->bl.min_mem_alignment = s->buf_align;
+    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
 }
 
 static int check_for_dasd(int fd)
@@ -1016,6 +1025,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
     struct xfs_flock64 fl;
+    int err;
 
     memset(&fl, 0, sizeof(fl));
     fl.l_whence = SEEK_SET;
@@ -1023,8 +1033,9 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
     fl.l_len = bytes;
 
     if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
-        DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
-        return -errno;
+        err = errno;
+        DPRINTF("cannot write zero range (%s)\n", strerror(errno));
+        return -err;
     }
 
     return 0;
@@ -1033,6 +1044,7 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
     struct xfs_flock64 fl;
+    int err;
 
     memset(&fl, 0, sizeof(fl));
     fl.l_whence = SEEK_SET;
@@ -1040,8 +1052,9 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
     fl.l_len = bytes;
 
     if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
-        DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
-        return -errno;
+        err = errno;
+        DPRINTF("cannot punch hole (%s)\n", strerror(errno));
+        return -err;
     }
 
     return 0;
@@ -1846,8 +1859,9 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
         *pnum = nb_sectors;
         ret = BDRV_BLOCK_DATA;
     } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent.  */
-        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+        /* On a data extent, compute sectors to the end of the extent,
+         * possibly including a partial sector at EOF. */
+        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
         ret = BDRV_BLOCK_DATA;
     } else {
         /* On a hole, compute sectors to the beginning of the next extent.  */
@@ -2073,15 +2087,38 @@ static void hdev_parse_filename(const char *filename, QDict *options,
     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
 }
 
+static bool hdev_is_sg(BlockDriverState *bs)
+{
+
+#if defined(__linux__)
+
+    struct stat st;
+    struct sg_scsi_id scsiid;
+    int sg_version;
+
+    if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) &&
+        !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) &&
+        !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) {
+        DPRINTF("SG device found: type=%d, version=%d\n",
+            scsiid.scsi_type, sg_version);
+        return true;
+    }
+
+#endif
+
+    return false;
+}
+
 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     Error *local_err = NULL;
     int ret;
-    const char *filename = qdict_get_str(options, "filename");
 
 #if defined(__APPLE__) && defined(__MACH__)
+    const char *filename = qdict_get_str(options, "filename");
+
     if (strstart(filename, "/dev/cdrom", NULL)) {
         kern_return_t kernResult;
         io_iterator_t mediaIterator;
@@ -2110,16 +2147,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
 #endif
 
     s->type = FTYPE_FILE;
-#if defined(__linux__)
-    {
-        char resolved_path[ MAXPATHLEN ], *temp;
-
-        temp = realpath(filename, resolved_path);
-        if (temp && strstart(temp, "/dev/sg", NULL)) {
-            bs->sg = 1;
-        }
-    }
-#endif
 
     ret = raw_open_common(bs, options, flags, 0, &local_err);
     if (ret < 0) {
@@ -2129,6 +2156,9 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 
+    /* Since this does ioctl the device must be already opened */
+    bs->sg = hdev_is_sg(bs);
+
     if (flags & BDRV_O_RDWR) {
         ret = check_hdev_writable(s);
         if (ret < 0) {
@@ -2157,16 +2187,12 @@ static int fd_open(BlockDriverState *bs)
         (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
         qemu_close(s->fd);
         s->fd = -1;
-#ifdef DEBUG_FLOPPY
-        printf("Floppy closed\n");
-#endif
+        DPRINTF("Floppy closed\n");
     }
     if (s->fd < 0) {
         if (s->fd_got_error &&
             (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
-#ifdef DEBUG_FLOPPY
-            printf("No floppy (open delayed)\n");
-#endif
+            DPRINTF("No floppy (open delayed)\n");
             return -EIO;
         }
         s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
@@ -2175,14 +2201,10 @@ static int fd_open(BlockDriverState *bs)
             s->fd_got_error = 1;
             if (last_media_present)
                 s->fd_media_changed = 1;
-#ifdef DEBUG_FLOPPY
-            printf("No floppy\n");
-#endif
+            DPRINTF("No floppy\n");
             return -EIO;
         }
-#ifdef DEBUG_FLOPPY
-        printf("Floppy opened\n");
-#endif
+        DPRINTF("Floppy opened\n");
     }
     if (!last_media_present)
         s->fd_media_changed = 1;
@@ -2408,7 +2430,8 @@ static int floppy_probe_device(const char *filename)
     struct stat st;
 
     if (strstart(filename, "/dev/fd", NULL) &&
-        !strstart(filename, "/dev/fdset/", NULL)) {
+        !strstart(filename, "/dev/fdset/", NULL) &&
+        !strstart(filename, "/dev/fd/", NULL)) {
         prio = 50;
     }
 
@@ -2450,9 +2473,7 @@ static int floppy_media_changed(BlockDriverState *bs)
     fd_open(bs);
     ret = s->fd_media_changed;
     s->fd_media_changed = 0;
-#ifdef DEBUG_FLOPPY
-    printf("Floppy changed=%d\n", ret);
-#endif
+    DPRINTF("Floppy changed=%d\n", ret);
     return ret;
 }
 
diff --git a/block/raw-win32.c b/block/raw-win32.c
index dae5d2fee..68f2338ac 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -29,6 +29,7 @@
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
+#include "qapi/qmp/qstring.h"
 #include <windows.h>
 #include <winioctl.h>
 
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2ddd5..a60a19d58 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -74,25 +74,18 @@ typedef struct RBDAIOCB {
     QEMUIOVector *qiov;
     char *bounce;
     RBDAIOCmd cmd;
-    int64_t sector_num;
     int error;
     struct BDRVRBDState *s;
-    int status;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
-    int rcbid;
     RBDAIOCB *acb;
     struct BDRVRBDState *s;
-    int done;
     int64_t size;
     char *buf;
     int64_t ret;
 } RADOSCB;
 
-#define RBD_FD_READ 0
-#define RBD_FD_WRITE 1
-
 typedef struct BDRVRBDState {
     rados_t cluster;
     rados_ioctx_t io_ctx;
@@ -235,7 +228,9 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
     return NULL;
 }
 
-static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp)
+static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
+                             bool only_read_conf_file,
+                             Error **errp)
 {
     char *p, *buf;
     char name[RBD_MAX_CONF_NAME_SIZE];
@@ -267,14 +262,18 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp)
         qemu_rbd_unescape(value);
 
         if (strcmp(name, "conf") == 0) {
-            ret = rados_conf_read_file(cluster, value);
-            if (ret < 0) {
-                error_setg(errp, "error reading conf file %s", value);
-                break;
+            /* read the conf file alone, so it doesn't override more
+               specific settings for a particular device */
+            if (only_read_conf_file) {
+                ret = rados_conf_read_file(cluster, value);
+                if (ret < 0) {
+                    error_setg(errp, "error reading conf file %s", value);
+                    break;
+                }
             }
         } else if (strcmp(name, "id") == 0) {
             /* ignore, this is parsed by qemu_rbd_parse_clientname() */
-        } else {
+        } else if (!only_read_conf_file) {
             ret = rados_conf_set(cluster, name, value);
             if (ret < 0) {
                 error_setg(errp, "invalid conf option %s", name);
@@ -325,7 +324,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
             error_setg(errp, "obj size too small");
             return -EINVAL;
         }
-        obj_order = ffs(objsize) - 1;
+        obj_order = ctz32(objsize);
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
@@ -337,10 +336,15 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
     if (strstr(conf, "conf=") == NULL) {
         /* try default location, but ignore failure */
         rados_conf_read_file(cluster, NULL);
+    } else if (conf[0] != '\0' &&
+               qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) {
+        rados_shutdown(cluster);
+        error_propagate(errp, local_err);
+        return -EIO;
     }
 
     if (conf[0] != '\0' &&
-        qemu_rbd_set_conf(cluster, conf, &local_err) < 0) {
+        qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) {
         rados_shutdown(cluster);
         error_propagate(errp, local_err);
         return -EIO;
@@ -405,7 +409,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     }
     qemu_vfree(acb->bounce);
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    acb->status = 0;
 
     qemu_aio_unref(acb);
 }
@@ -468,6 +471,23 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
         s->snap = g_strdup(snap_buf);
     }
 
+    if (strstr(conf, "conf=") == NULL) {
+        /* try default location, but ignore failure */
+        rados_conf_read_file(s->cluster, NULL);
+    } else if (conf[0] != '\0') {
+        r = qemu_rbd_set_conf(s->cluster, conf, true, errp);
+        if (r < 0) {
+            goto failed_shutdown;
+        }
+    }
+
+    if (conf[0] != '\0') {
+        r = qemu_rbd_set_conf(s->cluster, conf, false, errp);
+        if (r < 0) {
+            goto failed_shutdown;
+        }
+    }
+
     /*
      * Fallback to more conservative semantics if setting cache
      * options fails. Ignore errors from setting rbd_cache because the
@@ -481,18 +501,6 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
         rados_conf_set(s->cluster, "rbd_cache", "true");
     }
 
-    if (strstr(conf, "conf=") == NULL) {
-        /* try default location, but ignore failure */
-        rados_conf_read_file(s->cluster, NULL);
-    }
-
-    if (conf[0] != '\0') {
-        r = qemu_rbd_set_conf(s->cluster, conf, errp);
-        if (r < 0) {
-            goto failed_shutdown;
-        }
-    }
-
     r = rados_connect(s->cluster);
     if (r < 0) {
         error_setg(errp, "error connecting");
@@ -621,7 +629,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb->error = 0;
     acb->s = s;
     acb->bh = NULL;
-    acb->status = -EINPROGRESS;
 
     if (cmd == RBD_AIO_WRITE) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -633,7 +640,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     size = nb_sectors * BDRV_SECTOR_SIZE;
 
     rcb = g_new(RADOSCB, 1);
-    rcb->done = 0;
     rcb->acb = acb;
     rcb->buf = buf;
     rcb->s = acb->s;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172cfa..9585beb73 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -318,6 +318,10 @@ enum AIOCBState {
     AIOCB_DISCARD_OBJ,
 };
 
+#define AIOCBOverwrapping(x, y)                                 \
+    (!(x->max_affect_data_idx < y->min_affect_data_idx          \
+       || y->max_affect_data_idx < x->min_affect_data_idx))
+
 struct SheepdogAIOCB {
     BlockAIOCB common;
 
@@ -334,6 +338,11 @@ struct SheepdogAIOCB {
 
     bool cancelable;
     int nr_pending;
+
+    uint32_t min_affect_data_idx;
+    uint32_t max_affect_data_idx;
+
+    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
 typedef struct BDRVSheepdogState {
@@ -362,8 +371,10 @@ typedef struct BDRVSheepdogState {
 
     /* Every aio request must be linked to either of these queues. */
     QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
-    QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
     QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
+
+    CoQueue overwrapping_queue;
+    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
 static const char * sd_strerror(int err)
@@ -498,13 +509,7 @@ static void sd_aio_cancel(BlockAIOCB *blockacb)
     AIOReq *aioreq, *next;
 
     if (sd_acb_cancelable(acb)) {
-        /* Remove outstanding requests from pending and failed queues.  */
-        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
-                           next) {
-            if (aioreq->aiocb == acb) {
-                free_aio_req(s, aioreq);
-            }
-        }
+        /* Remove outstanding requests from failed queue.  */
         QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
                            next) {
             if (aioreq->aiocb == acb) {
@@ -529,6 +534,10 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
                                    int64_t sector_num, int nb_sectors)
 {
     SheepdogAIOCB *acb;
+    uint32_t object_size;
+    BDRVSheepdogState *s = bs->opaque;
+
+    object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
     acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
 
@@ -542,6 +551,11 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     acb->nr_pending = 0;
+
+    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
+                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
+
     return acb;
 }
 
@@ -703,38 +717,6 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
 static void co_write_request(void *opaque);
 
-static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
-    AIOReq *aio_req;
-
-    QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
-        if (aio_req->oid == oid) {
-            return aio_req;
-        }
-    }
-
-    return NULL;
-}
-
-/*
- * This function searchs pending requests to the object `oid', and
- * sends them.
- */
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
-    AIOReq *aio_req;
-    SheepdogAIOCB *acb;
-
-    while ((aio_req = find_pending_req(s, oid)) != NULL) {
-        acb = aio_req->aiocb;
-        /* move aio_req from pending list to inflight one */
-        QLIST_REMOVE(aio_req, aio_siblings);
-        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
-                        acb->aiocb_type);
-    }
-}
-
 static coroutine_fn void reconnect_to_sdog(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
@@ -840,12 +822,6 @@ static void coroutine_fn aio_read_response(void *opaque)
                 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
                 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
             }
-            /*
-             * Some requests may be blocked because simultaneous
-             * create requests are not allowed, so we search the
-             * pending requests here.
-             */
-            send_pending_req(s, aio_req->oid);
         }
         break;
     case AIOCB_READ_UDATA:
@@ -1341,30 +1317,6 @@ out:
     return ret;
 }
 
-/* Return true if the specified request is linked to the pending list. */
-static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
-{
-    AIOReq *areq;
-    QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
-        if (areq != aio_req && areq->oid == aio_req->oid) {
-            /*
-             * Sheepdog cannot handle simultaneous create requests to the same
-             * object, so we cannot send the request until the previous request
-             * finishes.
-             */
-            DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
-            aio_req->flags = 0;
-            aio_req->base_oid = 0;
-            aio_req->create = false;
-            QLIST_REMOVE(aio_req, aio_siblings);
-            QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
-            return true;
-        }
-    }
-
-    return false;
-}
-
 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
 {
     SheepdogAIOCB *acb = aio_req->aiocb;
@@ -1379,10 +1331,6 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
             goto out;
         }
 
-        if (check_simultaneous_create(s, aio_req)) {
-            return;
-        }
-
         if (s->inode.data_vdi_id[idx]) {
             aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
             aio_req->flags |= SD_FLAG_CMD_COW;
@@ -1458,8 +1406,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     filename = qemu_opt_get(opts, "filename");
 
     QLIST_INIT(&s->inflight_aio_head);
-    QLIST_INIT(&s->pending_aio_head);
     QLIST_INIT(&s->failed_aio_head);
+    QLIST_INIT(&s->inflight_aiocb_head);
     s->fd = -1;
 
     memset(vdi, 0, sizeof(vdi));
@@ -1524,6 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
     pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
+    qemu_co_queue_init(&s->overwrapping_queue);
     qemu_opts_del(opts);
     g_free(buf);
     return 0;
@@ -1716,7 +1665,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
         if ((object_size - 1) & object_size) {    /* not a power of 2? */
             return -EINVAL;
         }
-        obj_order = ffs(object_size) - 1;
+        obj_order = ctz32(object_size);
         if (obj_order < 20 || obj_order > 31) {
             return -EINVAL;
         }
@@ -2195,12 +2144,6 @@ static int coroutine_fn sd_co_rw_vector(void *p)
                                 old_oid, done);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 
-        if (create) {
-            if (check_simultaneous_create(s, aio_req)) {
-                goto done;
-            }
-        }
-
         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
                         acb->aiocb_type);
     done:
@@ -2215,6 +2158,20 @@ out:
     return 1;
 }
 
+static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+{
+    SheepdogAIOCB *cb;
+
+    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+        if (AIOCBOverwrapping(aiocb, cb)) {
+            return true;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
+    return false;
+}
+
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
@@ -2234,14 +2191,25 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
+
     return acb->ret;
 }
 
@@ -2250,19 +2218,30 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
 {
     SheepdogAIOCB *acb;
     int ret;
+    BDRVSheepdogState *s = bs->opaque;
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
     acb->aiocb_type = AIOCB_READ_UDATA;
     acb->aio_done_func = sd_finish_aiocb;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
     return acb->ret;
 }
 
@@ -2341,6 +2320,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (ret < 0) {
         error_report("failed to create inode for snapshot: %s",
                      error_get_pretty(local_err));
+        error_free(local_err);
         goto cleanup;
     }
 
@@ -2609,14 +2589,25 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
     acb->aio_done_func = sd_finish_aiocb;
 
+retry:
+    if (check_overwrapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overwrapping_queue);
+        goto retry;
+    }
+
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
+        QLIST_REMOVE(acb, aiocb_siblings);
+        qemu_co_queue_restart_all(&s->overwrapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
 
     qemu_coroutine_yield();
 
+    QLIST_REMOVE(acb, aiocb_siblings);
+    qemu_co_queue_restart_all(&s->overwrapping_queue);
+
     return acb->ret;
 }
 
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1d5..49e143e99 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -24,6 +24,7 @@
 
 #include "block/snapshot.h"
 #include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
 
 QemuOptsList internal_snapshot_opts = {
     .name = "snapshot",
@@ -229,7 +230,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
 {
     BlockDriver *drv = bs->drv;
     if (!drv) {
-        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
+        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
         return -ENOMEDIUM;
     }
     if (!snapshot_id && !name) {
@@ -238,7 +239,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
     }
 
     /* drain all pending i/o before deleting snapshot */
-    bdrv_drain_all();
+    bdrv_drain(bs);
 
     if (drv->bdrv_snapshot_delete) {
         return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
@@ -246,9 +247,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
     if (bs->file) {
         return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "internal snapshot deletion");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support internal snapshot deletion",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
@@ -315,7 +316,7 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
     BlockDriver *drv = bs->drv;
 
     if (!drv) {
-        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
+        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
         return -ENOMEDIUM;
     }
     if (!snapshot_id && !name) {
@@ -329,9 +330,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
     if (drv->bdrv_snapshot_load_tmp) {
         return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "temporarily load internal snapshot");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support temporarily loading internal snapshots",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
diff --git a/block/ssh.c b/block/ssh.c
index f466cbf39..8d0673903 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -30,9 +30,11 @@
 #include <libssh2_sftp.h>
 
 #include "block/block_int.h"
+#include "qemu/error-report.h"
 #include "qemu/sockets.h"
 #include "qemu/uri.h"
 #include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"
 
 /* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
  * this block driver code.
@@ -561,7 +563,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
     /* Open the socket and connect. */
     s->sock = inet_connect(s->hostport, errp);
     if (s->sock < 0) {
-        ret = -errno;
+        ret = -EIO;
         goto err;
     }
 
diff --git a/block/stream.c b/block/stream.c
index a628901f6..ab0bd057f 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -14,6 +14,7 @@
 #include "trace.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -227,7 +228,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
     StreamBlockJob *s = container_of(job, StreamBlockJob, common);
 
     if (speed < 0) {
-        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
         return;
     }
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -250,7 +251,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
     if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
          on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
         !bdrv_iostatus_is_enabled(bs)) {
-        error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
         return;
     }
 
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
new file mode 100644
index 000000000..1abc6fcae
--- /dev/null
+++ b/block/throttle-groups.c
@@ -0,0 +1,501 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "block/throttle-groups.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "sysemu/qtest.h"
+
+/* The ThrottleGroup structure (with its ThrottleState) is shared
+ * among different BlockDriverState and it's independent from
+ * AioContext, so in order to use it from different threads it needs
+ * its own locking.
+ *
+ * This locking is however handled internally in this file, so it's
+ * mostly transparent to outside users (but see the documentation in
+ * throttle_groups_lock()).
+ *
+ * The whole ThrottleGroup structure is private and invisible to
+ * outside users, that only use it through its ThrottleState.
+ *
+ * In addition to the ThrottleGroup structure, BlockDriverState has
+ * fields that need to be accessed by other members of the group and
+ * therefore also need to be protected by this lock. Once a BDS is
+ * registered in a group those fields can be accessed by other threads
+ * any time.
+ *
+ * Again, all this is handled internally and is mostly transparent to
+ * the outside. The 'throttle_timers' field however has an additional
+ * constraint because it may be temporarily invalid (see for example
+ * bdrv_set_aio_context()). Therefore in this file a thread will
+ * access some other BDS's timers only after verifying that that BDS
+ * has throttled requests in the queue.
+ */
+typedef struct ThrottleGroup {
+    char *name; /* This is constant during the lifetime of the group */
+
+    QemuMutex lock; /* This lock protects the following four fields */
+    ThrottleState ts;
+    QLIST_HEAD(, BlockDriverState) head;
+    BlockDriverState *tokens[2];
+    bool any_timer_armed[2];
+
+    /* These two are protected by the global throttle_groups_lock */
+    unsigned refcount;
+    QTAILQ_ENTRY(ThrottleGroup) list;
+} ThrottleGroup;
+
+static QemuMutex throttle_groups_lock;
+static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
+    QTAILQ_HEAD_INITIALIZER(throttle_groups);
+
+/* Increments the reference count of a ThrottleGroup given its name.
+ *
+ * If no ThrottleGroup is found with the given name a new one is
+ * created.
+ *
+ * @name: the name of the ThrottleGroup
+ * @ret:  the ThrottleGroup
+ */
+static ThrottleGroup *throttle_group_incref(const char *name)
+{
+    ThrottleGroup *tg = NULL;
+    ThrottleGroup *iter;
+
+    qemu_mutex_lock(&throttle_groups_lock);
+
+    /* Look for an existing group with that name */
+    QTAILQ_FOREACH(iter, &throttle_groups, list) {
+        if (!strcmp(name, iter->name)) {
+            tg = iter;
+            break;
+        }
+    }
+
+    /* Create a new one if not found */
+    if (!tg) {
+        tg = g_new0(ThrottleGroup, 1);
+        tg->name = g_strdup(name);
+        qemu_mutex_init(&tg->lock);
+        throttle_init(&tg->ts);
+        QLIST_INIT(&tg->head);
+
+        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+    }
+
+    tg->refcount++;
+
+    qemu_mutex_unlock(&throttle_groups_lock);
+
+    return tg;
+}
+
+/* Decrease the reference count of a ThrottleGroup.
+ *
+ * When the reference count reaches zero the ThrottleGroup is
+ * destroyed.
+ *
+ * @tg:  The ThrottleGroup to unref
+ */
+static void throttle_group_unref(ThrottleGroup *tg)
+{
+    qemu_mutex_lock(&throttle_groups_lock);
+    if (--tg->refcount == 0) {
+        QTAILQ_REMOVE(&throttle_groups, tg, list);
+        qemu_mutex_destroy(&tg->lock);
+        g_free(tg->name);
+        g_free(tg);
+    }
+    qemu_mutex_unlock(&throttle_groups_lock);
+}
+
+/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
+ * the pointer) is guaranteed to remain constant during the lifetime
+ * of the group.
+ *
+ * @bs:   a BlockDriverState that is member of a throttling group
+ * @ret:  the name of the group.
+ */
+const char *throttle_group_get_name(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    return tg->name;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence,
+ * simulating a circular list.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:  the current BlockDriverState
+ * @ret: the next BlockDriverState in the sequence
+ */
+static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+
+    if (!next) {
+        return QLIST_FIRST(&tg->head);
+    }
+
+    return next;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ * @ret:       the next BlockDriverState with pending requests, or bs
+ *             if there is none.
+ */
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+                                             bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockDriverState *token, *start;
+
+    start = token = tg->tokens[is_write];
+
+    /* get next bs round in round robin style */
+    token = throttle_group_next_bs(token);
+    while (token != start && !token->pending_reqs[is_write]) {
+        token = throttle_group_next_bs(token);
+    }
+
+    /* If no IO are queued for scheduling on the next round robin token
+     * then decide the token is the current bs because chances are
+     * the current bs get the current request queued.
+     */
+    if (token == start && !token->pending_reqs[is_write]) {
+        token = bs;
+    }
+
+    return token;
+}
+
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:         the current BlockDriverState
+ * @is_write:   the type of operation (read/write)
+ * @ret:        whether the I/O request needs to be throttled or not
+ */
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+                                          bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool must_wait;
+
+    /* Check if any of the timers in this group is already armed */
+    if (tg->any_timer_armed[is_write]) {
+        return true;
+    }
+
+    must_wait = throttle_schedule_timer(ts, tt, is_write);
+
+    /* If a timer just got armed, set bs as the current token */
+    if (must_wait) {
+        tg->tokens[is_write] = bs;
+        tg->any_timer_armed[is_write] = true;
+    }
+
+    return must_wait;
+}
+
+/* Look for the next pending I/O request and schedule it.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ */
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    bool must_wait;
+    BlockDriverState *token;
+
+    /* Check if there's any pending request to schedule next */
+    token = next_throttle_token(bs, is_write);
+    if (!token->pending_reqs[is_write]) {
+        return;
+    }
+
+    /* Set a timer for the request if it needs to be throttled */
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* If it doesn't have to wait, queue it for immediate execution */
+    if (!must_wait) {
+        /* Give preference to requests from the current bs */
+        if (qemu_in_coroutine() &&
+            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+            token = bs;
+        } else {
+            ThrottleTimers *tt = &token->throttle_timers;
+            int64_t now = qemu_clock_get_ns(tt->clock_type);
+            timer_mod(tt->timers[is_write], now + 1);
+            tg->any_timer_armed[is_write] = true;
+        }
+        tg->tokens[is_write] = token;
+    }
+}
+
+/* Check if an I/O request needs to be throttled, wait and set a timer
+ * if necessary, and schedule the next request using a round robin
+ * algorithm.
+ *
+ * @bs:        the current BlockDriverState
+ * @bytes:     the number of bytes for this I/O
+ * @is_write:  the type of operation (read/write)
+ */
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+                                                        unsigned int bytes,
+                                                        bool is_write)
+{
+    bool must_wait;
+    BlockDriverState *token;
+
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+
+    /* First we check if this I/O has to be throttled. */
+    token = next_throttle_token(bs, is_write);
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* Wait if there's a timer set or queued requests of this type */
+    if (must_wait || bs->pending_reqs[is_write]) {
+        bs->pending_reqs[is_write]++;
+        qemu_mutex_unlock(&tg->lock);
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_mutex_lock(&tg->lock);
+        bs->pending_reqs[is_write]--;
+    }
+
+    /* The I/O will be executed, so do the accounting */
+    throttle_account(bs->throttle_state, is_write, bytes);
+
+    /* Schedule the next request */
+    schedule_next_request(bs, is_write);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Update the throttle configuration for a particular group. Similar
+ * to throttle_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration to set
+ */
+void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    /* throttle_config() cancels the timers */
+    if (timer_pending(tt->timers[0])) {
+        tg->any_timer_armed[0] = false;
+    }
+    if (timer_pending(tt->timers[1])) {
+        tg->any_timer_armed[1] = false;
+    }
+    throttle_config(ts, tt, cfg);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Get the throttle configuration from a particular group. Similar to
+ * throttle_get_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs:  a BlockDriverState that is member of the group
+ * @cfg: the configuration will be written here
+ */
+void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+    throttle_get_config(ts, cfg);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* ThrottleTimers callback. This wakes up a request that was waiting
+ * because it had been throttled.
+ *
+ * @bs:        the BlockDriverState whose request had been throttled
+ * @is_write:  the type of operation (read/write)
+ */
+static void timer_cb(BlockDriverState *bs, bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool empty_queue;
+
+    /* The timer has just been fired, so we can update the flag */
+    qemu_mutex_lock(&tg->lock);
+    tg->any_timer_armed[is_write] = false;
+    qemu_mutex_unlock(&tg->lock);
+
+    /* Run the request that was waiting for this timer */
+    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+
+    /* If the request queue was empty then we have to take care of
+     * scheduling the next one */
+    if (empty_queue) {
+        qemu_mutex_lock(&tg->lock);
+        schedule_next_request(bs, is_write);
+        qemu_mutex_unlock(&tg->lock);
+    }
+}
+
+static void read_timer_cb(void *opaque)
+{
+    timer_cb(opaque, false);
+}
+
+static void write_timer_cb(void *opaque)
+{
+    timer_cb(opaque, true);
+}
+
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
+ *
+ * @bs:        the BlockDriverState to insert
+ * @groupname: the name of the group
+ */
+void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+{
+    int i;
+    ThrottleGroup *tg = throttle_group_incref(groupname);
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+
+    bs->throttle_state = &tg->ts;
+
+    qemu_mutex_lock(&tg->lock);
+    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    for (i = 0; i < 2; i++) {
+        if (!tg->tokens[i]) {
+            tg->tokens[i] = bs;
+        }
+    }
+
+    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+
+    throttle_timers_init(&bs->throttle_timers,
+                         bdrv_get_aio_context(bs),
+                         clock_type,
+                         read_timer_cb,
+                         write_timer_cb,
+                         bs);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
+/* Unregister a BlockDriverState from its group, removing it from the
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
+ *
+ * The group will be destroyed if it's empty after this operation.
+ *
+ * @bs: the BlockDriverState to remove
+ */
+void throttle_group_unregister_bs(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    int i;
+
+    qemu_mutex_lock(&tg->lock);
+    for (i = 0; i < 2; i++) {
+        if (tg->tokens[i] == bs) {
+            BlockDriverState *token = throttle_group_next_bs(bs);
+            /* Take care of the case where this is the last bs in the group */
+            if (token == bs) {
+                token = NULL;
+            }
+            tg->tokens[i] = token;
+        }
+    }
+
+    /* remove the current bs from the list */
+    QLIST_REMOVE(bs, round_robin);
+    throttle_timers_destroy(&bs->throttle_timers);
+    qemu_mutex_unlock(&tg->lock);
+
+    throttle_group_unref(tg);
+    bs->throttle_state = NULL;
+}
+
+/* Acquire the lock of this throttling group.
+ *
+ * You won't normally need to use this. None of the functions from the
+ * ThrottleGroup API require you to acquire the lock since all of them
+ * deal with it internally.
+ *
+ * This should only be used in exceptional cases when you want to
+ * access the protected fields of a BlockDriverState directly
+ * (e.g. bdrv_swap()).
+ *
+ * @bs: a BlockDriverState that is member of the group
+ */
+void throttle_group_lock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+}
+
+/* Release the lock of this throttling group.
+ *
+ * See the comments in throttle_group_lock().
+ */
+void throttle_group_unlock(BlockDriverState *bs)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_unlock(&tg->lock);
+}
+
+static void throttle_groups_init(void)
+{
+    qemu_mutex_init(&throttle_groups_lock);
+}
+
+block_init(throttle_groups_init);
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02fe2..7642ef359 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when vdi images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vdi", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index 6547bec40..47fec63c6 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -19,6 +19,7 @@
  */
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "block/vhdx.h"
 
diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45d5..0776de717 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     /* TODO: differencing files */
 
     /* Disable migration when VHDX images are used */
-    error_set(&s->migration_blocker,
-            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-            "vhdx", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
                         memset(iov1.iov_base, 0, iov1.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
-                                              sinfo.block_offset);
+                                              iov1.iov_len);
                         sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
                     }
 
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
                         memset(iov2.iov_base, 0, iov2.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
-                                              sinfo.block_offset);
+                                              iov2.iov_len);
                         sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
                     }
                 }
diff --git a/block/vmdk.c b/block/vmdk.c
index 4c71cde3b..fbaab67c8 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -25,6 +25,8 @@
 
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
 #include <zlib.h>
@@ -321,37 +323,13 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
     return 1;
 }
 
-/* Queue extents, if any, for reopen() */
+/* We have nothing to do for VMDK reopen, stubs just return success */
 static int vmdk_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
 {
-    BDRVVmdkState *s;
-    int ret = -1;
-    int i;
-    VmdkExtent *e;
-
     assert(state != NULL);
     assert(state->bs != NULL);
-
-    if (queue == NULL) {
-        error_setg(errp, "No reopen queue for VMDK extents");
-        goto exit;
-    }
-
-    s = state->bs->opaque;
-
-    assert(s != NULL);
-
-    for (i = 0; i < s->num_extents; i++) {
-        e = &s->extents[i];
-        if (e->file != state->bs->file) {
-            bdrv_reopen_queue(queue, e->file, state->flags);
-        }
-    }
-    ret = 0;
-
-exit:
-    return ret;
+    return 0;
 }
 
 static int vmdk_parent_open(BlockDriverState *bs)
@@ -524,7 +502,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
     }
     ret = vmdk_add_extent(bs, file, false,
                           le32_to_cpu(header.disk_sectors),
-                          le32_to_cpu(header.l1dir_offset) << 9,
+                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
                           0,
                           le32_to_cpu(header.l1dir_size),
                           4096,
@@ -543,7 +521,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
-                               Error **errp);
+                               QDict *options, Error **errp);
 
 static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
                             Error **errp)
@@ -582,7 +560,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
 
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
-                           int flags, Error **errp)
+                           int flags, QDict *options, Error **errp)
 {
     int ret;
     uint32_t magic;
@@ -606,7 +584,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
             if (!buf) {
                 return -EINVAL;
             }
-            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
             g_free(buf);
             return ret;
         }
@@ -669,8 +647,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         char buf[64];
         snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
                  le32_to_cpu(header.version));
-        error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "vmdk", buf);
+        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+                   bdrv_get_device_or_node_name(bs), "vmdk", buf);
         return -ENOTSUP;
     } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
         /* VMware KB 2064959 explains that version 3 added support for
@@ -763,7 +741,7 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
 /* Open an extent file and append to bs array */
 static int vmdk_open_sparse(BlockDriverState *bs,
                             BlockDriverState *file, int flags,
-                            char *buf, Error **errp)
+                            char *buf, QDict *options, Error **errp)
 {
     uint32_t magic;
 
@@ -773,7 +751,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
             return vmdk_open_vmfs_sparse(bs, file, flags, errp);
             break;
         case VMDK4_MAGIC:
-            return vmdk_open_vmdk4(bs, file, flags, errp);
+            return vmdk_open_vmdk4(bs, file, flags, options, errp);
             break;
         default:
             error_setg(errp, "Image not in VMDK format");
@@ -783,7 +761,8 @@ static int vmdk_open_sparse(BlockDriverState *bs,
 }
 
 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
-                              const char *desc_file_path, Error **errp)
+                              const char *desc_file_path, QDict *options,
+                              Error **errp)
 {
     int ret;
     int matches;
@@ -797,6 +776,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
     BlockDriverState *extent_file;
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent;
+    char extent_opt_prefix[32];
 
     while (*p) {
         /* parse extent line in one of below formats:
@@ -846,8 +826,12 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
         extent_path = g_malloc0(PATH_MAX);
         path_combine(extent_path, PATH_MAX, desc_file_path, fname);
         extent_file = NULL;
-        ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
-                        bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
+
+        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
+        assert(ret < 32);
+
+        ret = bdrv_open_image(&extent_file, extent_path, options,
+                              extent_opt_prefix, bs, &child_file, false, errp);
         g_free(extent_path);
         if (ret) {
             return ret;
@@ -870,7 +854,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             if (!buf) {
                 ret = -EINVAL;
             } else {
-                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
+                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
+                                       options, errp);
             }
             g_free(buf);
             if (ret) {
@@ -898,7 +883,7 @@ next_line:
 }
 
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
-                               Error **errp)
+                               QDict *options, Error **errp)
 {
     int ret;
     char ct[128];
@@ -920,7 +905,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
     }
     s->create_type = g_strdup(ct);
     s->desc_offset = 0;
-    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp);
+    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp);
 exit:
     return ret;
 }
@@ -942,11 +927,11 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     switch (magic) {
         case VMDK3_MAGIC:
         case VMDK4_MAGIC:
-            ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp);
+            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp);
             s->desc_offset = 0x200;
             break;
         default:
-            ret = vmdk_open_desc_file(bs, flags, buf, errp);
+            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
             break;
     }
     if (ret) {
@@ -963,9 +948,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VMDK images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vmdk", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
     g_free(buf);
     return 0;
@@ -1705,12 +1690,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     /* write all the data */
     ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
     if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
+        error_setg(errp, QERR_IO_ERROR);
         goto exit;
     }
     ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
+        error_setg(errp, QERR_IO_ERROR);
         goto exit;
     }
 
@@ -1730,7 +1715,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
                       gd_buf, gd_buf_size);
     if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
+        error_setg(errp, QERR_IO_ERROR);
         goto exit;
     }
 
@@ -1742,7 +1727,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
                       gd_buf, gd_buf_size);
     if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
+        error_setg(errp, QERR_IO_ERROR);
         goto exit;
     }
 
diff --git a/block/vpc.c b/block/vpc.c
index 8ab30d600..3e385d9fb 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -328,9 +328,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VHD images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vpc", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f40..206869712 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -30,6 +30,7 @@
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qstring.h"
 
 #ifndef S_IWGRP
 #define S_IWGRP 0
@@ -322,6 +323,7 @@ typedef struct BDRVVVFATState {
 
     int fat_type; /* 16 or 32 */
     array_t fat,directory,mapping;
+    char volume_label[11];
 
     unsigned int cluster_size;
     unsigned int sectors_per_cluster;
@@ -859,7 +861,7 @@ static int init_directories(BDRVVVFATState* s,
     {
 	direntry_t* entry=array_get_next(&(s->directory));
 	entry->attributes=0x28; /* archive | volume label */
-        memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name));
+        memcpy(entry->name, s->volume_label, sizeof(entry->name));
     }
 
     /* Now build FAT, and write back information into directory */
@@ -968,7 +970,8 @@ static int init_directories(BDRVVVFATState* s,
     bootsector->u.fat16.signature=0x29;
     bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
 
-    memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+    memcpy(bootsector->u.fat16.volume_label, s->volume_label,
+           sizeof(bootsector->u.fat16.volume_label));
     memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12   ":s->fat_type==16?"FAT16   ":"FAT32   "),8);
     bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
 
@@ -1007,6 +1010,11 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Create a floppy rather than a hard disk image",
         },
+        {
+            .name = "label",
+            .type = QEMU_OPT_STRING,
+            .help = "Use a volume label other than QEMU VVFAT",
+        },
         {
             .name = "rw",
             .type = QEMU_OPT_BOOL,
@@ -1059,8 +1067,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options,
     /* Fill in the options QDict */
     qdict_put(options, "dir", qstring_from_str(filename));
     qdict_put(options, "fat-type", qint_from_int(fat_type));
-    qdict_put(options, "floppy", qbool_from_int(floppy));
-    qdict_put(options, "rw", qbool_from_int(rw));
+    qdict_put(options, "floppy", qbool_from_bool(floppy));
+    qdict_put(options, "rw", qbool_from_bool(rw));
 }
 
 static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
@@ -1069,7 +1077,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVVVFATState *s = bs->opaque;
     int cyls, heads, secs;
     bool floppy;
-    const char *dirname;
+    const char *dirname, *label;
     QemuOpts *opts;
     Error *local_err = NULL;
     int ret;
@@ -1096,6 +1104,18 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
     s->fat_type = qemu_opt_get_number(opts, "fat-type", 0);
     floppy = qemu_opt_get_bool(opts, "floppy", false);
 
+    memset(s->volume_label, ' ', sizeof(s->volume_label));
+    label = qemu_opt_get(opts, "label");
+    if (label) {
+        size_t label_length = strlen(label);
+        if (label_length > 11) {
+            error_setg(errp, "vvfat label cannot be longer than 11 bytes");
+            ret = -EINVAL;
+            goto fail;
+        }
+        memcpy(s->volume_label, label, label_length);
+    }
+
     if (floppy) {
         /* 1.44MB or 2.88MB floppy.  2.88MB can be FAT12 (default) or FAT16. */
         if (!s->fat_type) {
@@ -1180,9 +1200,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Disable migration when vvfat is used rw */
     if (s->qcow) {
-        error_set(&s->migration_blocker,
-                  QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+        error_setg(&s->migration_blocker,
+                   "The vvfat (rw) format used by node '%s' "
+                   "does not support live migration",
+                   bdrv_get_device_or_node_name(bs));
         migrate_add_blocker(s->migration_blocker);
     }
 
-- 
cgit v1.2.3