Imported Upstream version 2.7.0upstream/2.7.0

author: hyokeun <hyokeun.jeon@samsung.com> 2016-09-06 14:09:22 +0900
committer: hyokeun <hyokeun.jeon@samsung.com> 2016-09-06 14:09:22 +0900
commit: bd54c25035217800f3b1d39f6472d599cd602d5a (patch)
tree: 299417fe96f546225439ff92b27ac3e55909a970 /block
parent: 186efde2677c31fb40d154a81a5f3731eab52414 (diff)
download: qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.gz
qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.bz2
qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.zip
56 files changed, 4940 insertions, 3544 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 44a541622..2593a2f8a 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o io.o
+block-obj-y += null.o mirror.o commit.o io.o
 block-obj-y += throttle-groups.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
@@ -26,7 +26,6 @@ block-obj-y += write-threshold.o
 block-obj-y += crypto.o
 
 common-obj-y += stream.o
-common-obj-y += commit.o
 common-obj-y += backup.o
 
 iscsi.o-cflags     := $(LIBISCSI_CFLAGS)
diff --git a/block/archipelago.c b/block/archipelago.c
index b9f5e69d4..37b8aca78 100644
--- a/block/archipelago.c
+++ b/block/archipelago.c
@@ -974,11 +974,9 @@ err_exit2:
 
 static int64_t qemu_archipelago_getlength(BlockDriverState *bs)
 {
-    int64_t ret;
     BDRVArchipelagoState *s = bs->opaque;
 
-    ret = archipelago_volume_info(s);
-    return ret;
+    return archipelago_volume_info(s);
 }
 
 static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset)
diff --git a/block/backup.c b/block/backup.c
index 491fd1406..2c0532314 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -36,7 +36,7 @@ typedef struct CowRequest {
 
 typedef struct BackupBlockJob {
     BlockJob common;
-    BlockDriverState *target;
+    BlockBackend *target;
     /* bitmap for sync=incremental */
     BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
@@ -47,6 +47,7 @@ typedef struct BackupBlockJob {
     uint64_t sectors_read;
     unsigned long *done_bitmap;
     int64_t cluster_size;
+    NotifierWithReturn before_write;
     QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;
 
@@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req)
     qemu_co_queue_restart_all(&req->wait_queue);
 }
 
-static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                                       int64_t sector_num, int nb_sectors,
                                       bool *error_is_read,
                                       bool is_write_notifier)
 {
-    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    BlockBackend *blk = job->common.blk;
     CowRequest cow_request;
     struct iovec iov;
     QEMUIOVector bounce_qiov;
@@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                 start * sectors_per_cluster);
 
         if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
+            bounce_buffer = blk_blockalign(blk, job->cluster_size);
         }
         iov.iov_base = bounce_buffer;
         iov.iov_len = n * BDRV_SECTOR_SIZE;
         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-        if (is_write_notifier) {
-            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * sectors_per_cluster,
-                                           n, &bounce_qiov);
-        } else {
-            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
-                                &bounce_qiov);
-        }
+        ret = blk_co_preadv(blk, start * job->cluster_size,
+                            bounce_qiov.size, &bounce_qiov,
+                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
         if (ret < 0) {
             trace_backup_do_cow_read_fail(job, start, ret);
             if (error_is_read) {
@@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
         }
 
         if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = bdrv_co_write_zeroes(job->target,
-                                       start * sectors_per_cluster,
-                                       n, BDRV_REQ_MAY_UNMAP);
+            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
         } else {
-            ret = bdrv_co_writev(job->target,
-                                 start * sectors_per_cluster, n,
-                                 &bounce_qiov);
+            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+                                 bounce_qiov.size, &bounce_qiov, 0);
         }
         if (ret < 0) {
             trace_backup_do_cow_write_fail(job, start, ret);
@@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify(
         NotifierWithReturn *notifier,
         void *opaque)
 {
+    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
     BdrvTrackedRequest *req = opaque;
     int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
     int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
 
+    assert(req->bs == blk_bs(job->common.blk));
     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -218,19 +214,10 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static void backup_iostatus_reset(BlockJob *job)
-{
-    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
     BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *bs = blk_bs(job->common.blk);
 
     if (ret < 0 || block_job_is_cancelled(&job->common)) {
         /* Merge the successor back into the parent, delete nothing. */
@@ -259,24 +246,31 @@ static void backup_abort(BlockJob *job)
     }
 }
 
+static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    blk_set_aio_context(s->target, aio_context);
+}
+
 static const BlockJobDriver backup_job_driver = {
-    .instance_size  = sizeof(BackupBlockJob),
-    .job_type       = BLOCK_JOB_TYPE_BACKUP,
-    .set_speed      = backup_set_speed,
-    .iostatus_reset = backup_iostatus_reset,
-    .commit         = backup_commit,
-    .abort          = backup_abort,
+    .instance_size          = sizeof(BackupBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_BACKUP,
+    .set_speed              = backup_set_speed,
+    .commit                 = backup_commit,
+    .abort                  = backup_abort,
+    .attached_aio_context   = backup_attached_aio_context,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                             bool read, int error)
 {
     if (read) {
-        return block_job_error_action(&job->common, job->common.bs,
-                                      job->on_source_error, true, error);
+        return block_job_error_action(&job->common, job->on_source_error,
+                                      true, error);
     } else {
-        return block_job_error_action(&job->common, job->target,
-                                      job->on_target_error, false, error);
+        return block_job_error_action(&job->common, job->on_target_error,
+                                      false, error);
     }
 }
 
@@ -289,7 +283,7 @@ static void backup_complete(BlockJob *job, void *opaque)
     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
     BackupCompleteData *data = opaque;
 
-    bdrv_unref(s->target);
+    blk_unref(s->target);
 
     block_job_completed(job, data->ret);
     g_free(data);
@@ -331,7 +325,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
     int64_t end;
     int64_t last_cluster = -1;
     int64_t sectors_per_cluster = cluster_size_sectors(job);
-    BlockDriverState *bs = job->common.bs;
     HBitmapIter hbi;
 
     granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -353,7 +346,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                 if (yield_and_check(job)) {
                     return ret;
                 }
-                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                     sectors_per_cluster, &error_is_read,
                                     false);
                 if ((ret < 0) &&
@@ -386,12 +379,8 @@ static void coroutine_fn backup_run(void *opaque)
 {
     BackupBlockJob *job = opaque;
     BackupCompleteData *data;
-    BlockDriverState *bs = job->common.bs;
-    BlockDriverState *target = job->target;
-    BlockdevOnError on_target_error = job->on_target_error;
-    NotifierWithReturn before_write = {
-        .notify = backup_before_write_notify,
-    };
+    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockBackend *target = job->target;
     int64_t start, end;
     int64_t sectors_per_cluster = cluster_size_sectors(job);
     int ret = 0;
@@ -404,20 +393,14 @@ static void coroutine_fn backup_run(void *opaque)
 
     job->done_bitmap = bitmap_new(end);
 
-    if (target->blk) {
-        blk_set_on_error(target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(target->blk);
-    }
-
-    bdrv_add_before_write_notifier(bs, &before_write);
+    job->before_write.notify = backup_before_write_notify;
+    bdrv_add_before_write_notifier(bs, &job->before_write);
 
     if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
         while (!block_job_is_cancelled(&job->common)) {
             /* Yield until the job is cancelled.  We just let our before_write
              * notify callback service CoW requests. */
-            job->common.busy = false;
-            qemu_coroutine_yield();
-            job->common.busy = true;
+            block_job_yield(&job->common);
         }
     } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
         ret = backup_run_incremental(job);
@@ -461,7 +444,7 @@ static void coroutine_fn backup_run(void *opaque)
                 }
             }
             /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * sectors_per_cluster,
+            ret = backup_do_cow(job, start * sectors_per_cluster,
                                 sectors_per_cluster, &error_is_read, false);
             if (ret < 0) {
                 /* Depending on error action, fail now or retry cluster */
@@ -477,26 +460,23 @@ static void coroutine_fn backup_run(void *opaque)
         }
     }
 
-    notifier_with_return_remove(&before_write);
+    notifier_with_return_remove(&job->before_write);
 
     /* wait until pending backup_do_cow() calls have completed */
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
     g_free(job->done_bitmap);
 
-    if (target->blk) {
-        blk_iostatus_disable(target->blk);
-    }
-    bdrv_op_unblock_all(target, job->common.blocker);
+    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
     block_job_defer_to_main_loop(&job->common, backup_complete, data);
 }
 
-void backup_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, MirrorSyncMode sync_mode,
-                  BdrvDirtyBitmap *sync_bitmap,
+void backup_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *target, int64_t speed,
+                  MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
@@ -504,24 +484,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 {
     int64_t len;
     BlockDriverInfo bdi;
+    BackupBlockJob *job = NULL;
     int ret;
 
     assert(bs);
     assert(target);
-    assert(cb);
 
     if (bs == target) {
         error_setg(errp, "Source and target cannot be the same");
         return;
     }
 
-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
     if (!bdrv_is_inserted(bs)) {
         error_setg(errp, "Device is not inserted: %s",
                    bdrv_get_device_name(bs));
@@ -568,15 +541,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         goto error;
     }
 
-    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
-                                           cb, opaque, errp);
+    job = block_job_create(job_id, &backup_job_driver, bs, speed,
+                           cb, opaque, errp);
     if (!job) {
         goto error;
     }
 
+    job->target = blk_new();
+    blk_insert_bs(job->target, target);
+
     job->on_source_error = on_source_error;
     job->on_target_error = on_target_error;
-    job->target = target;
     job->sync_mode = sync_mode;
     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                        sync_bitmap : NULL;
@@ -584,7 +559,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     /* If there is no backing file on the target, we cannot rely on COW if our
      * backup cluster size is smaller than the target cluster size. Even for
      * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(job->target, &bdi);
+    ret = bdrv_get_info(target, &bdi);
     if (ret < 0 && !target->backing) {
         error_setg_errno(errp, -ret,
             "Couldn't determine the cluster size of the target image, "
@@ -601,13 +576,17 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 
     bdrv_op_block_all(target, job->common.blocker);
     job->common.len = len;
-    job->common.co = qemu_coroutine_create(backup_run);
+    job->common.co = qemu_coroutine_create(backup_run, job);
     block_job_txn_add_job(txn, &job->common);
-    qemu_coroutine_enter(job->common.co, job);
+    qemu_coroutine_enter(job->common.co);
     return;
 
  error:
     if (sync_bitmap) {
         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
     }
+    if (job) {
+        blk_unref(job->target);
+        block_job_unref(&job->common);
+    }
 }
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 20d25bda6..d5db16681 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -37,6 +37,10 @@
 typedef struct BDRVBlkdebugState {
     int state;
     int new_state;
+    int align;
+
+    /* For blkdebug_refresh_filename() */
+    char *config_file;
 
     QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX];
     QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
@@ -350,7 +354,6 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVBlkdebugState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *config;
     uint64_t align;
     int ret;
 
@@ -363,8 +366,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Read rules from config file or command line options */
-    config = qemu_opt_get(opts, "config");
-    ret = read_config(s, config, options, errp);
+    s->config_file = g_strdup(qemu_opt_get(opts, "config"));
+    ret = read_config(s, s->config_file, options, errp);
     if (ret) {
         goto out;
     }
@@ -382,10 +385,10 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Set request alignment */
-    align = qemu_opt_get_size(opts, "align", bs->request_alignment);
-    if (align > 0 && align < INT_MAX && !(align & (align - 1))) {
-        bs->request_alignment = align;
-    } else {
+    align = qemu_opt_get_size(opts, "align", 0);
+    if (align < INT_MAX && is_power_of_2(align)) {
+        s->align = align;
+    } else if (align) {
         error_setg(errp, "Invalid alignment");
         ret = -EINVAL;
         goto fail_unref;
@@ -397,6 +400,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
 fail_unref:
     bdrv_unref_child(bs, bs->file);
 out:
+    if (ret < 0) {
+        g_free(s->config_file);
+    }
     qemu_opts_del(opts);
     return ret;
 }
@@ -456,7 +462,7 @@ static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
         return inject_error(bs, cb, opaque, rule);
     }
 
-    return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors,
+    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
                           cb, opaque);
 }
 
@@ -479,7 +485,7 @@ static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
         return inject_error(bs, cb, opaque, rule);
     }
 
-    return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors,
+    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
                            cb, opaque);
 }
 
@@ -514,6 +520,8 @@ static void blkdebug_close(BlockDriverState *bs)
             remove_rule(rule);
         }
     }
+
+    g_free(s->config_file);
 }
 
 static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
@@ -620,7 +628,7 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
 
     QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) {
         if (!strcmp(r->tag, tag)) {
-            qemu_coroutine_enter(r->co, NULL);
+            qemu_coroutine_enter(r->co);
             return 0;
         }
     }
@@ -646,7 +654,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs,
     }
     QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) {
         if (!strcmp(r->tag, tag)) {
-            qemu_coroutine_enter(r->co, NULL);
+            qemu_coroutine_enter(r->co);
             ret = 0;
         }
     }
@@ -678,6 +686,7 @@ static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
 
 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 {
+    BDRVBlkdebugState *s = bs->opaque;
     QDict *opts;
     const QDictEntry *e;
     bool force_json = false;
@@ -699,8 +708,7 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
 
     if (!force_json && bs->file->bs->exact_filename[0]) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "blkdebug:%s:%s",
-                 qdict_get_try_str(options, "config") ?: "",
+                 "blkdebug:%s:%s", s->config_file ?: "",
                  bs->file->bs->exact_filename);
     }
 
@@ -720,6 +728,15 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
     bs->full_open_options = opts;
 }
 
+static void blkdebug_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+
+    if (s->align) {
+        bs->bl.request_alignment = s->align;
+    }
+}
+
 static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state,
                                    BlockReopenQueue *queue, Error **errp)
 {
@@ -738,6 +755,7 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_getlength         = blkdebug_getlength,
     .bdrv_truncate          = blkdebug_truncate,
     .bdrv_refresh_filename  = blkdebug_refresh_filename,
+    .bdrv_refresh_limits    = blkdebug_refresh_limits,
 
     .bdrv_aio_readv         = blkdebug_aio_readv,
     .bdrv_aio_writev        = blkdebug_aio_writev,
diff --git a/block/blkreplay.c b/block/blkreplay.c
index 42f1813af..30f9d5ff6 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -65,7 +65,7 @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
 static void blkreplay_bh_cb(void *opaque)
 {
     Request *req = opaque;
-    qemu_coroutine_enter(req->co, NULL);
+    qemu_coroutine_enter(req->co);
     qemu_bh_delete(req->bh);
     g_free(req);
 }
@@ -81,44 +81,44 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs,
     replay_block_event(req->bh, reqid);
 }
 
-static int coroutine_fn blkreplay_co_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov);
+    int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
     return ret;
 }
 
-static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
+    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov);
+    int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
     return ret;
 }
 
-static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
     return ret;
 }
 
-static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
+                                              int64_t offset, int count)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
+    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
@@ -144,11 +144,11 @@ static BlockDriver bdrv_blkreplay = {
     .bdrv_close             = blkreplay_close,
     .bdrv_getlength         = blkreplay_getlength,
 
-    .bdrv_co_readv          = blkreplay_co_readv,
-    .bdrv_co_writev         = blkreplay_co_writev,
+    .bdrv_co_preadv         = blkreplay_co_preadv,
+    .bdrv_co_pwritev        = blkreplay_co_pwritev,
 
-    .bdrv_co_write_zeroes   = blkreplay_co_write_zeroes,
-    .bdrv_co_discard        = blkreplay_co_discard,
+    .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
+    .bdrv_co_pdiscard       = blkreplay_co_pdiscard,
     .bdrv_co_flush          = blkreplay_co_flush,
 };
 
diff --git a/block/blkverify.c b/block/blkverify.c
index 9414b7a84..da62d7596 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -247,9 +247,9 @@ static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs,
     qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
     qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
 
-    bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors,
+    bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
                    blkverify_aio_cb, acb);
-    bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors,
+    bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors,
                    blkverify_aio_cb, acb);
     return &acb->common;
 }
@@ -262,9 +262,9 @@ static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs,
     BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov,
                                             nb_sectors, cb, opaque);
 
-    bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors,
+    bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors,
                     blkverify_aio_cb, acb);
-    bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors,
+    bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
                     blkverify_aio_cb, acb);
     return &acb->common;
 }
@@ -293,22 +293,6 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs,
     return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate);
 }
 
-/* Propagate AioContext changes to ->test_file */
-static void blkverify_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_detach_aio_context(s->test_file->bs);
-}
-
-static void blkverify_attach_aio_context(BlockDriverState *bs,
-                                         AioContext *new_context)
-{
-    BDRVBlkverifyState *s = bs->opaque;
-
-    bdrv_attach_aio_context(s->test_file->bs, new_context);
-}
-
 static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
 {
     BDRVBlkverifyState *s = bs->opaque;
@@ -356,9 +340,6 @@ static BlockDriver bdrv_blkverify = {
     .bdrv_aio_writev                  = blkverify_aio_writev,
     .bdrv_aio_flush                   = blkverify_aio_flush,
 
-    .bdrv_attach_aio_context          = blkverify_attach_aio_context,
-    .bdrv_detach_aio_context          = blkverify_detach_aio_context,
-
     .is_filter                        = true,
     .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
diff --git a/block/block-backend.c b/block/block-backend.c
index 16c9d5e0f..effa03892 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1,7 +1,7 @@
 /*
  * QEMU Block backends
  *
- * Copyright (C) 2014 Red Hat, Inc.
+ * Copyright (C) 2014-2016 Red Hat, Inc.
  *
  * Authors:
  *  Markus Armbruster <armbru@redhat.com>,
@@ -19,6 +19,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
+#include "trace.h"
 
 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -34,6 +35,7 @@ struct BlockBackend {
     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
+    BlockBackendPublic public;
 
     void *dev;                  /* attached device model, if any */
     /* TODO change to DeviceState when all users are qdevified */
@@ -74,6 +76,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
 };
 
 static void drive_info_del(DriveInfo *dinfo);
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 
 /* All BlockBackends */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
@@ -90,9 +93,26 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options,
     /* We're not supposed to call this function for root nodes */
     abort();
 }
+static void blk_root_drained_begin(BdrvChild *child);
+static void blk_root_drained_end(BdrvChild *child);
+
+static void blk_root_change_media(BdrvChild *child, bool load);
+static void blk_root_resize(BdrvChild *child);
+
+static const char *blk_root_get_name(BdrvChild *child)
+{
+    return blk_name(child->opaque);
+}
 
 static const BdrvChildRole child_root = {
-    .inherit_options = blk_root_inherit_options,
+    .inherit_options    = blk_root_inherit_options,
+
+    .change_media       = blk_root_change_media,
+    .resize             = blk_root_resize,
+    .get_name           = blk_root_get_name,
+
+    .drained_begin      = blk_root_drained_begin,
+    .drained_end        = blk_root_drained_end,
 };
 
 /*
@@ -100,40 +120,26 @@ static const BdrvChildRole child_root = {
  * Store an error through @errp on failure, unless it's null.
  * Return the new BlockBackend on success, null on failure.
  */
-BlockBackend *blk_new(Error **errp)
+BlockBackend *blk_new(void)
 {
     BlockBackend *blk;
 
     blk = g_new0(BlockBackend, 1);
     blk->refcnt = 1;
-    notifier_list_init(&blk->remove_bs_notifiers);
-    notifier_list_init(&blk->insert_bs_notifiers);
-    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
-    return blk;
-}
+    blk_set_enable_write_cache(blk, true);
 
-/*
- * Create a new BlockBackend with a new BlockDriverState attached.
- * Otherwise just like blk_new(), which see.
- */
-BlockBackend *blk_new_with_bs(Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
+    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
+    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
 
-    blk = blk_new(errp);
-    if (!blk) {
-        return NULL;
-    }
+    notifier_list_init(&blk->remove_bs_notifiers);
+    notifier_list_init(&blk->insert_bs_notifiers);
 
-    bs = bdrv_new_root();
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
     return blk;
 }
 
 /*
- * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
+ * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
  *
  * Just as with bdrv_open(), after having called this function the reference to
  * @options belongs to the block layer (even on failure).
@@ -148,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp)
 {
     BlockBackend *blk;
-    int ret;
-
-    blk = blk_new_with_bs(errp);
-    if (!blk) {
-        QDECREF(options);
-        return NULL;
-    }
+    BlockDriverState *bs;
 
-    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
-    if (ret < 0) {
+    blk = blk_new();
+    bs = bdrv_open(filename, reference, options, flags, errp);
+    if (!bs) {
         blk_unref(blk);
         return NULL;
     }
 
-    blk_set_enable_write_cache(blk, true);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
 
     return blk;
 }
@@ -177,10 +178,6 @@ static void blk_delete(BlockBackend *blk)
     }
     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
-    if (blk->root_state.throttle_state) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
     QTAILQ_REMOVE(&block_backends, blk, link);
     drive_info_del(blk->legacy_dinfo);
     block_acct_cleanup(&blk->stats);
@@ -267,28 +264,45 @@ BlockBackend *blk_next(BlockBackend *blk)
                : QTAILQ_FIRST(&monitor_block_backends);
 }
 
-/*
- * Iterates over all BlockDriverStates which are attached to a BlockBackend.
- * This function is for use by bdrv_next().
- *
- * @bs must be NULL or a BDS that is attached to a BB.
- */
-BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
+/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
+ * the monitor or attached to a BlockBackend */
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-    BlockBackend *blk;
+    BlockDriverState *bs;
 
-    if (bs) {
-        assert(bs->blk);
-        blk = bs->blk;
-    } else {
-        blk = NULL;
+    /* First, return all root nodes of BlockBackends. In order to avoid
+     * returning a BDS twice when multiple BBs refer to it, we only return it
+     * if the BB is the first one in the parent list of the BDS. */
+    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+        do {
+            it->blk = blk_all_next(it->blk);
+            bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
+
+        if (bs) {
+            return bs;
+        }
+        it->phase = BDRV_NEXT_MONITOR_OWNED;
     }
 
+    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
+     * BDSes that are attached to a BlockBackend here; they have been handled
+     * by the above block already */
     do {
-        blk = blk_all_next(blk);
-    } while (blk && !blk->root);
+        it->bs = bdrv_next_monitor_owned(it->bs);
+        bs = it->bs;
+    } while (bs && bdrv_has_blk(bs));
+
+    return bs;
+}
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+    *it = (BdrvNextIterator) {
+        .phase = BDRV_NEXT_BACKEND_ROOTS,
+    };
 
-    return blk ? blk->root->bs : NULL;
+    return bdrv_next(it);
 }
 
 /*
@@ -375,6 +389,26 @@ BlockDriverState *blk_bs(BlockBackend *blk)
     return blk->root ? blk->root->bs : NULL;
 }
 
+static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
+{
+    BdrvChild *child;
+    QLIST_FOREACH(child, &bs->parents, next_parent) {
+        if (child->role == &child_root) {
+            return child->opaque;
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Returns true if @bs has an associated BlockBackend.
+ */
+bool bdrv_has_blk(BlockDriverState *bs)
+{
+    return bdrv_first_blk(bs) != NULL;
+}
+
 /*
  * Return @blk's DriveInfo if any, else null.
  */
@@ -411,17 +445,33 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 }
 
 /*
+ * Returns a pointer to the publicly accessible fields of @blk.
+ */
+BlockBackendPublic *blk_get_public(BlockBackend *blk)
+{
+    return &blk->public;
+}
+
+/*
+ * Returns a BlockBackend given the associated @public fields.
+ */
+BlockBackend *blk_by_public(BlockBackendPublic *public)
+{
+    return container_of(public, BlockBackend, public);
+}
+
+/*
  * Disassociates the currently associated BlockDriverState from @blk.
  */
 void blk_remove_bs(BlockBackend *blk)
 {
-    assert(blk->root->bs->blk == blk);
-
     notifier_list_notify(&blk->remove_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+    }
 
     blk_update_root_state(blk);
 
-    blk->root->bs->blk = NULL;
     bdrv_root_unref_child(blk->root);
     blk->root = NULL;
 }
@@ -431,12 +481,14 @@ void blk_remove_bs(BlockBackend *blk)
  */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
-    assert(!blk->root && !bs->blk);
     bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    bs->blk = blk;
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
 
     notifier_list_notify(&blk->insert_bs_notifiers, blk);
+    if (blk->public.throttle_state) {
+        throttle_timers_attach_aio_context(
+            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
+    }
 }
 
 /*
@@ -525,6 +577,11 @@ void blk_dev_change_media_cb(BlockBackend *blk, bool load)
     }
 }
 
+static void blk_root_change_media(BdrvChild *child, bool load)
+{
+    blk_dev_change_media_cb(child->opaque, load);
+}
+
 /*
  * Does @blk's attached device model have removable media?
  * %true if no device model is attached.
@@ -579,8 +636,10 @@ bool blk_dev_is_medium_locked(BlockBackend *blk)
 /*
  * Notify @blk's attached device model of a backend size change.
  */
-void blk_dev_resize_cb(BlockBackend *blk)
+static void blk_root_resize(BdrvChild *child)
 {
+    BlockBackend *blk = child->opaque;
+
     if (blk->dev_ops && blk->dev_ops->resize_cb) {
         blk->dev_ops->resize_cb(blk->dev_opaque);
     }
@@ -683,34 +742,50 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num,
                                   nb_sectors * BDRV_SECTOR_SIZE);
 }
 
-static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
 {
-    int ret = blk_check_byte_request(blk, offset, bytes);
+    int ret;
+
+    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags);
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, false);
+    }
+
+    return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
 }
 
-static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
 {
     int ret;
 
+    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
+    /* throttling disk I/O */
+    if (blk->public.throttle_state) {
+        throttle_group_co_io_limits_intercept(blk, bytes, true);
+    }
+
     if (!blk->enable_write_cache) {
         flags |= BDRV_REQ_FUA;
     }
 
-    return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags);
+    return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
 }
 
 typedef struct BlkRwCo {
@@ -761,8 +836,8 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
         .ret    = NOT_DONE,
     };
 
-    co = qemu_coroutine_create(co_entry);
-    qemu_coroutine_enter(co, &rwco);
+    co = qemu_coroutine_create(co_entry, &rwco);
+    qemu_coroutine_enter(co);
 
     aio_context = blk_get_aio_context(blk);
     while (rwco.ret == NOT_DONE) {
@@ -772,55 +847,32 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
     return rwco.ret;
 }
 
-static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                  int nb_sectors, CoroutineEntry co_entry,
-                  BdrvRequestFlags flags)
+int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
+                          int count)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf,
-                   nb_sectors << BDRV_SECTOR_BITS, co_entry, flags);
-}
-
-int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-             int nb_sectors)
-{
-    return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0);
-}
-
-int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
-                         int nb_sectors)
-{
-    BlockDriverState *bs = blk_bs(blk);
-    bool enabled;
     int ret;
 
-    ret = blk_check_request(blk, sector_num, nb_sectors);
+    ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return ret;
     }
 
-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = blk_read(blk, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
+    blk_root_drained_begin(blk->root);
+    ret = blk_pread(blk, offset, buf, count);
+    blk_root_drained_end(blk->root);
     return ret;
 }
 
-int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
-              int nb_sectors)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags)
 {
-    return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors,
-                  blk_write_entry, 0);
+    return blk_prw(blk, offset, NULL, count, blk_write_entry,
+                   flags | BDRV_REQ_ZERO_WRITE);
 }
 
-int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                     int nb_sectors, BdrvRequestFlags flags)
+int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 {
-    return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry,
-                  flags | BDRV_REQ_ZERO_WRITE);
+    return bdrv_make_zero(blk->root, flags);
 }
 
 static void error_callback_bh(void *opaque)
@@ -898,8 +950,8 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
     acb->bh = NULL;
     acb->has_returned = false;
 
-    co = qemu_coroutine_create(co_entry);
-    qemu_coroutine_enter(co, acb);
+    co = qemu_coroutine_create(co_entry, acb);
+    qemu_coroutine_enter(co);
 
     acb->has_returned = true;
     if (acb->rwco.ret != NOT_DONE) {
@@ -932,18 +984,12 @@ static void blk_aio_write_entry(void *opaque)
     blk_aio_complete(acb);
 }
 
-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                 int nb_sectors, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS,
-                        nb_sectors << BDRV_SECTOR_BITS, NULL,
-                        blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE,
-                        cb, opaque);
+    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
+                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }
 
 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
@@ -955,9 +1001,11 @@ int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
     return count;
 }
 
-int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count)
+int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
+               BdrvRequestFlags flags)
 {
-    int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0);
+    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
+                      flags);
     if (ret < 0) {
         return ret;
     }
@@ -991,30 +1039,20 @@ int64_t blk_nb_sectors(BlockBackend *blk)
     return bdrv_nb_sectors(blk_bs(blk));
 }
 
-BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
-                          QEMUIOVector *iov, int nb_sectors,
-                          BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
+                           QEMUIOVector *qiov, BdrvRequestFlags flags,
+                           BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_read_entry, 0, cb, opaque);
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_read_entry, flags, cb, opaque);
 }
 
-BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
-                           QEMUIOVector *iov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
+                            QEMUIOVector *qiov, BdrvRequestFlags flags,
+                            BlockCompletionFunc *cb, void *opaque)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return blk_abort_aio_request(blk, cb, opaque, -EINVAL);
-    }
-
-    assert(nb_sectors << BDRV_SECTOR_BITS == iov->size);
-    return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov,
-                        blk_aio_write_entry, 0, cb, opaque);
+    return blk_aio_prwv(blk, offset, qiov->size, qiov,
+                        blk_aio_write_entry, flags, cb, opaque);
 }
 
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
@@ -1027,16 +1065,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
     return bdrv_aio_flush(blk_bs(blk), cb, opaque);
 }
 
-BlockAIOCB *blk_aio_discard(BlockBackend *blk,
-                            int64_t sector_num, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
+                             int64_t offset, int count,
+                             BlockCompletionFunc *cb, void *opaque)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
-    return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque);
+    return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque);
 }
 
 void blk_aio_cancel(BlockAIOCB *acb)
@@ -1049,20 +1087,6 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
     bdrv_aio_cancel_async(acb);
 }
 
-int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs)
-{
-    int i, ret;
-
-    for (i = 0; i < num_reqs; i++) {
-        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-
-    return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs);
-}
-
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
     if (!blk_is_available(blk)) {
@@ -1082,14 +1106,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque);
 }
 
-int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors);
+    return bdrv_co_pdiscard(blk_bs(blk), offset, count);
 }
 
 int blk_co_flush(BlockBackend *blk)
@@ -1149,6 +1173,7 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
         return BLOCK_ERROR_ACTION_REPORT;
     case BLOCKDEV_ON_ERROR_IGNORE:
         return BLOCK_ERROR_ACTION_IGNORE;
+    case BLOCKDEV_ON_ERROR_AUTO:
     default:
         abort();
     }
@@ -1284,15 +1309,16 @@ int blk_get_flags(BlockBackend *blk)
     }
 }
 
-int blk_get_max_transfer_length(BlockBackend *blk)
+/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
+uint32_t blk_get_max_transfer(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
+    uint32_t max = 0;
 
     if (bs) {
-        return bs->bl.max_transfer_length;
-    } else {
-        return 0;
+        max = bs->bl.max_transfer;
     }
+    return MIN_NON_ZERO(max, INT_MAX);
 }
 
 int blk_get_max_iov(BlockBackend *blk)
@@ -1375,7 +1401,14 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
     BlockDriverState *bs = blk_bs(blk);
 
     if (bs) {
+        if (blk->public.throttle_state) {
+            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+        }
         bdrv_set_aio_context(bs, new_context);
+        if (blk->public.throttle_state) {
+            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
+                                               new_context);
+        }
     }
 }
 
@@ -1444,15 +1477,10 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }
 
-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num,
-                                     int nb_sectors, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS,
-                          nb_sectors << BDRV_SECTOR_BITS, NULL,
+    return blk_co_pwritev(blk, offset, count, NULL,
                           flags | BDRV_REQ_ZERO_WRITE);
 }
 
@@ -1476,14 +1504,14 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
     return bdrv_truncate(blk_bs(blk), offset);
 }
 
-int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_discard(blk_bs(blk), sector_num, nb_sectors);
+    return bdrv_pdiscard(blk_bs(blk), offset, count);
 }
 
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
@@ -1545,19 +1573,6 @@ void blk_update_root_state(BlockBackend *blk)
     blk->root_state.open_flags    = blk->root->bs->open_flags;
     blk->root_state.read_only     = blk->root->bs->read_only;
     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
-
-    if (blk->root_state.throttle_group) {
-        g_free(blk->root_state.throttle_group);
-        throttle_group_unref(blk->root_state.throttle_state);
-    }
-    if (blk->root->bs->throttle_state) {
-        const char *name = throttle_group_get_name(blk->root->bs);
-        blk->root_state.throttle_group = g_strdup(name);
-        blk->root_state.throttle_state = throttle_group_incref(name);
-    } else {
-        blk->root_state.throttle_group = NULL;
-        blk->root_state.throttle_state = NULL;
-    }
 }
 
 /*
@@ -1568,9 +1583,6 @@ void blk_update_root_state(BlockBackend *blk)
 void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
 {
     bs->detect_zeroes = blk->root_state.detect_zeroes;
-    if (blk->root_state.throttle_group) {
-        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
-    }
 }
 
 /*
@@ -1633,3 +1645,62 @@ int blk_flush_all(void)
 
     return result;
 }
+
+
+/* throttling disk I/O limits */
+void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
+{
+    throttle_group_config(blk, cfg);
+}
+
+void blk_io_limits_disable(BlockBackend *blk)
+{
+    assert(blk->public.throttle_state);
+    bdrv_drained_begin(blk_bs(blk));
+    throttle_group_unregister_blk(blk);
+    bdrv_drained_end(blk_bs(blk));
+}
+
+/* should be called before blk_set_io_limits if a limit is set */
+void blk_io_limits_enable(BlockBackend *blk, const char *group)
+{
+    assert(!blk->public.throttle_state);
+    throttle_group_register_blk(blk, group);
+}
+
+void blk_io_limits_update_group(BlockBackend *blk, const char *group)
+{
+    /* this BB is not part of any group */
+    if (!blk->public.throttle_state) {
+        return;
+    }
+
+    /* this BB is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
+        return;
+    }
+
+    /* need to change the group this bs belong to */
+    blk_io_limits_disable(blk);
+    blk_io_limits_enable(blk, group);
+}
+
+static void blk_root_drained_begin(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    /* Note that blk->root may not be accessible here yet if we are just
+     * attaching to a BlockDriverState that is drained. Use child instead. */
+
+    if (blk->public.io_limits_disabled++ == 0) {
+        throttle_group_restart_blk(blk);
+    }
+}
+
+static void blk_root_drained_end(BdrvChild *child)
+{
+    BlockBackend *blk = child->opaque;
+
+    assert(blk->public.io_limits_disabled);
+    --blk->public.io_limits_disabled;
+}
diff --git a/block/bochs.c b/block/bochs.c
index af8b7abdf..8c9652ebe 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -27,6 +27,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 
 /**************************************************************/
 
@@ -103,9 +104,9 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
     struct bochs_header bochs;
     int ret;
 
-    bs->read_only = 1; // no write support yet
+    bs->read_only = true; /* no write support yet */
 
-    ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs));
+    ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
     if (ret < 0) {
         return ret;
     }
@@ -139,7 +140,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
         return -ENOMEM;
     }
 
-    ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap,
+    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap,
                      s->catalog_size * 4);
     if (ret < 0) {
         goto fail;
@@ -187,6 +188,11 @@ fail:
     return ret;
 }
 
+static void bochs_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
+}
+
 static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
 {
     BDRVBochsState *s = bs->opaque;
@@ -208,7 +214,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
         (s->extent_blocks + s->bitmap_blocks));
 
     /* read in bitmap for current extent */
-    ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8),
+    ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8),
                      &bitmap_entry, 1);
     if (ret < 0) {
         return ret;
@@ -221,38 +227,52 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
     return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
 }
 
-static int bochs_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
+    BDRVBochsState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    uint64_t bytes_done = 0;
+    QEMUIOVector local_qiov;
     int ret;
 
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
     while (nb_sectors > 0) {
         int64_t block_offset = seek_to_sector(bs, sector_num);
         if (block_offset < 0) {
-            return block_offset;
-        } else if (block_offset > 0) {
-            ret = bdrv_pread(bs->file->bs, block_offset, buf, 512);
+            ret = block_offset;
+            goto fail;
+        }
+
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, 512);
+
+        if (block_offset > 0) {
+            ret = bdrv_co_preadv(bs->file, block_offset, 512,
+                                 &local_qiov, 0);
             if (ret < 0) {
-                return ret;
+                goto fail;
             }
         } else {
-            memset(buf, 0, 512);
+            qemu_iovec_memset(&local_qiov, 0, 0, 512);
         }
         nb_sectors--;
         sector_num++;
-        buf += 512;
+        bytes_done += 512;
     }
-    return 0;
-}
 
-static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVBochsState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = bochs_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
     qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
     return ret;
 }
 
@@ -267,7 +287,8 @@ static BlockDriver bdrv_bochs = {
     .instance_size	= sizeof(BDRVBochsState),
     .bdrv_probe		= bochs_probe,
     .bdrv_open		= bochs_open,
-    .bdrv_read          = bochs_co_read,
+    .bdrv_refresh_limits = bochs_refresh_limits,
+    .bdrv_co_preadv = bochs_co_preadv,
     .bdrv_close		= bochs_close,
 };
 
diff --git a/block/cloop.c b/block/cloop.c
index a84f14019..7b75f7ef7 100644
--- a/block/cloop.c
+++ b/block/cloop.c
@@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>
 
 /* Maximum compressed block size */
@@ -65,10 +66,10 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
     uint32_t offsets_size, max_compressed_block_size = 1, i;
     int ret;
 
-    bs->read_only = 1;
+    bs->read_only = true;
 
     /* read header */
-    ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4);
+    ret = bdrv_pread(bs->file, 128, &s->block_size, 4);
     if (ret < 0) {
         return ret;
     }
@@ -94,7 +95,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
         return -EINVAL;
     }
 
-    ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4);
+    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4);
     if (ret < 0) {
         return ret;
     }
@@ -125,7 +126,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
         return -ENOMEM;
     }
 
-    ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size);
+    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size);
     if (ret < 0) {
         goto fail;
     }
@@ -197,6 +198,11 @@ fail:
     return ret;
 }
 
+static void cloop_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
+}
+
 static inline int cloop_read_block(BlockDriverState *bs, int block_num)
 {
     BDRVCloopState *s = bs->opaque;
@@ -205,7 +211,7 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
         int ret;
         uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num];
 
-        ret = bdrv_pread(bs->file->bs, s->offsets[block_num],
+        ret = bdrv_pread(bs->file, s->offsets[block_num],
                          s->compressed_block, bytes);
         if (ret != bytes) {
             return -1;
@@ -229,33 +235,38 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num)
     return 0;
 }
 
-static int cloop_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
     BDRVCloopState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);
 
     for (i = 0; i < nb_sectors; i++) {
+        void *data;
         uint32_t sector_offset_in_block =
             ((sector_num + i) % s->sectors_per_block),
             block_num = (sector_num + i) / s->sectors_per_block;
         if (cloop_read_block(bs, block_num) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
         }
-        memcpy(buf + i * 512,
-            s->uncompressed_block + sector_offset_in_block * 512, 512);
+
+        data = s->uncompressed_block + sector_offset_in_block * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
     }
-    return 0;
-}
 
-static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVCloopState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = cloop_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
     qemu_co_mutex_unlock(&s->lock);
+
     return ret;
 }
 
@@ -273,7 +284,8 @@ static BlockDriver bdrv_cloop = {
     .instance_size  = sizeof(BDRVCloopState),
     .bdrv_probe     = cloop_probe,
     .bdrv_open      = cloop_open,
-    .bdrv_read      = cloop_co_read,
+    .bdrv_refresh_limits = cloop_refresh_limits,
+    .bdrv_co_preadv = cloop_co_preadv,
     .bdrv_close     = cloop_close,
 };
 
diff --git a/block/commit.c b/block/commit.c
index cba0e8c1e..553e18da5 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,28 +36,36 @@ typedef struct CommitBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *active;
-    BlockDriverState *top;
-    BlockDriverState *base;
+    BlockBackend *top;
+    BlockBackend *base;
     BlockdevOnError on_error;
     int base_flags;
     int orig_overlay_flags;
     char *backing_file_str;
 } CommitBlockJob;
 
-static int coroutine_fn commit_populate(BlockDriverState *bs,
-                                        BlockDriverState *base,
+static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
                                         int64_t sector_num, int nb_sectors,
                                         void *buf)
 {
     int ret = 0;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
 
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    if (ret) {
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
+                        qiov.size, &qiov, 0);
+    if (ret < 0) {
         return ret;
     }
 
-    ret = bdrv_write(base, sector_num, buf, nb_sectors);
-    if (ret) {
+    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
+                         qiov.size, &qiov, 0);
+    if (ret < 0) {
         return ret;
     }
 
@@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque)
     CommitBlockJob *s = container_of(job, CommitBlockJob, common);
     CommitCompleteData *data = opaque;
     BlockDriverState *active = s->active;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
+    BlockDriverState *top = blk_bs(s->top);
+    BlockDriverState *base = blk_bs(s->base);
     BlockDriverState *overlay_bs;
     int ret = data->ret;
 
@@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque)
         bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
     }
     g_free(s->backing_file_str);
+    blk_unref(s->top);
+    blk_unref(s->base);
     block_job_completed(&s->common, ret);
     g_free(data);
 }
@@ -102,42 +112,39 @@ static void coroutine_fn commit_run(void *opaque)
 {
     CommitBlockJob *s = opaque;
     CommitCompleteData *data;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
     int64_t sector_num, end;
+    uint64_t delay_ns = 0;
     int ret = 0;
     int n = 0;
     void *buf = NULL;
     int bytes_written = 0;
     int64_t base_len;
 
-    ret = s->common.len = bdrv_getlength(top);
+    ret = s->common.len = blk_getlength(s->top);
 
 
     if (s->common.len < 0) {
         goto out;
     }
 
-    ret = base_len = bdrv_getlength(base);
+    ret = base_len = blk_getlength(s->base);
     if (base_len < 0) {
         goto out;
     }
 
     if (base_len < s->common.len) {
-        ret = bdrv_truncate(base, s->common.len);
+        ret = blk_truncate(s->base, s->common.len);
         if (ret) {
             goto out;
         }
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 
     for (sector_num = 0; sector_num < end; sector_num += n) {
-        uint64_t delay_ns = 0;
         bool copy;
 
-wait:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
@@ -146,25 +153,20 @@ wait:
             break;
         }
         /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(top, base, sector_num,
+        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
+                                      sector_num,
                                       COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                       &n);
         copy = (ret == 1);
         trace_commit_one_iteration(s, sector_num, n, ret);
         if (copy) {
-            if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, n);
-                if (delay_ns > 0) {
-                    goto wait;
-                }
-            }
-            ret = commit_populate(top, base, sector_num, n, buf);
+            ret = commit_populate(s->top, s->base, sector_num, n, buf);
             bytes_written += n * BDRV_SECTOR_SIZE;
         }
         if (ret < 0) {
-            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
-                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
-                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
+            BlockErrorAction action =
+                block_job_error_action(&s->common, false, s->on_error, -ret);
+            if (action == BLOCK_ERROR_ACTION_REPORT) {
                 goto out;
             } else {
                 n = 0;
@@ -173,6 +175,10 @@ wait:
         }
         /* Publish progress */
         s->common.offset += n * BDRV_SECTOR_SIZE;
+
+        if (copy && s->common.speed) {
+            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+        }
     }
 
     ret = 0;
@@ -202,8 +208,8 @@ static const BlockJobDriver commit_job_driver = {
     .set_speed     = commit_set_speed,
 };
 
-void commit_start(BlockDriverState *bs, BlockDriverState *base,
-                  BlockDriverState *top, int64_t speed,
+void commit_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
                   BlockdevOnError on_error, BlockCompletionFunc *cb,
                   void *opaque, const char *backing_file_str, Error **errp)
 {
@@ -214,13 +220,6 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
     BlockDriverState *overlay_bs;
     Error *local_err = NULL;
 
-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, "Invalid parameter combination");
-        return;
-    }
-
     assert(top != bs);
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
@@ -234,6 +233,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
+    s = block_job_create(job_id, &commit_job_driver, bs, speed,
+                         cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
     orig_base_flags    = bdrv_get_flags(base);
     orig_overlay_flags = bdrv_get_flags(overlay_bs);
 
@@ -250,18 +255,18 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         bdrv_reopen_multiple(reopen_queue, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
+            block_job_unref(&s->common);
             return;
         }
     }
 
 
-    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp);
-    if (!s) {
-        return;
-    }
+    s->base = blk_new();
+    blk_insert_bs(s->base, base);
+
+    s->top = blk_new();
+    blk_insert_bs(s->top, top);
 
-    s->base   = base;
-    s->top    = top;
     s->active = bs;
 
     s->base_flags          = orig_base_flags;
@@ -270,8 +275,129 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
     s->backing_file_str = g_strdup(backing_file_str);
 
     s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(commit_run);
+    s->common.co = qemu_coroutine_create(commit_run, s);
 
     trace_commit_start(bs, base, top, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co, s);
+    qemu_coroutine_enter(s->common.co);
+}
+
+
+#define COMMIT_BUF_SECTORS 2048
+
+/* commit COW file into the raw image */
+int bdrv_commit(BlockDriverState *bs)
+{
+    BlockBackend *src, *backing;
+    BlockDriver *drv = bs->drv;
+    int64_t sector, total_sectors, length, backing_length;
+    int n, ro, open_flags;
+    int ret = 0;
+    uint8_t *buf = NULL;
+
+    if (!drv)
+        return -ENOMEDIUM;
+
+    if (!bs->backing) {
+        return -ENOTSUP;
+    }
+
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
+        bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
+        return -EBUSY;
+    }
+
+    ro = bs->backing->bs->read_only;
+    open_flags =  bs->backing->bs->open_flags;
+
+    if (ro) {
+        if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) {
+            return -EACCES;
+        }
+    }
+
+    src = blk_new();
+    blk_insert_bs(src, bs);
+
+    backing = blk_new();
+    blk_insert_bs(backing, bs->backing->bs);
+
+    length = blk_getlength(src);
+    if (length < 0) {
+        ret = length;
+        goto ro_cleanup;
+    }
+
+    backing_length = blk_getlength(backing);
+    if (backing_length < 0) {
+        ret = backing_length;
+        goto ro_cleanup;
+    }
+
+    /* If our top snapshot is larger than the backing file image,
+     * grow the backing file image if possible.  If not possible,
+     * we must return an error */
+    if (length > backing_length) {
+        ret = blk_truncate(backing, length);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
+    }
+
+    total_sectors = length >> BDRV_SECTOR_BITS;
+
+    /* blk_try_blockalign() for src will choose an alignment that works for
+     * backing as well, so no need to compare the alignment manually. */
+    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+    if (buf == NULL) {
+        ret = -ENOMEM;
+        goto ro_cleanup;
+    }
+
+    for (sector = 0; sector < total_sectors; sector += n) {
+        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
+        if (ret) {
+            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
+                            n * BDRV_SECTOR_SIZE);
+            if (ret < 0) {
+                goto ro_cleanup;
+            }
+
+            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
+                             n * BDRV_SECTOR_SIZE, 0);
+            if (ret < 0) {
+                goto ro_cleanup;
+            }
+        }
+    }
+
+    if (drv->bdrv_make_empty) {
+        ret = drv->bdrv_make_empty(bs);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
+        blk_flush(src);
+    }
+
+    /*
+     * Make sure all data we wrote to the backing device is actually
+     * stable on disk.
+     */
+    blk_flush(backing);
+
+    ret = 0;
+ro_cleanup:
+    qemu_vfree(buf);
+
+    blk_unref(src);
+    blk_unref(backing);
+
+    if (ro) {
+        /* ignoring error return here */
+        bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL);
+    }
+
+    return ret;
 }
diff --git a/block/crypto.c b/block/crypto.c
index 1903e84fb..7f61e1268 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -64,7 +64,7 @@ static ssize_t block_crypto_read_func(QCryptoBlock *block,
     BlockDriverState *bs = opaque;
     ssize_t ret;
 
-    ret = bdrv_pread(bs->file->bs, offset, buf, buflen);
+    ret = bdrv_pread(bs->file, offset, buf, buflen);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not read encryption header");
         return ret;
@@ -91,7 +91,7 @@ static ssize_t block_crypto_write_func(QCryptoBlock *block,
     struct BlockCryptoCreateData *data = opaque;
     ssize_t ret;
 
-    ret = blk_pwrite(data->blk, offset, buf, buflen);
+    ret = blk_pwrite(data->blk, offset, buf, buflen, 0);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not write encryption header");
         return ret;
@@ -193,18 +193,16 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
                             QemuOpts *opts,
                             Error **errp)
 {
-    OptsVisitor *ov;
+    Visitor *v;
     QCryptoBlockOpenOptions *ret = NULL;
     Error *local_err = NULL;
-    Error *end_err = NULL;
 
     ret = g_new0(QCryptoBlockOpenOptions, 1);
     ret->format = format;
 
-    ov = opts_visitor_new(opts);
+    v = opts_visitor_new(opts);
 
-    visit_start_struct(opts_get_visitor(ov),
-                       NULL, NULL, 0, &local_err);
+    visit_start_struct(v, NULL, NULL, 0, &local_err);
     if (local_err) {
         goto out;
     }
@@ -212,16 +210,18 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
     switch (format) {
     case Q_CRYPTO_BLOCK_FORMAT_LUKS:
         visit_type_QCryptoBlockOptionsLUKS_members(
-            opts_get_visitor(ov), &ret->u.luks, &local_err);
+            v, &ret->u.luks, &local_err);
         break;
 
     default:
         error_setg(&local_err, "Unsupported block format %d", format);
         break;
     }
+    if (!local_err) {
+        visit_check_struct(v, &local_err);
+    }
 
-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(v, NULL);
 
  out:
     if (local_err) {
@@ -229,7 +229,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
         qapi_free_QCryptoBlockOpenOptions(ret);
         ret = NULL;
     }
-    opts_visitor_cleanup(ov);
+    visit_free(v);
     return ret;
 }
 
@@ -239,18 +239,16 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
                               QemuOpts *opts,
                               Error **errp)
 {
-    OptsVisitor *ov;
+    Visitor *v;
     QCryptoBlockCreateOptions *ret = NULL;
     Error *local_err = NULL;
-    Error *end_err = NULL;
 
     ret = g_new0(QCryptoBlockCreateOptions, 1);
     ret->format = format;
 
-    ov = opts_visitor_new(opts);
+    v = opts_visitor_new(opts);
 
-    visit_start_struct(opts_get_visitor(ov),
-                       NULL, NULL, 0, &local_err);
+    visit_start_struct(v, NULL, NULL, 0, &local_err);
     if (local_err) {
         goto out;
     }
@@ -258,16 +256,18 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
     switch (format) {
     case Q_CRYPTO_BLOCK_FORMAT_LUKS:
         visit_type_QCryptoBlockCreateOptionsLUKS_members(
-            opts_get_visitor(ov), &ret->u.luks, &local_err);
+            v, &ret->u.luks, &local_err);
         break;
 
     default:
         error_setg(&local_err, "Unsupported block format %d", format);
         break;
     }
+    if (!local_err) {
+        visit_check_struct(v, &local_err);
+    }
 
-    visit_end_struct(opts_get_visitor(ov), &end_err);
-    error_propagate(&local_err, end_err);
+    visit_end_struct(v, NULL);
 
  out:
     if (local_err) {
@@ -275,7 +275,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
         qapi_free_QCryptoBlockCreateOptions(ret);
         ret = NULL;
     }
-    opts_visitor_cleanup(ov);
+    visit_free(v);
     return ret;
 }
 
@@ -320,8 +320,8 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
         goto cleanup;
     }
 
-    bs->encrypted = 1;
-    bs->valid_key = 1;
+    bs->encrypted = true;
+    bs->valid_key = true;
 
     ret = 0;
  cleanup:
@@ -426,7 +426,7 @@ block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num,
         qemu_iovec_reset(&hd_qiov);
         qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);
 
-        ret = bdrv_co_readv(bs->file->bs,
+        ret = bdrv_co_readv(bs->file,
                             payload_offset + sector_num,
                             cur_nr_sectors, &hd_qiov);
         if (ret < 0) {
@@ -505,7 +505,7 @@ block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num,
         qemu_iovec_reset(&hd_qiov);
         qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512);
 
-        ret = bdrv_co_writev(bs->file->bs,
+        ret = bdrv_co_writev(bs->file,
                              payload_offset + sector_num,
                              cur_nr_sectors, &hd_qiov);
         if (ret < 0) {
@@ -563,6 +563,53 @@ static int block_crypto_create_luks(const char *filename,
                                        filename, opts, errp);
 }
 
+static int block_crypto_get_info_luks(BlockDriverState *bs,
+                                      BlockDriverInfo *bdi)
+{
+    BlockDriverInfo subbdi;
+    int ret;
+
+    ret = bdrv_get_info(bs->file->bs, &subbdi);
+    if (ret != 0) {
+        return ret;
+    }
+
+    bdi->unallocated_blocks_are_zero = false;
+    bdi->can_write_zeroes_with_unmap = false;
+    bdi->cluster_size = subbdi.cluster_size;
+
+    return 0;
+}
+
+static ImageInfoSpecific *
+block_crypto_get_specific_info_luks(BlockDriverState *bs)
+{
+    BlockCrypto *crypto = bs->opaque;
+    ImageInfoSpecific *spec_info;
+    QCryptoBlockInfo *info;
+
+    info = qcrypto_block_get_info(crypto->block, NULL);
+    if (!info) {
+        return NULL;
+    }
+    if (info->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
+        qapi_free_QCryptoBlockInfo(info);
+        return NULL;
+    }
+
+    spec_info = g_new(ImageInfoSpecific, 1);
+    spec_info->type = IMAGE_INFO_SPECIFIC_KIND_LUKS;
+    spec_info->u.luks.data = g_new(QCryptoBlockInfoLUKS, 1);
+    *spec_info->u.luks.data = info->u.luks;
+
+    /* Blank out pointers we've just stolen to avoid double free */
+    memset(&info->u.luks, 0, sizeof(info->u.luks));
+
+    qapi_free_QCryptoBlockInfo(info);
+
+    return spec_info;
+}
+
 BlockDriver bdrv_crypto_luks = {
     .format_name        = "luks",
     .instance_size      = sizeof(BlockCrypto),
@@ -576,6 +623,8 @@ BlockDriver bdrv_crypto_luks = {
     .bdrv_co_readv      = block_crypto_co_readv,
     .bdrv_co_writev     = block_crypto_co_writev,
     .bdrv_getlength     = block_crypto_getlength,
+    .bdrv_get_info      = block_crypto_get_info_luks,
+    .bdrv_get_specific_info = block_crypto_get_specific_info_luks,
 };
 
 static void block_crypto_init(void)
diff --git a/block/curl.c b/block/curl.c
index 5a8f8b623..426fb4d67 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE
 
 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)                                            \
+    do {                                                             \
+        if (DEBUG_CURL_PRINT) {                                      \
+            fprintf(stderr, fmt, ## __VA_ARGS__);                    \
+        }                                                            \
+    } while (0)
 
 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
@@ -163,7 +169,7 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
     state->sock_fd = fd;
     s = state->s;
 
-    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd);
+    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, (int)fd);
     switch (action) {
         case CURL_POLL_IN:
             aio_set_fd_handler(s->aio_context, fd, false,
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 4902ca557..f2bfdcfde 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -326,14 +326,14 @@ void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 }
 
 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int nr_sectors)
+                           int64_t cur_sector, int64_t nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int nr_sectors)
+                             int64_t cur_sector, int64_t nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
@@ -361,7 +361,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 }
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int nr_sectors)
+                    int64_t nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
diff --git a/block/dmg.c b/block/dmg.c
index a496eb7c9..b0ed89baa 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -32,7 +32,6 @@
 #ifdef CONFIG_BZIP2
 #include <bzlib.h>
 #endif
-#include <glib.h>
 
 enum {
     /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -87,7 +86,7 @@ static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result)
     uint64_t buffer;
     int ret;
 
-    ret = bdrv_pread(bs->file->bs, offset, &buffer, 8);
+    ret = bdrv_pread(bs->file, offset, &buffer, 8);
     if (ret < 0) {
         return ret;
     }
@@ -101,7 +100,7 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
     uint32_t buffer;
     int ret;
 
-    ret = bdrv_pread(bs->file->bs, offset, &buffer, 4);
+    ret = bdrv_pread(bs->file, offset, &buffer, 4);
     if (ret < 0) {
         return ret;
     }
@@ -154,8 +153,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
     }
 }
 
-static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
+static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp)
 {
+    BlockDriverState *file_bs = file->bs;
     int64_t length;
     int64_t offset = 0;
     uint8_t buffer[515];
@@ -179,7 +179,7 @@ static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
         offset = length - 511 - 512;
     }
     length = length < 515 ? length : 515;
-    ret = bdrv_pread(file_bs, offset, buffer, length);
+    ret = bdrv_pread(file, offset, buffer, length);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Failed while reading UDIF trailer");
         return ret;
@@ -356,7 +356,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
         offset += 4;
 
         buffer = g_realloc(buffer, count);
-        ret = bdrv_pread(bs->file->bs, offset, buffer, count);
+        ret = bdrv_pread(bs->file, offset, buffer, count);
         if (ret < 0) {
             goto fail;
         }
@@ -393,7 +393,7 @@ static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,
 
     buffer = g_malloc(info_length + 1);
     buffer[info_length] = '\0';
-    ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length);
+    ret = bdrv_pread(bs->file, info_begin, buffer, info_length);
     if (ret != info_length) {
         ret = -EINVAL;
         goto fail;
@@ -439,7 +439,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     int64_t offset;
     int ret;
 
-    bs->read_only = 1;
+    bs->read_only = true;
+
     s->n_chunks = 0;
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
     /* used by dmg_read_mish_block to keep track of the current I/O position */
@@ -448,7 +449,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
     ds.max_sectors_per_chunk = 1;
 
     /* locate the UDIF trailer */
-    offset = dmg_find_koly_offset(bs->file->bs, errp);
+    offset = dmg_find_koly_offset(bs->file, errp);
     if (offset < 0) {
         ret = offset;
         goto fail;
@@ -546,6 +547,11 @@ fail:
     return ret;
 }
 
+static void dmg_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
+}
+
 static inline int is_sector_in_chunk(BDRVDMGState* s,
                 uint32_t chunk_num, uint64_t sector_num)
 {
@@ -594,7 +600,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
         case 0x80000005: { /* zlib compressed */
             /* we need to buffer, because only the chunk as whole can be
              * inflated. */
-            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->compressed_chunk, s->lengths[chunk]);
             if (ret != s->lengths[chunk]) {
                 return -1;
@@ -618,7 +624,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
         case 0x80000006: /* bzip2 compressed */
             /* we need to buffer, because only the chunk as whole can be
              * inflated. */
-            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->compressed_chunk, s->lengths[chunk]);
             if (ret != s->lengths[chunk]) {
                 return -1;
@@ -643,7 +649,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
             break;
 #endif /* CONFIG_BZIP2 */
         case 1: /* copy */
-            ret = bdrv_pread(bs->file->bs, s->offsets[chunk],
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->uncompressed_chunk, s->lengths[chunk]);
             if (ret != s->lengths[chunk]) {
                 return -1;
@@ -659,38 +665,42 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
     return 0;
 }
 
-static int dmg_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
     BDRVDMGState *s = bs->opaque;
-    int i;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int ret, i;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    qemu_co_mutex_lock(&s->lock);
 
     for (i = 0; i < nb_sectors; i++) {
         uint32_t sector_offset_in_chunk;
+        void *data;
+
         if (dmg_read_chunk(bs, sector_num + i) != 0) {
-            return -1;
+            ret = -EIO;
+            goto fail;
         }
         /* Special case: current chunk is all zeroes. Do not perform a memcpy as
          * s->uncompressed_chunk may be too small to cover the large all-zeroes
          * section. dmg_read_chunk is called to find s->current_chunk */
         if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-            memset(buf + i * 512, 0, 512);
+            qemu_iovec_memset(qiov, i * 512, 0, 512);
             continue;
         }
         sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-        memcpy(buf + i * 512,
-               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);
+        data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
+        qemu_iovec_from_buf(qiov, i * 512, data, 512);
     }
-    return 0;
-}
 
-static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVDMGState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = dmg_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -715,7 +725,8 @@ static BlockDriver bdrv_dmg = {
     .instance_size  = sizeof(BDRVDMGState),
     .bdrv_probe     = dmg_probe,
     .bdrv_open      = dmg_open,
-    .bdrv_read      = dmg_co_read,
+    .bdrv_refresh_limits = dmg_refresh_limits,
+    .bdrv_co_preadv = dmg_co_preadv,
     .bdrv_close     = dmg_close,
 };
 
diff --git a/block/gluster.c b/block/gluster.c
index a8aaacf64..01b479fbb 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -11,7 +11,27 @@
 #include <glusterfs/api/glfs.h>
 #include "block/block_int.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/uri.h"
+#include "qemu/error-report.h"
+
+#define GLUSTER_OPT_FILENAME        "filename"
+#define GLUSTER_OPT_VOLUME          "volume"
+#define GLUSTER_OPT_PATH            "path"
+#define GLUSTER_OPT_TYPE            "type"
+#define GLUSTER_OPT_SERVER_PATTERN  "server."
+#define GLUSTER_OPT_HOST            "host"
+#define GLUSTER_OPT_PORT            "port"
+#define GLUSTER_OPT_TO              "to"
+#define GLUSTER_OPT_IPV4            "ipv4"
+#define GLUSTER_OPT_IPV6            "ipv6"
+#define GLUSTER_OPT_SOCKET          "socket"
+#define GLUSTER_OPT_DEBUG           "debug"
+#define GLUSTER_DEFAULT_PORT        24007
+#define GLUSTER_DEBUG_DEFAULT       4
+#define GLUSTER_DEBUG_MAX           9
+
+#define GERR_INDEX_HINT "hint: check in 'server' array index '%d'\n"
 
 typedef struct GlusterAIOCB {
     int64_t size;
@@ -24,28 +44,145 @@ typedef struct GlusterAIOCB {
 typedef struct BDRVGlusterState {
     struct glfs *glfs;
     struct glfs_fd *fd;
+    bool supports_seek_data;
+    int debug_level;
 } BDRVGlusterState;
 
-typedef struct GlusterConf {
-    char *server;
-    int port;
-    char *volname;
-    char *image;
-    char *transport;
-} GlusterConf;
+typedef struct BDRVGlusterReopenState {
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+} BDRVGlusterReopenState;
 
-static void qemu_gluster_gconf_free(GlusterConf *gconf)
-{
-    if (gconf) {
-        g_free(gconf->server);
-        g_free(gconf->volname);
-        g_free(gconf->image);
-        g_free(gconf->transport);
-        g_free(gconf);
+
+static QemuOptsList qemu_gluster_create_opts = {
+    .name = "qemu-gluster-create-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
+    .desc = {
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size"
+        },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, full)"
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
     }
-}
+};
 
-static int parse_volume_options(GlusterConf *gconf, char *path)
+static QemuOptsList runtime_opts = {
+    .name = "gluster",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_FILENAME,
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the gluster image",
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_json_opts = {
+    .name = "gluster_json",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_json_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_VOLUME,
+            .type = QEMU_OPT_STRING,
+            .help = "name of gluster volume where VM image resides",
+        },
+        {
+            .name = GLUSTER_OPT_PATH,
+            .type = QEMU_OPT_STRING,
+            .help = "absolute path to image file in gluster volume",
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_type_opts = {
+    .name = "gluster_type",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_type_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_TYPE,
+            .type = QEMU_OPT_STRING,
+            .help = "tcp|unix",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_unix_opts = {
+    .name = "gluster_unix",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_unix_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_SOCKET,
+            .type = QEMU_OPT_STRING,
+            .help = "socket file path)",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_tcp_opts = {
+    .name = "gluster_tcp",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_TYPE,
+            .type = QEMU_OPT_STRING,
+            .help = "tcp|unix",
+        },
+        {
+            .name = GLUSTER_OPT_HOST,
+            .type = QEMU_OPT_STRING,
+            .help = "host address (hostname/ipv4/ipv6 addresses)",
+        },
+        {
+            .name = GLUSTER_OPT_PORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "port number on which glusterd is listening (default 24007)",
+        },
+        {
+            .name = "to",
+            .type = QEMU_OPT_NUMBER,
+            .help = "max port number, not supported by gluster",
+        },
+        {
+            .name = "ipv4",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv4 bool value, not supported by gluster",
+        },
+        {
+            .name = "ipv6",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv6 bool value, not supported by gluster",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
 {
     char *p, *q;
 
@@ -59,31 +196,29 @@ static int parse_volume_options(GlusterConf *gconf, char *path)
     if (*p == '\0') {
         return -EINVAL;
     }
-    gconf->volname = g_strndup(q, p - q);
+    gconf->volume = g_strndup(q, p - q);
 
-    /* image */
+    /* path */
     p += strspn(p, "/");
     if (*p == '\0') {
         return -EINVAL;
     }
-    gconf->image = g_strdup(p);
+    gconf->path = g_strdup(p);
     return 0;
 }
 
 /*
- * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...]
  *
  * 'gluster' is the protocol.
  *
  * 'transport' specifies the transport type used to connect to gluster
  * management daemon (glusterd). Valid transport types are
- * tcp, unix and rdma. If a transport type isn't specified, then tcp
- * type is assumed.
+ * tcp or unix. If a transport type isn't specified, then tcp type is assumed.
  *
- * 'server' specifies the server where the volume file specification for
- * the given volume resides. This can be either hostname, ipv4 address
- * or ipv6 address. ipv6 address needs to be within square brackets [ ].
- * If transport type is 'unix', then 'server' field should not be specified.
+ * 'host' specifies the host where the volume file specification for
+ * the given volume resides. This can be either hostname or ipv4 address.
+ * If transport type is 'unix', then 'host' field should not be specified.
  * The 'socket' field needs to be populated with the path to unix domain
  * socket.
  *
@@ -92,23 +227,22 @@ static int parse_volume_options(GlusterConf *gconf, char *path)
  * default port. If the transport type is unix, then 'port' should not be
  * specified.
  *
- * 'volname' is the name of the gluster volume which contains the VM image.
+ * 'volume' is the name of the gluster volume which contains the VM image.
  *
- * 'image' is the path to the actual VM image that resides on gluster volume.
+ * 'path' is the path to the actual VM image that resides on gluster volume.
  *
  * Examples:
  *
  * file=gluster://1.2.3.4/testvol/a.img
  * file=gluster+tcp://1.2.3.4/testvol/a.img
  * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
- * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
- * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
- * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img
  * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
- * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
  */
-static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
+                                  const char *filename)
 {
+    GlusterServer *gsconf;
     URI *uri;
     QueryParams *qp = NULL;
     bool is_unix = false;
@@ -119,16 +253,21 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
         return -EINVAL;
     }
 
+    gconf->server = g_new0(GlusterServerList, 1);
+    gconf->server->value = gsconf = g_new0(GlusterServer, 1);
+
     /* transport */
     if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
-        gconf->transport = g_strdup("tcp");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
     } else if (!strcmp(uri->scheme, "gluster+tcp")) {
-        gconf->transport = g_strdup("tcp");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
     } else if (!strcmp(uri->scheme, "gluster+unix")) {
-        gconf->transport = g_strdup("unix");
+        gsconf->type = GLUSTER_TRANSPORT_UNIX;
         is_unix = true;
     } else if (!strcmp(uri->scheme, "gluster+rdma")) {
-        gconf->transport = g_strdup("rdma");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
+        error_report("Warning: rdma feature is not supported, falling "
+                     "back to tcp");
     } else {
         ret = -EINVAL;
         goto out;
@@ -154,10 +293,14 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
             ret = -EINVAL;
             goto out;
         }
-        gconf->server = g_strdup(qp->p[0].value);
+        gsconf->u.q_unix.path = g_strdup(qp->p[0].value);
     } else {
-        gconf->server = g_strdup(uri->server ? uri->server : "localhost");
-        gconf->port = uri->port;
+        gsconf->u.tcp.host = g_strdup(uri->server ? uri->server : "localhost");
+        if (uri->port) {
+            gsconf->u.tcp.port = g_strdup_printf("%d", uri->port);
+        } else {
+            gsconf->u.tcp.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT);
+        }
     }
 
 out:
@@ -168,52 +311,62 @@ out:
     return ret;
 }
 
-static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
-                                      Error **errp)
+static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
+                                           Error **errp)
 {
-    struct glfs *glfs = NULL;
+    struct glfs *glfs;
     int ret;
     int old_errno;
+    GlusterServerList *server;
 
-    ret = qemu_gluster_parseuri(gconf, filename);
-    if (ret < 0) {
-        error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
-                   "volname/image[?socket=...]");
-        errno = -ret;
-        goto out;
-    }
-
-    glfs = glfs_new(gconf->volname);
+    glfs = glfs_new(gconf->volume);
     if (!glfs) {
         goto out;
     }
 
-    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
-            gconf->port);
-    if (ret < 0) {
-        goto out;
+    for (server = gconf->server; server; server = server->next) {
+        if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
+            ret = glfs_set_volfile_server(glfs,
+                                   GlusterTransport_lookup[server->value->type],
+                                   server->value->u.q_unix.path, 0);
+        } else {
+            ret = glfs_set_volfile_server(glfs,
+                                   GlusterTransport_lookup[server->value->type],
+                                   server->value->u.tcp.host,
+                                   atoi(server->value->u.tcp.port));
+        }
+
+        if (ret < 0) {
+            goto out;
+        }
     }
 
-    /*
-     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
-     * GlusterFS makes GF_LOG_* macros available to libgfapi users.
-     */
-    ret = glfs_set_logging(glfs, "-", 4);
+    ret = glfs_set_logging(glfs, "-", gconf->debug_level);
     if (ret < 0) {
         goto out;
     }
 
     ret = glfs_init(glfs);
     if (ret) {
-        error_setg_errno(errp, errno,
-                         "Gluster connection failed for server=%s port=%d "
-                         "volume=%s image=%s transport=%s", gconf->server,
-                         gconf->port, gconf->volname, gconf->image,
-                         gconf->transport);
+        error_setg(errp, "Gluster connection for volume %s, path %s failed"
+                         " to connect", gconf->volume, gconf->path);
+        for (server = gconf->server; server; server = server->next) {
+            if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
+                error_append_hint(errp, "hint: failed on socket %s ",
+                                  server->value->u.q_unix.path);
+            } else {
+                error_append_hint(errp, "hint: failed on host %s and port %s ",
+                                  server->value->u.tcp.host,
+                                  server->value->u.tcp.port);
+            }
+        }
+
+        error_append_hint(errp, "Please refer to gluster logs for more info\n");
 
         /* glfs_init sometimes doesn't set errno although docs suggest that */
-        if (errno == 0)
+        if (errno == 0) {
             errno = EINVAL;
+        }
 
         goto out;
     }
@@ -228,13 +381,233 @@ out:
     return NULL;
 }
 
+static int qapi_enum_parse(const char *opt)
+{
+    int i;
+
+    if (!opt) {
+        return GLUSTER_TRANSPORT__MAX;
+    }
+
+    for (i = 0; i < GLUSTER_TRANSPORT__MAX; i++) {
+        if (!strcmp(opt, GlusterTransport_lookup[i])) {
+            return i;
+        }
+    }
+
+    return i;
+}
+
+/*
+ * Convert the json formatted command line into qapi.
+*/
+static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
+                                  QDict *options, Error **errp)
+{
+    QemuOpts *opts;
+    GlusterServer *gsconf;
+    GlusterServerList *curr = NULL;
+    QDict *backing_options = NULL;
+    Error *local_err = NULL;
+    char *str = NULL;
+    const char *ptr;
+    size_t num_servers;
+    int i;
+
+    /* create opts info from runtime_json_opts list */
+    opts = qemu_opts_create(&runtime_json_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    num_servers = qdict_array_entries(options, GLUSTER_OPT_SERVER_PATTERN);
+    if (num_servers < 1) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
+        goto out;
+    }
+
+    ptr = qemu_opt_get(opts, GLUSTER_OPT_VOLUME);
+    if (!ptr) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_VOLUME);
+        goto out;
+    }
+    gconf->volume = g_strdup(ptr);
+
+    ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH);
+    if (!ptr) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_PATH);
+        goto out;
+    }
+    gconf->path = g_strdup(ptr);
+    qemu_opts_del(opts);
+
+    for (i = 0; i < num_servers; i++) {
+        str = g_strdup_printf(GLUSTER_OPT_SERVER_PATTERN"%d.", i);
+        qdict_extract_subqdict(options, &backing_options, str);
+
+        /* create opts info from runtime_type_opts list */
+        opts = qemu_opts_create(&runtime_type_opts, NULL, 0, &error_abort);
+        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        ptr = qemu_opt_get(opts, GLUSTER_OPT_TYPE);
+        gsconf = g_new0(GlusterServer, 1);
+        gsconf->type = qapi_enum_parse(ptr);
+        if (!ptr) {
+            error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_TYPE);
+            error_append_hint(&local_err, GERR_INDEX_HINT, i);
+            goto out;
+
+        }
+        if (gsconf->type == GLUSTER_TRANSPORT__MAX) {
+            error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE,
+                       GLUSTER_OPT_TYPE, "tcp or unix");
+            error_append_hint(&local_err, GERR_INDEX_HINT, i);
+            goto out;
+        }
+        qemu_opts_del(opts);
+
+        if (gsconf->type == GLUSTER_TRANSPORT_TCP) {
+            /* create opts info from runtime_tcp_opts list */
+            opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
+            qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+            if (local_err) {
+                goto out;
+            }
+
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_HOST);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_HOST);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.tcp.host = g_strdup(ptr);
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_PORT);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_PORT);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.tcp.port = g_strdup(ptr);
+
+            /* defend for unsupported fields in InetSocketAddress,
+             * i.e. @ipv4, @ipv6  and @to
+             */
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_TO);
+            if (ptr) {
+                gsconf->u.tcp.has_to = true;
+            }
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV4);
+            if (ptr) {
+                gsconf->u.tcp.has_ipv4 = true;
+            }
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV6);
+            if (ptr) {
+                gsconf->u.tcp.has_ipv6 = true;
+            }
+            if (gsconf->u.tcp.has_to) {
+                error_setg(&local_err, "Parameter 'to' not supported");
+                goto out;
+            }
+            if (gsconf->u.tcp.has_ipv4 || gsconf->u.tcp.has_ipv6) {
+                error_setg(&local_err, "Parameters 'ipv4/ipv6' not supported");
+                goto out;
+            }
+            qemu_opts_del(opts);
+        } else {
+            /* create opts info from runtime_unix_opts list */
+            opts = qemu_opts_create(&runtime_unix_opts, NULL, 0, &error_abort);
+            qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+            if (local_err) {
+                goto out;
+            }
+
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_SOCKET);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.q_unix.path = g_strdup(ptr);
+            qemu_opts_del(opts);
+        }
+
+        if (gconf->server == NULL) {
+            gconf->server = g_new0(GlusterServerList, 1);
+            gconf->server->value = gsconf;
+            curr = gconf->server;
+        } else {
+            curr->next = g_new0(GlusterServerList, 1);
+            curr->next->value = gsconf;
+            curr = curr->next;
+        }
+
+        qdict_del(backing_options, str);
+        g_free(str);
+        str = NULL;
+    }
+
+    return 0;
+
+out:
+    error_propagate(errp, local_err);
+    qemu_opts_del(opts);
+    if (str) {
+        qdict_del(backing_options, str);
+        g_free(str);
+    }
+    errno = EINVAL;
+    return -errno;
+}
+
+static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
+                                      const char *filename,
+                                      QDict *options, Error **errp)
+{
+    int ret;
+    if (filename) {
+        ret = qemu_gluster_parse_uri(gconf, filename);
+        if (ret < 0) {
+            error_setg(errp, "invalid URI");
+            error_append_hint(errp, "Usage: file=gluster[+transport]://"
+                                    "[host[:port]]/volume/path[?socket=...]\n");
+            errno = -ret;
+            return NULL;
+        }
+    } else {
+        ret = qemu_gluster_parse_json(gconf, options, errp);
+        if (ret < 0) {
+            error_append_hint(errp, "Usage: "
+                             "-drive driver=qcow2,file.driver=gluster,"
+                             "file.volume=testvol,file.path=/path/a.qcow2"
+                             "[,file.debug=9],file.server.0.type=tcp,"
+                             "file.server.0.host=1.2.3.4,"
+                             "file.server.0.port=24007,"
+                             "file.server.1.transport=unix,"
+                             "file.server.1.socket=/var/run/glusterd.socket ..."
+                             "\n");
+            errno = -ret;
+            return NULL;
+        }
+
+    }
+
+    return qemu_gluster_glfs_init(gconf, errp);
+}
+
 static void qemu_gluster_complete_aio(void *opaque)
 {
     GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
 
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
-    qemu_coroutine_enter(acb->coroutine, NULL);
+    qemu_coroutine_enter(acb->coroutine);
 }
 
 /*
@@ -256,20 +629,6 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
     qemu_bh_schedule(acb->bh);
 }
 
-/* TODO Convert to fine grained options */
-static QemuOptsList runtime_opts = {
-    .name = "gluster",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-    .desc = {
-        {
-            .name = "filename",
-            .type = QEMU_OPT_STRING,
-            .help = "URL to the gluster image",
-        },
-        { /* end of list */ }
-    },
-};
-
 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
 {
     assert(open_flags != NULL);
@@ -287,13 +646,35 @@ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
     }
 }
 
+/*
+ * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of
+ * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used.
+ * - Corrected versions return -1 and set errno to EINVAL.
+ * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set
+ *   errno to ENXIO when SEEK_DATA is called with a position of EOF.
+ */
+static bool qemu_gluster_test_seek(struct glfs_fd *fd)
+{
+    off_t ret, eof;
+
+    eof = glfs_lseek(fd, 0, SEEK_END);
+    if (eof < 0) {
+        /* this should never occur */
+        return false;
+    }
+
+    /* this should always fail with ENXIO if SEEK_DATA is supported */
+    ret = glfs_lseek(fd, eof, SEEK_DATA);
+    return (ret < 0) && (errno == ENXIO);
+}
+
 static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
                              int bdrv_flags, Error **errp)
 {
     BDRVGlusterState *s = bs->opaque;
     int open_flags = 0;
     int ret = 0;
-    GlusterConf *gconf = g_new0(GlusterConf, 1);
+    BlockdevOptionsGluster *gconf = NULL;
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
@@ -306,9 +687,20 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
         goto out;
     }
 
-    filename = qemu_opt_get(opts, "filename");
+    filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME);
 
-    s->glfs = qemu_gluster_init(gconf, filename, errp);
+    s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG,
+                                         GLUSTER_DEBUG_DEFAULT);
+    if (s->debug_level < 0) {
+        s->debug_level = 0;
+    } else if (s->debug_level > GLUSTER_DEBUG_MAX) {
+        s->debug_level = GLUSTER_DEBUG_MAX;
+    }
+
+    gconf = g_new0(BlockdevOptionsGluster, 1);
+    gconf->debug_level = s->debug_level;
+    gconf->has_debug_level = true;
+    s->glfs = qemu_gluster_init(gconf, filename, options, errp);
     if (!s->glfs) {
         ret = -errno;
         goto out;
@@ -333,14 +725,16 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
 
     qemu_gluster_parse_flags(bdrv_flags, &open_flags);
 
-    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+    s->fd = glfs_open(s->glfs, gconf->path, open_flags);
     if (!s->fd) {
         ret = -errno;
     }
 
+    s->supports_seek_data = qemu_gluster_test_seek(s->fd);
+
 out:
     qemu_opts_del(opts);
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     if (!ret) {
         return ret;
     }
@@ -353,31 +747,29 @@ out:
     return ret;
 }
 
-typedef struct BDRVGlusterReopenState {
-    struct glfs *glfs;
-    struct glfs_fd *fd;
-} BDRVGlusterReopenState;
-
-
 static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
                                        BlockReopenQueue *queue, Error **errp)
 {
     int ret = 0;
+    BDRVGlusterState *s;
     BDRVGlusterReopenState *reop_s;
-    GlusterConf *gconf = NULL;
+    BlockdevOptionsGluster *gconf;
     int open_flags = 0;
 
     assert(state != NULL);
     assert(state->bs != NULL);
 
+    s = state->bs->opaque;
+
     state->opaque = g_new0(BDRVGlusterReopenState, 1);
     reop_s = state->opaque;
 
     qemu_gluster_parse_flags(state->flags, &open_flags);
 
-    gconf = g_new0(GlusterConf, 1);
-
-    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
+    gconf = g_new0(BlockdevOptionsGluster, 1);
+    gconf->debug_level = s->debug_level;
+    gconf->has_debug_level = true;
+    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, NULL, errp);
     if (reop_s->glfs == NULL) {
         ret = -errno;
         goto exit;
@@ -393,7 +785,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
     }
 #endif
 
-    reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
+    reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags);
     if (reop_s->fd == NULL) {
         /* reops->glfs will be cleaned up in _abort */
         ret = -errno;
@@ -402,7 +794,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
 
 exit:
     /* state->opaque will be freed in either the _abort or _commit */
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     return ret;
 }
 
@@ -454,14 +846,14 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
 }
 
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+                                                      int64_t offset,
+                                                      int size,
+                                                      BdrvRequestFlags flags)
 {
     int ret;
     GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
-    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
     acb.size = size;
     acb.ret = 0;
@@ -483,7 +875,7 @@ static inline bool gluster_supports_zerofill(void)
 }
 
 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
-        int64_t size)
+                                        int64_t size)
 {
     return glfs_zerofill(fd, offset, size);
 }
@@ -495,7 +887,7 @@ static inline bool gluster_supports_zerofill(void)
 }
 
 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
-        int64_t size)
+                                        int64_t size)
 {
     return 0;
 }
@@ -504,15 +896,25 @@ static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
 static int qemu_gluster_create(const char *filename,
                                QemuOpts *opts, Error **errp)
 {
+    BlockdevOptionsGluster *gconf;
     struct glfs *glfs;
     struct glfs_fd *fd;
     int ret = 0;
     int prealloc = 0;
     int64_t total_size = 0;
     char *tmp = NULL;
-    GlusterConf *gconf = g_new0(GlusterConf, 1);
 
-    glfs = qemu_gluster_init(gconf, filename, errp);
+    gconf = g_new0(BlockdevOptionsGluster, 1);
+    gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
+                                                 GLUSTER_DEBUG_DEFAULT);
+    if (gconf->debug_level < 0) {
+        gconf->debug_level = 0;
+    } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
+        gconf->debug_level = GLUSTER_DEBUG_MAX;
+    }
+    gconf->has_debug_level = true;
+
+    glfs = qemu_gluster_init(gconf, filename, NULL, errp);
     if (!glfs) {
         ret = -errno;
         goto out;
@@ -524,19 +926,17 @@ static int qemu_gluster_create(const char *filename,
     tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
     if (!tmp || !strcmp(tmp, "off")) {
         prealloc = 0;
-    } else if (!strcmp(tmp, "full") &&
-               gluster_supports_zerofill()) {
+    } else if (!strcmp(tmp, "full") && gluster_supports_zerofill()) {
         prealloc = 1;
     } else {
         error_setg(errp, "Invalid preallocation mode: '%s'"
-            " or GlusterFS doesn't support zerofill API",
-            tmp);
+                         " or GlusterFS doesn't support zerofill API", tmp);
         ret = -EINVAL;
         goto out;
     }
 
-    fd = glfs_creat(glfs, gconf->image,
-        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    fd = glfs_creat(glfs, gconf->path,
+                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
     if (!fd) {
         ret = -errno;
     } else {
@@ -554,7 +954,7 @@ static int qemu_gluster_create(const char *filename,
     }
 out:
     g_free(tmp);
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     if (glfs) {
         glfs_fini(glfs);
     }
@@ -562,7 +962,8 @@ out:
 }
 
 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
+                                           int64_t sector_num, int nb_sectors,
+                                           QEMUIOVector *qiov, int write)
 {
     int ret;
     GlusterAIOCB acb;
@@ -577,10 +978,10 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 
     if (write) {
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            gluster_finish_aiocb, &acb);
+                                 gluster_finish_aiocb, &acb);
     } else {
         ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            gluster_finish_aiocb, &acb);
+                                gluster_finish_aiocb, &acb);
     }
 
     if (ret < 0) {
@@ -605,13 +1006,17 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
 }
 
 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                              int64_t sector_num,
+                                              int nb_sectors,
+                                              QEMUIOVector *qiov)
 {
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
 }
 
 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                               int64_t sector_num,
+                                               int nb_sectors,
+                                               QEMUIOVector *qiov)
 {
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
@@ -672,14 +1077,12 @@ error:
 }
 
 #ifdef CONFIG_GLUSTERFS_DISCARD
-static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors)
+static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
+                                                 int64_t offset, int size)
 {
     int ret;
     GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
-    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
     acb.size = 0;
     acb.ret = 0;
@@ -729,29 +1132,164 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs)
     return 0;
 }
 
-static QemuOptsList qemu_gluster_create_opts = {
-    .name = "qemu-gluster-create-opts",
-    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
-    .desc = {
-        {
-            .name = BLOCK_OPT_SIZE,
-            .type = QEMU_OPT_SIZE,
-            .help = "Virtual disk size"
-        },
-        {
-            .name = BLOCK_OPT_PREALLOC,
-            .type = QEMU_OPT_STRING,
-            .help = "Preallocation mode (allowed values: off, full)"
-        },
-        { /* end of list */ }
+/*
+ * Find allocation range in @bs around offset @start.
+ * May change underlying file descriptor's file offset.
+ * If @start is not in a hole, store @start in @data, and the
+ * beginning of the next hole in @hole, and return 0.
+ * If @start is in a non-trailing hole, store @start in @hole and the
+ * beginning of the next non-hole in @data, and return 0.
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
+ * If we can't find out, return a negative errno other than -ENXIO.
+ *
+ * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
+ */
+static int find_allocation(BlockDriverState *bs, off_t start,
+                           off_t *data, off_t *hole)
+{
+    BDRVGlusterState *s = bs->opaque;
+    off_t offs;
+
+    if (!s->supports_seek_data) {
+        return -ENOTSUP;
     }
-};
+
+    /*
+     * SEEK_DATA cases:
+     * D1. offs == start: start is in data
+     * D2. offs > start: start is in a hole, next data at offs
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
+     *                              or start is beyond EOF
+     *     If the latter happens, the file has been truncated behind
+     *     our back since we opened it.  All bets are off then.
+     *     Treating like a trailing hole is simplest.
+     * D4. offs < 0, errno != ENXIO: we learned nothing
+     */
+    offs = glfs_lseek(s->fd, start, SEEK_DATA);
+    if (offs < 0) {
+        return -errno;          /* D3 or D4 */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /* D2: in hole, next data at offs */
+        *hole = start;
+        *data = offs;
+        return 0;
+    }
+
+    /* D1: in data, end not yet known */
+
+    /*
+     * SEEK_HOLE cases:
+     * H1. offs == start: start is in a hole
+     *     If this happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H2. offs > start: either start is in data, next hole at offs,
+     *                   or start is in trailing hole, EOF at offs
+     *     Linux treats trailing holes like any other hole: offs ==
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
+     *     If that happens here, a hole has been dug behind our back
+     *     since the previous lseek().
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
+     *     If this happens, the file has been truncated behind our
+     *     back since we opened it.  Treat it like a trailing hole.
+     * H4. offs < 0, errno != ENXIO: we learned nothing
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
+     */
+    offs = glfs_lseek(s->fd, start, SEEK_HOLE);
+    if (offs < 0) {
+        return -errno;          /* D1 and (H3 or H4) */
+    }
+    assert(offs >= start);
+
+    if (offs > start) {
+        /*
+         * D1 and H2: either in data, next hole at offs, or it was in
+         * data but is now in a trailing hole.  In the latter case,
+         * all bets are off.  Treating it as if it there was data all
+         * the way to EOF is safe, so simply do that.
+         */
+        *data = start;
+        *hole = offs;
+        return 0;
+    }
+
+    /* D1 and H1 */
+    return -EBUSY;
+}
+
+/*
+ * Returns the allocation status of the specified sectors.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ *
+ * (Based on raw_co_get_block_status() from raw-posix.c.)
+ */
+static int64_t coroutine_fn qemu_gluster_co_get_block_status(
+        BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
+        BlockDriverState **file)
+{
+    BDRVGlusterState *s = bs->opaque;
+    off_t start, data = 0, hole = 0;
+    int64_t total_size;
+    int ret = -EINVAL;
+
+    if (!s->fd) {
+        return ret;
+    }
+
+    start = sector_num * BDRV_SECTOR_SIZE;
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        return total_size;
+    } else if (start >= total_size) {
+        *pnum = 0;
+        return 0;
+    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
+        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    }
+
+    ret = find_allocation(bs, start, &data, &hole);
+    if (ret == -ENXIO) {
+        /* Trailing hole */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_ZERO;
+    } else if (ret < 0) {
+        /* No info available, so pretend there are no holes */
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA;
+    } else if (data == start) {
+        /* On a data extent, compute sectors to the end of the extent,
+         * possibly including a partial sector at EOF. */
+        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        ret = BDRV_BLOCK_DATA;
+    } else {
+        /* On a hole, compute sectors to the beginning of the next extent.  */
+        assert(hole == start);
+        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        ret = BDRV_BLOCK_ZERO;
+    }
+
+    *file = bs;
+
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+}
+
 
 static BlockDriver bdrv_gluster = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster",
     .instance_size                = sizeof(BDRVGlusterState),
-    .bdrv_needs_filename          = true,
+    .bdrv_needs_filename          = false,
     .bdrv_file_open               = qemu_gluster_open,
     .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
     .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
@@ -766,11 +1304,12 @@ static BlockDriver bdrv_gluster = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
     .create_opts                  = &qemu_gluster_create_opts,
 };
 
@@ -778,7 +1317,7 @@ static BlockDriver bdrv_gluster_tcp = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+tcp",
     .instance_size                = sizeof(BDRVGlusterState),
-    .bdrv_needs_filename          = true,
+    .bdrv_needs_filename          = false,
     .bdrv_file_open               = qemu_gluster_open,
     .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
     .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
@@ -793,11 +1332,12 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
     .create_opts                  = &qemu_gluster_create_opts,
 };
 
@@ -820,14 +1360,21 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
     .create_opts                  = &qemu_gluster_create_opts,
 };
 
+/* rdma is deprecated (actually never supported for volfile fetch).
+ * Let's maintain it for the protocol compatibility, to make sure things
+ * won't break immediately. For now, gluster+rdma will fall back to gluster+tcp
+ * protocol with a warning.
+ * TODO: remove gluster+rdma interface support
+ */
 static BlockDriver bdrv_gluster_rdma = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+rdma",
@@ -847,11 +1394,12 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
+    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
     .create_opts                  = &qemu_gluster_create_opts,
 };
 
diff --git a/block/io.c b/block/io.c
index a7dbf85b1..420944d80 100644
--- a/block/io.c
+++ b/block/io.c
@@ -27,118 +27,54 @@
 #include "sysemu/block-backend.h"
 #include "block/blockjob.h"
 #include "block/block_int.h"
-#include "block/throttle-groups.h"
 #include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write);
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+                                          int64_t offset,
+                                          QEMUIOVector *qiov,
+                                          BdrvRequestFlags flags,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque,
+                                          bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags);
 
-/* throttling disk I/O limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        ThrottleConfig *cfg)
+static void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
-    int i;
-
-    throttle_group_config(bs, cfg);
-
-    for (i = 0; i < 2; i++) {
-        qemu_co_enter_next(&bs->throttled_reqs[i]);
-    }
-}
-
-/* this function drain all the throttled IOs */
-static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
-{
-    bool drained = false;
-    bool enabled = bs->io_limits_enabled;
-    int i;
-
-    bs->io_limits_enabled = false;
+    BdrvChild *c;
 
-    for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
-            drained = true;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->drained_begin) {
+            c->role->drained_begin(c);
         }
     }
-
-    bs->io_limits_enabled = enabled;
-
-    return drained;
-}
-
-void bdrv_io_limits_disable(BlockDriverState *bs)
-{
-    bs->io_limits_enabled = false;
-    bdrv_start_throttled_reqs(bs);
-    throttle_group_unregister_bs(bs);
-}
-
-/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
-{
-    assert(!bs->io_limits_enabled);
-    throttle_group_register_bs(bs, group);
-    bs->io_limits_enabled = true;
 }
 
-void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
+static void bdrv_parent_drained_end(BlockDriverState *bs)
 {
-    /* this bs is not part of any group */
-    if (!bs->throttle_state) {
-        return;
-    }
+    BdrvChild *c;
 
-    /* this bs is a part of the same group than the one we want */
-    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
-        return;
+    QLIST_FOREACH(c, &bs->parents, next_parent) {
+        if (c->role->drained_end) {
+            c->role->drained_end(c);
+        }
     }
-
-    /* need to change the group this bs belong to */
-    bdrv_io_limits_disable(bs);
-    bdrv_io_limits_enable(bs, group);
 }
 
-void bdrv_setup_io_funcs(BlockDriver *bdrv)
+static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 {
-    /* Block drivers without coroutine functions need emulation */
-    if (!bdrv->bdrv_co_readv) {
-        bdrv->bdrv_co_readv = bdrv_co_readv_em;
-        bdrv->bdrv_co_writev = bdrv_co_writev_em;
-
-        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
-         * the block driver lacks aio we need to emulate that too.
-         */
-        if (!bdrv->bdrv_aio_readv) {
-            /* add AIO emulation layer */
-            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
-            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
-        }
-    }
+    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
+    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
+    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
+                                 src->opt_mem_alignment);
+    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
+                                 src->min_mem_alignment);
+    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 }
 
 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
@@ -152,6 +88,9 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
         return;
     }
 
+    /* Default alignment based on whether driver has byte interface */
+    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
+
     /* Take some limits from the children as a default */
     if (bs->file) {
         bdrv_refresh_limits(bs->file->bs, &local_err);
@@ -159,11 +98,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
             error_propagate(errp, local_err);
             return;
         }
-        bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
-        bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
-        bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
-        bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
-        bs->bl.max_iov = bs->file->bs->bl.max_iov;
+        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
     } else {
         bs->bl.min_mem_alignment = 512;
         bs->bl.opt_mem_alignment = getpagesize();
@@ -178,21 +113,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
             error_propagate(errp, local_err);
             return;
         }
-        bs->bl.opt_transfer_length =
-            MAX(bs->bl.opt_transfer_length,
-                bs->backing->bs->bl.opt_transfer_length);
-        bs->bl.max_transfer_length =
-            MIN_NON_ZERO(bs->bl.max_transfer_length,
-                         bs->backing->bs->bl.max_transfer_length);
-        bs->bl.opt_mem_alignment =
-            MAX(bs->bl.opt_mem_alignment,
-                bs->backing->bs->bl.opt_mem_alignment);
-        bs->bl.min_mem_alignment =
-            MAX(bs->bl.min_mem_alignment,
-                bs->backing->bs->bl.min_mem_alignment);
-        bs->bl.max_iov =
-            MIN(bs->bl.max_iov,
-                bs->backing->bs->bl.max_iov);
+        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
     }
 
     /* Then let the driver override it */
@@ -225,12 +146,6 @@ bool bdrv_requests_pending(BlockDriverState *bs)
     if (!QLIST_EMPTY(&bs->tracked_requests)) {
         return true;
     }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
-        return true;
-    }
 
     QLIST_FOREACH(child, &bs->children, next) {
         if (bdrv_requests_pending(child->bs)) {
@@ -260,18 +175,29 @@ typedef struct {
     bool done;
 } BdrvCoDrainData;
 
+static void bdrv_drain_poll(BlockDriverState *bs)
+{
+    bool busy = true;
+
+    while (busy) {
+        /* Keep iterating */
+        busy = bdrv_requests_pending(bs);
+        busy |= aio_poll(bdrv_get_aio_context(bs), busy);
+    }
+}
+
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
     Coroutine *co = data->co;
 
     qemu_bh_delete(data->bh);
-    bdrv_drain(data->bs);
+    bdrv_drain_poll(data->bs);
     data->done = true;
-    qemu_coroutine_enter(co, NULL);
+    qemu_coroutine_enter(co);
 }
 
-void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
+static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 {
     BdrvCoDrainData data;
 
@@ -294,6 +220,34 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
     assert(data.done);
 }
 
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+        bdrv_parent_drained_begin(bs);
+    }
+
+    bdrv_io_unplugged_begin(bs);
+    bdrv_drain_recurse(bs);
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+    } else {
+        bdrv_drain_poll(bs);
+    }
+    bdrv_io_unplugged_end(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
+    }
+
+    bdrv_parent_drained_end(bs);
+    aio_enable_external(bdrv_get_aio_context(bs));
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -305,21 +259,17 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
  * not depend on events in other AioContexts.  In that case, use
  * bdrv_drain_all() instead.
  */
-void bdrv_drain(BlockDriverState *bs)
+void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
-    bool busy = true;
+    assert(qemu_in_coroutine());
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
+}
 
-    bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_drain(bs);
-        return;
-    }
-    while (busy) {
-        /* Keep iterating */
-         bdrv_flush_io_queue(bs);
-         busy = bdrv_requests_pending(bs);
-         busy |= aio_poll(bdrv_get_aio_context(bs), busy);
-    }
+void bdrv_drain(BlockDriverState *bs)
+{
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
 }
 
 /*
@@ -332,16 +282,25 @@ void bdrv_drain_all(void)
 {
     /* Always run first iteration so any pending completion BHs run */
     bool busy = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
+    BlockJob *job = NULL;
     GSList *aio_ctxs = NULL, *ctx;
 
-    while ((bs = bdrv_next(bs))) {
+    while ((job = block_job_next(job))) {
+        AioContext *aio_context = blk_get_aio_context(job->blk);
+
+        aio_context_acquire(aio_context);
+        block_job_pause(job);
+        aio_context_release(aio_context);
+    }
+
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        if (bs->job) {
-            block_job_pause(bs->job);
-        }
+        bdrv_parent_drained_begin(bs);
+        bdrv_io_unplugged_begin(bs);
         bdrv_drain_recurse(bs);
         aio_context_release(aio_context);
 
@@ -361,12 +320,10 @@ void bdrv_drain_all(void)
 
         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
             AioContext *aio_context = ctx->data;
-            bs = NULL;
 
             aio_context_acquire(aio_context);
-            while ((bs = bdrv_next(bs))) {
+            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    bdrv_flush_io_queue(bs);
                     if (bdrv_requests_pending(bs)) {
                         busy = true;
                         aio_poll(aio_context, busy);
@@ -378,17 +335,24 @@ void bdrv_drain_all(void)
         }
     }
 
-    bs = NULL;
-    while ((bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
-        if (bs->job) {
-            block_job_resume(bs->job);
-        }
+        bdrv_io_unplugged_end(bs);
+        bdrv_parent_drained_end(bs);
         aio_context_release(aio_context);
     }
     g_slist_free(aio_ctxs);
+
+    job = NULL;
+    while ((job = block_job_next(job))) {
+        AioContext *aio_context = blk_get_aio_context(job->blk);
+
+        aio_context_acquire(aio_context);
+        block_job_resume(job);
+        aio_context_release(aio_context);
+    }
 }
 
 /**
@@ -447,12 +411,12 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 }
 
 /**
- * Round a region to cluster boundaries
+ * Round a region to cluster boundaries (sector-based)
  */
-void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t sector_num, int nb_sectors,
-                            int64_t *cluster_sector_num,
-                            int *cluster_nb_sectors)
+void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
+                                    int64_t sector_num, int nb_sectors,
+                                    int64_t *cluster_sector_num,
+                                    int *cluster_nb_sectors)
 {
     BlockDriverInfo bdi;
 
@@ -467,6 +431,26 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t offset, unsigned int bytes,
+                            int64_t *cluster_offset,
+                            unsigned int *cluster_bytes)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_offset = offset;
+        *cluster_bytes = bytes;
+    } else {
+        int64_t c = bdi.cluster_size;
+        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
+        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
+    }
+}
+
 static int bdrv_get_cluster_size(BlockDriverState *bs)
 {
     BlockDriverInfo bdi;
@@ -474,7 +458,7 @@ static int bdrv_get_cluster_size(BlockDriverState *bs)
 
     ret = bdrv_get_info(bs, &bdi);
     if (ret < 0 || bdi.cluster_size == 0) {
-        return bs->request_alignment;
+        return bs->bl.request_alignment;
     } else {
         return bdi.cluster_size;
     }
@@ -568,7 +552,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
 }
 
 typedef struct RwCo {
-    BlockDriverState *bs;
+    BdrvChild *child;
     int64_t offset;
     QEMUIOVector *qiov;
     bool is_write;
@@ -581,26 +565,26 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
     RwCo *rwco = opaque;
 
     if (!rwco->is_write) {
-        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
-                                      rwco->qiov->size, rwco->qiov,
-                                      rwco->flags);
+        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
+                                   rwco->qiov->size, rwco->qiov,
+                                   rwco->flags);
     } else {
-        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
-                                       rwco->qiov->size, rwco->qiov,
-                                       rwco->flags);
+        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
+                                    rwco->qiov->size, rwco->qiov,
+                                    rwco->flags);
     }
 }
 
 /*
  * Process a vectored synchronous request using coroutines
  */
-static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
                         QEMUIOVector *qiov, bool is_write,
                         BdrvRequestFlags flags)
 {
     Coroutine *co;
     RwCo rwco = {
-        .bs = bs,
+        .child = child,
         .offset = offset,
         .qiov = qiov,
         .is_write = is_write,
@@ -608,25 +592,14 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
         .flags = flags,
     };
 
-    /**
-     * In sync call context, when the vcpu is blocked, this throttling timer
-     * will not fire; so the I/O throttling function has to be disabled here
-     * if it has been enabled.
-     */
-    if (bs->io_limits_enabled) {
-        fprintf(stderr, "Disabling I/O throttling on '%s' due "
-                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
-        bdrv_io_limits_disable(bs);
-    }
-
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+        AioContext *aio_context = bdrv_get_aio_context(child->bs);
 
-        co = qemu_coroutine_create(bdrv_rw_co_entry);
-        qemu_coroutine_enter(co, &rwco);
+        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
+        qemu_coroutine_enter(co);
         while (rwco.ret == NOT_DONE) {
             aio_poll(aio_context, true);
         }
@@ -637,7 +610,7 @@ static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
 /*
  * Process a synchronous request using coroutines
  */
-static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
 {
     QEMUIOVector qiov;
@@ -651,15 +624,15 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
     }
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+    return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
                         &qiov, is_write, flags);
 }
 
 /* return < 0 if error. See bdrv_write() for the return codes */
-int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+int bdrv_read(BdrvChild *child, int64_t sector_num,
               uint8_t *buf, int nb_sectors)
 {
-    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+    return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 }
 
 /* Return < 0 if error. Important errors are:
@@ -668,30 +641,39 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
   -EINVAL      Invalid sector number or nb_sectors
   -EACCES      Trying to write a read-only device
 */
-int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+int bdrv_write(BdrvChild *child, int64_t sector_num,
                const uint8_t *buf, int nb_sectors)
 {
-    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+    return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                      int nb_sectors, BdrvRequestFlags flags)
+int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
+                       int count, BdrvRequestFlags flags)
 {
-    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-                      BDRV_REQ_ZERO_WRITE | flags);
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = NULL,
+        .iov_len = count,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_prwv_co(child, offset, &qiov, true,
+                        BDRV_REQ_ZERO_WRITE | flags);
 }
 
 /*
- * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
  * The operation is sped up by checking the block status and only writing
  * zeroes to the device if they currently do not return zeroes. Optional
- * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
+ * BDRV_REQ_FUA).
  *
  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
  */
-int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 {
     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+    BlockDriverState *bs = child->bs;
     BlockDriverState *file;
     int n;
 
@@ -715,7 +697,8 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
             sector_num += n;
             continue;
         }
-        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
+                                 n << BDRV_SECTOR_BITS, flags);
         if (ret < 0) {
             error_report("error writing zeroes at sector %" PRId64 ": %s",
                          sector_num, strerror(-ret));
@@ -725,33 +708,39 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 }
 
-int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
+{
+    int ret;
+
+    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return qiov->size;
+}
+
+int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
         .iov_base = (void *)buf,
         .iov_len = bytes,
     };
-    int ret;
 
     if (bytes < 0) {
         return -EINVAL;
     }
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
-    if (ret < 0) {
-        return ret;
-    }
-
-    return bytes;
+    return bdrv_preadv(child, offset, &qiov);
 }
 
-int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 {
     int ret;
 
-    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
     if (ret < 0) {
         return ret;
     }
@@ -759,8 +748,7 @@ int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
     return qiov->size;
 }
 
-int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-                const void *buf, int bytes)
+int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
@@ -773,7 +761,7 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
     }
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_pwritev(bs, offset, &qiov);
+    return bdrv_pwritev(child, offset, &qiov);
 }
 
 /*
@@ -782,17 +770,17 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
  *
  * Returns 0 on success, -errno in error cases.
  */
-int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
-    const void *buf, int count)
+int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
+                     const void *buf, int count)
 {
     int ret;
 
-    ret = bdrv_pwrite(bs, offset, buf, count);
+    ret = bdrv_pwrite(child, offset, buf, count);
     if (ret < 0) {
         return ret;
     }
 
-    ret = bdrv_flush(bs);
+    ret = bdrv_flush(child->bs);
     if (ret < 0) {
         return ret;
     }
@@ -800,8 +788,117 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine);
+}
+
+static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
+                                           uint64_t offset, uint64_t bytes,
+                                           QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    int64_t sector_num;
+    unsigned int nb_sectors;
+
+    assert(!(flags & ~BDRV_REQ_MASK));
+
+    if (drv->bdrv_co_preadv) {
+        return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
+    }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+
+    if (drv->bdrv_co_readv) {
+        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    } else {
+        BlockAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+                                      bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            return -EIO;
+        } else {
+            qemu_coroutine_yield();
+            return co.ret;
+        }
+    }
+}
+
+static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    int64_t sector_num;
+    unsigned int nb_sectors;
+    int ret;
+
+    assert(!(flags & ~BDRV_REQ_MASK));
+
+    if (drv->bdrv_co_pwritev) {
+        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
+                                   flags & bs->supported_write_flags);
+        flags &= ~bs->supported_write_flags;
+        goto emulate_flags;
+    }
+
+    sector_num = offset >> BDRV_SECTOR_BITS;
+    nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
+
+    if (drv->bdrv_co_writev_flags) {
+        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
+                                        flags & bs->supported_write_flags);
+        flags &= ~bs->supported_write_flags;
+    } else if (drv->bdrv_co_writev) {
+        assert(!bs->supported_write_flags);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    } else {
+        BlockAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
+                                       bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            ret = -EIO;
+        } else {
+            qemu_coroutine_yield();
+            ret = co.ret;
+        }
+    }
+
+emulate_flags:
+    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
+        ret = bdrv_co_flush(bs);
+    }
+
+    return ret;
+}
+
 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 {
     /* Perform I/O through a temporary buffer so that users who scribble over
      * their read buffer while the operation is in progress do not end up
@@ -813,21 +910,20 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
     BlockDriver *drv = bs->drv;
     struct iovec iov;
     QEMUIOVector bounce_qiov;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
+    int64_t cluster_offset;
+    unsigned int cluster_bytes;
     size_t skip_bytes;
     int ret;
 
     /* Cover entire cluster so no additional backing file I/O is required when
      * allocating cluster in the image file.
      */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
+    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 
-    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
-                                   cluster_sector_num, cluster_nb_sectors);
+    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
+                                   cluster_offset, cluster_bytes);
 
-    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_len = cluster_bytes;
     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
     if (bounce_buffer == NULL) {
         ret = -ENOMEM;
@@ -836,22 +932,24 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 
     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
-                             &bounce_qiov);
+    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
+                             &bounce_qiov, 0);
     if (ret < 0) {
         goto err;
     }
 
-    if (drv->bdrv_co_write_zeroes &&
+    if (drv->bdrv_co_pwrite_zeroes &&
         buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-                                      cluster_nb_sectors, 0);
+        /* FIXME: Should we (perhaps conditionally) be setting
+         * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
+         * that still correctly reads as zero? */
+        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
     } else {
         /* This does not change the data on the disk, it is not necessary
          * to flush even in cache=writethrough mode.
          */
-        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
-                                  &bounce_qiov);
+        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
+                                  &bounce_qiov, 0);
     }
 
     if (ret < 0) {
@@ -862,9 +960,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
         goto err;
     }
 
-    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
-    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
-                        nb_sectors * BDRV_SECTOR_SIZE);
+    skip_bytes = offset - cluster_offset;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
 
 err:
     qemu_vfree(bounce_buffer);
@@ -873,23 +970,31 @@ err:
 
 /*
  * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
+ * handles copy on read, zeroing after EOF, and fragmentation of large
+ * reads; any other features must be implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
     int64_t align, QEMUIOVector *qiov, int flags)
 {
-    BlockDriver *drv = bs->drv;
-    int ret;
-
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int64_t total_bytes, max_bytes;
+    int ret = 0;
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(is_power_of_2(align));
+    assert((offset & (align - 1)) == 0);
+    assert((bytes & (align - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);
+
+    /* TODO: We would need a per-BDS .supported_read_flags and
+     * potential fallback support, if we ever implement any read flags
+     * to pass through to drivers.  For now, there aren't any
+     * passthrough flags.  */
+    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -906,76 +1011,77 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
+        int64_t start_sector = offset >> BDRV_SECTOR_BITS;
+        int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+        unsigned int nb_sectors = end_sector - start_sector;
         int pnum;
 
-        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
         if (ret < 0) {
             goto out;
         }
 
         if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
             goto out;
         }
     }
 
-    /* Forward the request to the BlockDriver */
-    if (!bs->zero_beyond_eof) {
-        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-    } else {
-        /* Read zeros after EOF */
-        int64_t total_sectors, max_nb_sectors;
+    /* Forward the request to the BlockDriver, possibly fragmenting it */
+    total_bytes = bdrv_getlength(bs);
+    if (total_bytes < 0) {
+        ret = total_bytes;
+        goto out;
+    }
 
-        total_sectors = bdrv_nb_sectors(bs);
-        if (total_sectors < 0) {
-            ret = total_sectors;
-            goto out;
-        }
+    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
+    if (bytes <= max_bytes && bytes <= max_transfer) {
+        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
+        goto out;
+    }
+
+    while (bytes_remaining) {
+        int num;
 
-        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
-                                  align >> BDRV_SECTOR_BITS);
-        if (nb_sectors < max_nb_sectors) {
-            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-        } else if (max_nb_sectors > 0) {
+        if (max_bytes) {
             QEMUIOVector local_qiov;
 
+            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
+            assert(num);
             qemu_iovec_init(&local_qiov, qiov->niov);
-            qemu_iovec_concat(&local_qiov, qiov, 0,
-                              max_nb_sectors * BDRV_SECTOR_SIZE);
-
-            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
-                                     &local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
 
+            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
+                                     num, &local_qiov, 0);
+            max_bytes -= num;
             qemu_iovec_destroy(&local_qiov);
         } else {
-            ret = 0;
+            num = bytes_remaining;
+            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
+                                    bytes_remaining);
         }
-
-        /* Reading beyond end of file is supposed to produce zeroes */
-        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
-            uint64_t offset = MAX(0, total_sectors - sector_num);
-            uint64_t bytes = (sector_num + nb_sectors - offset) *
-                              BDRV_SECTOR_SIZE;
-            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+        if (ret < 0) {
+            goto out;
         }
+        bytes_remaining -= num;
     }
 
 out:
-    return ret;
+    return ret < 0 ? ret : 0;
 }
 
 /*
  * Handle a read request in coroutine context
  */
-int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+int coroutine_fn bdrv_co_preadv(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
+    BlockDriverState *bs = child->bs;
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
 
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->bl.request_alignment;
     uint8_t *head_buf = NULL;
     uint8_t *tail_buf = NULL;
     QEMUIOVector local_qiov;
@@ -996,11 +1102,6 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        throttle_group_co_io_limits_intercept(bs, bytes, false);
-    }
-
     /* Align read if necessary by padding qiov */
     if (offset & (align - 1)) {
         head_buf = qemu_blockalign(bs, align);
@@ -1041,7 +1142,7 @@ int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
     return ret;
 }
 
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
@@ -1049,67 +1150,56 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EINVAL;
     }
 
-    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
-                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+    return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
+                          nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
 }
 
-int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, QEMUIOVector *qiov)
+int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
+                               int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+    trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
 
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
 }
 
-int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
+/* Maximum buffer for write zeroes fallback, in bytes */
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_NO_SERIALISING);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_COPY_ON_READ);
-}
-
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
-
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
     QEMUIOVector qiov;
     struct iovec iov = {0};
     int ret = 0;
+    bool need_flush = false;
+    int head = 0;
+    int tail = 0;
+
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
+    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
+                        bs->bl.request_alignment);
 
-    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
-                                        BDRV_REQUEST_MAX_SECTORS);
+    assert(alignment % bs->bl.request_alignment == 0);
+    head = offset % alignment;
+    tail = (offset + count) % alignment;
+    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
+    assert(max_write_zeroes >= bs->bl.request_alignment);
 
-    while (nb_sectors > 0 && !ret) {
-        int num = nb_sectors;
+    while (count > 0 && !ret) {
+        int num = count;
 
         /* Align request.  Block drivers can expect the "bulk" of the request
-         * to be aligned.
+         * to be aligned, and that unaligned requests do not cross cluster
+         * boundaries.
          */
-        if (bs->bl.write_zeroes_alignment
-            && num > bs->bl.write_zeroes_alignment) {
-            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
-                /* Make a small request up to the first aligned sector.  */
-                num = bs->bl.write_zeroes_alignment;
-                num -= sector_num % bs->bl.write_zeroes_alignment;
-            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
-                /* Shorten the request to the last aligned sector.  num cannot
-                 * underflow because num > bs->bl.write_zeroes_alignment.
-                 */
-                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
-            }
+        if (head) {
+            /* Make a small request up to the first aligned sector.  */
+            num = MIN(count, alignment - head);
+            head = 0;
+        } else if (tail && num > alignment) {
+            /* Shorten the request to the last aligned sector.  */
+            num -= tail;
         }
 
         /* limit request size */
@@ -1119,64 +1209,90 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 
         ret = -ENOTSUP;
         /* First try the efficient write zeroes operation */
-        if (drv->bdrv_co_write_zeroes) {
-            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+        if (drv->bdrv_co_pwrite_zeroes) {
+            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
+                                             flags & bs->supported_zero_flags);
+            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
+                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
+                need_flush = true;
+            }
+        } else {
+            assert(!bs->supported_zero_flags);
         }
 
         if (ret == -ENOTSUP) {
             /* Fall back to bounce buffer if write zeroes is unsupported */
-            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+            int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
-            num = MIN(num, max_xfer_len);
-            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
+
+            if ((flags & BDRV_REQ_FUA) &&
+                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+                /* No need for bdrv_driver_pwrite() to do a fallback
+                 * flush on each chunk; use just one at the end */
+                write_flags &= ~BDRV_REQ_FUA;
+                need_flush = true;
+            }
+            num = MIN(num, max_transfer);
+            iov.iov_len = num;
             if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                iov.iov_base = qemu_try_blockalign(bs, num);
                 if (iov.iov_base == NULL) {
                     ret = -ENOMEM;
                     goto fail;
                 }
-                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+                memset(iov.iov_base, 0, num);
             }
             qemu_iovec_init_external(&qiov, &iov, 1);
 
-            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
 
             /* Keep bounce buffer around if it is big enough for all
              * all future requests.
              */
-            if (num < max_xfer_len) {
+            if (num < max_transfer) {
                 qemu_vfree(iov.iov_base);
                 iov.iov_base = NULL;
             }
         }
 
-        sector_num += num;
-        nb_sectors -= num;
+        offset += num;
+        count -= num;
     }
 
 fail:
+    if (ret == 0 && need_flush) {
+        ret = bdrv_co_flush(bs);
+    }
     qemu_vfree(iov.iov_base);
     return ret;
 }
 
 /*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    QEMUIOVector *qiov, int flags)
+    int64_t align, QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
     bool waited;
     int ret;
 
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
+    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;
 
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(is_power_of_2(align));
+    assert((offset & (align - 1)) == 0);
+    assert((bytes & (align - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    assert(!(flags & ~BDRV_REQ_MASK));
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);
 
     waited = wait_serialising_requests(req);
     assert(!waited || !req->serialising);
@@ -1186,7 +1302,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
         qemu_iovec_is_zero(qiov)) {
         flags |= BDRV_REQ_ZERO_WRITE;
         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
@@ -1198,32 +1314,48 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
-        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
-    } else if (drv->bdrv_co_writev_flags) {
+        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
+    } else if (bytes <= max_transfer) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
-        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
-                                        flags);
+        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
     } else {
-        assert(drv->supported_write_flags == 0);
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    }
-    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
+        while (bytes_remaining) {
+            int num = MIN(bytes_remaining, max_transfer);
+            QEMUIOVector local_qiov;
+            int local_flags = flags;
+
+            assert(num);
+            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+                /* If FUA is going to be emulated by flush, we only
+                 * need to flush on the last iteration */
+                local_flags &= ~BDRV_REQ_FUA;
+            }
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
 
-    if (ret == 0 && (flags & BDRV_REQ_FUA) &&
-        !(drv->supported_write_flags & BDRV_REQ_FUA))
-    {
-        ret = bdrv_co_flush(bs);
+            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+                                      num, &local_qiov, local_flags);
+            qemu_iovec_destroy(&local_qiov);
+            if (ret < 0) {
+                break;
+            }
+            bytes_remaining -= num;
+        }
     }
+    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
 
-    bdrv_set_dirty(bs, sector_num, nb_sectors);
+    ++bs->write_gen;
+    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
 
     if (bs->wr_highest_offset < offset + bytes) {
         bs->wr_highest_offset = offset + bytes;
     }
 
     if (ret >= 0) {
-        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+        bs->total_sectors = MAX(bs->total_sectors, end_sector);
+        ret = 0;
     }
 
     return ret;
@@ -1238,7 +1370,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
     uint8_t *buf = NULL;
     QEMUIOVector local_qiov;
     struct iovec iov;
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->bl.request_alignment;
     unsigned int head_padding_bytes, tail_padding_bytes;
     int ret = 0;
 
@@ -1271,7 +1403,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
 
         memset(buf + head_padding_bytes, 0, zero_bytes);
         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
-                                   &local_qiov,
+                                   align, &local_qiov,
                                    flags & ~BDRV_REQ_ZERO_WRITE);
         if (ret < 0) {
             goto fail;
@@ -1284,7 +1416,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
     if (bytes >= align) {
         /* Write the aligned part in the middle. */
         uint64_t aligned_bytes = bytes & ~(align - 1);
-        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
+        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align,
                                    NULL, flags);
         if (ret < 0) {
             goto fail;
@@ -1308,7 +1440,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
 
         memset(buf, 0, bytes);
-        ret = bdrv_aligned_pwritev(bs, req, offset, align,
+        ret = bdrv_aligned_pwritev(bs, req, offset, align, align,
                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
     }
 fail:
@@ -1320,13 +1452,13 @@ fail:
 /*
  * Handle a write request in coroutine context
  */
-int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
+    BlockDriverState *bs = child->bs;
     BdrvTrackedRequest req;
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint64_t align = bs->bl.request_alignment;
     uint8_t *head_buf = NULL;
     uint8_t *tail_buf = NULL;
     QEMUIOVector local_qiov;
@@ -1346,11 +1478,6 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         return ret;
     }
 
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        throttle_group_co_io_limits_intercept(bs, bytes, true);
-    }
-
     /*
      * Align write if necessary by performing a read-modify-write cycle.
      * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1392,6 +1519,14 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
 
         bytes += offset & (align - 1);
         offset = offset & ~(align - 1);
+
+        /* We have read the tail already if the request is smaller
+         * than one aligned block.
+         */
+        if (bytes < align) {
+            qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
+            bytes = align;
+        }
     }
 
     if ((offset + bytes) & (align - 1)) {
@@ -1431,7 +1566,7 @@ int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         bytes = ROUND_UP(bytes, align);
     }
 
-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align,
                                use_local_qiov ? &local_qiov : qiov,
                                flags);
 
@@ -1447,7 +1582,7 @@ out:
     return ret;
 }
 
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
@@ -1455,30 +1590,29 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EINVAL;
     }
 
-    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
-                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+    return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
+                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
 }
 
-int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
-    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+    trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
 
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+    return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
 }
 
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      BdrvRequestFlags flags)
+int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
+                                       int count, BdrvRequestFlags flags)
 {
-    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
 
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
         flags &= ~BDRV_REQ_MAY_UNMAP;
     }
 
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                             BDRV_REQ_ZERO_WRITE | flags);
+    return bdrv_co_pwritev(child, offset, count, NULL,
+                           BDRV_REQ_ZERO_WRITE | flags);
 }
 
 typedef struct BdrvCoGetBlockStatusData {
@@ -1663,8 +1797,9 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
     } else {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
-        qemu_coroutine_enter(co, &data);
+        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
+                                   &data);
+        qemu_coroutine_enter(co);
         while (!data.done) {
             aio_poll(aio_context, true);
         }
@@ -1766,273 +1901,134 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
 }
 
-int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
-                      int64_t pos, int size)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base   = (void *) buf,
-        .iov_len    = size,
-    };
+typedef struct BdrvVmstateCo {
+    BlockDriverState    *bs;
+    QEMUIOVector        *qiov;
+    int64_t             pos;
+    bool                is_read;
+    int                 ret;
+} BdrvVmstateCo;
 
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_writev_vmstate(bs, &qiov, pos);
-}
-
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+static int coroutine_fn
+bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+                   bool is_read)
 {
     BlockDriver *drv = bs->drv;
 
     if (!drv) {
         return -ENOMEDIUM;
-    } else if (drv->bdrv_save_vmstate) {
-        return drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (drv->bdrv_load_vmstate) {
+        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
+                       : drv->bdrv_save_vmstate(bs, qiov, pos);
     } else if (bs->file) {
-        return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
+        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
     }
 
     return -ENOTSUP;
 }
 
-int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                      int64_t pos, int size)
-{
-    BlockDriver *drv = bs->drv;
-    if (!drv)
-        return -ENOMEDIUM;
-    if (drv->bdrv_load_vmstate)
-        return drv->bdrv_load_vmstate(bs, buf, pos, size);
-    if (bs->file)
-        return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
-    return -ENOTSUP;
-}
-
-/**************************************************************/
-/* async I/Os */
-
-BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
+static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
 {
-    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, true);
+    BdrvVmstateCo *co = opaque;
+    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
 }
 
-BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque)
+static inline int
+bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+                bool is_read)
 {
-    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
-                                 BDRV_REQ_ZERO_WRITE | flags,
-                                 cb, opaque, true);
-}
-
-
-typedef struct MultiwriteCB {
-    int error;
-    int num_requests;
-    int num_callbacks;
-    struct {
-        BlockCompletionFunc *cb;
-        void *opaque;
-        QEMUIOVector *free_qiov;
-    } callbacks[];
-} MultiwriteCB;
-
-static void multiwrite_user_cb(MultiwriteCB *mcb)
-{
-    int i;
+    if (qemu_in_coroutine()) {
+        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
+    } else {
+        BdrvVmstateCo data = {
+            .bs         = bs,
+            .qiov       = qiov,
+            .pos        = pos,
+            .is_read    = is_read,
+            .ret        = -EINPROGRESS,
+        };
+        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
 
-    for (i = 0; i < mcb->num_callbacks; i++) {
-        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
-        if (mcb->callbacks[i].free_qiov) {
-            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+        qemu_coroutine_enter(co);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(bdrv_get_aio_context(bs), true);
         }
-        g_free(mcb->callbacks[i].free_qiov);
+        return data.ret;
     }
 }
 
-static void multiwrite_cb(void *opaque, int ret)
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size)
 {
-    MultiwriteCB *mcb = opaque;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = size,
+    };
+    int ret;
 
-    trace_multiwrite_cb(mcb, ret);
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
-    if (ret < 0 && !mcb->error) {
-        mcb->error = ret;
+    ret = bdrv_writev_vmstate(bs, &qiov, pos);
+    if (ret < 0) {
+        return ret;
     }
 
-    mcb->num_requests--;
-    if (mcb->num_requests == 0) {
-        multiwrite_user_cb(mcb);
-        g_free(mcb);
-    }
+    return size;
 }
 
-static int multiwrite_req_compare(const void *a, const void *b)
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 {
-    const BlockRequest *req1 = a, *req2 = b;
-
-    /*
-     * Note that we can't simply subtract req2->sector from req1->sector
-     * here as that could overflow the return value.
-     */
-    if (req1->sector > req2->sector) {
-        return 1;
-    } else if (req1->sector < req2->sector) {
-        return -1;
-    } else {
-        return 0;
-    }
+    return bdrv_rw_vmstate(bs, qiov, pos, false);
 }
 
-/*
- * Takes a bunch of requests and tries to merge them. Returns the number of
- * requests that remain after merging.
- */
-static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
-    int num_reqs, MultiwriteCB *mcb)
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size)
 {
-    int i, outidx;
-
-    // Sort requests by start sector
-    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
-
-    // Check if adjacent requests touch the same clusters. If so, combine them,
-    // filling up gaps with zero sectors.
-    outidx = 0;
-    for (i = 1; i < num_reqs; i++) {
-        int merge = 0;
-        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
-
-        // Handle exactly sequential writes and overlapping writes.
-        if (reqs[i].sector <= oldreq_last) {
-            merge = 1;
-        }
-
-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
-            bs->bl.max_iov) {
-            merge = 0;
-        }
-
-        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
-            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
-            merge = 0;
-        }
-
-        if (merge) {
-            size_t size;
-            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
-            qemu_iovec_init(qiov,
-                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
-
-            // Add the first request to the merged one. If the requests are
-            // overlapping, drop the last sectors of the first request.
-            size = (reqs[i].sector - reqs[outidx].sector) << 9;
-            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
-
-            // We should need to add any zeros between the two requests
-            assert (reqs[i].sector <= oldreq_last);
-
-            // Add the second request
-            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
-
-            // Add tail of first request, if necessary
-            if (qiov->size < reqs[outidx].qiov->size) {
-                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
-                                  reqs[outidx].qiov->size - qiov->size);
-            }
-
-            reqs[outidx].nb_sectors = qiov->size >> 9;
-            reqs[outidx].qiov = qiov;
-
-            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
-        } else {
-            outidx++;
-            reqs[outidx].sector     = reqs[i].sector;
-            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
-            reqs[outidx].qiov       = reqs[i].qiov;
-        }
-    }
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = buf,
+        .iov_len    = size,
+    };
+    int ret;
 
-    if (bs->blk) {
-        block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
-                              num_reqs - outidx - 1);
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_readv_vmstate(bs, &qiov, pos);
+    if (ret < 0) {
+        return ret;
     }
 
-    return outidx + 1;
+    return size;
 }
 
-/*
- * Submit multiple AIO write requests at once.
- *
- * On success, the function returns 0 and all requests in the reqs array have
- * been submitted. In error case this function returns -1, and any of the
- * requests may or may not be submitted yet. In particular, this means that the
- * callback will be called for some of the requests, for others it won't. The
- * caller must check the error field of the BlockRequest to wait for the right
- * callbacks (if error != 0, no callback will be called).
- *
- * The implementation may modify the contents of the reqs array, e.g. to merge
- * requests. However, the fields opaque and error are left unmodified as they
- * are used to signal failure for a single request to the caller.
- */
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 {
-    MultiwriteCB *mcb;
-    int i;
-
-    /* don't submit writes if we don't have a medium */
-    if (bs->drv == NULL) {
-        for (i = 0; i < num_reqs; i++) {
-            reqs[i].error = -ENOMEDIUM;
-        }
-        return -1;
-    }
-
-    if (num_reqs == 0) {
-        return 0;
-    }
-
-    // Create MultiwriteCB structure
-    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
-    mcb->num_requests = 0;
-    mcb->num_callbacks = num_reqs;
+    return bdrv_rw_vmstate(bs, qiov, pos, true);
+}
 
-    for (i = 0; i < num_reqs; i++) {
-        mcb->callbacks[i].cb = reqs[i].cb;
-        mcb->callbacks[i].opaque = reqs[i].opaque;
-    }
+/**************************************************************/
+/* async I/Os */
 
-    // Check for mergable requests
-    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
+                           QEMUIOVector *qiov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
 
-    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+                                  0, cb, opaque, false);
+}
 
-    /* Run the aio requests. */
-    mcb->num_requests = num_reqs;
-    for (i = 0; i < num_reqs; i++) {
-        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
-                              reqs[i].nb_sectors, reqs[i].flags,
-                              multiwrite_cb, mcb,
-                              true);
-    }
+BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
+                            QEMUIOVector *qiov, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
 
-    return 0;
+    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+                                  0, cb, opaque, true);
 }
 
 void bdrv_aio_cancel(BlockAIOCB *acb)
@@ -2064,82 +2060,30 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
 /**************************************************************/
 /* async block device emulation */
 
-typedef struct BlockAIOCBSync {
-    BlockAIOCB common;
-    QEMUBH *bh;
-    int ret;
-    /* vector translation state */
-    QEMUIOVector *qiov;
-    uint8_t *bounce;
-    int is_write;
-} BlockAIOCBSync;
-
-static const AIOCBInfo bdrv_em_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBSync),
-};
-
-static void bdrv_aio_bh_cb(void *opaque)
-{
-    BlockAIOCBSync *acb = opaque;
-
-    if (!acb->is_write && acb->ret >= 0) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, acb->ret);
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    qemu_aio_unref(acb);
-}
-
-static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov,
-                                      int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque,
-                                      int is_write)
-
-{
-    BlockAIOCBSync *acb;
-
-    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
-    acb->is_write = is_write;
-    acb->qiov = qiov;
-    acb->bounce = qemu_try_blockalign(bs, qiov->size);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
-
-    if (acb->bounce == NULL) {
-        acb->ret = -ENOMEM;
-    } else if (is_write) {
-        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
-    } else {
-        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return &acb->common;
-}
-
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-}
-
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-}
+typedef struct BlockRequest {
+    union {
+        /* Used during read, write, trim */
+        struct {
+            int64_t offset;
+            int bytes;
+            int flags;
+            QEMUIOVector *qiov;
+        };
+        /* Used during ioctl */
+        struct {
+            int req;
+            void *buf;
+        };
+    };
+    BlockCompletionFunc *cb;
+    void *opaque;
 
+    int error;
+} BlockRequest;
 
 typedef struct BlockAIOCBCoroutine {
     BlockAIOCB common;
+    BdrvChild *child;
     BlockRequest req;
     bool is_write;
     bool need_bh;
@@ -2183,42 +2127,40 @@ static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
 static void coroutine_fn bdrv_co_do_rw(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
 
     if (!acb->is_write) {
-        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
+            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
     } else {
-        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
+            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
     }
 
     bdrv_co_complete(acb);
 }
 
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write)
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+                                          int64_t offset,
+                                          QEMUIOVector *qiov,
+                                          BdrvRequestFlags flags,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque,
+                                          bool is_write)
 {
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
+    acb->child = child;
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
+    acb->req.offset = offset;
     acb->req.qiov = qiov;
     acb->req.flags = flags;
     acb->is_write = is_write;
 
-    co = qemu_coroutine_create(bdrv_co_do_rw);
-    qemu_coroutine_enter(co, acb);
+    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
+    qemu_coroutine_enter(co);
 
     bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
@@ -2245,38 +2187,37 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
 
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
-    qemu_coroutine_enter(co, acb);
+    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
+    qemu_coroutine_enter(co);
 
     bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
 }
 
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
 
-    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
     bdrv_co_complete(acb);
 }
 
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
+                              BlockCompletionFunc *cb, void *opaque)
 {
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
-    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+    trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
-    qemu_coroutine_enter(co, acb);
+    acb->req.offset = offset;
+    acb->req.bytes = count;
+    co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
+    qemu_coroutine_enter(co);
 
     bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
@@ -2314,62 +2255,15 @@ void qemu_aio_unref(void *p)
 /**************************************************************/
 /* Coroutine block device emulation */
 
-typedef struct CoroutineIOCompletion {
-    Coroutine *coroutine;
+typedef struct FlushCo {
+    BlockDriverState *bs;
     int ret;
-} CoroutineIOCompletion;
-
-static void bdrv_co_io_em_complete(void *opaque, int ret)
-{
-    CoroutineIOCompletion *co = opaque;
-
-    co->ret = ret;
-    qemu_coroutine_enter(co->coroutine, NULL);
-}
-
-static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *iov,
-                                      bool is_write)
-{
-    CoroutineIOCompletion co = {
-        .coroutine = qemu_coroutine_self(),
-    };
-    BlockAIOCB *acb;
-
-    if (is_write) {
-        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
-                                       bdrv_co_io_em_complete, &co);
-    } else {
-        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
-                                      bdrv_co_io_em_complete, &co);
-    }
-
-    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
-    if (!acb) {
-        return -EIO;
-    }
-    qemu_coroutine_yield();
-
-    return co.ret;
-}
-
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
-}
+} FlushCo;
 
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
-}
 
 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 {
-    RwCo *rwco = opaque;
+    FlushCo *rwco = opaque;
 
     rwco->ret = bdrv_co_flush(rwco->bs);
 }
@@ -2386,6 +2280,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
 
+    int current_gen = bs->write_gen;
+
+    /* Wait until any previous flushes are completed */
+    while (bs->active_flush_req != NULL) {
+        qemu_co_queue_wait(&bs->flush_queue);
+    }
+
+    bs->active_flush_req = &req;
+
     /* Write back all layers by calling one driver function */
     if (bs->drv->bdrv_co_flush) {
         ret = bs->drv->bdrv_co_flush(bs);
@@ -2406,6 +2309,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         goto flush_parent;
     }
 
+    /* Check if we really need to flush anything */
+    if (bs->flushed_gen == current_gen) {
+        goto flush_parent;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
     if (bs->drv->bdrv_co_flush_to_disk) {
         ret = bs->drv->bdrv_co_flush_to_disk(bs);
@@ -2436,6 +2344,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
          */
         ret = 0;
     }
+
     if (ret < 0) {
         goto out;
     }
@@ -2446,6 +2355,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 flush_parent:
     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
 out:
+    /* Notify any pending flushes that we have completed */
+    bs->flushed_gen = current_gen;
+    bs->active_flush_req = NULL;
+    /* Return value is ignored - it's ok if wait queue is empty */
+    qemu_co_queue_next(&bs->flush_queue);
+
     tracked_request_end(&req);
     return ret;
 }
@@ -2453,51 +2368,52 @@ out:
 int bdrv_flush(BlockDriverState *bs)
 {
     Coroutine *co;
-    RwCo rwco = {
+    FlushCo flush_co = {
         .bs = bs,
         .ret = NOT_DONE,
     };
 
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
-        bdrv_flush_co_entry(&rwco);
+        bdrv_flush_co_entry(&flush_co);
     } else {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        co = qemu_coroutine_create(bdrv_flush_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
+        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
+        qemu_coroutine_enter(co);
+        while (flush_co.ret == NOT_DONE) {
             aio_poll(aio_context, true);
         }
     }
 
-    return rwco.ret;
+    return flush_co.ret;
 }
 
 typedef struct DiscardCo {
     BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
+    int count;
     int ret;
 } DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
 {
     DiscardCo *rwco = opaque;
 
-    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
 }
 
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                 int nb_sectors)
+int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
+                                  int count)
 {
     BdrvTrackedRequest req;
-    int max_discard, ret;
+    int max_pdiscard, ret;
+    int head, align;
 
     if (!bs->drv) {
         return -ENOMEDIUM;
     }
 
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    ret = bdrv_check_byte_request(bs, offset, count);
     if (ret < 0) {
         return ret;
     } else if (bs->read_only) {
@@ -2510,44 +2426,49 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
         return 0;
     }
 
-    tracked_request_begin(&req, bs, sector_num, nb_sectors,
-                          BDRV_TRACKED_DISCARD);
-    bdrv_set_dirty(bs, sector_num, nb_sectors);
+    /* Discard is advisory, so ignore any unaligned head or tail */
+    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
+    assert(align % bs->bl.request_alignment == 0);
+    head = offset % align;
+    if (head) {
+        head = MIN(count, align - head);
+        count -= head;
+        offset += head;
+    }
+    count = QEMU_ALIGN_DOWN(count, align);
+    if (!count) {
+        return 0;
+    }
 
-    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
-    while (nb_sectors > 0) {
-        int ret;
-        int num = nb_sectors;
-
-        /* align request */
-        if (bs->bl.discard_alignment &&
-            num >= bs->bl.discard_alignment &&
-            sector_num % bs->bl.discard_alignment) {
-            if (num > bs->bl.discard_alignment) {
-                num = bs->bl.discard_alignment;
-            }
-            num -= sector_num % bs->bl.discard_alignment;
-        }
+    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
 
-        /* limit request size */
-        if (num > max_discard) {
-            num = max_discard;
-        }
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
+    if (ret < 0) {
+        goto out;
+    }
 
-        if (bs->drv->bdrv_co_discard) {
-            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
+                                   align);
+    assert(max_pdiscard);
+
+    while (count > 0) {
+        int ret;
+        int num = MIN(count, max_pdiscard);
+
+        if (bs->drv->bdrv_co_pdiscard) {
+            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
         } else {
             BlockAIOCB *acb;
             CoroutineIOCompletion co = {
                 .coroutine = qemu_coroutine_self(),
             };
 
-            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
-                                            bdrv_co_io_em_complete, &co);
+            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
+                                             bdrv_co_io_em_complete, &co);
             if (acb == NULL) {
                 ret = -EIO;
                 goto out;
@@ -2560,33 +2481,36 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
             goto out;
         }
 
-        sector_num += num;
-        nb_sectors -= num;
+        offset += num;
+        count -= num;
     }
     ret = 0;
 out:
+    ++bs->write_gen;
+    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
+                   req.bytes >> BDRV_SECTOR_BITS);
     tracked_request_end(&req);
     return ret;
 }
 
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     Coroutine *co;
     DiscardCo rwco = {
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
+        .offset = offset,
+        .count = count,
         .ret = NOT_DONE,
     };
 
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
-        bdrv_discard_co_entry(&rwco);
+        bdrv_pdiscard_co_entry(&rwco);
     } else {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        co = qemu_coroutine_create(bdrv_discard_co_entry);
-        qemu_coroutine_enter(co, &rwco);
+        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
+        qemu_coroutine_enter(co);
         while (rwco.ret == NOT_DONE) {
             aio_poll(aio_context, true);
         }
@@ -2595,19 +2519,6 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
     return rwco.ret;
 }
 
-typedef struct {
-    CoroutineIOCompletion *co;
-    QEMUBH *bh;
-} BdrvIoctlCompletionData;
-
-static void bdrv_ioctl_bh_cb(void *opaque)
-{
-    BdrvIoctlCompletionData *data = opaque;
-
-    bdrv_co_io_em_complete(data->co, -ENOTSUP);
-    qemu_bh_delete(data->bh);
-}
-
 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
 {
     BlockDriver *drv = bs->drv;
@@ -2625,11 +2536,8 @@ static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
 
     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
     if (!acb) {
-        BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
-        data->bh = aio_bh_new(bdrv_get_aio_context(bs),
-                                bdrv_ioctl_bh_cb, data);
-        data->co = &co;
-        qemu_bh_schedule(data->bh);
+        co.ret = -ENOTSUP;
+        goto out;
     }
     qemu_coroutine_yield();
 out:
@@ -2664,9 +2572,9 @@ int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
         /* Fast-path if already in coroutine context */
         bdrv_co_ioctl_entry(&data);
     } else {
-        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, &data);
 
-        qemu_coroutine_enter(co, &data);
+        qemu_coroutine_enter(co);
         while (data.ret == -EINPROGRESS) {
             aio_poll(bdrv_get_aio_context(bs), true);
         }
@@ -2694,8 +2602,8 @@ BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
     acb->req.error = -EINPROGRESS;
     acb->req.req = req;
     acb->req.buf = buf;
-    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
-    qemu_coroutine_enter(co, acb);
+    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry, acb);
+    qemu_coroutine_enter(co);
 
     bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
@@ -2763,48 +2671,66 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs,
 
 void bdrv_io_plug(BlockDriverState *bs)
 {
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_plug) {
-        drv->bdrv_io_plug(bs);
-    } else if (bs->file) {
-        bdrv_io_plug(bs->file->bs);
+    BdrvChild *child;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_io_plug(child->bs);
+    }
+
+    if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
+        BlockDriver *drv = bs->drv;
+        if (drv && drv->bdrv_io_plug) {
+            drv->bdrv_io_plug(bs);
+        }
     }
 }
 
 void bdrv_io_unplug(BlockDriverState *bs)
 {
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_unplug) {
-        drv->bdrv_io_unplug(bs);
-    } else if (bs->file) {
-        bdrv_io_unplug(bs->file->bs);
+    BdrvChild *child;
+
+    assert(bs->io_plugged);
+    if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
+        BlockDriver *drv = bs->drv;
+        if (drv && drv->bdrv_io_unplug) {
+            drv->bdrv_io_unplug(bs);
+        }
     }
-}
 
-void bdrv_flush_io_queue(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_flush_io_queue) {
-        drv->bdrv_flush_io_queue(bs);
-    } else if (bs->file) {
-        bdrv_flush_io_queue(bs->file->bs);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_io_unplug(child->bs);
     }
-    bdrv_start_throttled_reqs(bs);
 }
 
-void bdrv_drained_begin(BlockDriverState *bs)
+void bdrv_io_unplugged_begin(BlockDriverState *bs)
 {
-    if (!bs->quiesce_counter++) {
-        aio_disable_external(bdrv_get_aio_context(bs));
+    BdrvChild *child;
+
+    if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
+        BlockDriver *drv = bs->drv;
+        if (drv && drv->bdrv_io_unplug) {
+            drv->bdrv_io_unplug(bs);
+        }
+    }
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_io_unplugged_begin(child->bs);
     }
-    bdrv_drain(bs);
 }
 
-void bdrv_drained_end(BlockDriverState *bs)
+void bdrv_io_unplugged_end(BlockDriverState *bs)
 {
-    assert(bs->quiesce_counter > 0);
-    if (--bs->quiesce_counter > 0) {
-        return;
+    BdrvChild *child;
+
+    assert(bs->io_plug_disabled);
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_io_unplugged_end(child->bs);
+    }
+
+    if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
+        BlockDriver *drv = bs->drv;
+        if (drv && drv->bdrv_io_plug) {
+            drv->bdrv_io_plug(bs);
+        }
     }
-    aio_enable_external(bdrv_get_aio_context(bs));
 }
diff --git a/block/iscsi.c b/block/iscsi.c
index 302baf84c..95ce9e139 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,6 @@
 
 #ifdef __linux__
 #include <scsi/sg.h>
-#include <block/scsi.h>
 #endif
 
 typedef struct IscsiLun {
@@ -62,7 +61,23 @@ typedef struct IscsiLun {
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
-    unsigned long *allocationmap;
+    /* The allocmap tracks which clusters (pages) on the iSCSI target are
+     * allocated and which are not. In case a target returns zeros for
+     * unallocated pages (iscsilun->lprz) we can directly return zeros instead
+     * of reading zeros over the wire if a read request falls within an
+     * unallocated block. As there are 3 possible states we need 2 bitmaps to
+     * track. allocmap_valid keeps track if QEMU's information about a page is
+     * valid. allocmap tracks if a page is allocated or not. In case QEMU has no
+     * valid information about a page the corresponding allocmap entry should be
+     * switched to unallocated as well to force a new lookup of the allocation
+     * status as lookups are generally skipped if a page is suspect to be
+     * allocated. If a iSCSI target is opened with cache.direct = on the
+     * allocmap_valid does not exist turning all cached information invalid so
+     * that a fresh lookup is made for any page even if allocmap entry returns
+     * it's unallocated. */
+    unsigned long *allocmap;
+    unsigned long *allocmap_valid;
+    long allocmap_size;
     int cluster_sectors;
     bool use_16_for_rw;
     bool write_protected;
@@ -153,7 +168,7 @@ static void iscsi_co_generic_bh_cb(void *opaque)
     struct IscsiTask *iTask = opaque;
     iTask->complete = 1;
     qemu_bh_delete(iTask->bh);
-    qemu_coroutine_enter(iTask->co, NULL);
+    qemu_coroutine_enter(iTask->co);
 }
 
 static void iscsi_retry_timer_expired(void *opaque)
@@ -161,7 +176,7 @@ static void iscsi_retry_timer_expired(void *opaque)
     struct IscsiTask *iTask = opaque;
     iTask->complete = 1;
     if (iTask->co) {
-        qemu_coroutine_enter(iTask->co, NULL);
+        qemu_coroutine_enter(iTask->co);
     }
 }
 
@@ -401,53 +416,159 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
     return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }
 
-static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
-                                      IscsiLun *iscsilun)
+static bool is_byte_request_lun_aligned(int64_t offset, int count,
+                                        IscsiLun *iscsilun)
 {
-    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
-        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
-            error_report("iSCSI misaligned request: "
-                         "iscsilun->block_size %u, sector_num %" PRIi64
-                         ", nb_sectors %d",
-                         iscsilun->block_size, sector_num, nb_sectors);
-            return 0;
-    }
-    return 1;
+    if (offset % iscsilun->block_size || count % iscsilun->block_size) {
+        error_report("iSCSI misaligned request: "
+                     "iscsilun->block_size %u, offset %" PRIi64
+                     ", count %d",
+                     iscsilun->block_size, offset, count);
+        return false;
+    }
+    return true;
 }
 
-static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors,
+                                          IscsiLun *iscsilun)
 {
-    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
-                                                       iscsilun),
-                                       iscsilun->cluster_sectors));
+    assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
+    return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS,
+                                       nb_sectors << BDRV_SECTOR_BITS,
+                                       iscsilun);
 }
 
-static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
-                                    int nb_sectors)
+static void iscsi_allocmap_free(IscsiLun *iscsilun)
 {
-    if (iscsilun->allocationmap == NULL) {
-        return;
+    g_free(iscsilun->allocmap);
+    g_free(iscsilun->allocmap_valid);
+    iscsilun->allocmap = NULL;
+    iscsilun->allocmap_valid = NULL;
+}
+
+
+static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
+{
+    iscsi_allocmap_free(iscsilun);
+
+    iscsilun->allocmap_size =
+        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
+                     iscsilun->cluster_sectors);
+
+    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
+    if (!iscsilun->allocmap) {
+        return -ENOMEM;
+    }
+
+    if (open_flags & BDRV_O_NOCACHE) {
+        /* in case that cache.direct = on all allocmap entries are
+         * treated as invalid to force a relookup of the block
+         * status on every read request */
+        return 0;
     }
-    bitmap_set(iscsilun->allocationmap,
-               sector_num / iscsilun->cluster_sectors,
-               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
+
+    iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
+    if (!iscsilun->allocmap_valid) {
+        /* if we are under memory pressure free the allocmap as well */
+        iscsi_allocmap_free(iscsilun);
+        return -ENOMEM;
+    }
+
+    return 0;
 }
 
-static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
-                                      int nb_sectors)
+static void
+iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
+                      int nb_sectors, bool allocated, bool valid)
 {
-    int64_t cluster_num, nb_clusters;
-    if (iscsilun->allocationmap == NULL) {
+    int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk;
+
+    if (iscsilun->allocmap == NULL) {
         return;
     }
-    cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
-    nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
-                  - cluster_num;
-    if (nb_clusters > 0) {
-        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
+    /* expand to entirely contain all affected clusters */
+    cl_num_expanded = sector_num / iscsilun->cluster_sectors;
+    nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors,
+                                   iscsilun->cluster_sectors) - cl_num_expanded;
+    /* shrink to touch only completely contained clusters */
+    cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
+    nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors
+                      - cl_num_shrunk;
+    if (allocated) {
+        bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded);
+    } else {
+        bitmap_clear(iscsilun->allocmap, cl_num_shrunk, nb_cls_shrunk);
+    }
+
+    if (iscsilun->allocmap_valid == NULL) {
+        return;
+    }
+    if (valid) {
+        bitmap_set(iscsilun->allocmap_valid, cl_num_shrunk, nb_cls_shrunk);
+    } else {
+        bitmap_clear(iscsilun->allocmap_valid, cl_num_expanded,
+                     nb_cls_expanded);
+    }
+}
+
+static void
+iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
+                             int nb_sectors)
+{
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
+}
+
+static void
+iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
+                               int nb_sectors)
+{
+    /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update
+     * is ignored, so this will in effect be an iscsi_allocmap_set_invalid.
+     */
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
+}
+
+static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
+                                       int nb_sectors)
+{
+    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
+}
+
+static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
+{
+    if (iscsilun->allocmap) {
+        bitmap_zero(iscsilun->allocmap, iscsilun->allocmap_size);
+    }
+    if (iscsilun->allocmap_valid) {
+        bitmap_zero(iscsilun->allocmap_valid, iscsilun->allocmap_size);
     }
 }
 
+static inline bool
+iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
+                            int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocmap == NULL) {
+        return true;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return !(find_next_bit(iscsilun->allocmap, size,
+                           sector_num / iscsilun->cluster_sectors) == size);
+}
+
+static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
+                                           int64_t sector_num, int nb_sectors)
+{
+    unsigned long size;
+    if (iscsilun->allocmap_valid == NULL) {
+        return false;
+    }
+    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
+                               sector_num / iscsilun->cluster_sectors) == size);
+}
+
 static int coroutine_fn
 iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
                       QEMUIOVector *iov, int flags)
@@ -456,23 +577,23 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     struct IscsiTask iTask;
     uint64_t lba;
     uint32_t num_sectors;
-    bool fua;
+    bool fua = flags & BDRV_REQ_FUA;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (fua) {
+        assert(iscsilun->dpofua);
+    }
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
     }
 
-    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
-        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
-                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
-        return -EINVAL;
+    if (bs->bl.max_transfer) {
+        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
     }
 
     lba = sector_qemu2lun(sector_num, iscsilun);
     num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
-    fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA);
     if (iscsilun->use_16_for_rw) {
         iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
@@ -505,34 +626,17 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
+        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
         return iTask.err_code;
     }
 
-    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
 
     return 0;
 }
 
-static int coroutine_fn
-iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                QEMUIOVector *iov)
-{
-    return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0);
-}
 
 
-static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun,
-                                             int64_t sector_num, int nb_sectors)
-{
-    unsigned long size;
-    if (iscsilun->allocationmap == NULL) {
-        return true;
-    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
-    return !(find_next_bit(iscsilun->allocationmap, size,
-                           sector_num / iscsilun->cluster_sectors) == size);
-}
-
 static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
                                                   int64_t sector_num,
                                                   int nb_sectors, int *pnum,
@@ -546,7 +650,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         ret = -EINVAL;
         goto out;
     }
@@ -616,9 +720,9 @@ retry:
     }
 
     if (ret & BDRV_BLOCK_ZERO) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
     } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
     }
 
     if (*pnum > nb_sectors) {
@@ -643,26 +747,40 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
     uint64_t lba;
     uint32_t num_sectors;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
     }
 
-    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) {
-        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
-                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length);
-        return -EINVAL;
+    if (bs->bl.max_transfer) {
+        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
     }
 
-    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
-        int64_t ret;
+    /* if cache.direct is off and we have a valid entry in our allocation map
+     * we can skip checking the block status and directly return zeroes if
+     * the request falls within an unallocated area */
+    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+            qemu_iovec_memset(iov, 0, 0x00, iov->size);
+            return 0;
+    }
+
+    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
+        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
         int pnum;
         BlockDriverState *file;
-        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file);
+        /* check the block status from the beginning of the cluster
+         * containing the start sector */
+        int64_t ret = iscsi_co_get_block_status(bs,
+                          sector_num - sector_num % iscsilun->cluster_sectors,
+                          BDRV_REQUEST_MAX_SECTORS, &pnum, &file);
         if (ret < 0) {
             return ret;
         }
-        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) {
+        /* if the whole request falls into an unallocated area we can avoid
+         * to read and directly return zeroes instead */
+        if (ret & BDRV_BLOCK_ZERO &&
+            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
             qemu_iovec_memset(iov, 0, 0x00, iov->size);
             return 0;
         }
@@ -766,6 +884,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
     acb->ioh->driver_status = 0;
     acb->ioh->host_status   = 0;
     acb->ioh->resid         = 0;
+    acb->ioh->status        = status;
 
 #define SG_ERR_DRIVER_SENSE    0x08
 
@@ -837,6 +956,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
         return &acb->common;
     }
 
+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
     acb->task = malloc(sizeof(struct scsi_task));
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -916,29 +1042,26 @@ iscsi_getlength(BlockDriverState *bs)
 }
 
 static int
-coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors)
+coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
     struct unmap_list list;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return -EINVAL;
-    }
+    assert(is_byte_request_lun_aligned(offset, count, iscsilun));
 
     if (!iscsilun->lbp.lbpu) {
         /* UNMAP is not supported by the target */
         return 0;
     }
 
-    list.lba = sector_qemu2lun(sector_num, iscsilun);
-    list.num = sector_qemu2lun(nb_sectors, iscsilun);
+    list.lba = offset / iscsilun->block_size;
+    list.num = count / iscsilun->block_size;
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
     if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
-                     iscsi_co_generic_cb, &iTask) == NULL) {
+                         iscsi_co_generic_cb, &iTask) == NULL) {
         return -ENOMEM;
     }
 
@@ -968,14 +1091,15 @@ retry:
         return iTask.err_code;
     }
 
-    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
+                               count >> BDRV_SECTOR_BITS);
 
     return 0;
 }
 
 static int
-coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, BdrvRequestFlags flags)
+coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                    int count, BdrvRequestFlags flags)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
@@ -983,8 +1107,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     uint32_t nb_blocks;
     bool use_16_for_ws = iscsilun->use_16_for_rw;
 
-    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return -EINVAL;
+    if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
+        return -ENOTSUP;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1005,8 +1129,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
         return -ENOTSUP;
     }
 
-    lba = sector_qemu2lun(sector_num, iscsilun);
-    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+    lba = offset / iscsilun->block_size;
+    nb_blocks = count / iscsilun->block_size;
 
     if (iscsilun->zeroblock == NULL) {
         iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1058,13 +1182,17 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
+        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                   count >> BDRV_SECTOR_BITS);
         return iTask.err_code;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                   count >> BDRV_SECTOR_BITS);
     } else {
-        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
+                                     count >> BDRV_SECTOR_BITS);
     }
 
     return 0;
@@ -1555,6 +1683,10 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     task = NULL;
 
     iscsi_modesense_sync(iscsilun);
+    if (iscsilun->dpofua) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
 
     /* Check the write protect flag of the LUN if we want to write */
     if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
@@ -1571,14 +1703,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
     bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
-    bs->request_alignment = iscsilun->block_size;
 
     /* We don't have any emulation for devices other than disks and CD-ROMs, so
      * this must be sg ioctl compatible. We force it to be sg, otherwise qemu
      * will try to read from the device to guess the image format.
      */
     if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) {
-        bs->sg = 1;
+        bs->sg = true;
     }
 
     task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
@@ -1634,10 +1765,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
         if (iscsilun->lbprz) {
-            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
-            if (iscsilun->allocationmap == NULL) {
-                ret = -ENOMEM;
-            }
+            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
         }
     }
 
@@ -1674,48 +1802,54 @@ static void iscsi_close(BlockDriverState *bs)
     }
     iscsi_destroy_context(iscsi);
     g_free(iscsilun->zeroblock);
-    g_free(iscsilun->allocationmap);
+    iscsi_allocmap_free(iscsilun);
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
-static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun)
-{
-    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1);
-}
-
 static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     /* We don't actually refresh here, but just return data queried in
      * iscsi_open(): iscsi targets don't change their limits. */
 
     IscsiLun *iscsilun = bs->opaque;
-    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
+    uint64_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff;
+
+    bs->bl.request_alignment = iscsilun->block_size;
 
     if (iscsilun->bl.max_xfer_len) {
         max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len);
     }
 
-    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun);
+    if (max_xfer_len * iscsilun->block_size < INT_MAX) {
+        bs->bl.max_transfer = max_xfer_len * iscsilun->block_size;
+    }
 
     if (iscsilun->lbp.lbpu) {
-        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard =
-                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun);
+        if (iscsilun->bl.max_unmap < 0xffffffff / iscsilun->block_size) {
+            bs->bl.max_pdiscard =
+                iscsilun->bl.max_unmap * iscsilun->block_size;
         }
-        bs->bl.discard_alignment =
-            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+        bs->bl.pdiscard_alignment =
+            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
+    } else {
+        bs->bl.pdiscard_alignment = iscsilun->block_size;
     }
 
-    if (iscsilun->bl.max_ws_len < 0xffffffff) {
-        bs->bl.max_write_zeroes =
-            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
+    if (iscsilun->bl.max_ws_len < 0xffffffff / iscsilun->block_size) {
+        bs->bl.max_pwrite_zeroes =
+            iscsilun->bl.max_ws_len * iscsilun->block_size;
     }
     if (iscsilun->lbp.lbpws) {
-        bs->bl.write_zeroes_alignment =
-            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+        bs->bl.pwrite_zeroes_alignment =
+            iscsilun->bl.opt_unmap_gran * iscsilun->block_size;
+    } else {
+        bs->bl.pwrite_zeroes_alignment = iscsilun->block_size;
+    }
+    if (iscsilun->bl.opt_xfer_len &&
+        iscsilun->bl.opt_xfer_len < INT_MAX / iscsilun->block_size) {
+        bs->bl.opt_transfer = pow2floor(iscsilun->bl.opt_xfer_len *
+                                        iscsilun->block_size);
     }
-    bs->bl.opt_transfer_length =
-        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
 }
 
 /* Note that this will not re-establish a connection with an iSCSI target - it
@@ -1732,6 +1866,16 @@ static int iscsi_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
+static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
+{
+    IscsiLun *iscsilun = reopen_state->bs->opaque;
+
+    /* the cache.direct status might have changed */
+    if (iscsilun->allocmap != NULL) {
+        iscsi_allocmap_init(iscsilun, reopen_state->flags);
+    }
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
     IscsiLun *iscsilun = bs->opaque;
@@ -1751,9 +1895,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
         return -EINVAL;
     }
 
-    if (iscsilun->allocationmap != NULL) {
-        g_free(iscsilun->allocationmap);
-        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+    if (iscsilun->allocmap != NULL) {
+        iscsi_allocmap_init(iscsilun, bs->open_flags);
     }
 
     return 0;
@@ -1813,6 +1956,13 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
+static void iscsi_invalidate_cache(BlockDriverState *bs,
+                                   Error **errp)
+{
+    IscsiLun *iscsilun = bs->opaque;
+    iscsi_allocmap_invalidate(iscsilun);
+}
+
 static QemuOptsList iscsi_create_opts = {
     .name = "iscsi-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head),
@@ -1836,7 +1986,9 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_close      = iscsi_close,
     .bdrv_create     = iscsi_create,
     .create_opts     = &iscsi_create_opts,
-    .bdrv_reopen_prepare  = iscsi_reopen_prepare,
+    .bdrv_reopen_prepare   = iscsi_reopen_prepare,
+    .bdrv_reopen_commit    = iscsi_reopen_commit,
+    .bdrv_invalidate_cache = iscsi_invalidate_cache,
 
     .bdrv_getlength  = iscsi_getlength,
     .bdrv_get_info   = iscsi_get_info,
@@ -1844,12 +1996,10 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_refresh_limits = iscsi_refresh_limits,
 
     .bdrv_co_get_block_status = iscsi_co_get_block_status,
-    .bdrv_co_discard      = iscsi_co_discard,
-    .bdrv_co_write_zeroes = iscsi_co_write_zeroes,
+    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
+    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
-    .bdrv_co_writev        = iscsi_co_writev,
     .bdrv_co_writev_flags  = iscsi_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 805757e02..e906abebb 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -11,8 +11,10 @@
 #include "qemu-common.h"
 #include "block/aio.h"
 #include "qemu/queue.h"
+#include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
+#include "qemu/coroutine.h"
 
 #include <libaio.h>
 
@@ -26,11 +28,10 @@
  */
 #define MAX_EVENTS 128
 
-#define MAX_QUEUED_IO  128
-
 struct qemu_laiocb {
     BlockAIOCB common;
-    struct qemu_laio_state *ctx;
+    Coroutine *co;
+    LinuxAioState *ctx;
     struct iocb iocb;
     ssize_t ret;
     size_t nbytes;
@@ -41,12 +42,15 @@ struct qemu_laiocb {
 
 typedef struct {
     int plugged;
-    unsigned int n;
+    unsigned int in_queue;
+    unsigned int in_flight;
     bool blocked;
     QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;
 
-struct qemu_laio_state {
+struct LinuxAioState {
+    AioContext *aio_context;
+
     io_context_t ctx;
     EventNotifier e;
 
@@ -60,7 +64,7 @@ struct qemu_laio_state {
     int event_max;
 };
 
-static void ioq_submit(struct qemu_laio_state *s);
+static void ioq_submit(LinuxAioState *s);
 
 static inline ssize_t io_event_ret(struct io_event *ev)
 {
@@ -70,8 +74,7 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 /*
  * Completes an AIO request (calls the callback and frees the ACB).
  */
-static void qemu_laio_process_completion(struct qemu_laio_state *s,
-    struct qemu_laiocb *laiocb)
+static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
     int ret;
 
@@ -85,13 +88,18 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
                 qemu_iovec_memset(laiocb->qiov, ret, 0,
                     laiocb->qiov->size - ret);
             } else {
-                ret = -EINVAL;
+                ret = -ENOSPC;
             }
         }
     }
-    laiocb->common.cb(laiocb->common.opaque, ret);
 
-    qemu_aio_unref(laiocb);
+    laiocb->ret = ret;
+    if (laiocb->co) {
+        qemu_coroutine_enter(laiocb->co);
+    } else {
+        laiocb->common.cb(laiocb->common.opaque, ret);
+        qemu_aio_unref(laiocb);
+    }
 }
 
 /* The completion BH fetches completed I/O requests and invokes their
@@ -99,7 +107,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
  *
  * The function is somewhat tricky because it supports nested event loops, for
  * example when a request callback invokes aio_poll().  In order to do this,
- * the completion events array and index are kept in qemu_laio_state.  The BH
+ * the completion events array and index are kept in LinuxAioState.  The BH
  * reschedules itself as long as there are completions pending so it will
  * either be called again in a nested event loop or will be called after all
  * events have been completed.  When there are no events left to complete, the
@@ -107,7 +115,7 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
  */
 static void qemu_laio_completion_bh(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
+    LinuxAioState *s = opaque;
 
     /* Fetch more completion events when empty */
     if (s->event_idx == s->event_max) {
@@ -122,6 +130,7 @@ static void qemu_laio_completion_bh(void *opaque)
             s->event_max = 0;
             return; /* no more events */
         }
+        s->io_q.in_flight -= s->event_max;
     }
 
     /* Reschedule so nested event loops see currently pending completions */
@@ -136,20 +145,22 @@ static void qemu_laio_completion_bh(void *opaque)
         laiocb->ret = io_event_ret(&s->events[s->event_idx]);
         s->event_idx++;
 
-        qemu_laio_process_completion(s, laiocb);
+        qemu_laio_process_completion(laiocb);
     }
 
     if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
+
+    qemu_bh_cancel(s->completion_bh);
 }
 
 static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+    LinuxAioState *s = container_of(e, LinuxAioState, e);
 
     if (event_notifier_test_and_clear(&s->e)) {
-        qemu_bh_schedule(s->completion_bh);
+        qemu_laio_completion_bh(s);
     }
 }
 
@@ -181,22 +192,26 @@ static void ioq_init(LaioQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->pending);
     io_q->plugged = 0;
-    io_q->n = 0;
+    io_q->in_queue = 0;
+    io_q->in_flight = 0;
     io_q->blocked = false;
 }
 
-static void ioq_submit(struct qemu_laio_state *s)
+static void ioq_submit(LinuxAioState *s)
 {
     int ret, len;
     struct qemu_laiocb *aiocb;
-    struct iocb *iocbs[MAX_QUEUED_IO];
+    struct iocb *iocbs[MAX_EVENTS];
     QSIMPLEQ_HEAD(, qemu_laiocb) completed;
 
     do {
+        if (s->io_q.in_flight >= MAX_EVENTS) {
+            break;
+        }
         len = 0;
         QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
             iocbs[len++] = &aiocb->iocb;
-            if (len == MAX_QUEUED_IO) {
+            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                 break;
             }
         }
@@ -206,55 +221,43 @@ static void ioq_submit(struct qemu_laio_state *s)
             break;
         }
         if (ret < 0) {
-            abort();
+            /* Fail the first request, retry the rest */
+            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
+            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
+            s->io_q.in_queue--;
+            aiocb->ret = ret;
+            qemu_laio_process_completion(aiocb);
+            continue;
         }
 
-        s->io_q.n -= ret;
+        s->io_q.in_flight += ret;
+        s->io_q.in_queue  -= ret;
         aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
         QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
     } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
-    s->io_q.blocked = (s->io_q.n > 0);
+    s->io_q.blocked = (s->io_q.in_queue > 0);
 }
 
-void laio_io_plug(BlockDriverState *bs, void *aio_ctx)
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
     s->io_q.plugged++;
 }
 
-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 {
-    struct qemu_laio_state *s = aio_ctx;
-
-    assert(s->io_q.plugged > 0 || !unplug);
-
-    if (unplug && --s->io_q.plugged > 0) {
-        return;
-    }
-
-    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    assert(s->io_q.plugged);
+    if (--s->io_q.plugged == 0 &&
+        !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
         ioq_submit(s);
     }
 }
 
-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
+                          int type)
 {
-    struct qemu_laio_state *s = aio_ctx;
-    struct qemu_laiocb *laiocb;
-    struct iocb *iocbs;
-    off_t offset = sector_num * 512;
-
-    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
-    laiocb->nbytes = nb_sectors * 512;
-    laiocb->ctx = s;
-    laiocb->ret = -EINPROGRESS;
-    laiocb->is_read = (type == QEMU_AIO_READ);
-    laiocb->qiov = qiov;
-
-    iocbs = &laiocb->iocb;
+    LinuxAioState *s = laiocb->ctx;
+    struct iocb *iocbs = &laiocb->iocb;
+    QEMUIOVector *qiov = laiocb->qiov;
 
     switch (type) {
     case QEMU_AIO_WRITE:
@@ -267,43 +270,83 @@ BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
     default:
         fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                         __func__, type);
-        goto out_free_aiocb;
+        return -EIO;
     }
     io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
-    s->io_q.n++;
+    s->io_q.in_queue++;
     if (!s->io_q.blocked &&
-        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
+        (!s->io_q.plugged ||
+         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
         ioq_submit(s);
     }
-    return &laiocb->common;
 
-out_free_aiocb:
-    qemu_aio_unref(laiocb);
-    return NULL;
+    return 0;
 }
 
-void laio_detach_aio_context(void *s_, AioContext *old_context)
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                uint64_t offset, QEMUIOVector *qiov, int type)
 {
-    struct qemu_laio_state *s = s_;
+    int ret;
+    struct qemu_laiocb laiocb = {
+        .co         = qemu_coroutine_self(),
+        .nbytes     = qiov->size,
+        .ctx        = s,
+        .is_read    = (type == QEMU_AIO_READ),
+        .qiov       = qiov,
+    };
+
+    ret = laio_do_submit(fd, &laiocb, offset, type);
+    if (ret < 0) {
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+    return laiocb.ret;
+}
+
+BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_laiocb *laiocb;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+    int ret;
+
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+    laiocb->is_read = (type == QEMU_AIO_READ);
+    laiocb->qiov = qiov;
+
+    ret = laio_do_submit(fd, laiocb, offset, type);
+    if (ret < 0) {
+        qemu_aio_unref(laiocb);
+        return NULL;
+    }
 
+    return &laiocb->common;
+}
+
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
+{
     aio_set_event_notifier(old_context, &s->e, false, NULL);
     qemu_bh_delete(s->completion_bh);
 }
 
-void laio_attach_aio_context(void *s_, AioContext *new_context)
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
-    struct qemu_laio_state *s = s_;
-
+    s->aio_context = new_context;
     s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
     aio_set_event_notifier(new_context, &s->e, false,
                            qemu_laio_completion_cb);
 }
 
-void *laio_init(void)
+LinuxAioState *laio_init(void)
 {
-    struct qemu_laio_state *s;
+    LinuxAioState *s;
 
     s = g_malloc0(sizeof(*s));
     if (event_notifier_init(&s->e, false) < 0) {
@@ -325,10 +368,8 @@ out_free_state:
     return NULL;
 }
 
-void laio_cleanup(void *s_)
+void laio_cleanup(LinuxAioState *s)
 {
-    struct qemu_laio_state *s = s_;
-
     event_notifier_cleanup(&s->e);
 
     if (io_destroy(s->ctx) != 0) {
diff --git a/block/mirror.c b/block/mirror.c
index 039f48125..e0b3f4180 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,11 +20,12 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
-#include "qemu/error-report.h"
 
 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
-#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
+#define MAX_IO_SECTORS ((1 << 20) >> BDRV_SECTOR_BITS) /* 1 Mb */
+#define DEFAULT_MIRROR_BUF_SIZE \
+    (MAX_IN_FLIGHT * MAX_IO_SECTORS * BDRV_SECTOR_SIZE)
 
 /* The mirroring buffer is a list of granularity-sized chunks.
  * Free chunks are organized in a list.
@@ -36,7 +37,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
     BlockJob common;
     RateLimit limit;
-    BlockDriverState *target;
+    BlockBackend *target;
     BlockDriverState *base;
     /* The name of the graph node to replace */
     char *replaces;
@@ -45,6 +46,7 @@ typedef struct MirrorBlockJob {
     /* Used to block operations on the drive-mirror-replace target */
     Error *replace_blocker;
     bool is_none_mode;
+    BlockMirrorBackingMode backing_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
     bool should_complete;
@@ -58,9 +60,10 @@ typedef struct MirrorBlockJob {
     QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
     int buf_free_count;
 
+    uint64_t last_pause_ns;
     unsigned long *in_flight_bitmap;
     int in_flight;
-    int sectors_in_flight;
+    int64_t sectors_in_flight;
     int ret;
     bool unmap;
     bool waiting_for_io;
@@ -80,11 +83,11 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 {
     s->synced = false;
     if (read) {
-        return block_job_error_action(&s->common, s->common.bs,
-                                      s->on_source_error, true, error);
+        return block_job_error_action(&s->common, s->on_source_error,
+                                      true, error);
     } else {
-        return block_job_error_action(&s->common, s->target,
-                                      s->on_target_error, false, error);
+        return block_job_error_action(&s->common, s->on_target_error,
+                                      false, error);
     }
 }
 
@@ -121,7 +124,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
     g_free(op);
 
     if (s->waiting_for_io) {
-        qemu_coroutine_enter(s->common.co, NULL);
+        qemu_coroutine_enter(s->common.co);
     }
 }
 
@@ -157,8 +160,8 @@ static void mirror_read_complete(void *opaque, int ret)
         mirror_iteration_done(op, ret);
         return;
     }
-    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
-                    mirror_write_complete, op);
+    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                    0, mirror_write_complete, op);
 }
 
 static inline void mirror_clip_sectors(MirrorBlockJob *s,
@@ -186,8 +189,9 @@ static int mirror_cow_align(MirrorBlockJob *s,
     need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                           s->cow_bitmap);
     if (need_cow) {
-        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
-                               &align_sector_num, &align_nb_sectors);
+        bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
+                                       *nb_sectors, &align_sector_num,
+                                       &align_nb_sectors);
     }
 
     if (align_nb_sectors > max_sectors) {
@@ -217,23 +221,29 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 }
 
 /* Submit async read while handling COW.
- * Returns: nb_sectors if no alignment is necessary, or
+ * Returns: The number of sectors copied after and including sector_num,
+ *          excluding any sectors copied prior to sector_num due to alignment.
+ *          This will be nb_sectors if no alignment is necessary, or
  *          (new_end - sector_num) if tail is rounded up or down due to
  *          alignment or buffer limit.
  */
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                           int nb_sectors)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockBackend *source = s->common.blk;
     int sectors_per_chunk, nb_chunks;
-    int ret = nb_sectors;
+    int ret;
     MirrorOp *op;
+    int max_sectors;
 
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    max_sectors = sectors_per_chunk * s->max_iov;
 
     /* We can only handle as much as buf_size at a time. */
     nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
+    nb_sectors = MIN(max_sectors, nb_sectors);
     assert(nb_sectors);
+    ret = nb_sectors;
 
     if (s->cow_bitmap) {
         ret += mirror_cow_align(s, &sector_num, &nb_sectors);
@@ -274,7 +284,7 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
     s->sectors_in_flight += nb_sectors;
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
 
-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
                    mirror_read_complete, op);
     return ret;
 }
@@ -296,10 +306,12 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     s->in_flight++;
     s->sectors_in_flight += nb_sectors;
     if (is_discard) {
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
+        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
+                         op->nb_sectors << BDRV_SECTOR_BITS,
                          mirror_write_complete, op);
     } else {
-        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
+                              op->nb_sectors * BDRV_SECTOR_SIZE,
                               s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                               mirror_write_complete, op);
     }
@@ -307,13 +319,16 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
 
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockDriverState *source = blk_bs(s->common.blk);
     int64_t sector_num, first_chunk;
     uint64_t delay_ns = 0;
     /* At least the first dirty chunk is mirrored in one iteration. */
     int nb_chunks = 1;
     int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
     int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
+    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
+                             MAX_IO_SECTORS);
 
     sector_num = hbitmap_iter_next(&s->hbi);
     if (sector_num < 0) {
@@ -325,10 +340,12 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
     first_chunk = sector_num / sectors_per_chunk;
     while (test_bit(first_chunk, s->in_flight_bitmap)) {
-        trace_mirror_yield_in_flight(s, first_chunk, s->in_flight);
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
         mirror_wait_for_io(s);
     }
 
+    block_job_pause_point(&s->common);
+
     /* Find the number of consective dirty chunks following the first dirty
      * one, and wait for in flight requests in them. */
     while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
@@ -362,7 +379,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
     while (nb_chunks > 0 && sector_num < end) {
         int ret;
-        int io_sectors;
+        int io_sectors, io_sectors_acct;
         BlockDriverState *file;
         enum MirrorMethod {
             MIRROR_METHOD_COPY,
@@ -375,7 +392,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
                                           nb_chunks * sectors_per_chunk,
                                           &io_sectors, &file);
         if (ret < 0) {
-            io_sectors = nb_chunks * sectors_per_chunk;
+            io_sectors = MIN(nb_chunks * sectors_per_chunk, max_io_sectors);
+        } else if (ret & BDRV_BLOCK_DATA) {
+            io_sectors = MIN(io_sectors, max_io_sectors);
         }
 
         io_sectors -= io_sectors % sectors_per_chunk;
@@ -384,8 +403,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
             int64_t target_sector_num;
             int target_nb_sectors;
-            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
-                                   &target_sector_num, &target_nb_sectors);
+            bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
+                                           io_sectors,  &target_sector_num,
+                                           &target_nb_sectors);
             if (target_sector_num == sector_num &&
                 target_nb_sectors == io_sectors) {
                 mirror_method = ret & BDRV_BLOCK_ZERO ?
@@ -394,16 +414,30 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
             }
         }
 
+        while (s->in_flight >= MAX_IN_FLIGHT) {
+            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+            mirror_wait_for_io(s);
+        }
+
+        if (s->ret < 0) {
+            return 0;
+        }
+
         mirror_clip_sectors(s, sector_num, &io_sectors);
         switch (mirror_method) {
         case MIRROR_METHOD_COPY:
             io_sectors = mirror_do_read(s, sector_num, io_sectors);
+            io_sectors_acct = io_sectors;
             break;
         case MIRROR_METHOD_ZERO:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
-            break;
         case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
+            mirror_do_zero_or_discard(s, sector_num, io_sectors,
+                                      mirror_method == MIRROR_METHOD_DISCARD);
+            if (write_zeroes_ok) {
+                io_sectors_acct = 0;
+            } else {
+                io_sectors_acct = io_sectors;
+            }
             break;
         default:
             abort();
@@ -411,7 +445,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         assert(io_sectors);
         sector_num += io_sectors;
         nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
-        delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors);
+        if (s->common.speed) {
+            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
+        }
     }
     return delay_ns;
 }
@@ -449,7 +485,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
     MirrorExitData *data = opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->common.bs;
+    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
 
     /* Make sure that the source BDS doesn't go away before we called
      * block_job_completed(). */
@@ -461,26 +498,25 @@ static void mirror_exit(BlockJob *job, void *opaque)
     }
 
     if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
+        BlockDriverState *to_replace = src;
         if (s->to_replace) {
             to_replace = s->to_replace;
         }
 
-        /* This was checked in mirror_start_job(), but meanwhile one of the
-         * nodes could have been newly attached to a BlockBackend. */
-        if (to_replace->blk && s->target->blk) {
-            error_report("block job: Can't create node with two BlockBackends");
-            data->ret = -EINVAL;
-            goto out;
+        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
         }
 
-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_replace_in_backing_chain(to_replace, s->target);
-    }
+        /* The mirror job has no requests in flight any more, but we need to
+         * drain potential other users of the BDS before changing the graph. */
+        bdrv_drained_begin(target_bs);
+        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_drained_end(target_bs);
 
-out:
+        /* We just changed the BDS the job BB refers to */
+        blk_remove_bs(job->blk);
+        blk_insert_bs(job->blk, src);
+    }
     if (s->to_replace) {
         bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
         error_free(s->replace_blocker);
@@ -490,29 +526,102 @@ out:
         aio_context_release(replace_aio_context);
     }
     g_free(s->replaces);
-    bdrv_op_unblock_all(s->target, s->common.blocker);
-    bdrv_unref(s->target);
+    bdrv_op_unblock_all(target_bs, s->common.blocker);
+    blk_unref(s->target);
     block_job_completed(&s->common, data->ret);
     g_free(data);
     bdrv_drained_end(src);
-    if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
-        aio_enable_external(iohandler_get_aio_context());
-    }
     bdrv_unref(src);
 }
 
+static void mirror_throttle(MirrorBlockJob *s)
+{
+    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+    if (now - s->last_pause_ns > SLICE_TIME) {
+        s->last_pause_ns = now;
+        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+    } else {
+        block_job_pause_point(&s->common);
+    }
+}
+
+static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
+{
+    int64_t sector_num, end;
+    BlockDriverState *base = s->base;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
+    int ret, n;
+
+    end = s->bdev_length / BDRV_SECTOR_SIZE;
+
+    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
+        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
+            return 0;
+        }
+
+        for (sector_num = 0; sector_num < end; ) {
+            int nb_sectors = MIN(end - sector_num,
+                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
+
+            mirror_throttle(s);
+
+            if (block_job_is_cancelled(&s->common)) {
+                return 0;
+            }
+
+            if (s->in_flight >= MAX_IN_FLIGHT) {
+                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
+                mirror_wait_for_io(s);
+                continue;
+            }
+
+            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
+            sector_num += nb_sectors;
+        }
+
+        mirror_drain(s);
+    }
+
+    /* First part, loop on the sectors and initialize the dirty bitmap.  */
+    for (sector_num = 0; sector_num < end; ) {
+        /* Just to make sure we are not exceeding int limit. */
+        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
+                             end - sector_num);
+
+        mirror_throttle(s);
+
+        if (block_job_is_cancelled(&s->common)) {
+            return 0;
+        }
+
+        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assert(n > 0);
+        if (ret == 1) {
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
+        }
+        sector_num += n;
+    }
+    return 0;
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
-    BlockDriverState *bs = s->common.bs;
-    int64_t sector_num, end, length;
-    uint64_t last_pause_ns;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
+    int64_t length;
     BlockDriverInfo bdi;
     char backing_filename[2]; /* we only need 2 characters because we are only
                                  checking for a NULL string */
     int ret = 0;
-    int n;
     int target_cluster_size = BDRV_SECTOR_SIZE;
 
     if (block_job_is_cancelled(&s->common)) {
@@ -541,20 +650,19 @@ static void coroutine_fn mirror_run(void *opaque)
      * the destination do COW.  Instead, we copy sectors around the
      * dirty data if needed.  We need a bitmap to do that.
      */
-    bdrv_get_backing_filename(s->target, backing_filename,
+    bdrv_get_backing_filename(target_bs, backing_filename,
                               sizeof(backing_filename));
-    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
         target_cluster_size = bdi.cluster_size;
     }
-    if (backing_filename[0] && !s->target->backing
+    if (backing_filename[0] && !target_bs->backing
         && s->granularity < target_cluster_size) {
         s->buf_size = MAX(s->buf_size, target_cluster_size);
         s->cow_bitmap = bitmap_new(length);
     }
     s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
+    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
 
-    end = s->bdev_length / BDRV_SECTOR_SIZE;
     s->buf = qemu_try_blockalign(bs, s->buf_size);
     if (s->buf == NULL) {
         ret = -ENOMEM;
@@ -563,45 +671,18 @@ static void coroutine_fn mirror_run(void *opaque)
 
     mirror_free_init(s);
 
-    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     if (!s->is_none_mode) {
-        /* First part, loop on the sectors and initialize the dirty bitmap.  */
-        BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
-
-        for (sector_num = 0; sector_num < end; ) {
-            /* Just to make sure we are not exceeding int limit. */
-            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
-                                 end - sector_num);
-            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-
-            if (now - last_pause_ns > SLICE_TIME) {
-                last_pause_ns = now;
-                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
-            }
-
-            if (block_job_is_cancelled(&s->common)) {
-                goto immediate_exit;
-            }
-
-            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
-
-            if (ret < 0) {
-                goto immediate_exit;
-            }
-
-            assert(n > 0);
-            if (ret == 1 || mark_all_dirty) {
-                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
-            }
-            sector_num += n;
+        ret = mirror_dirty_init(s);
+        if (ret < 0 || block_job_is_cancelled(&s->common)) {
+            goto immediate_exit;
         }
     }
 
     bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
     for (;;) {
         uint64_t delay_ns = 0;
-        int64_t cnt;
+        int64_t cnt, delta;
         bool should_complete;
 
         if (s->ret < 0) {
@@ -609,6 +690,8 @@ static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
+        block_job_pause_point(&s->common);
+
         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         /* s->common.offset contains the number of bytes already processed so
          * far, cnt is the number of dirty sectors remaining and
@@ -622,9 +705,10 @@ static void coroutine_fn mirror_run(void *opaque)
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
-        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
+        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
+        if (delta < SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
                 mirror_wait_for_io(s);
@@ -637,7 +721,7 @@ static void coroutine_fn mirror_run(void *opaque)
         should_complete = false;
         if (s->in_flight == 0 && cnt == 0) {
             trace_mirror_before_flush(s);
-            ret = bdrv_flush(s->target);
+            ret = blk_flush(s->target);
             if (ret < 0) {
                 if (mirror_error_action(s, false, -ret) ==
                     BLOCK_ERROR_ACTION_REPORT) {
@@ -692,7 +776,7 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.cancelled = false;
             break;
         }
-        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
 immediate_exit:
@@ -710,21 +794,12 @@ immediate_exit:
     g_free(s->cow_bitmap);
     g_free(s->in_flight_bitmap);
     bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
-    if (s->target->blk) {
-        blk_iostatus_disable(s->target->blk);
-    }
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
     /* Before we switch to target in mirror_exit, make sure data doesn't
      * change. */
-    bdrv_drained_begin(s->common.bs);
-    if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
-        /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
-         * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
-         * need a block layer API change to achieve this. */
-        aio_disable_external(iohandler_get_aio_context());
-    }
+    bdrv_drained_begin(bs);
     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }
 
@@ -739,32 +814,31 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 }
 
-static void mirror_iostatus_reset(BlockJob *job)
-{
-    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-
-    if (s->target->blk) {
-        blk_iostatus_reset(s->target->blk);
-    }
-}
-
 static void mirror_complete(BlockJob *job, Error **errp)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-    Error *local_err = NULL;
-    int ret;
+    BlockDriverState *src, *target;
+
+    src = blk_bs(job->blk);
+    target = blk_bs(s->target);
 
-    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        return;
-    }
     if (!s->synced) {
-        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
+        error_setg(errp, "The active block job '%s' cannot be completed",
+                   job->id);
         return;
     }
 
-    /* check the target bs is not blocked and block all operations on it */
+    if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
+        int ret;
+
+        assert(!target->backing);
+        ret = bdrv_open_backing_file(target, NULL, "backing", errp);
+        if (ret < 0) {
+            return;
+        }
+    }
+
+    /* block all operations on to_replace bs */
     if (s->replaces) {
         AioContext *replace_aio_context;
 
@@ -785,31 +859,57 @@ static void mirror_complete(BlockJob *job, Error **errp)
         aio_context_release(replace_aio_context);
     }
 
+    if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
+        BlockDriverState *backing = s->is_none_mode ? src : s->base;
+        if (backing_bs(target) != backing) {
+            bdrv_set_backing_hd(target, backing);
+        }
+    }
+
     s->should_complete = true;
     block_job_enter(&s->common);
 }
 
+/* There is no matching mirror_resume() because mirror_run() will begin
+ * iterating again when the job is resumed.
+ */
+static void coroutine_fn mirror_pause(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    mirror_drain(s);
+}
+
+static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    blk_set_aio_context(s->target, new_context);
+}
+
 static const BlockJobDriver mirror_job_driver = {
-    .instance_size = sizeof(MirrorBlockJob),
-    .job_type      = BLOCK_JOB_TYPE_MIRROR,
-    .set_speed     = mirror_set_speed,
-    .iostatus_reset= mirror_iostatus_reset,
-    .complete      = mirror_complete,
+    .instance_size          = sizeof(MirrorBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_MIRROR,
+    .set_speed              = mirror_set_speed,
+    .complete               = mirror_complete,
+    .pause                  = mirror_pause,
+    .attached_aio_context   = mirror_attached_aio_context,
 };
 
 static const BlockJobDriver commit_active_job_driver = {
-    .instance_size = sizeof(MirrorBlockJob),
-    .job_type      = BLOCK_JOB_TYPE_COMMIT,
-    .set_speed     = mirror_set_speed,
-    .iostatus_reset
-                   = mirror_iostatus_reset,
-    .complete      = mirror_complete,
+    .instance_size          = sizeof(MirrorBlockJob),
+    .job_type               = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed              = mirror_set_speed,
+    .complete               = mirror_complete,
+    .pause                  = mirror_pause,
+    .attached_aio_context   = mirror_attached_aio_context,
 };
 
-static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
-                             const char *replaces,
+static void mirror_start_job(const char *job_id, BlockDriverState *bs,
+                             BlockDriverState *target, const char *replaces,
                              int64_t speed, uint32_t granularity,
                              int64_t buf_size,
+                             BlockMirrorBackingMode backing_mode,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
                              bool unmap,
@@ -819,7 +919,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              bool is_none_mode, BlockDriverState *base)
 {
     MirrorBlockJob *s;
-    BlockDriverState *replaced_bs;
 
     if (granularity == 0) {
         granularity = bdrv_get_default_bitmap_granularity(target);
@@ -827,13 +926,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
     assert ((granularity & (granularity - 1)) == 0);
 
-    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
-        return;
-    }
-
     if (buf_size < 0) {
         error_setg(errp, "Invalid parameter 'buf-size'");
         return;
@@ -843,31 +935,19 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
         buf_size = DEFAULT_MIRROR_BUF_SIZE;
     }
 
-    /* We can't support this case as long as the block layer can't handle
-     * multiple BlockBackends per BlockDriverState. */
-    if (replaces) {
-        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
-        if (replaced_bs == NULL) {
-            return;
-        }
-    } else {
-        replaced_bs = bs;
-    }
-    if (replaced_bs->blk && target->blk) {
-        error_setg(errp, "Can't create node with two BlockBackends");
-        return;
-    }
-
-    s = block_job_create(driver, bs, speed, cb, opaque, errp);
+    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
 
+    s->target = blk_new();
+    blk_insert_bs(s->target, target);
+
     s->replaces = g_strdup(replaces);
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
-    s->target = target;
     s->is_none_mode = is_none_mode;
+    s->backing_mode = backing_mode;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = ROUND_UP(buf_size, granularity);
@@ -876,25 +956,23 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         g_free(s->replaces);
+        blk_unref(s->target);
         block_job_unref(&s->common);
         return;
     }
 
-    bdrv_op_block_all(s->target, s->common.blocker);
+    bdrv_op_block_all(target, s->common.blocker);
 
-    if (s->target->blk) {
-        blk_set_on_error(s->target->blk, on_target_error, on_target_error);
-        blk_iostatus_enable(s->target->blk);
-    }
-    s->common.co = qemu_coroutine_create(mirror_run);
+    s->common.co = qemu_coroutine_create(mirror_run, s);
     trace_mirror_start(bs, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co, s);
+    qemu_coroutine_enter(s->common.co);
 }
 
-void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  const char *replaces,
+void mirror_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *target, const char *replaces,
                   int64_t speed, uint32_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+                  BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   bool unmap,
                   BlockCompletionFunc *cb,
@@ -909,14 +987,14 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     }
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
-    mirror_start_job(bs, target, replaces,
-                     speed, granularity, buf_size,
+    mirror_start_job(job_id, bs, target, replaces,
+                     speed, granularity, buf_size, backing_mode,
                      on_source_error, on_target_error, unmap, cb, opaque, errp,
                      &mirror_job_driver, is_none_mode, base);
 }
 
-void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
-                         int64_t speed,
+void commit_active_start(const char *job_id, BlockDriverState *bs,
+                         BlockDriverState *base, int64_t speed,
                          BlockdevOnError on_error,
                          BlockCompletionFunc *cb,
                          void *opaque, Error **errp)
@@ -957,8 +1035,8 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
         }
     }
 
-    bdrv_ref(base);
-    mirror_start_job(bs, base, NULL, speed, 0, 0,
+    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
+                     MIRROR_LEAVE_BACKING_CHAIN,
                      on_error, on_error, false, cb, opaque, &local_err,
                      &commit_active_job_driver, false, base);
     if (local_err) {
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 878e879ac..2cf3237ef 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -38,7 +38,7 @@ static void nbd_recv_coroutines_enter_all(NbdClientSession *s)
 
     for (i = 0; i < MAX_NBD_REQUESTS; i++) {
         if (s->recv_coroutine[i]) {
-            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+            qemu_coroutine_enter(s->recv_coroutine[i]);
         }
     }
 }
@@ -99,7 +99,7 @@ static void nbd_reply_ready(void *opaque)
     }
 
     if (s->recv_coroutine[i]) {
-        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+        qemu_coroutine_enter(s->recv_coroutine[i]);
         return;
     }
 
@@ -111,12 +111,12 @@ static void nbd_restart_write(void *opaque)
 {
     BlockDriverState *bs = opaque;
 
-    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL);
+    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
 }
 
 static int nbd_co_send_request(BlockDriverState *bs,
                                struct nbd_request *request,
-                               QEMUIOVector *qiov, int offset)
+                               QEMUIOVector *qiov)
 {
     NbdClientSession *s = nbd_get_client_session(bs);
     AioContext *aio_context;
@@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
         if (rc >= 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-                               offset, request->len, 0);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+                               false);
             if (ret != request->len) {
                 rc = -EIO;
             }
@@ -167,8 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs,
 }
 
 static void nbd_co_receive_reply(NbdClientSession *s,
-    struct nbd_request *request, struct nbd_reply *reply,
-    QEMUIOVector *qiov, int offset)
+                                 struct nbd_request *request,
+                                 struct nbd_reply *reply,
+                                 QEMUIOVector *qiov)
 {
     int ret;
 
@@ -181,8 +182,8 @@ static void nbd_co_receive_reply(NbdClientSession *s,
         reply->error = EIO;
     } else {
         if (qiov && reply->error == 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-                               offset, request->len, 1);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+                               true);
             if (ret != request->len) {
                 reply->error = EIO;
             }
@@ -217,97 +218,62 @@ static void nbd_coroutine_end(NbdClientSession *s,
     }
 }
 
-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov,
-                          int offset)
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+                         uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_READ };
+    struct nbd_request request = {
+        .type = NBD_CMD_READ,
+        .from = offset,
+        .len = bytes,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
+    assert(bytes <= NBD_MAX_BUFFER_SIZE);
+    assert(!flags);
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+        nbd_co_receive_reply(client, &request, &reply, qiov);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
-
 }
 
-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset, int *flags)
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_request request = {
+        .type = NBD_CMD_WRITE,
+        .from = offset,
+        .len = bytes,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
-    if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
-        *flags &= ~BDRV_REQ_FUA;
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
         request.type |= NBD_CMD_FLAG_FUA;
     }
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
+    assert(bytes <= NBD_MAX_BUFFER_SIZE);
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, qiov, offset);
+    ret = nbd_co_send_request(bs, &request, qiov);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
 }
 
-/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
- * remain aligned to 4K. */
-#define NBD_MAX_SECTORS 2040
-
-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
-{
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
-}
-
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags)
-{
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset,
-                              flags);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags);
-}
-
 int nbd_client_co_flush(BlockDriverState *bs)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
@@ -323,36 +289,37 @@ int nbd_client_co_flush(BlockDriverState *bs)
     request.len = 0;
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
 }
 
-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors)
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_TRIM };
+    struct nbd_request request = {
+        .type = NBD_CMD_TRIM,
+        .from = offset,
+        .len = count,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
     if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
         return 0;
     }
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
@@ -414,6 +381,9 @@ int nbd_client_init(BlockDriverState *bs,
         logout("Failed to negotiate with the NBD server\n");
         return ret;
     }
+    if (client->nbdflags & NBD_FLAG_SEND_FUA) {
+        bs->supported_write_flags = BDRV_REQ_FUA;
+    }
 
     qemu_co_mutex_init(&client->send_mutex);
     qemu_co_mutex_init(&client->free_sema);
diff --git a/block/nbd-client.h b/block/nbd-client.h
index bc7aec079..044aca453 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -20,7 +20,7 @@
 typedef struct NbdClientSession {
     QIOChannelSocket *sioc; /* The master data channel */
     QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
-    uint32_t nbdflags;
+    uint16_t nbdflags;
     off_t size;
 
     CoMutex send_mutex;
@@ -44,13 +44,12 @@ int nbd_client_init(BlockDriverState *bs,
                     Error **errp);
 void nbd_client_close(BlockDriverState *bs);
 
-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors);
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int *flags);
-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+                         uint64_t bytes, QEMUIOVector *qiov, int flags);
 
 void nbd_client_detach_aio_context(BlockDriverState *bs);
 void nbd_client_attach_aio_context(BlockDriverState *bs,
diff --git a/block/nbd.c b/block/nbd.c
index f7ea3b360..6bc06d619 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -42,6 +42,9 @@
 
 typedef struct BDRVNBDState {
     NbdClientSession client;
+
+    /* For nbd_refresh_filename() */
+    char *path, *host, *port, *export, *tlscredsid;
 } BDRVNBDState;
 
 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -188,13 +191,15 @@ out:
     g_free(file);
 }
 
-static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
-                                 Error **errp)
+static SocketAddress *nbd_config(BDRVNBDState *s, QemuOpts *opts, Error **errp)
 {
     SocketAddress *saddr;
 
-    if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) {
-        if (qdict_haskey(options, "path")) {
+    s->path = g_strdup(qemu_opt_get(opts, "path"));
+    s->host = g_strdup(qemu_opt_get(opts, "host"));
+
+    if (!s->path == !s->host) {
+        if (s->path) {
             error_setg(errp, "path and host may not be used at the same time.");
         } else {
             error_setg(errp, "one of path and host must be specified.");
@@ -204,32 +209,28 @@ static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export,
 
     saddr = g_new0(SocketAddress, 1);
 
-    if (qdict_haskey(options, "path")) {
+    if (s->path) {
         UnixSocketAddress *q_unix;
         saddr->type = SOCKET_ADDRESS_KIND_UNIX;
         q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
-        q_unix->path = g_strdup(qdict_get_str(options, "path"));
-        qdict_del(options, "path");
+        q_unix->path = g_strdup(s->path);
     } else {
         InetSocketAddress *inet;
+
+        s->port = g_strdup(qemu_opt_get(opts, "port"));
+
         saddr->type = SOCKET_ADDRESS_KIND_INET;
         inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1);
-        inet->host = g_strdup(qdict_get_str(options, "host"));
-        if (!qdict_get_try_str(options, "port")) {
+        inet->host = g_strdup(s->host);
+        inet->port = g_strdup(s->port);
+        if (!inet->port) {
             inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT);
-        } else {
-            inet->port = g_strdup(qdict_get_str(options, "port"));
         }
-        qdict_del(options, "host");
-        qdict_del(options, "port");
     }
 
     s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX;
 
-    *export = g_strdup(qdict_get_try_str(options, "export"));
-    if (*export) {
-        qdict_del(options, "export");
-    }
+    s->export = g_strdup(qemu_opt_get(opts, "export"));
 
     return saddr;
 }
@@ -292,28 +293,66 @@ static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
 }
 
 
+static QemuOptsList nbd_runtime_opts = {
+    .name = "nbd",
+    .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head),
+    .desc = {
+        {
+            .name = "host",
+            .type = QEMU_OPT_STRING,
+            .help = "TCP host to connect to",
+        },
+        {
+            .name = "port",
+            .type = QEMU_OPT_STRING,
+            .help = "TCP port to connect to",
+        },
+        {
+            .name = "path",
+            .type = QEMU_OPT_STRING,
+            .help = "Unix socket path to connect to",
+        },
+        {
+            .name = "export",
+            .type = QEMU_OPT_STRING,
+            .help = "Name of the NBD export to open",
+        },
+        {
+            .name = "tls-creds",
+            .type = QEMU_OPT_STRING,
+            .help = "ID of the TLS credentials to use",
+        },
+    },
+};
+
 static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVNBDState *s = bs->opaque;
-    char *export = NULL;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
     QIOChannelSocket *sioc = NULL;
-    SocketAddress *saddr;
-    const char *tlscredsid;
+    SocketAddress *saddr = NULL;
     QCryptoTLSCreds *tlscreds = NULL;
     const char *hostname = NULL;
     int ret = -EINVAL;
 
+    opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto error;
+    }
+
     /* Pop the config into our state object. Exit if invalid. */
-    saddr = nbd_config(s, options, &export, errp);
+    saddr = nbd_config(s, opts, errp);
     if (!saddr) {
         goto error;
     }
 
-    tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds"));
-    if (tlscredsid) {
-        qdict_del(options, "tls-creds");
-        tlscreds = nbd_get_tls_creds(tlscredsid, errp);
+    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
+    if (s->tlscredsid) {
+        tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
         if (!tlscreds) {
             goto error;
         }
@@ -335,7 +374,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* NBD handshake */
-    ret = nbd_client_init(bs, sioc, export,
+    ret = nbd_client_init(bs, sioc, s->export,
                           tlscreds, hostname, errp);
  error:
     if (sioc) {
@@ -344,42 +383,18 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     if (tlscreds) {
         object_unref(OBJECT(tlscreds));
     }
-    qapi_free_SocketAddress(saddr);
-    g_free(export);
-    return ret;
-}
-
-static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
-}
-
-static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
-                               int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-    int ret;
-
-    ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags);
     if (ret < 0) {
-        return ret;
-    }
-
-    /* The flag wasn't sent to the server, so we need to emulate it with an
-     * explicit flush */
-    if (flags & BDRV_REQ_FUA) {
-        ret = nbd_client_co_flush(bs);
+        g_free(s->path);
+        g_free(s->host);
+        g_free(s->port);
+        g_free(s->export);
+        g_free(s->tlscredsid);
     }
-
+    qapi_free_SocketAddress(saddr);
+    qemu_opts_del(opts);
     return ret;
 }
 
-static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
     return nbd_client_co_flush(bs);
@@ -387,19 +402,21 @@ static int nbd_co_flush(BlockDriverState *bs)
 
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
-    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
-}
-
-static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors)
-{
-    return nbd_client_co_discard(bs, sector_num, nb_sectors);
+    bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
+    bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
 }
 
 static void nbd_close(BlockDriverState *bs)
 {
+    BDRVNBDState *s = bs->opaque;
+
     nbd_client_close(bs);
+
+    g_free(s->path);
+    g_free(s->host);
+    g_free(s->port);
+    g_free(s->export);
+    g_free(s->tlscredsid);
 }
 
 static int64_t nbd_getlength(BlockDriverState *bs)
@@ -422,48 +439,45 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
 
 static void nbd_refresh_filename(BlockDriverState *bs, QDict *options)
 {
+    BDRVNBDState *s = bs->opaque;
     QDict *opts = qdict_new();
-    const char *path   = qdict_get_try_str(options, "path");
-    const char *host   = qdict_get_try_str(options, "host");
-    const char *port   = qdict_get_try_str(options, "port");
-    const char *export = qdict_get_try_str(options, "export");
-    const char *tlscreds = qdict_get_try_str(options, "tls-creds");
 
     qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd")));
 
-    if (path && export) {
+    if (s->path && s->export) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix:///%s?socket=%s", export, path);
-    } else if (path && !export) {
+                 "nbd+unix:///%s?socket=%s", s->export, s->path);
+    } else if (s->path && !s->export) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd+unix://?socket=%s", path);
-    } else if (!path && export && port) {
+                 "nbd+unix://?socket=%s", s->path);
+    } else if (!s->path && s->export && s->port) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s/%s", host, port, export);
-    } else if (!path && export && !port) {
+                 "nbd://%s:%s/%s", s->host, s->port, s->export);
+    } else if (!s->path && s->export && !s->port) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s/%s", host, export);
-    } else if (!path && !export && port) {
+                 "nbd://%s/%s", s->host, s->export);
+    } else if (!s->path && !s->export && s->port) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s:%s", host, port);
-    } else if (!path && !export && !port) {
+                 "nbd://%s:%s", s->host, s->port);
+    } else if (!s->path && !s->export && !s->port) {
         snprintf(bs->exact_filename, sizeof(bs->exact_filename),
-                 "nbd://%s", host);
+                 "nbd://%s", s->host);
     }
 
-    if (path) {
-        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path)));
-    } else if (port) {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
-        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port)));
+    if (s->path) {
+        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(s->path)));
+    } else if (s->port) {
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
+        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(s->port)));
     } else {
-        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host)));
+        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(s->host)));
     }
-    if (export) {
-        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export)));
+    if (s->export) {
+        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(s->export)));
     }
-    if (tlscreds) {
-        qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds)));
+    if (s->tlscredsid) {
+        qdict_put_obj(opts, "tls-creds",
+                      QOBJECT(qstring_from_str(s->tlscredsid)));
     }
 
     bs->full_open_options = opts;
@@ -475,13 +489,11 @@ static BlockDriver bdrv_nbd = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -495,13 +507,11 @@ static BlockDriver bdrv_nbd_tcp = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -515,13 +525,11 @@ static BlockDriver bdrv_nbd_unix = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev             = nbd_co_writev,
-    .bdrv_co_writev_flags       = nbd_co_writev_flags,
-    .supported_write_flags      = BDRV_REQ_FUA,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
diff --git a/block/nfs.c b/block/nfs.c
index 9f51cc3f1..8602a4421 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -1,7 +1,7 @@
 /*
  * QEMU Block driver for native access to files on NFS shares
  *
- * Copyright (c) 2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2014-2016 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -38,6 +38,7 @@
 #include <nfsc/libnfs.h>
 
 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
+#define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
 #define QEMU_NFS_MAX_DEBUG_LEVEL 2
 
 typedef struct NFSClient {
@@ -47,6 +48,7 @@ typedef struct NFSClient {
     bool has_zero_init;
     AioContext *aio_context;
     blkcnt_t st_blocks;
+    bool cache_used;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -102,7 +104,7 @@ static void nfs_co_generic_bh_cb(void *opaque)
     NFSRPC *task = opaque;
     task->complete = 1;
     qemu_bh_delete(task->bh);
-    qemu_coroutine_enter(task->co, NULL);
+    qemu_coroutine_enter(task->co);
 }
 
 static void
@@ -278,7 +280,7 @@ static void nfs_file_close(BlockDriverState *bs)
 }
 
 static int64_t nfs_client_open(NFSClient *client, const char *filename,
-                               int flags, Error **errp)
+                               int flags, Error **errp, int open_flags)
 {
     int ret = -EINVAL, i;
     struct stat st;
@@ -330,12 +332,38 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
             nfs_set_tcp_syncnt(client->context, val);
 #ifdef LIBNFS_FEATURE_READAHEAD
         } else if (!strcmp(qp->p[i].name, "readahead")) {
+            if (open_flags & BDRV_O_NOCACHE) {
+                error_setg(errp, "Cannot enable NFS readahead "
+                                 "if cache.direct = on");
+                goto fail;
+            }
             if (val > QEMU_NFS_MAX_READAHEAD_SIZE) {
                 error_report("NFS Warning: Truncating NFS readahead"
                              " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
                 val = QEMU_NFS_MAX_READAHEAD_SIZE;
             }
             nfs_set_readahead(client->context, val);
+#ifdef LIBNFS_FEATURE_PAGECACHE
+            nfs_set_pagecache_ttl(client->context, 0);
+#endif
+            client->cache_used = true;
+#endif
+#ifdef LIBNFS_FEATURE_PAGECACHE
+            nfs_set_pagecache_ttl(client->context, 0);
+        } else if (!strcmp(qp->p[i].name, "pagecache")) {
+            if (open_flags & BDRV_O_NOCACHE) {
+                error_setg(errp, "Cannot enable NFS pagecache "
+                                 "if cache.direct = on");
+                goto fail;
+            }
+            if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) {
+                error_report("NFS Warning: Truncating NFS pagecache"
+                             " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
+                val = QEMU_NFS_MAX_PAGECACHE_SIZE;
+            }
+            nfs_set_pagecache(client->context, val);
+            nfs_set_pagecache_ttl(client->context, 0);
+            client->cache_used = true;
 #endif
 #ifdef LIBNFS_FEATURE_DEBUG
         } else if (!strcmp(qp->p[i].name, "debug")) {
@@ -418,7 +446,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
     }
     ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
                           (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
-                          errp);
+                          errp, bs->open_flags);
     if (ret < 0) {
         goto out;
     }
@@ -454,7 +482,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                           BDRV_SECTOR_SIZE);
 
-    ret = nfs_client_open(client, url, O_CREAT, errp);
+    ret = nfs_client_open(client, url, O_CREAT, errp, 0);
     if (ret < 0) {
         goto out;
     }
@@ -516,6 +544,12 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
         return -EACCES;
     }
 
+    if ((state->flags & BDRV_O_NOCACHE) && client->cache_used) {
+        error_setg(errp, "Cannot disable cache if libnfs readahead or"
+                         " pagecache is enabled");
+        return -EINVAL;
+    }
+
     /* Update cache for read-only reopens */
     if (!(state->flags & BDRV_O_RDWR)) {
         ret = nfs_fstat(client->context, client->fh, &st);
@@ -530,6 +564,15 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
+#ifdef LIBNFS_FEATURE_PAGECACHE
+static void nfs_invalidate_cache(BlockDriverState *bs,
+                                 Error **errp)
+{
+    NFSClient *client = bs->opaque;
+    nfs_pagecache_invalidate(client->context, client->fh);
+}
+#endif
+
 static BlockDriver bdrv_nfs = {
     .format_name                    = "nfs",
     .protocol_name                  = "nfs",
@@ -553,6 +596,10 @@ static BlockDriver bdrv_nfs = {
 
     .bdrv_detach_aio_context        = nfs_detach_aio_context,
     .bdrv_attach_aio_context        = nfs_attach_aio_context,
+
+#ifdef LIBNFS_FEATURE_PAGECACHE
+    .bdrv_invalidate_cache          = nfs_invalidate_cache,
+#endif
 };
 
 static void nfs_block_init(void)
diff --git a/block/null.c b/block/null.c
index 396500bab..b511010ba 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,6 +12,8 @@
 
 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
 #include "block/block_int.h"
 
 #define NULL_OPT_LATENCY "latency-ns"
@@ -223,6 +225,20 @@ static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
     }
 }
 
+static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
+{
+    QINCREF(opts);
+    qdict_del(opts, "filename");
+
+    if (!qdict_size(opts)) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://",
+                 bs->drv->format_name);
+    }
+
+    qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name));
+    bs->full_open_options = opts;
+}
+
 static BlockDriver bdrv_null_co = {
     .format_name            = "null-co",
     .protocol_name          = "null-co",
@@ -238,6 +254,8 @@ static BlockDriver bdrv_null_co = {
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
     .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };
 
 static BlockDriver bdrv_null_aio = {
@@ -255,6 +273,8 @@ static BlockDriver bdrv_null_aio = {
     .bdrv_reopen_prepare    = null_reopen_prepare,
 
     .bdrv_co_get_block_status   = null_co_get_block_status,
+
+    .bdrv_refresh_filename  = null_refresh_filename,
 };
 
 static void bdrv_null_init(void)
diff --git a/block/parallels.c b/block/parallels.c
index 324ed43ac..2ccefa7d8 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -33,6 +33,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "qemu/bitmap.h"
 #include "qapi/util.h"
 
@@ -42,6 +43,7 @@
 #define HEADER_MAGIC2 "WithouFreSpacExt"
 #define HEADER_VERSION 2
 #define HEADER_INUSE_MAGIC  (0x746F6E59)
+#define MAX_PARALLELS_IMAGE_FACTOR (1ull << 32)
 
 #define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */
 
@@ -203,13 +205,15 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
         return -EINVAL;
     }
 
-    to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx;
+    to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;
     space = to_allocate * s->tracks;
     if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) {
         int ret;
         space += s->prealloc_size;
         if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
-            ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0);
+            ret = bdrv_pwrite_zeroes(bs->file,
+                                     s->data_end << BDRV_SECTOR_BITS,
+                                     space << BDRV_SECTOR_BITS, 0);
         } else {
             ret = bdrv_truncate(bs->file->bs,
                                 (s->data_end + space) << BDRV_SECTOR_BITS);
@@ -247,7 +251,7 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
         if (off + to_write > s->header_size) {
             to_write = s->header_size - off;
         }
-        ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off,
+        ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off,
                           to_write);
         if (ret < 0) {
             qemu_co_mutex_unlock(&s->lock);
@@ -308,7 +312,7 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
         qemu_iovec_reset(&hd_qiov);
         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
 
-        ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov);
+        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
         if (ret < 0) {
             break;
         }
@@ -348,7 +352,7 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
             qemu_iovec_reset(&hd_qiov);
             qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
 
-            ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov);
+            ret = bdrv_co_readv(bs->file, position, n, &hd_qiov);
             if (ret < 0) {
                 break;
             }
@@ -429,7 +433,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
     }
 
     if (flush_bat) {
-        ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size);
+        ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size);
         if (ret < 0) {
             res->check_errors++;
             return ret;
@@ -472,6 +476,10 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
                           BDRV_SECTOR_SIZE);
     cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                           DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE);
+    if (total_size >= MAX_PARALLELS_IMAGE_FACTOR * cl_size) {
+        error_propagate(errp, local_err);
+        return -E2BIG;
+    }
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -512,11 +520,12 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
     memset(tmp, 0, sizeof(tmp));
     memcpy(tmp, &header, sizeof(header));
 
-    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+    ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE, 0);
     if (ret < 0) {
         goto exit;
     }
-    ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0);
+    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
+                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
     if (ret < 0) {
         goto exit;
     }
@@ -559,7 +568,7 @@ static int parallels_update_header(BlockDriverState *bs)
     if (size > s->header_size) {
         size = s->header_size;
     }
-    return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size);
+    return bdrv_pwrite_sync(bs->file, 0, s->header, size);
 }
 
 static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
@@ -572,7 +581,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
     Error *local_err = NULL;
     char *buf;
 
-    ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph));
+    ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
     if (ret < 0) {
         goto fail;
     }
@@ -627,7 +636,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
         s->header_size = size;
     }
 
-    ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size);
+    ret = bdrv_pread(bs->file, 0, s->header, s->header_size);
     if (ret < 0) {
         goto fail;
     }
diff --git a/block/qapi.c b/block/qapi.c
index c5f6ba643..6f947e3e6 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -67,10 +67,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
     info->backing_file_depth = bdrv_get_backing_file_depth(bs);
     info->detect_zeroes = bs->detect_zeroes;
 
-    if (bs->throttle_state) {
+    if (blk && blk_get_public(blk)->throttle_state) {
         ThrottleConfig cfg;
 
-        throttle_group_get_config(bs, &cfg);
+        throttle_group_get_config(blk, &cfg);
 
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
@@ -118,7 +118,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
         info->iops_size = cfg.op_size;
 
         info->has_group = true;
-        info->group = g_strdup(throttle_group_get_name(bs));
+        info->group = g_strdup(throttle_group_get_name(blk));
     }
 
     info->write_threshold = bdrv_write_threshold_get(bs);
@@ -690,16 +690,15 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation,
 void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f,
                                    ImageInfoSpecific *info_spec)
 {
-    QmpOutputVisitor *ov = qmp_output_visitor_new();
     QObject *obj, *data;
+    Visitor *v = qmp_output_visitor_new(&obj);
 
-    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec,
-                                 &error_abort);
-    obj = qmp_output_get_qobject(ov);
+    visit_type_ImageInfoSpecific(v, NULL, &info_spec, &error_abort);
+    visit_complete(v, &obj);
     assert(qobject_type(obj) == QTYPE_QDICT);
     data = qdict_get(qobject_to_qdict(obj), "data");
     dump_qobject(func_fprintf, f, 1, data);
-    qmp_output_visitor_cleanup(ov);
+    visit_free(v);
 }
 
 void bdrv_image_info_dump(fprintf_function func_fprintf, void *f,
diff --git a/block/qcow.c b/block/qcow.c
index 60ddb12ec..6f9b2e2d2 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include <zlib.h>
 #include "qapi/qmp/qerror.h"
 #include "crypto/cipher.h"
@@ -104,7 +105,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     QCowHeader header;
 
-    ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header));
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
         goto fail;
     }
@@ -161,13 +162,19 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     if (s->crypt_method_header) {
         if (bdrv_uses_whitelist() &&
             s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
         }
 
-        bs->encrypted = 1;
+        bs->encrypted = true;
     }
     s->cluster_bits = header.cluster_bits;
     s->cluster_size = 1 << s->cluster_bits;
@@ -201,7 +208,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table,
+    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                s->l1_size * sizeof(uint64_t));
     if (ret < 0) {
         goto fail;
@@ -232,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
             ret = -EINVAL;
             goto fail;
         }
-        ret = bdrv_pread(bs->file->bs, header.backing_file_offset,
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
                    bs->backing_file, len);
         if (ret < 0) {
             goto fail;
@@ -383,7 +390,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
         /* update the L1 entry */
         s->l1_table[l1_index] = l2_offset;
         tmp = cpu_to_be64(l2_offset);
-        if (bdrv_pwrite_sync(bs->file->bs,
+        if (bdrv_pwrite_sync(bs->file,
                 s->l1_table_offset + l1_index * sizeof(tmp),
                 &tmp, sizeof(tmp)) < 0)
             return 0;
@@ -413,11 +420,11 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     l2_table = s->l2_cache + (min_index << s->l2_bits);
     if (new_l2_table) {
         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-        if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table,
+        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
                 s->l2_size * sizeof(uint64_t)) < 0)
             return 0;
     } else {
-        if (bdrv_pread(bs->file->bs, l2_offset, l2_table,
+        if (bdrv_pread(bs->file, l2_offset, l2_table,
                        s->l2_size * sizeof(uint64_t)) !=
             s->l2_size * sizeof(uint64_t))
             return 0;
@@ -443,7 +450,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
             cluster_offset = (cluster_offset + s->cluster_size - 1) &
                 ~(s->cluster_size - 1);
             /* write the cluster content */
-            if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache,
+            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
                             s->cluster_size) !=
                 s->cluster_size)
                 return -1;
@@ -473,7 +480,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
                                 errno = EIO;
                                 return -1;
                             }
-                            if (bdrv_pwrite(bs->file->bs,
+                            if (bdrv_pwrite(bs->file,
                                             cluster_offset + i * 512,
                                             s->cluster_data, 512) != 512)
                                 return -1;
@@ -488,7 +495,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
         /* update L2 table */
         tmp = cpu_to_be64(cluster_offset);
         l2_table[l2_index] = tmp;
-        if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp),
+        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
                 &tmp, sizeof(tmp)) < 0)
             return 0;
     }
@@ -558,7 +565,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
     if (s->cluster_cache_offset != coffset) {
         csize = cluster_offset >> (63 - s->cluster_bits);
         csize &= (s->cluster_size - 1);
-        ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize);
+        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
         if (ret != csize)
             return -1;
         if (decompress_buffer(s->cluster_cache, s->cluster_size,
@@ -612,8 +619,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                 hd_iov.iov_len = n * 512;
                 qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
                 qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_readv(bs->backing->bs, sector_num,
-                                    n, &hd_qiov);
+                ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
                 qemu_co_mutex_lock(&s->lock);
                 if (ret < 0) {
                     goto fail;
@@ -637,7 +643,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
             hd_iov.iov_len = n * 512;
             qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
             qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->file->bs,
+            ret = bdrv_co_readv(bs->file,
                                 (cluster_offset >> 9) + index_in_cluster,
                                 n, &hd_qiov);
             qemu_co_mutex_lock(&s->lock);
@@ -739,7 +745,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
         hd_iov.iov_len = n * 512;
         qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
         qemu_co_mutex_unlock(&s->lock);
-        ret = bdrv_co_writev(bs->file->bs,
+        ret = bdrv_co_writev(bs->file,
                              (cluster_offset >> 9) + index_in_cluster,
                              n, &hd_qiov);
         qemu_co_mutex_lock(&s->lock);
@@ -853,24 +859,24 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
     }
 
     /* write all the data */
-    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
+    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
     if (ret != sizeof(header)) {
         goto exit;
     }
 
     if (backing_file) {
         ret = blk_pwrite(qcow_blk, sizeof(header),
-            backing_file, backing_filename_len);
+                         backing_file, backing_filename_len, 0);
         if (ret != backing_filename_len) {
             goto exit;
         }
     }
 
     tmp = g_malloc0(BDRV_SECTOR_SIZE);
-    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
-        BDRV_SECTOR_SIZE); i++) {
-        ret = blk_pwrite(qcow_blk, header_size +
-            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+    for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
+         i++) {
+        ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
+                         tmp, BDRV_SECTOR_SIZE, 0);
         if (ret != BDRV_SECTOR_SIZE) {
             g_free(tmp);
             goto exit;
@@ -893,7 +899,7 @@ static int qcow_make_empty(BlockDriverState *bs)
     int ret;
 
     memset(s->l1_table, 0, l1_length);
-    if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table,
+    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
             l1_length) < 0)
         return -1;
     ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length);
@@ -907,6 +913,49 @@ static int qcow_make_empty(BlockDriverState *bs)
     return 0;
 }
 
+typedef struct QcowWriteCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} QcowWriteCo;
+
+static void qcow_write_co_entry(void *opaque)
+{
+    QcowWriteCo *co = opaque;
+    QEMUIOVector qiov;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = co->nb_sectors * BDRV_SECTOR_SIZE,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    co->ret = qcow_co_writev(co->bs, co->sector_num, co->nb_sectors, &qiov);
+}
+
+/* Wrapper for non-coroutine contexts */
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+                      const uint8_t *buf, int nb_sectors)
+{
+    Coroutine *co;
+    AioContext *aio_context = bdrv_get_aio_context(bs);
+    QcowWriteCo data = {
+        .bs         = bs,
+        .sector_num = sector_num,
+        .buf        = buf,
+        .nb_sectors = nb_sectors,
+        .ret        = -EINPROGRESS,
+    };
+    co = qemu_coroutine_create(qcow_write_co_entry, &data);
+    qemu_coroutine_enter(co);
+    while (data.ret == -EINPROGRESS) {
+        aio_poll(aio_context, true);
+    }
+    return data.ret;
+}
+
 /* XXX: put compressed sectors first, then all the cluster aligned
    tables to avoid losing bytes in alignment */
 static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
@@ -934,7 +983,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
         return ret;
     }
 
-    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+    out_buf = g_malloc(s->cluster_size);
 
     /* best compression, small window, no zlib header */
     memset(&strm, 0, sizeof(strm));
@@ -963,7 +1012,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
 
     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
         /* could not compress: write normal cluster */
-        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        ret = qcow_write(bs, sector_num, buf, s->cluster_sectors);
         if (ret < 0) {
             goto fail;
         }
@@ -976,7 +1025,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
         }
 
         cluster_offset &= s->cluster_offset_mask;
-        ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len);
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
         if (ret < 0) {
             goto fail;
         }
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 0fe8edae4..6eaefeddc 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -24,11 +24,6 @@
 
 /* Needed for CONFIG_MADVISE */
 #include "qemu/osdep.h"
-
-#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
-#include <sys/mman.h>
-#endif
-
 #include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
@@ -215,7 +210,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
         BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
     }
 
-    ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset,
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset,
                       qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
     if (ret < 0) {
         return ret;
@@ -226,7 +221,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     return 0;
 }
 
-int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c)
 {
     BDRVQcow2State *s = bs->opaque;
     int result = 0;
@@ -242,8 +237,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
         }
     }
 
+    return result;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int result = qcow2_cache_write(bs, c);
+
     if (result == 0) {
-        ret = bdrv_flush(bs->file->bs);
+        int ret = bdrv_flush(bs->file->bs);
         if (ret < 0) {
             result = ret;
         }
@@ -355,7 +357,7 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
             BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
         }
 
-        ret = bdrv_pread(bs->file->bs, offset,
+        ret = bdrv_pread(bs->file, offset,
                          qcow2_cache_get_table_addr(bs, c, i),
                          s->cluster_size);
         if (ret < 0) {
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 31ecc1030..f94183529 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "trace.h"
 
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
@@ -64,7 +65,8 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
         }
     }
 
-    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
+    QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
+    if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
         return -EFBIG;
     }
 
@@ -107,7 +109,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
-    ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset,
+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
                            new_l1_table, new_l1_size2);
     if (ret < 0)
         goto fail;
@@ -116,9 +118,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
 
     /* set new table */
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
-    cpu_to_be32w((uint32_t*)data, new_l1_size);
+    stl_be_p(data, new_l1_size);
     stq_be_p(data + 4, new_l1_table_offset);
-    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size),
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
                            data, sizeof(data));
     if (ret < 0) {
         goto fail;
@@ -153,11 +155,9 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
     uint64_t **l2_table)
 {
     BDRVQcow2State *s = bs->opaque;
-    int ret;
-
-    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 
-    return ret;
+    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                           (void **)l2_table);
 }
 
 /*
@@ -186,7 +186,7 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
-    ret = bdrv_pwrite_sync(bs->file->bs,
+    ret = bdrv_pwrite_sync(bs->file,
                            s->l1_table_offset + 8 * l1_start_index,
                            buf, sizeof(buf));
     if (ret < 0) {
@@ -389,22 +389,18 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
     return 0;
 }
 
-static int coroutine_fn copy_sectors(BlockDriverState *bs,
-                                     uint64_t start_sect,
-                                     uint64_t cluster_offset,
-                                     int n_start, int n_end)
+static int coroutine_fn do_perform_cow(BlockDriverState *bs,
+                                       uint64_t src_cluster_offset,
+                                       uint64_t cluster_offset,
+                                       int offset_in_cluster,
+                                       int bytes)
 {
     BDRVQcow2State *s = bs->opaque;
     QEMUIOVector qiov;
     struct iovec iov;
-    int n, ret;
-
-    n = n_end - n_start;
-    if (n <= 0) {
-        return 0;
-    }
+    int ret;
 
-    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_len = bytes;
     iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
     if (iov.iov_base == NULL) {
         return -ENOMEM;
@@ -423,17 +419,21 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
      * interface.  This avoids double I/O throttling and request tracking,
      * which can lead to deadlock when block layer copy-on-read is enabled.
      */
-    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
+                                  bytes, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
 
     if (bs->encrypted) {
         Error *err = NULL;
+        int64_t sector = (cluster_offset + offset_in_cluster)
+                         >> BDRV_SECTOR_BITS;
         assert(s->cipher);
-        if (qcow2_encrypt_sectors(s, start_sect + n_start,
-                                  iov.iov_base, iov.iov_base, n,
-                                  true, &err) < 0) {
+        assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
+        assert((bytes & ~BDRV_SECTOR_MASK) == 0);
+        if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+                                  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
             ret = -EIO;
             error_free(err);
             goto out;
@@ -441,14 +441,14 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0,
-            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+            cluster_offset + offset_in_cluster, bytes);
     if (ret < 0) {
         goto out;
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
-                         &qiov);
+    ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
+                          bytes, &qiov, 0);
     if (ret < 0) {
         goto out;
     }
@@ -463,47 +463,43 @@ out:
 /*
  * get_cluster_offset
  *
- * For a given offset of the disk image, find the cluster offset in
- * qcow2 file. The offset is stored in *cluster_offset.
+ * For a given offset of the virtual disk, find the cluster type and offset in
+ * the qcow2 file. The offset is stored in *cluster_offset.
  *
- * on entry, *num is the number of contiguous sectors we'd like to
- * access following offset.
+ * On entry, *bytes is the maximum number of contiguous bytes starting at
+ * offset that we are interested in.
  *
- * on exit, *num is the number of contiguous sectors we can read.
+ * On exit, *bytes is the number of bytes starting at offset that have the same
+ * cluster type and (if applicable) are stored contiguously in the image file.
+ * Compressed clusters are always returned one by one.
  *
  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
  * cases.
  */
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset)
+                             unsigned int *bytes, uint64_t *cluster_offset)
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int l2_index;
     uint64_t l1_index, l2_offset, *l2_table;
     int l1_bits, c;
-    unsigned int index_in_cluster, nb_clusters;
-    uint64_t nb_available, nb_needed;
+    unsigned int offset_in_cluster;
+    uint64_t bytes_available, bytes_needed, nb_clusters;
     int ret;
 
-    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
-    nb_needed = *num + index_in_cluster;
+    offset_in_cluster = offset_into_cluster(s, offset);
+    bytes_needed = (uint64_t) *bytes + offset_in_cluster;
 
     l1_bits = s->l2_bits + s->cluster_bits;
 
-    /* compute how many bytes there are between the offset and
-     * the end of the l1 entry
-     */
-
-    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
-
-    /* compute the number of available sectors */
+    /* compute how many bytes there are between the start of the cluster
+     * containing offset and the end of the l1 entry */
+    bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
+                    + offset_in_cluster;
 
-    nb_available = (nb_available >> 9) + index_in_cluster;
-
-    if (nb_needed > nb_available) {
-        nb_needed = nb_available;
+    if (bytes_needed > bytes_available) {
+        bytes_needed = bytes_available;
     }
-    assert(nb_needed <= INT_MAX);
 
     *cluster_offset = 0;
 
@@ -540,8 +536,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
-    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
-    nb_clusters = size_to_clusters(s, nb_needed << 9);
+    nb_clusters = size_to_clusters(s, bytes_needed);
+    /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
+     * integers; the minimum cluster size is 512, so this assertion is always
+     * true */
+    assert(nb_clusters <= INT_MAX);
 
     ret = qcow2_get_cluster_type(*cluster_offset);
     switch (ret) {
@@ -588,13 +587,18 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 
     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 
-    nb_available = (c * s->cluster_sectors);
+    bytes_available = (int64_t)c * s->cluster_size;
 
 out:
-    if (nb_available > nb_needed)
-        nb_available = nb_needed;
+    if (bytes_available > bytes_needed) {
+        bytes_available = bytes_needed;
+    }
 
-    *num = nb_available - index_in_cluster;
+    /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
+     * subtracting offset_in_cluster will therefore definitely yield something
+     * not exceeding UINT_MAX */
+    assert(bytes_available - offset_in_cluster <= UINT_MAX);
+    *bytes = bytes_available - offset_in_cluster;
 
     return ret;
 
@@ -740,14 +744,12 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
     BDRVQcow2State *s = bs->opaque;
     int ret;
 
-    if (r->nb_sectors == 0) {
+    if (r->nb_bytes == 0) {
         return 0;
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
-                       r->offset / BDRV_SECTOR_SIZE,
-                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
     qemu_co_mutex_lock(&s->lock);
 
     if (ret < 0) {
@@ -809,13 +811,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     assert(l2_index + m->nb_clusters <= s->l2_size);
     for (i = 0; i < m->nb_clusters; i++) {
         /* if two concurrent writes happen to the same unallocated cluster
-	 * each write allocates separate cluster and writes data concurrently.
-	 * The first one to complete updates l2 table with pointer to its
-	 * cluster the second one has to do RMW (which is done above by
-	 * copy_sectors()), update l2 table with its cluster pointer and free
-	 * old cluster. This is what this loop does */
-        if(l2_table[l2_index + i] != 0)
+         * each write allocates separate cluster and writes data concurrently.
+         * The first one to complete updates l2 table with pointer to its
+         * cluster the second one has to do RMW (which is done above by
+         * perform_cow()), update l2 table with its cluster pointer and free
+         * old cluster. This is what this loop does */
+        if (l2_table[l2_index + i] != 0) {
             old_cluster[j++] = l2_table[l2_index + i];
+        }
 
         l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
@@ -1197,25 +1200,20 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     /*
      * Save info needed for meta data update.
      *
-     * requested_sectors: Number of sectors from the start of the first
+     * requested_bytes: Number of bytes from the start of the first
      * newly allocated cluster to the end of the (possibly shortened
      * before) write request.
      *
-     * avail_sectors: Number of sectors from the start of the first
+     * avail_bytes: Number of bytes from the start of the first
      * newly allocated to the end of the last newly allocated cluster.
      *
-     * nb_sectors: The number of sectors from the start of the first
+     * nb_bytes: The number of bytes from the start of the first
      * newly allocated cluster to the end of the area that the write
      * request actually writes to (excluding COW at the end)
      */
-    int requested_sectors =
-        (*bytes + offset_into_cluster(s, guest_offset))
-        >> BDRV_SECTOR_BITS;
-    int avail_sectors = nb_clusters
-                        << (s->cluster_bits - BDRV_SECTOR_BITS);
-    int alloc_n_start = offset_into_cluster(s, guest_offset)
-                        >> BDRV_SECTOR_BITS;
-    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
+    int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
+    int nb_bytes = MIN(requested_bytes, avail_bytes);
     QCowL2Meta *old_m = *m;
 
     *m = g_malloc0(sizeof(**m));
@@ -1226,23 +1224,21 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
         .alloc_offset   = alloc_cluster_offset,
         .offset         = start_of_cluster(s, guest_offset),
         .nb_clusters    = nb_clusters,
-        .nb_available   = nb_sectors,
 
         .cow_start = {
             .offset     = 0,
-            .nb_sectors = alloc_n_start,
+            .nb_bytes   = offset_into_cluster(s, guest_offset),
         },
         .cow_end = {
-            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
-            .nb_sectors = avail_sectors - nb_sectors,
+            .offset     = nb_bytes,
+            .nb_bytes   = avail_bytes - nb_bytes,
         },
     };
     qemu_co_queue_init(&(*m)->dependent_requests);
     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
 
     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
-    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
-                         - offset_into_cluster(s, guest_offset));
+    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
     assert(*bytes != 0);
 
     return 1;
@@ -1274,7 +1270,8 @@ fail:
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m)
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t start, remaining;
@@ -1282,13 +1279,11 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     uint64_t cur_bytes;
     int ret;
 
-    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
-
-    assert((offset & ~BDRV_SECTOR_MASK) == 0);
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
 
 again:
     start = offset;
-    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
+    remaining = *bytes;
     cluster_offset = 0;
     *host_offset = 0;
     cur_bytes = 0;
@@ -1374,8 +1369,8 @@ again:
         }
     }
 
-    *num -= remaining >> BDRV_SECTOR_BITS;
-    assert(*num > 0);
+    *bytes -= remaining;
+    assert(*bytes > 0);
     assert(*host_offset != 0);
 
     return 0;
@@ -1420,7 +1415,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
         sector_offset = coffset & 511;
         csize = nb_csectors * 512 - sector_offset;
         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-        ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data,
+        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data,
                         nb_csectors);
         if (ret < 0) {
             return ret;
@@ -1689,7 +1684,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                     (void **)&l2_table);
         } else {
             /* load inactive L2 tables from disk */
-            ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
+            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
                             (void *)l2_table, s->cluster_sectors);
         }
         if (ret < 0) {
@@ -1764,8 +1759,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                 goto fail;
             }
 
-            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
-                                    s->cluster_sectors, 0);
+            ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
             if (ret < 0) {
                 if (!preallocated) {
                     qcow2_free_clusters(bs, offset, s->cluster_size,
@@ -1797,7 +1791,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                     goto fail;
                 }
 
-                ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
+                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
                                  (void *)l2_table, s->cluster_sectors);
                 if (ret < 0) {
                     goto fail;
@@ -1867,12 +1861,12 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs,
     }
 
     for (i = 0; i < s->nb_snapshots; i++) {
-        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
-                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
+        int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size *
+                                      sizeof(uint64_t), BDRV_SECTOR_SIZE);
 
         l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
 
-        ret = bdrv_read(bs->file->bs,
+        ret = bdrv_read(bs->file,
                         s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
                         (void *)l1_table, l1_sectors);
         if (ret < 0) {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index ca6094ff5..cbfb3fe06 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -28,6 +28,7 @@
 #include "block/block_int.h"
 #include "block/qcow2.h"
 #include "qemu/range.h"
+#include "qemu/bswap.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -103,7 +104,7 @@ int qcow2_refcount_init(BlockDriverState *bs)
             goto fail;
         }
         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
-        ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
+        ret = bdrv_pread(bs->file, s->refcount_table_offset,
                          s->refcount_table, refcount_table_size2);
         if (ret < 0) {
             goto fail;
@@ -217,13 +218,10 @@ static int load_refcount_block(BlockDriverState *bs,
                                void **refcount_block)
 {
     BDRVQcow2State *s = bs->opaque;
-    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
-    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
-        refcount_block);
-
-    return ret;
+    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+                           refcount_block);
 }
 
 /*
@@ -433,7 +431,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     if (refcount_table_index < s->refcount_table_size) {
         uint64_t data64 = cpu_to_be64(new_block);
         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
-        ret = bdrv_pwrite_sync(bs->file->bs,
+        ret = bdrv_pwrite_sync(bs->file,
             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
             &data64, sizeof(data64));
         if (ret < 0) {
@@ -489,14 +487,12 @@ static int alloc_refcount_block(BlockDriverState *bs,
         uint64_t table_clusters =
             size_to_clusters(s, table_size * sizeof(uint64_t));
         blocks_clusters = 1 +
-            ((table_clusters + s->refcount_block_size - 1)
-            / s->refcount_block_size);
+            DIV_ROUND_UP(table_clusters, s->refcount_block_size);
         uint64_t meta_clusters = table_clusters + blocks_clusters;
 
         last_table_size = table_size;
         table_size = next_refcount_table_size(s, blocks_used +
-            ((meta_clusters + s->refcount_block_size - 1)
-            / s->refcount_block_size));
+            DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
 
     } while (last_table_size != table_size);
 
@@ -537,7 +533,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
 
     /* Write refcount blocks to disk */
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
-    ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
+    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
         blocks_clusters * s->cluster_size);
     g_free(new_blocks);
     new_blocks = NULL;
@@ -551,7 +547,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
-    ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
+    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
         table_size * sizeof(uint64_t));
     if (ret < 0) {
         goto fail_table;
@@ -566,10 +562,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
         uint64_t d64;
         uint32_t d32;
     } data;
-    cpu_to_be64w(&data.d64, table_offset);
-    cpu_to_be32w(&data.d32, table_clusters);
+    data.d64 = cpu_to_be64(table_offset);
+    data.d32 = cpu_to_be32(table_clusters);
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
-    ret = bdrv_pwrite_sync(bs->file->bs,
+    ret = bdrv_pwrite_sync(bs->file,
                            offsetof(QCowHeader, refcount_table_offset),
                            &data, sizeof(data));
     if (ret < 0) {
@@ -619,9 +615,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 
         /* Discard is optional, ignore the return value */
         if (ret >= 0) {
-            bdrv_discard(bs->file->bs,
-                         d->offset >> BDRV_SECTOR_BITS,
-                         d->bytes >> BDRV_SECTOR_BITS);
+            bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
         }
 
         g_free(d);
@@ -1074,7 +1068,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
         }
         l1_allocated = true;
 
-        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
         if (ret < 0) {
             goto fail;
         }
@@ -1227,7 +1221,7 @@ fail:
             cpu_to_be64s(&l1_table[i]);
         }
 
-        ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
                                l1_table, l1_size2);
 
         for (i = 0; i < l1_size; i++) {
@@ -1386,7 +1380,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     l2_size = s->l2_size * sizeof(uint64_t);
     l2_table = g_malloc(l2_size);
 
-    ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
+    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
     if (ret < 0) {
         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
         res->check_errors++;
@@ -1518,7 +1512,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
             res->check_errors++;
             goto fail;
         }
-        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
         if (ret < 0) {
             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
             res->check_errors++;
@@ -1616,7 +1610,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
             }
         }
 
-        ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
                          s->l2_size * sizeof(uint64_t));
         if (ret < 0) {
             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
@@ -1668,7 +1662,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
                 goto fail;
             }
 
-            ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
+            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
                               s->cluster_size);
             if (ret < 0) {
                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
@@ -2102,7 +2096,7 @@ write_refblocks:
         on_disk_refblock = (void *)((char *) *refcount_table +
                                     refblock_index * s->cluster_size);
 
-        ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
+        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
                          on_disk_refblock, s->cluster_sectors);
         if (ret < 0) {
             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
@@ -2151,7 +2145,7 @@ write_refblocks:
     }
 
     assert(reftable_size < INT_MAX / sizeof(uint64_t));
-    ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
+    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
                       reftable_size * sizeof(uint64_t));
     if (ret < 0) {
         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
@@ -2159,12 +2153,11 @@ write_refblocks:
     }
 
     /* Enter new reftable into the image header */
-    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
-                 reftable_offset);
-    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
-                 size_to_clusters(s, reftable_size * sizeof(uint64_t)));
-    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
-                                                  refcount_table_offset),
+    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
+    reftable_offset_and_clusters.reftable_clusters =
+        cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
+    ret = bdrv_pwrite_sync(bs->file,
+                           offsetof(QCowHeader, refcount_table_offset),
                            &reftable_offset_and_clusters,
                            sizeof(reftable_offset_and_clusters));
     if (ret < 0) {
@@ -2411,7 +2404,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
                 return -ENOMEM;
             }
 
-            ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
+            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
             if (ret < 0) {
                 g_free(l1);
                 return ret;
@@ -2564,7 +2557,7 @@ static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
             return ret;
         }
 
-        ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
+        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Failed to write refblock");
             return ret;
@@ -2834,7 +2827,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
         cpu_to_be64s(&new_reftable[i]);
     }
 
-    ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
+    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
                       new_reftable_size * sizeof(uint64_t));
 
     for (i = 0; i < new_reftable_size; i++) {
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 5f4a17e47..032424322 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -26,6 +26,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/bswap.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
 
@@ -66,7 +67,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
     for(i = 0; i < s->nb_snapshots; i++) {
         /* Read statically sized part of the snapshot header */
         offset = align_offset(offset, 8);
-        ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h));
+        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
         if (ret < 0) {
             goto fail;
         }
@@ -85,7 +86,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         name_size = be16_to_cpu(h.name_size);
 
         /* Read extra data */
-        ret = bdrv_pread(bs->file->bs, offset, &extra,
+        ret = bdrv_pread(bs->file, offset, &extra,
                          MIN(sizeof(extra), extra_data_size));
         if (ret < 0) {
             goto fail;
@@ -104,7 +105,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
 
         /* Read snapshot ID */
         sn->id_str = g_malloc(id_str_size + 1);
-        ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size);
+        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
             goto fail;
         }
@@ -113,7 +114,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
 
         /* Read snapshot name */
         sn->name = g_malloc(name_size + 1);
-        ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size);
+        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
         if (ret < 0) {
             goto fail;
         }
@@ -216,25 +217,25 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         h.name_size = cpu_to_be16(name_size);
         offset = align_offset(offset, 8);
 
-        ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h));
+        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
         if (ret < 0) {
             goto fail;
         }
         offset += sizeof(h);
 
-        ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra));
+        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
         if (ret < 0) {
             goto fail;
         }
         offset += sizeof(extra);
 
-        ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size);
+        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
             goto fail;
         }
         offset += id_str_size;
 
-        ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size);
+        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
         if (ret < 0) {
             goto fail;
         }
@@ -256,7 +257,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
     header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
 
-    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots),
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
                            &header_data, sizeof(header_data));
     if (ret < 0) {
         goto fail;
@@ -398,7 +399,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto fail;
     }
 
-    ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table,
+    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
                       s->l1_size * sizeof(uint64_t));
     if (ret < 0) {
         goto fail;
@@ -511,7 +512,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto fail;
     }
 
-    ret = bdrv_pread(bs->file->bs, sn->l1_table_offset,
+    ret = bdrv_pread(bs->file, sn->l1_table_offset,
                      sn_l1_table, sn_l1_bytes);
     if (ret < 0) {
         goto fail;
@@ -529,7 +530,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto fail;
     }
 
-    ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table,
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
                            cur_l1_bytes);
     if (ret < 0) {
         goto fail;
@@ -715,7 +716,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
         return -ENOMEM;
     }
 
-    ret = bdrv_pread(bs->file->bs, sn->l1_table_offset,
+    ret = bdrv_pread(bs->file, sn->l1_table_offset,
                      new_l1_table, new_l1_bytes);
     if (ret < 0) {
         error_setg(errp, "Failed to read l1 table for snapshot");
diff --git a/block/qcow2.c b/block/qcow2.c
index 470734be9..91ef4dfef 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -36,6 +36,7 @@
 #include "trace.h"
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"
 
 /*
   Differences with QCOW:
@@ -106,7 +107,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
         printf("attempting to read extended header in offset %lu\n", offset);
 #endif
 
-        ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext));
+        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
         if (ret < 0) {
             error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
                              "pread fail from offset %" PRIu64, offset);
@@ -134,7 +135,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                            sizeof(bs->backing_format));
                 return 2;
             }
-            ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len);
+            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
             if (ret < 0) {
                 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
                                  "Could not read format name");
@@ -150,7 +151,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
             if (p_feature_table != NULL) {
                 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
-                ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len);
+                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
                 if (ret < 0) {
                     error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
                                      "Could not read table");
@@ -171,7 +172,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
                 uext->len = ext.len;
                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
 
-                ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len);
+                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
                 if (ret < 0) {
                     error_setg_errno(errp, -ret, "ERROR: unknown extension: "
                                      "Could not read data");
@@ -248,7 +249,7 @@ int qcow2_mark_dirty(BlockDriverState *bs)
     }
 
     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
-    ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features),
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
                       &val, sizeof(val));
     if (ret < 0) {
         return ret;
@@ -816,7 +817,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     uint64_t ext_end;
     uint64_t l1_vm_state_index;
 
-    ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header));
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not read qcow2 header");
         goto fail;
@@ -891,7 +892,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     if (header.header_length > sizeof(header)) {
         s->unknown_header_fields_size = header.header_length - sizeof(header);
         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
-        ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields,
+        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
                          s->unknown_header_fields_size);
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
@@ -967,13 +968,19 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     if (s->crypt_method_header) {
         if (bdrv_uses_whitelist() &&
             s->crypt_method_header == QCOW_CRYPT_AES) {
-            error_report("qcow2 built-in AES encryption is deprecated");
-            error_printf("Support for it will be removed in a future release.\n"
-                         "You can use 'qemu-img convert' to switch to an\n"
-                         "unencrypted qcow2 image, or a LUKS raw image.\n");
+            error_setg(errp,
+                       "Use of AES-CBC encrypted qcow2 images is no longer "
+                       "supported in system emulators");
+            error_append_hint(errp,
+                              "You can use 'qemu-img convert' to convert your "
+                              "image to an alternative supported format, such "
+                              "as unencrypted qcow2, or raw with the LUKS "
+                              "format instead.\n");
+            ret = -ENOSYS;
+            goto fail;
         }
 
-        bs->encrypted = 1;
+        bs->encrypted = true;
     }
 
     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
@@ -1059,7 +1066,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
             ret = -ENOMEM;
             goto fail;
         }
-        ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table,
+        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                          s->l1_size * sizeof(uint64_t));
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Could not read L1 table");
@@ -1115,7 +1122,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
             ret = -EINVAL;
             goto fail;
         }
-        ret = bdrv_pread(bs->file->bs, header.backing_file_offset,
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
                          bs->backing_file, len);
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Could not read backing file name");
@@ -1192,7 +1199,11 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
 
-    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+    if (bs->encrypted) {
+        /* Encryption works on a sector granularity */
+        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
+    }
+    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
 }
 
 static int qcow2_set_key(BlockDriverState *bs, const char *key)
@@ -1330,16 +1341,20 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
     BDRVQcow2State *s = bs->opaque;
     uint64_t cluster_offset;
     int index_in_cluster, ret;
+    unsigned int bytes;
     int64_t status = 0;
 
-    *pnum = nb_sectors;
+    bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes,
+                                   &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
         return ret;
     }
 
+    *pnum = bytes >> BDRV_SECTOR_BITS;
+
     if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
         !s->cipher) {
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1357,28 +1372,34 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
 
 /* handle reading after the end of the backing file */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-                  int64_t sector_num, int nb_sectors)
+                        int64_t offset, int bytes)
 {
+    uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
     int n1;
-    if ((sector_num + nb_sectors) <= bs->total_sectors)
-        return nb_sectors;
-    if (sector_num >= bs->total_sectors)
+
+    if ((offset + bytes) <= bs_size) {
+        return bytes;
+    }
+
+    if (offset >= bs_size) {
         n1 = 0;
-    else
-        n1 = bs->total_sectors - sector_num;
+    } else {
+        n1 = bs_size - offset;
+    }
 
-    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+    qemu_iovec_memset(qiov, n1, 0, bytes - n1);
 
     return n1;
 }
 
-static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
-                          int remaining_sectors, QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                        uint64_t bytes, QEMUIOVector *qiov,
+                                        int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster, n1;
+    int offset_in_cluster, n1;
     int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of bytes in current iteration */
     uint64_t cluster_offset = 0;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
@@ -1388,26 +1409,24 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
 
     qemu_co_mutex_lock(&s->lock);
 
-    while (remaining_sectors != 0) {
+    while (bytes != 0) {
 
         /* prepare next request */
-        cur_nr_sectors = remaining_sectors;
+        cur_bytes = MIN(bytes, INT_MAX);
         if (s->cipher) {
-            cur_nr_sectors = MIN(cur_nr_sectors,
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
         }
 
-        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset);
+        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
         if (ret < 0) {
             goto fail;
         }
 
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        offset_in_cluster = offset_into_cluster(s, offset);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
 
         switch (ret) {
         case QCOW2_CLUSTER_UNALLOCATED:
@@ -1415,18 +1434,17 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             if (bs->backing) {
                 /* read from the base image */
                 n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
-                    sector_num, cur_nr_sectors);
+                                         offset, cur_bytes);
                 if (n1 > 0) {
                     QEMUIOVector local_qiov;
 
                     qemu_iovec_init(&local_qiov, hd_qiov.niov);
-                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
-                                      n1 * BDRV_SECTOR_SIZE);
+                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
 
                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                     qemu_co_mutex_unlock(&s->lock);
-                    ret = bdrv_co_readv(bs->backing->bs, sector_num,
-                                        n1, &local_qiov);
+                    ret = bdrv_co_preadv(bs->backing, offset, n1,
+                                         &local_qiov, 0);
                     qemu_co_mutex_lock(&s->lock);
 
                     qemu_iovec_destroy(&local_qiov);
@@ -1437,12 +1455,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 }
             } else {
                 /* Note: in this case, no need to wait */
-                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
             }
             break;
 
         case QCOW2_CLUSTER_ZERO:
-            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
             break;
 
         case QCOW2_CLUSTER_COMPRESSED:
@@ -1453,8 +1471,8 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             }
 
             qemu_iovec_from_buf(&hd_qiov, 0,
-                s->cluster_cache + index_in_cluster * 512,
-                512 * cur_nr_sectors);
+                                s->cluster_cache + offset_in_cluster,
+                                cur_bytes);
             break;
 
         case QCOW2_CLUSTER_NORMAL:
@@ -1481,34 +1499,34 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                     }
                 }
 
-                assert(cur_nr_sectors <=
-                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
                 qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data,
-                    512 * cur_nr_sectors);
+                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
             }
 
             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
             qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->file->bs,
-                                (cluster_offset >> 9) + index_in_cluster,
-                                cur_nr_sectors, &hd_qiov);
+            ret = bdrv_co_preadv(bs->file,
+                                 cluster_offset + offset_in_cluster,
+                                 cur_bytes, &hd_qiov, 0);
             qemu_co_mutex_lock(&s->lock);
             if (ret < 0) {
                 goto fail;
             }
             if (bs->encrypted) {
                 assert(s->cipher);
+                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
                 Error *err = NULL;
-                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data,
-                                          cluster_data, cur_nr_sectors, false,
-                                          &err) < 0) {
+                if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                          cluster_data, cluster_data,
+                                          cur_bytes >> BDRV_SECTOR_BITS,
+                                          false, &err) < 0) {
                     error_free(err);
                     ret = -EIO;
                     goto fail;
                 }
-                qemu_iovec_from_buf(qiov, bytes_done,
-                    cluster_data, 512 * cur_nr_sectors);
+                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
             }
             break;
 
@@ -1518,9 +1536,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             goto fail;
         }
 
-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
     }
     ret = 0;
 
@@ -1533,23 +1551,21 @@ fail:
     return ret;
 }
 
-static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
-                           int64_t sector_num,
-                           int remaining_sectors,
-                           QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                         uint64_t bytes, QEMUIOVector *qiov,
+                                         int flags)
 {
     BDRVQcow2State *s = bs->opaque;
-    int index_in_cluster;
+    int offset_in_cluster;
     int ret;
-    int cur_nr_sectors; /* number of sectors in current iteration */
+    unsigned int cur_bytes; /* number of sectors in current iteration */
     uint64_t cluster_offset;
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
     QCowL2Meta *l2meta = NULL;
 
-    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
-                                 remaining_sectors);
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
 
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
@@ -1557,22 +1573,21 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     qemu_co_mutex_lock(&s->lock);
 
-    while (remaining_sectors != 0) {
+    while (bytes != 0) {
 
         l2meta = NULL;
 
         trace_qcow2_writev_start_part(qemu_coroutine_self());
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        cur_nr_sectors = remaining_sectors;
-        if (bs->encrypted &&
-            cur_nr_sectors >
-            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
-            cur_nr_sectors =
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
+        offset_in_cluster = offset_into_cluster(s, offset);
+        cur_bytes = MIN(bytes, INT_MAX);
+        if (bs->encrypted) {
+            cur_bytes = MIN(cur_bytes,
+                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                            - offset_in_cluster);
         }
 
-        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            &cur_nr_sectors, &cluster_offset, &l2meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
+                                         &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
@@ -1580,8 +1595,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         assert((cluster_offset & 511) == 0);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
-            cur_nr_sectors * 512);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
 
         if (bs->encrypted) {
             Error *err = NULL;
@@ -1600,8 +1614,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
 
-            if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
-                                      cluster_data, cur_nr_sectors,
+            if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
+                                      cluster_data, cluster_data,
+                                      cur_bytes >>BDRV_SECTOR_BITS,
                                       true, &err) < 0) {
                 error_free(err);
                 ret = -EIO;
@@ -1609,13 +1624,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             }
 
             qemu_iovec_reset(&hd_qiov);
-            qemu_iovec_add(&hd_qiov, cluster_data,
-                cur_nr_sectors * 512);
+            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
         }
 
         ret = qcow2_pre_write_overlap_check(bs, 0,
-                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
-                cur_nr_sectors * BDRV_SECTOR_SIZE);
+                cluster_offset + offset_in_cluster, cur_bytes);
         if (ret < 0) {
             goto fail;
         }
@@ -1623,10 +1636,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         qemu_co_mutex_unlock(&s->lock);
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
-                                (cluster_offset >> 9) + index_in_cluster);
-        ret = bdrv_co_writev(bs->file->bs,
-                             (cluster_offset >> 9) + index_in_cluster,
-                             cur_nr_sectors, &hd_qiov);
+                                cluster_offset + offset_in_cluster);
+        ret = bdrv_co_pwritev(bs->file,
+                              cluster_offset + offset_in_cluster,
+                              cur_bytes, &hd_qiov, 0);
         qemu_co_mutex_lock(&s->lock);
         if (ret < 0) {
             goto fail;
@@ -1652,10 +1665,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             l2meta = next;
         }
 
-        remaining_sectors -= cur_nr_sectors;
-        sector_num += cur_nr_sectors;
-        bytes_done += cur_nr_sectors * 512;
-        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+        bytes -= cur_bytes;
+        offset += cur_bytes;
+        bytes_done += cur_bytes;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
     }
     ret = 0;
 
@@ -1757,13 +1770,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
 
     qcow2_close(bs);
 
-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        bs->drv = NULL;
-        return;
-    }
-
     memset(s, 0, sizeof(BDRVQcow2State));
     options = qdict_clone_shallow(bs->options);
 
@@ -1970,7 +1976,7 @@ int qcow2_update_header(BlockDriverState *bs)
     }
 
     /* Write the new header */
-    ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size);
+    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
     if (ret < 0) {
         goto fail;
     }
@@ -2004,19 +2010,19 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
 
 static int preallocate(BlockDriverState *bs)
 {
-    uint64_t nb_sectors;
+    uint64_t bytes;
     uint64_t offset;
     uint64_t host_offset = 0;
-    int num;
+    unsigned int cur_bytes;
     int ret;
     QCowL2Meta *meta;
 
-    nb_sectors = bdrv_nb_sectors(bs);
+    bytes = bdrv_getlength(bs);
     offset = 0;
 
-    while (nb_sectors) {
-        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
-        ret = qcow2_alloc_cluster_offset(bs, offset, &num,
+    while (bytes) {
+        cur_bytes = MIN(bytes, INT_MAX);
+        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                          &host_offset, &meta);
         if (ret < 0) {
             return ret;
@@ -2042,8 +2048,8 @@ static int preallocate(BlockDriverState *bs)
 
         /* TODO Preallocate data if requested */
 
-        nb_sectors -= num;
-        offset += num << BDRV_SECTOR_BITS;
+        bytes -= cur_bytes;
+        offset += cur_bytes;
     }
 
     /*
@@ -2052,11 +2058,9 @@ static int preallocate(BlockDriverState *bs)
      * EOF). Extend the image to the last allocated sector.
      */
     if (host_offset != 0) {
-        uint8_t buf[BDRV_SECTOR_SIZE];
-        memset(buf, 0, BDRV_SECTOR_SIZE);
-        ret = bdrv_write(bs->file->bs,
-                         (host_offset >> BDRV_SECTOR_BITS) + num - 1,
-                         buf, 1);
+        uint8_t data = 0;
+        ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
+                          &data, 1);
         if (ret < 0) {
             return ret;
         }
@@ -2207,7 +2211,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
     }
 
-    ret = blk_pwrite(blk, 0, header, cluster_size);
+    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
     g_free(header);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not write qcow2 header");
@@ -2217,7 +2221,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     /* Write a refcount table with one refcount block */
     refcount_table = g_malloc0(2 * cluster_size);
     refcount_table[0] = cpu_to_be64(2 * cluster_size);
-    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size);
+    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
     g_free(refcount_table);
 
     if (ret < 0) {
@@ -2400,9 +2404,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                         cluster_size, prealloc, opts, version, refcount_order,
                         &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);
 
 finish:
     g_free(backing_file);
@@ -2411,35 +2413,81 @@ finish:
     return ret;
 }
 
-static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+                            uint32_t count)
+{
+    int nr;
+    BlockDriverState *file;
+    int64_t res;
+
+    if (!count) {
+        return true;
+    }
+    res = bdrv_get_block_status_above(bs, NULL, start, count,
+                                      &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
+}
+
+static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
-    /* Emulate misaligned zero writes */
-    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
-        return -ENOTSUP;
+    uint32_t head = offset % s->cluster_size;
+    uint32_t tail = (offset + count) % s->cluster_size;
+
+    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
+
+    if (head || tail) {
+        int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
+        uint64_t off;
+        unsigned int nr;
+
+        assert(head + count <= s->cluster_size);
+
+        /* check whether remainder of cluster already reads as zero */
+        if (!(is_zero_sectors(bs, cl_start,
+                              DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
+              is_zero_sectors(bs, (offset + count) >> BDRV_SECTOR_BITS,
+                              DIV_ROUND_UP(-tail & (s->cluster_size - 1),
+                                           BDRV_SECTOR_SIZE)))) {
+            return -ENOTSUP;
+        }
+
+        qemu_co_mutex_lock(&s->lock);
+        /* We can have new write after previous check */
+        offset = cl_start << BDRV_SECTOR_BITS;
+        count = s->cluster_size;
+        nr = s->cluster_size;
+        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
+        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
+            qemu_co_mutex_unlock(&s->lock);
+            return -ENOTSUP;
+        }
+    } else {
+        qemu_co_mutex_lock(&s->lock);
     }
 
+    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
+
     /* Whatever is left can use real zero clusters */
-    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;
 }
 
-static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
+                                          int64_t offset, int count)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors, QCOW2_DISCARD_REQUEST, false);
+    ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
+                                 QCOW2_DISCARD_REQUEST, false);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -2475,7 +2523,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
 
     /* write updated header.size */
     offset = cpu_to_be64(offset);
-    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size),
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
                            &offset, sizeof(uint64_t));
     if (ret < 0) {
         return ret;
@@ -2485,6 +2533,51 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
     return 0;
 }
 
+typedef struct Qcow2WriteCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} Qcow2WriteCo;
+
+static void qcow2_write_co_entry(void *opaque)
+{
+    Qcow2WriteCo *co = opaque;
+    QEMUIOVector qiov;
+    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = bytes,
+    };
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    co->ret = qcow2_co_pwritev(co->bs, offset, bytes, &qiov, 0);
+}
+
+/* Wrapper for non-coroutine contexts */
+static int qcow2_write(BlockDriverState *bs, int64_t sector_num,
+                       const uint8_t *buf, int nb_sectors)
+{
+    Coroutine *co;
+    AioContext *aio_context = bdrv_get_aio_context(bs);
+    Qcow2WriteCo data = {
+        .bs         = bs,
+        .sector_num = sector_num,
+        .buf        = buf,
+        .nb_sectors = nb_sectors,
+        .ret        = -EINPROGRESS,
+    };
+    co = qemu_coroutine_create(qcow2_write_co_entry, &data);
+    qemu_coroutine_enter(co);
+    while (data.ret == -EINPROGRESS) {
+        aio_poll(aio_context, true);
+    }
+    return data.ret;
+}
+
 /* XXX: put compressed sectors first, then all the cluster aligned
    tables to avoid losing bytes in alignment */
 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
@@ -2519,7 +2612,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
         return ret;
     }
 
-    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+    out_buf = g_malloc(s->cluster_size);
 
     /* best compression, small window, no zlib header */
     memset(&strm, 0, sizeof(strm));
@@ -2548,7 +2641,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
 
     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
         /* could not compress: write normal cluster */
-        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        ret = qcow2_write(bs, sector_num, buf, s->cluster_sectors);
         if (ret < 0) {
             goto fail;
         }
@@ -2567,7 +2660,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
         }
 
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
-        ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len);
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
         if (ret < 0) {
             goto fail;
         }
@@ -2616,8 +2709,8 @@ static int make_completely_empty(BlockDriverState *bs)
     /* After this call, neither the in-memory nor the on-disk refcount
      * information accurately describe the actual references */
 
-    ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE,
-                            l1_clusters * s->cluster_sectors, 0);
+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
+                             l1_clusters * s->cluster_size, 0);
     if (ret < 0) {
         goto fail_broken_refcounts;
     }
@@ -2630,9 +2723,8 @@ static int make_completely_empty(BlockDriverState *bs)
      * overwrite parts of the existing refcount and L1 table, which is not
      * an issue because the dirty flag is set, complete data loss is in fact
      * desired and partial data loss is consequently fine as well */
-    ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE,
-                            (2 + l1_clusters) * s->cluster_size /
-                            BDRV_SECTOR_SIZE, 0);
+    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
+                             (2 + l1_clusters) * s->cluster_size, 0);
     /* This call (even if it failed overall) may have overwritten on-disk
      * refcount structures; in that case, the in-memory refcount information
      * will probably differ from the on-disk information which makes the BDS
@@ -2647,10 +2739,10 @@ static int make_completely_empty(BlockDriverState *bs)
     /* "Create" an empty reftable (one cluster) directly after the image
      * header and an empty L1 table three clusters after the image header;
      * the cluster between those two will be used as the first refblock */
-    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
-    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
-    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
-    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset),
+    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
+    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
+    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
                            &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
     if (ret < 0) {
         goto fail_broken_refcounts;
@@ -2681,7 +2773,7 @@ static int make_completely_empty(BlockDriverState *bs)
 
     /* Enter the first refblock into the reftable */
     rt_entry = cpu_to_be64(2 * s->cluster_size);
-    ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size,
+    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
                            &rt_entry, sizeof(rt_entry));
     if (ret < 0) {
         goto fail_broken_refcounts;
@@ -2774,14 +2866,14 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     int ret;
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    ret = qcow2_cache_write(bs, s->l2_table_cache);
     if (ret < 0) {
         qemu_co_mutex_unlock(&s->lock);
         return ret;
     }
 
     if (qcow2_need_accurate_refcounts(s)) {
-        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        ret = qcow2_cache_write(bs, s->refcount_block_cache);
         if (ret < 0) {
             qemu_co_mutex_unlock(&s->lock);
             return ret;
@@ -2861,36 +2953,20 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                               int64_t pos)
 {
     BDRVQcow2State *s = bs->opaque;
-    int64_t total_sectors = bs->total_sectors;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    /* bdrv_co_do_writev will have increased the total_sectors value to include
-     * the VM state - the VM state is however not an actual part of the block
-     * device, therefore, we need to restore the old value. */
-    bs->total_sectors = total_sectors;
-
-    return ret;
+    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
+                                    qiov->size, qiov, 0);
 }
 
-static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                              int64_t pos, int size)
+static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
 {
     BDRVQcow2State *s = bs->opaque;
-    bool zero_beyond_eof = bs->zero_beyond_eof;
-    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
-    bs->zero_beyond_eof = false;
-    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
-    bs->zero_beyond_eof = zero_beyond_eof;
-
-    return ret;
+    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
+                                   qiov->size, qiov, 0);
 }
 
 /*
@@ -3329,12 +3405,12 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_co_get_block_status = qcow2_co_get_block_status,
     .bdrv_set_key       = qcow2_set_key,
 
-    .bdrv_co_readv          = qcow2_co_readv,
-    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_preadv         = qcow2_co_preadv,
+    .bdrv_co_pwritev        = qcow2_co_pwritev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
 
-    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
-    .bdrv_co_discard        = qcow2_co_discard,
+    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
+    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
     .bdrv_truncate          = qcow2_truncate,
     .bdrv_write_compressed  = qcow2_write_compressed,
     .bdrv_make_empty        = qcow2_make_empty,
diff --git a/block/qcow2.h b/block/qcow2.h
index a063a3c1a..b36a7bf8a 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion {
      */
     uint64_t    offset;
 
-    /** Number of sectors to copy */
-    int         nb_sectors;
+    /** Number of bytes to copy */
+    int         nb_bytes;
 } Qcow2COWRegion;
 
 /**
@@ -318,12 +318,6 @@ typedef struct QCowL2Meta
     /** Host offset of the first newly allocated cluster */
     uint64_t alloc_offset;
 
-    /**
-     * Number of sectors from the start of the first allocated cluster to
-     * the end of the (possibly shortened) request
-     */
-    int nb_available;
-
     /** Number of newly allocated clusters */
     int nb_clusters;
 
@@ -471,8 +465,7 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
 
 static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
 {
-    return m->offset + m->cow_end.offset
-        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+    return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
 }
 
 static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
@@ -544,9 +537,10 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
                           int nb_sectors, bool enc, Error **errp);
 
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *cluster_offset);
+                             unsigned int *bytes, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int *num, uint64_t *host_offset, QCowL2Meta **m);
+                               unsigned int *bytes, uint64_t *host_offset,
+                               QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                          uint64_t offset,
                                          int compressed_size);
@@ -583,6 +577,7 @@ int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
 void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
      void *table);
 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);
diff --git a/block/qed-check.c b/block/qed-check.c
index 622f30897..dcd4f036b 100644
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -234,8 +234,7 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
     }
 
     check.result->bfi.total_clusters =
-        (s->header.image_size + s->header.cluster_size - 1) /
-            s->header.cluster_size;
+        DIV_ROUND_UP(s->header.image_size, s->header.cluster_size);
     ret = qed_check_l1_table(&check, s->l1_table);
     if (ret == 0) {
         /* Only check for leaks if entire image was scanned successfully */
diff --git a/block/qed-table.c b/block/qed-table.c
index 802945f5e..1a731dff5 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -16,6 +16,7 @@
 #include "trace.h"
 #include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"
+#include "qemu/bswap.h"
 
 typedef struct {
     GenericCB gencb;
@@ -64,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
     read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
 
     qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
-    bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov,
+    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
                    qiov->size / BDRV_SECTOR_SIZE,
                    qed_read_table_cb, read_table_cb);
 }
@@ -153,7 +154,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
     /* Adjust for offset into table */
     offset += start * sizeof(uint64_t);
 
-    bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
                     &write_table_cb->qiov,
                     write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
                     qed_write_table_cb, write_table_cb);
diff --git a/block/qed.c b/block/qed.c
index 0af52741d..426f3cb44 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -15,6 +15,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/timer.h"
+#include "qemu/bswap.h"
 #include "trace.h"
 #include "qed.h"
 #include "qapi/qmp/qerror.h"
@@ -85,7 +86,7 @@ int qed_write_header_sync(BDRVQEDState *s)
     int ret;
 
     qed_header_cpu_to_le(&s->header, &le);
-    ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le));
+    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
     if (ret != sizeof(le)) {
         return ret;
     }
@@ -122,7 +123,7 @@ static void qed_write_header_read_cb(void *opaque, int ret)
     /* Update header */
     qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
 
-    bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov,
+    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
                     write_header_cb->nsectors, qed_write_header_cb,
                     write_header_cb);
 }
@@ -142,8 +143,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
      * them, and write back.
      */
 
-    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
-                   BDRV_SECTOR_SIZE;
+    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
     size_t len = nsectors * BDRV_SECTOR_SIZE;
     QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
                                                     cb, opaque);
@@ -155,7 +155,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
     write_header_cb->iov.iov_len = len;
     qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
 
-    bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors,
+    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
                    qed_write_header_read_cb, write_header_cb);
 }
 
@@ -218,7 +218,7 @@ static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
  *
  * The string is NUL-terminated.
  */
-static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
                            char *buf, size_t buflen)
 {
     int ret;
@@ -389,7 +389,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
     s->bs = bs;
     QSIMPLEQ_INIT(&s->allocating_write_reqs);
 
-    ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header));
+    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
     if (ret < 0) {
         return ret;
     }
@@ -446,7 +446,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
             return -EINVAL;
         }
 
-        ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset,
+        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
                               s->header.backing_filename_size, bs->backing_file,
                               sizeof(bs->backing_file));
         if (ret < 0) {
@@ -517,7 +517,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVQEDState *s = bs->opaque;
 
-    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
 }
 
 /* We have nothing to do for QED reopen, stubs just return
@@ -601,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size,
     }
 
     qed_header_cpu_to_le(&header, &le_header);
-    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
+    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
     if (ret < 0) {
         goto out;
     }
     ret = blk_pwrite(blk, sizeof(le_header), backing_file,
-                     header.backing_filename_size);
+                     header.backing_filename_size, 0);
     if (ret < 0) {
         goto out;
     }
 
     l1_table = g_malloc0(l1_size);
-    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
+    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
     if (ret < 0) {
         goto out;
     }
@@ -708,7 +708,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     }
 
     if (cb->co) {
-        qemu_coroutine_enter(cb->co, NULL);
+        qemu_coroutine_enter(cb->co);
     }
 }
 
@@ -800,7 +800,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
     qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-    bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE,
+    bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE,
                    *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
 }
 
@@ -837,7 +837,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret)
     }
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-    bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
                     &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
                     qed_copy_from_backing_file_cb, copy_cb);
 }
@@ -1087,7 +1087,7 @@ static void qed_aio_write_main(void *opaque, int ret)
     }
 
     BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-    bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
                     &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
                     next_fn, acb);
 }
@@ -1319,7 +1319,7 @@ static void qed_aio_read_data(void *opaque, int ret,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE,
+    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
                    qed_aio_next_io, acb);
     return;
@@ -1418,21 +1418,21 @@ typedef struct {
     bool done;
 } QEDWriteZeroesCB;
 
-static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
 {
     QEDWriteZeroesCB *cb = opaque;
 
     cb->done = true;
     cb->ret = ret;
     if (cb->co) {
-        qemu_coroutine_enter(cb->co, NULL);
+        qemu_coroutine_enter(cb->co);
     }
 }
 
-static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors,
-                                                 BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset,
+                                                  int count,
+                                                  BdrvRequestFlags flags)
 {
     BlockAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
@@ -1440,25 +1440,22 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
     QEMUIOVector qiov;
     struct iovec iov;
 
-    /* Refuse if there are untouched backing file sectors */
-    if (bs->backing) {
-        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
-        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
-            return -ENOTSUP;
-        }
+    /* Fall back if the request is not aligned */
+    if (qed_offset_into_cluster(s, offset) ||
+        qed_offset_into_cluster(s, count)) {
+        return -ENOTSUP;
     }
 
     /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
      * then it will be allocated during request processing.
      */
-    iov.iov_base = NULL,
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+    iov.iov_base = NULL;
+    iov.iov_len = count;
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
-                             qed_co_write_zeroes_cb, &cb,
+    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                             count >> BDRV_SECTOR_BITS,
+                             qed_co_pwrite_zeroes_cb, &cb,
                              QED_AIOCB_WRITE | QED_AIOCB_ZERO);
     if (!blockacb) {
         return -EIO;
@@ -1578,7 +1575,7 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
     }
 
     /* Write new header */
-    ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len);
+    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
     g_free(buffer);
     if (ret == 0) {
         memcpy(&s->header, &new_header, sizeof(new_header));
@@ -1594,12 +1591,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
 
     bdrv_qed_close(bs);
 
-    bdrv_invalidate_cache(bs->file->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
     memset(s, 0, sizeof(BDRVQEDState));
     ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
     if (local_err) {
@@ -1669,7 +1660,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,
diff --git a/block/quorum.c b/block/quorum.c
index da15465a9..9cf876fb3 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
     BdrvChild **children;  /* children BlockDriverStates */
     int num_children;      /* children count */
+    unsigned next_child_index;  /* the index of the next child that should
+                                 * be added
+                                 */
     int threshold;         /* if less than threshold children reads gave the
                             * same result a quorum error occurs.
                             */
@@ -379,7 +383,7 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
             continue;
         }
         QLIST_FOREACH(item, &version->items, next) {
-            bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num,
+            bdrv_aio_writev(s->children[item->index], acb->sector_num,
                             acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb,
                             acb);
         }
@@ -656,7 +660,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
     }
 
     for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num,
+        acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num,
                                             &acb->qcrs[i].qiov, acb->nb_sectors,
                                             quorum_aio_cb, &acb->qcrs[i]);
     }
@@ -674,7 +678,7 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
     qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
                      acb->qcrs[acb->child_iter].buf);
     acb->qcrs[acb->child_iter].aiocb =
-        bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num,
+        bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num,
                        &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
                        quorum_aio_cb, &acb->qcrs[acb->child_iter]);
 
@@ -715,7 +719,7 @@ static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs,
     int i;
 
     for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num,
+        acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num,
                                              qiov, nb_sectors, &quorum_aio_cb,
                                              &acb->qcrs[i]);
     }
@@ -747,21 +751,6 @@ static int64_t quorum_getlength(BlockDriverState *bs)
     return result;
 }
 
-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
-    BDRVQuorumState *s = bs->opaque;
-    Error *local_err = NULL;
-    int i;
-
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_invalidate_cache(s->children[i]->bs, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    }
-}
-
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
     BDRVQuorumState *s = bs->opaque;
@@ -898,9 +887,9 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
         ret = -EINVAL;
         goto exit;
     }
-    if (s->num_children < 2) {
+    if (s->num_children < 1) {
         error_setg(&local_err,
-                   "Number of provided children must be greater than 1");
+                   "Number of provided children must be 1 or more");
         ret = -EINVAL;
         goto exit;
     }
@@ -964,6 +953,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
 
         opened[i] = true;
     }
+    s->next_child_index = s->num_children;
 
     g_free(opened);
     goto exit;
@@ -981,9 +971,7 @@ close_exit:
 exit:
     qemu_opts_del(opts);
     /* propagate error */
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);
     return ret;
 }
 
@@ -999,25 +987,70 @@ static void quorum_close(BlockDriverState *bs)
     g_free(s->children);
 }
 
-static void quorum_detach_aio_context(BlockDriverState *bs)
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+                             Error **errp)
 {
     BDRVQuorumState *s = bs->opaque;
-    int i;
+    BdrvChild *child;
+    char indexstr[32];
+    int ret;
 
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_detach_aio_context(s->children[i]->bs);
+    assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+    if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+        s->next_child_index == UINT_MAX) {
+        error_setg(errp, "Too many children");
+        return;
     }
+
+    ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+    if (ret < 0 || ret >= 32) {
+        error_setg(errp, "cannot generate child name");
+        return;
+    }
+    s->next_child_index++;
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely add the child now */
+    bdrv_ref(child_bs);
+    child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+    s->children[s->num_children++] = child;
+
+    bdrv_drained_end(bs);
 }
 
-static void quorum_attach_aio_context(BlockDriverState *bs,
-                                      AioContext *new_context)
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+                             Error **errp)
 {
     BDRVQuorumState *s = bs->opaque;
     int i;
 
     for (i = 0; i < s->num_children; i++) {
-        bdrv_attach_aio_context(s->children[i]->bs, new_context);
+        if (s->children[i] == child) {
+            break;
+        }
     }
+
+    /* we have checked it in bdrv_del_child() */
+    assert(i < s->num_children);
+
+    if (s->num_children <= s->threshold) {
+        error_setg(errp,
+            "The number of children cannot be lower than the vote threshold %d",
+            s->threshold);
+        return;
+    }
+
+    bdrv_drained_begin(bs);
+
+    /* We can safely remove this child now */
+    memmove(&s->children[i], &s->children[i + 1],
+            (s->num_children - i - 1) * sizeof(BdrvChild *));
+    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+    bdrv_unref_child(bs, child);
+
+    bdrv_drained_end(bs);
 }
 
 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
@@ -1070,10 +1103,9 @@ static BlockDriver bdrv_quorum = {
 
     .bdrv_aio_readv                     = quorum_aio_readv,
     .bdrv_aio_writev                    = quorum_aio_writev,
-    .bdrv_invalidate_cache              = quorum_invalidate_cache,
 
-    .bdrv_detach_aio_context            = quorum_detach_aio_context,
-    .bdrv_attach_aio_context            = quorum_attach_aio_context,
+    .bdrv_add_child                     = quorum_add_child,
+    .bdrv_del_child                     = quorum_del_child,
 
     .is_filter                          = true,
     .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
diff --git a/block/raw-aio.h b/block/raw-aio.h
deleted file mode 100644
index 811e37501..000000000
--- a/block/raw-aio.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Declarations for AIO in the raw protocol
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-#ifndef QEMU_RAW_AIO_H
-#define QEMU_RAW_AIO_H
-
-#include "qemu/iov.h"
-
-/* AIO request types */
-#define QEMU_AIO_READ         0x0001
-#define QEMU_AIO_WRITE        0x0002
-#define QEMU_AIO_IOCTL        0x0004
-#define QEMU_AIO_FLUSH        0x0008
-#define QEMU_AIO_DISCARD      0x0010
-#define QEMU_AIO_WRITE_ZEROES 0x0020
-#define QEMU_AIO_TYPE_MASK \
-        (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
-         QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
-
-/* AIO flags */
-#define QEMU_AIO_MISALIGNED   0x1000
-#define QEMU_AIO_BLKDEV       0x2000
-
-
-/* linux-aio.c - Linux native implementation */
-#ifdef CONFIG_LINUX_AIO
-void *laio_init(void);
-void laio_cleanup(void *s);
-BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type);
-void laio_detach_aio_context(void *s, AioContext *old_context);
-void laio_attach_aio_context(void *s, AioContext *new_context);
-void laio_io_plug(BlockDriverState *bs, void *aio_ctx);
-void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug);
-#endif
-
-#ifdef _WIN32
-typedef struct QEMUWin32AIOState QEMUWin32AIOState;
-QEMUWin32AIOState *win32_aio_init(void);
-void win32_aio_cleanup(QEMUWin32AIOState *aio);
-int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
-BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
-        QEMUWin32AIOState *aio, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type);
-void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
-                                  AioContext *old_context);
-void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
-                                  AioContext *new_context);
-#endif
-
-#endif /* QEMU_RAW_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 906d5c941..6ed754739 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -32,7 +32,7 @@
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
-#include "raw-aio.h"
+#include "block/raw-aio.h"
 #include "qapi/util.h"
 #include "qapi/qmp/qstring.h"
 
@@ -137,10 +137,6 @@ typedef struct BDRVRawState {
     int open_flags;
     size_t buf_align;
 
-#ifdef CONFIG_LINUX_AIO
-    int use_aio;
-    void *aio_ctx;
-#endif
 #ifdef CONFIG_XFS
     bool is_xfs:1;
 #endif
@@ -154,9 +150,6 @@ typedef struct BDRVRawState {
 typedef struct BDRVRawReopenState {
     int fd;
     int open_flags;
-#ifdef CONFIG_LINUX_AIO
-    int use_aio;
-#endif
 } BDRVRawReopenState;
 
 static int fd_open(BlockDriverState *bs);
@@ -302,22 +295,22 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     /* For SCSI generic devices the alignment is not really used.
        With buffered I/O, we don't have any restrictions. */
     if (bdrv_is_sg(bs) || !s->needs_alignment) {
-        bs->request_alignment = 1;
+        bs->bl.request_alignment = 1;
         s->buf_align = 1;
         return;
     }
 
-    bs->request_alignment = 0;
+    bs->bl.request_alignment = 0;
     s->buf_align = 0;
     /* Let's try to use the logical blocksize for the alignment. */
-    if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) {
-        bs->request_alignment = 0;
+    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
+        bs->bl.request_alignment = 0;
     }
 #ifdef CONFIG_XFS
     if (s->is_xfs) {
         struct dioattr da;
         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
-            bs->request_alignment = da.d_miniosz;
+            bs->bl.request_alignment = da.d_miniosz;
             /* The kernel returns wrong information for d_mem */
             /* s->buf_align = da.d_mem; */
         }
@@ -337,21 +330,21 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
         qemu_vfree(buf);
     }
 
-    if (!bs->request_alignment) {
+    if (!bs->bl.request_alignment) {
         size_t align;
         buf = qemu_memalign(s->buf_align, max_align);
         for (align = 512; align <= max_align; align <<= 1) {
             if (raw_is_io_aligned(fd, buf, align)) {
-                bs->request_alignment = align;
+                bs->bl.request_alignment = align;
                 break;
             }
         }
         qemu_vfree(buf);
     }
 
-    if (!s->buf_align || !bs->request_alignment) {
-        error_setg(errp, "Could not find working O_DIRECT alignment. "
-                         "Try cache.direct=off.");
+    if (!s->buf_align || !bs->bl.request_alignment) {
+        error_setg(errp, "Could not find working O_DIRECT alignment");
+        error_append_hint(errp, "Try cache.direct=off\n");
     }
 }
 
@@ -374,58 +367,15 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags)
     }
 }
 
-static void raw_detach_aio_context(BlockDriverState *bs)
-{
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-
-    if (s->use_aio) {
-        laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs));
-    }
-#endif
-}
-
-static void raw_attach_aio_context(BlockDriverState *bs,
-                                   AioContext *new_context)
-{
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-
-    if (s->use_aio) {
-        laio_attach_aio_context(s->aio_ctx, new_context);
-    }
-#endif
-}
-
 #ifdef CONFIG_LINUX_AIO
-static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+static bool raw_use_aio(int bdrv_flags)
 {
-    int ret = -1;
-    assert(aio_ctx != NULL);
-    assert(use_aio != NULL);
     /*
      * Currently Linux do AIO only for files opened with O_DIRECT
      * specified so check NOCACHE flag too
      */
-    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
-
-        /* if non-NULL, laio_init() has already been run */
-        if (*aio_ctx == NULL) {
-            *aio_ctx = laio_init();
-            if (!*aio_ctx) {
-                goto error;
-            }
-        }
-        *use_aio = 1;
-    } else {
-        *use_aio = 0;
-    }
-
-    ret = 0;
-
-error:
-    return ret;
+    return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                         (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO);
 }
 #endif
 
@@ -494,13 +444,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     s->fd = fd;
 
 #ifdef CONFIG_LINUX_AIO
-    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
-        qemu_close(fd);
-        ret = -errno;
-        error_setg_errno(errp, -ret, "Could not set AIO state");
-        goto fail;
-    }
-    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
+    if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
         error_setg(errp, "aio=native was specified, but it requires "
                          "cache.direct=on, which was not specified.");
         ret = -EINVAL;
@@ -517,6 +461,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 
     s->has_discard = true;
     s->has_write_zeroes = true;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
     if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
         s->needs_alignment = true;
     }
@@ -566,8 +511,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
 #endif
 
-    raw_attach_aio_context(bs, bdrv_get_aio_context(bs));
-
     ret = 0;
 fail:
     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -581,15 +524,9 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;
 
     s->type = FTYPE_FILE;
-    ret = raw_open_common(bs, options, flags, 0, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return raw_open_common(bs, options, flags, 0, errp);
 }
 
 static int raw_reopen_prepare(BDRVReopenState *state,
@@ -608,18 +545,6 @@ static int raw_reopen_prepare(BDRVReopenState *state,
     state->opaque = g_new0(BDRVRawReopenState, 1);
     raw_s = state->opaque;
 
-#ifdef CONFIG_LINUX_AIO
-    raw_s->use_aio = s->use_aio;
-
-    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
-     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
-     * won't override aio_ctx if aio_ctx is non-NULL */
-    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
-        error_setg(errp, "Could not set AIO state");
-        return -1;
-    }
-#endif
-
     if (s->type == FTYPE_CD) {
         raw_s->open_flags |= O_NONBLOCK;
     }
@@ -644,15 +569,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
 
     if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
         /* dup the original fd */
-        /* TODO: use qemu fcntl wrapper */
-#ifdef F_DUPFD_CLOEXEC
-        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
-#else
-        raw_s->fd = dup(s->fd);
-        if (raw_s->fd != -1) {
-            qemu_set_cloexec(raw_s->fd);
-        }
-#endif
+        raw_s->fd = qemu_dup(s->fd);
         if (raw_s->fd >= 0) {
             ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
             if (ret) {
@@ -702,9 +619,6 @@ static void raw_reopen_commit(BDRVReopenState *state)
 
     qemu_close(s->fd);
     s->fd = raw_s->fd;
-#ifdef CONFIG_LINUX_AIO
-    s->use_aio = raw_s->use_aio;
-#endif
 
     g_free(state->opaque);
     state->opaque = NULL;
@@ -728,9 +642,33 @@ static void raw_reopen_abort(BDRVReopenState *state)
     state->opaque = NULL;
 }
 
+static int hdev_get_max_transfer_length(int fd)
+{
+#ifdef BLKSECTGET
+    int max_sectors = 0;
+    if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+        return max_sectors;
+    } else {
+        return -errno;
+    }
+#else
+    return -ENOSYS;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    struct stat st;
+
+    if (!fstat(s->fd, &st)) {
+        if (S_ISBLK(st.st_mode)) {
+            int ret = hdev_get_max_transfer_length(s->fd);
+            if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
+                bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
+            }
+        }
+    }
 
     raw_probe_alignment(bs, s->fd, errp);
     bs->bl.min_mem_alignment = s->buf_align;
@@ -1251,8 +1189,8 @@ static int aio_worker(void *arg)
 }
 
 static int paio_submit_co(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        int type)
+                          int64_t offset, QEMUIOVector *qiov,
+                          int count, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
     ThreadPool *pool;
@@ -1261,22 +1199,22 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
         acb->aio_niov = qiov->niov;
-        assert(qiov->size == acb->aio_nbytes);
+        assert(qiov->size == count);
     }
 
-    trace_paio_submit_co(sector_num, nb_sectors, type);
+    trace_paio_submit_co(offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_co(pool, aio_worker, acb);
 }
 
 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int64_t offset, QEMUIOVector *qiov, int count,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1286,8 +1224,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
@@ -1295,19 +1233,18 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
         assert(qiov->size == acb->aio_nbytes);
     }
 
-    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    trace_paio_submit(acb, opaque, offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
 
-static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque, int type)
+static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
+                                   uint64_t bytes, QEMUIOVector *qiov, int type)
 {
     BDRVRawState *s = bs->opaque;
 
     if (fd_open(bs) < 0)
-        return NULL;
+        return -EIO;
 
     /*
      * Check if the underlying device requires requests to be aligned,
@@ -1319,61 +1256,50 @@ static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
         if (!bdrv_qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
-        } else if (s->use_aio) {
-            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
-                               nb_sectors, cb, opaque, type);
+        } else if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+            assert(qiov->size == bytes);
+            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
 #endif
         }
     }
 
-    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
-                       cb, opaque, type);
+    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
 }
 
-static void raw_aio_plug(BlockDriverState *bs)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *qiov,
+                                      int flags)
 {
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_plug(bs, s->aio_ctx);
-    }
-#endif
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
 }
 
-static void raw_aio_unplug(BlockDriverState *bs)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *qiov,
+                                       int flags)
 {
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, true);
-    }
-#endif
+    assert(flags == 0);
+    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
-static void raw_aio_flush_io_queue(BlockDriverState *bs)
+static void raw_aio_plug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx, false);
+    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+        laio_io_plug(bs, aio);
     }
 #endif
 }
 
-static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_READ);
-}
-
-static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+static void raw_aio_unplug(BlockDriverState *bs)
 {
-    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
-                          cb, opaque, QEMU_AIO_WRITE);
+#ifdef CONFIG_LINUX_AIO
+    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+        laio_io_unplug(bs, aio);
+    }
+#endif
 }
 
 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
@@ -1391,13 +1317,6 @@ static void raw_close(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
 
-    raw_detach_aio_context(bs);
-
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_aio) {
-        laio_cleanup(s->aio_ctx);
-    }
-#endif
     if (s->fd >= 0) {
         qemu_close(s->fd);
         s->fd = -1;
@@ -1867,27 +1786,27 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
     return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }
 
-static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
 
-    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+    return paio_submit(bs, s->fd, offset, NULL, count,
                        cb, opaque, QEMU_AIO_DISCARD);
 }
 
-static int coroutine_fn raw_co_write_zeroes(
-    BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(
+    BlockDriverState *bs, int64_t offset,
+    int count, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
 
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_WRITE_ZEROES);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_DISCARD);
     }
     return -ENOTSUP;
@@ -1940,16 +1859,15 @@ BlockDriver bdrv_file = {
     .bdrv_create = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
     .bdrv_co_get_block_status = raw_co_get_block_status,
-    .bdrv_co_write_zeroes = raw_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
 
-    .bdrv_aio_readv = raw_aio_readv,
-    .bdrv_aio_writev = raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush = raw_aio_flush,
-    .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_aio_pdiscard = raw_aio_pdiscard,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
@@ -1957,9 +1875,6 @@ BlockDriver bdrv_file = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     .create_opts = &raw_create_opts,
 };
 
@@ -2225,9 +2140,7 @@ hdev_open_Mac_error:
 
     ret = raw_open_common(bs, options, flags, 0, &local_err);
     if (ret < 0) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
+        error_propagate(errp, local_err);
 #if defined(__APPLE__) && defined(__MACH__)
         if (*bsd_path) {
             filename = bsd_path;
@@ -2290,8 +2203,8 @@ static int fd_open(BlockDriverState *bs)
     return -EIO;
 }
 
-static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
@@ -2299,12 +2212,12 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
     if (fd_open(bs) < 0) {
         return NULL;
     }
-    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+    return paio_submit(bs, s->fd, offset, NULL, count,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
-static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
+    int64_t offset, int count, BdrvRequestFlags flags)
 {
     BDRVRawState *s = bs->opaque;
     int rc;
@@ -2314,10 +2227,10 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
         return rc;
     }
     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
     } else if (s->discard_zeroes) {
-        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+        return paio_submit_co(bs, s->fd, offset, NULL, count,
                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
     }
     return -ENOTSUP;
@@ -2389,16 +2302,15 @@ static BlockDriver bdrv_host_device = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create         = hdev_create,
     .create_opts         = &raw_create_opts,
-    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
 
-    .bdrv_aio_readv	= raw_aio_readv,
-    .bdrv_aio_writev	= raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
-    .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -2408,9 +2320,6 @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
     .bdrv_probe_geometry = hdev_probe_geometry,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* generic scsi device */
 #ifdef __linux__
     .bdrv_aio_ioctl     = hdev_aio_ioctl,
@@ -2433,17 +2342,11 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
                       Error **errp)
 {
     BDRVRawState *s = bs->opaque;
-    Error *local_err = NULL;
-    int ret;
 
     s->type = FTYPE_CD;
 
     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
-    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
 }
 
 static int cdrom_probe_device(const char *filename)
@@ -2522,13 +2425,13 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create         = hdev_create,
     .create_opts         = &raw_create_opts,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -2536,9 +2439,6 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* removable device support */
     .bdrv_is_inserted   = cdrom_is_inserted,
     .bdrv_eject         = cdrom_eject,
@@ -2561,9 +2461,7 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
 
     ret = raw_open_common(bs, options, flags, 0, &local_err);
     if (ret) {
-        if (local_err) {
-            error_propagate(errp, local_err);
-        }
+        error_propagate(errp, local_err);
         return ret;
     }
 
@@ -2658,13 +2556,12 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_create        = hdev_create,
     .create_opts        = &raw_create_opts,
 
-    .bdrv_aio_readv     = raw_aio_readv,
-    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_co_preadv         = raw_co_preadv,
+    .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
-    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -2672,9 +2569,6 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* removable device support */
     .bdrv_is_inserted   = cdrom_is_inserted,
     .bdrv_eject         = cdrom_eject,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index fd2389153..56f45fea9 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -27,7 +27,7 @@
 #include "qemu/timer.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "raw-aio.h"
+#include "block/raw-aio.h"
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
@@ -142,7 +142,7 @@ static int aio_worker(void *arg)
 }
 
 static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int64_t offset, QEMUIOVector *qiov, int count,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     RawWin32AIOData *acb = g_new(RawWin32AIOData, 1);
@@ -155,11 +155,12 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
     if (qiov) {
         acb->aio_iov = qiov->iov;
         acb->aio_niov = qiov->niov;
+        assert(qiov->size == count);
     }
-    acb->aio_nbytes = nb_sectors * 512;
-    acb->aio_offset = sector_num * 512;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
-    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    trace_paio_submit(acb, opaque, offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -222,7 +223,7 @@ static void raw_attach_aio_context(BlockDriverState *bs,
     }
 }
 
-static void raw_probe_alignment(BlockDriverState *bs)
+static void raw_probe_alignment(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     DWORD sectorsPerCluster, freeClusters, totalClusters, count;
@@ -230,14 +231,14 @@ static void raw_probe_alignment(BlockDriverState *bs)
     BOOL status;
 
     if (s->type == FTYPE_CD) {
-        bs->request_alignment = 2048;
+        bs->bl.request_alignment = 2048;
         return;
     }
     if (s->type == FTYPE_HARDDISK) {
         status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
                                  NULL, 0, &dg, sizeof(dg), &count, NULL);
         if (status != 0) {
-            bs->request_alignment = dg.Geometry.BytesPerSector;
+            bs->bl.request_alignment = dg.Geometry.BytesPerSector;
             return;
         }
         /* try GetDiskFreeSpace too */
@@ -247,7 +248,7 @@ static void raw_probe_alignment(BlockDriverState *bs)
         GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
                          &dg.Geometry.BytesPerSector,
                          &freeClusters, &totalClusters);
-        bs->request_alignment = dg.Geometry.BytesPerSector;
+        bs->bl.request_alignment = dg.Geometry.BytesPerSector;
     }
 }
 
@@ -365,7 +366,6 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
         win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs));
     }
 
-    raw_probe_alignment(bs);
     ret = 0;
 fail:
     qemu_opts_del(opts);
@@ -379,9 +379,10 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
         return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
+                                nb_sectors, cb, opaque, QEMU_AIO_READ);
     } else {
-        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+                           nb_sectors << BDRV_SECTOR_BITS,
                            cb, opaque, QEMU_AIO_READ);
     }
 }
@@ -393,9 +394,10 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
         return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
     } else {
-        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+                           nb_sectors << BDRV_SECTOR_BITS,
                            cb, opaque, QEMU_AIO_WRITE);
     }
 }
@@ -550,6 +552,7 @@ BlockDriver bdrv_file = {
     .bdrv_needs_filename = true,
     .bdrv_parse_filename = raw_parse_filename,
     .bdrv_file_open     = raw_open,
+    .bdrv_refresh_limits = raw_probe_alignment,
     .bdrv_close         = raw_close,
     .bdrv_create        = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index a6cc7e991..588d4080f 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -1,6 +1,6 @@
 /* BlockDriver implementation for "raw"
  *
- * Copyright (C) 2010, 2013, Red Hat, Inc.
+ * Copyright (C) 2010-2016 Red Hat, Inc.
  * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com>
  * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com>
  *
@@ -50,33 +50,30 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state,
     return 0;
 }
 
-static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
-                                     int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *qiov,
+                                      int flags)
 {
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov);
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
-static int coroutine_fn
-raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                    QEMUIOVector *qiov, int flags)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *qiov,
+                                       int flags)
 {
     void *buf = NULL;
     BlockDriver *drv;
     QEMUIOVector local_qiov;
     int ret;
 
-    if (bs->probed && sector_num == 0) {
-        /* As long as these conditions are true, we can't get partial writes to
-         * the probe buffer and can just directly check the request. */
+    if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
+        /* Handling partial writes would be a pain - so we just
+         * require that guests have 512-byte request alignment if
+         * probing occurred */
         QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512);
         QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512);
-
-        if (nb_sectors == 0) {
-            /* qemu_iovec_to_buf() would fail, but we want to return success
-             * instead of -EINVAL in this case. */
-            return 0;
-        }
+        assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE);
 
         buf = qemu_try_blockalign(bs->file->bs, 512);
         if (!buf) {
@@ -105,8 +102,7 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE,
-                             nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 
 fail:
     if (qiov == &local_qiov) {
@@ -116,13 +112,6 @@ fail:
     return ret;
 }
 
-static int coroutine_fn
-raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-              QEMUIOVector *qiov)
-{
-    return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0);
-}
-
 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                             int64_t sector_num,
                                             int nb_sectors, int *pnum,
@@ -134,17 +123,17 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
            (sector_num << BDRV_SECTOR_BITS);
 }
 
-static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-                                            int64_t sector_num, int nb_sectors,
-                                            BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
+                                             int64_t offset, int count,
+                                             BdrvRequestFlags flags)
 {
-    return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags);
+    return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
 }
 
-static int coroutine_fn raw_co_discard(BlockDriverState *bs,
-                                       int64_t sector_num, int nb_sectors)
+static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
+                                        int64_t offset, int count)
 {
-    return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
+    return bdrv_co_pdiscard(bs->file->bs, offset, count);
 }
 
 static int64_t raw_getlength(BlockDriverState *bs)
@@ -159,7 +148,12 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl = bs->file->bs->bl;
+    if (bs->probed) {
+        /* To make it easier to protect the first sector, any probed
+         * image is restricted to read-modify-write on sub-sector
+         * operations. */
+        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
+    }
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -197,20 +191,17 @@ static int raw_has_zero_init(BlockDriverState *bs)
 
 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 {
-    Error *local_err = NULL;
-    int ret;
-
-    ret = bdrv_create_file(filename, opts, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
-    return ret;
+    return bdrv_create_file(filename, opts, errp);
 }
 
 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     bs->sg = bs->file->bs->sg;
+    bs->supported_write_flags = BDRV_REQ_FUA &
+        bs->file->bs->supported_write_flags;
+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+        bs->file->bs->supported_zero_flags;
 
     if (bs->probed && !bdrv_is_read_only(bs)) {
         fprintf(stderr,
@@ -255,12 +246,10 @@ BlockDriver bdrv_raw = {
     .bdrv_open            = &raw_open,
     .bdrv_close           = &raw_close,
     .bdrv_create          = &raw_create,
-    .bdrv_co_readv        = &raw_co_readv,
-    .bdrv_co_writev       = &raw_co_writev,
-    .bdrv_co_writev_flags = &raw_co_writev_flags,
-    .supported_write_flags = BDRV_REQ_FUA,
-    .bdrv_co_write_zeroes = &raw_co_write_zeroes,
-    .bdrv_co_discard      = &raw_co_discard,
+    .bdrv_co_preadv       = &raw_co_preadv,
+    .bdrv_co_pwritev      = &raw_co_pwritev,
+    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
+    .bdrv_co_pdiscard     = &raw_co_pdiscard,
     .bdrv_co_get_block_status = &raw_co_get_block_status,
     .bdrv_truncate        = &raw_truncate,
     .bdrv_getlength       = &raw_getlength,
diff --git a/block/rbd.c b/block/rbd.c
index 5bc5b3253..0106fea45 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -290,7 +290,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
             if (only_read_conf_file) {
                 ret = rados_conf_read_file(cluster, value);
                 if (ret < 0) {
-                    error_setg(errp, "error reading conf file %s", value);
+                    error_setg_errno(errp, -ret, "error reading conf file %s",
+                                     value);
                     break;
                 }
             }
@@ -299,7 +300,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
         } else if (!only_read_conf_file) {
             ret = rados_conf_set(cluster, name, value);
             if (ret < 0) {
-                error_setg(errp, "invalid conf option %s", name);
+                error_setg_errno(errp, -ret, "invalid conf option %s", name);
                 ret = -EINVAL;
                 break;
             }
@@ -354,9 +355,10 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
-    if (rados_create(&cluster, clientname) < 0) {
-        error_setg(errp, "error initializing");
-        return -EIO;
+    ret = rados_create(&cluster, clientname);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error initializing");
+        return ret;
     }
 
     if (strstr(conf, "conf=") == NULL) {
@@ -381,21 +383,27 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
         return -EIO;
     }
 
-    if (rados_connect(cluster) < 0) {
-        error_setg(errp, "error connecting");
+    ret = rados_connect(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error connecting");
         rados_shutdown(cluster);
-        return -EIO;
+        return ret;
     }
 
-    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+    ret = rados_ioctx_create(cluster, pool, &io_ctx);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error opening pool %s", pool);
         rados_shutdown(cluster);
-        return -EIO;
+        return ret;
     }
 
     ret = rbd_create(io_ctx, name, bytes, &obj_order);
     rados_ioctx_destroy(io_ctx);
     rados_shutdown(cluster);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd create");
+        return ret;
+    }
 
     return ret;
 }
@@ -500,7 +508,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
     r = rados_create(&s->cluster, clientname);
     if (r < 0) {
-        error_setg(errp, "error initializing");
+        error_setg_errno(errp, -r, "error initializing");
         goto failed_opts;
     }
 
@@ -546,19 +554,19 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
 
     r = rados_connect(s->cluster);
     if (r < 0) {
-        error_setg(errp, "error connecting");
+        error_setg_errno(errp, -r, "error connecting");
         goto failed_shutdown;
     }
 
     r = rados_ioctx_create(s->cluster, pool, &s->io_ctx);
     if (r < 0) {
-        error_setg(errp, "error opening pool %s", pool);
+        error_setg_errno(errp, -r, "error opening pool %s", pool);
         goto failed_shutdown;
     }
 
     r = rbd_open(s->io_ctx, s->name, &s->image, s->snap);
     if (r < 0) {
-        error_setg(errp, "error reading header from %s", s->name);
+        error_setg_errno(errp, -r, "error reading header from %s", s->name);
         goto failed_open;
     }
 
@@ -641,9 +649,9 @@ static int rbd_aio_flush_wrapper(rbd_image_t image,
 }
 
 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
-                                 int64_t sector_num,
+                                 int64_t off,
                                  QEMUIOVector *qiov,
-                                 int nb_sectors,
+                                 int64_t size,
                                  BlockCompletionFunc *cb,
                                  void *opaque,
                                  RBDAIOCmd cmd)
@@ -651,7 +659,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     RBDAIOCB *acb;
     RADOSCB *rcb = NULL;
     rbd_completion_t c;
-    int64_t off, size;
     char *buf;
     int r;
 
@@ -660,6 +667,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
     acb->cmd = cmd;
     acb->qiov = qiov;
+    assert(!qiov || qiov->size == size);
     if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
         acb->bounce = NULL;
     } else {
@@ -679,9 +687,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
 
     buf = acb->bounce;
 
-    off = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
     rcb = g_new(RADOSCB, 1);
     rcb->acb = acb;
     rcb->buf = buf;
@@ -731,7 +736,8 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
                                       BlockCompletionFunc *cb,
                                       void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                          RBD_AIO_READ);
 }
 
@@ -742,7 +748,8 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                                        BlockCompletionFunc *cb,
                                        void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                          RBD_AIO_WRITE);
 }
 
@@ -875,10 +882,8 @@ static int qemu_rbd_snap_rollback(BlockDriverState *bs,
                                   const char *snapshot_name)
 {
     BDRVRBDState *s = bs->opaque;
-    int r;
 
-    r = rbd_snap_rollback(s->image, snapshot_name);
-    return r;
+    return rbd_snap_rollback(s->image, snapshot_name);
 }
 
 static int qemu_rbd_snap_list(BlockDriverState *bs,
@@ -925,13 +930,13 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }
 
 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
-                                        int64_t sector_num,
-                                        int nb_sectors,
-                                        BlockCompletionFunc *cb,
-                                        void *opaque)
+static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int count,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, offset, NULL, count, cb, opaque,
                          RBD_AIO_DISCARD);
 }
 #endif
@@ -995,7 +1000,7 @@ static BlockDriver bdrv_rbd = {
 #endif
 
 #ifdef LIBRBD_SUPPORTS_DISCARD
-    .bdrv_aio_discard       = qemu_rbd_aio_discard,
+    .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
 #endif
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 33e0a3382..66e1cb2b2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode)
 
 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)                                       \
-    do {                                                            \
-        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-    } while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)                                           \
+    do {                                                                \
+        if (DEBUG_SDOG_PRINT) {                                         \
+            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+        }                                                               \
+    } while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
@@ -492,7 +495,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 
 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
 {
-    qemu_coroutine_enter(acb->coroutine, NULL);
+    qemu_coroutine_enter(acb->coroutine);
     qemu_aio_unref(acb);
 }
 
@@ -633,7 +636,7 @@ static void restart_co_req(void *opaque)
 {
     Coroutine *co = opaque;
 
-    qemu_coroutine_enter(co, NULL);
+    qemu_coroutine_enter(co);
 }
 
 typedef struct SheepdogReqCo {
@@ -723,8 +726,8 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
     if (qemu_in_coroutine()) {
         do_co_req(&srco);
     } else {
-        co = qemu_coroutine_create(do_co_req);
-        qemu_coroutine_enter(co, &srco);
+        co = qemu_coroutine_create(do_co_req, &srco);
+        qemu_coroutine_enter(co);
         while (!srco.finished) {
             aio_poll(aio_context, true);
         }
@@ -922,17 +925,17 @@ static void co_read_response(void *opaque)
     BDRVSheepdogState *s = opaque;
 
     if (!s->co_recv) {
-        s->co_recv = qemu_coroutine_create(aio_read_response);
+        s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
     }
 
-    qemu_coroutine_enter(s->co_recv, opaque);
+    qemu_coroutine_enter(s->co_recv);
 }
 
 static void co_write_request(void *opaque)
 {
     BDRVSheepdogState *s = opaque;
 
-    qemu_coroutine_enter(s->co_send, NULL);
+    qemu_coroutine_enter(s->co_send);
 }
 
 /*
@@ -1678,7 +1681,7 @@ static int sd_prealloc(const char *filename, Error **errp)
         if (ret < 0) {
             goto out;
         }
-        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size);
+        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
         if (ret < 0) {
             goto out;
         }
@@ -2781,17 +2784,24 @@ static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
     return ret;
 }
 
-static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
-                           int64_t pos, int size)
+static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                           int64_t pos)
 {
     BDRVSheepdogState *s = bs->opaque;
+    void *buf;
+    int ret;
 
-    return do_load_save_vmstate(s, data, pos, size, 1);
+    buf = qemu_blockalign(bs, qiov->size);
+    ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
+    qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
+    qemu_vfree(buf);
+
+    return ret;
 }
 
 
-static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors)
+static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
+                                      int count)
 {
     SheepdogAIOCB *acb;
     BDRVSheepdogState *s = bs->opaque;
@@ -2801,7 +2811,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     uint32_t zero = 0;
 
     if (!s->discard_supported) {
-            return 0;
+        return 0;
     }
 
     memset(&discard_iov, 0, sizeof(discard_iov));
@@ -2810,7 +2820,10 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     iov.iov_len = sizeof(zero);
     discard_iov.iov = &iov;
     discard_iov.niov = 1;
-    acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
+    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
+                       count >> BDRV_SECTOR_BITS);
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
     acb->aio_done_func = sd_finish_aiocb;
 
@@ -2944,7 +2957,7 @@ static BlockDriver bdrv_sheepdog = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
@@ -2980,7 +2993,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
@@ -3016,7 +3029,7 @@ static BlockDriver bdrv_sheepdog_unix = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
diff --git a/block/snapshot.c b/block/snapshot.c
index e9d721df6..bf5c2ca5e 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -358,9 +358,7 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
         ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err);
     }
 
-    if (local_err) {
-        error_propagate(errp, local_err);
-    }
+    error_propagate(errp, local_err);
 
     return ret;
 }
@@ -373,9 +371,10 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
 bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
     bool ok = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
 
-    while (ok && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -383,8 +382,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
             ok = bdrv_can_snapshot(bs);
         }
         aio_context_release(ctx);
+        if (!ok) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return ok;
 }
@@ -393,10 +396,11 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
                              Error **err)
 {
     int ret = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
     QEMUSnapshotInfo sn1, *snapshot = &sn1;
 
-    while (ret == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -405,8 +409,12 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
             ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
         }
         aio_context_release(ctx);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return ret;
 }
@@ -415,9 +423,10 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -425,8 +434,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
             err = bdrv_snapshot_goto(bs, name);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
@@ -435,9 +448,10 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
     QEMUSnapshotInfo sn;
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -445,8 +459,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
             err = bdrv_snapshot_find(bs, &sn, name);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
@@ -457,9 +475,10 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
                              BlockDriverState **first_bad_bs)
 {
     int err = 0;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
 
-    while (err == 0 && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -471,23 +490,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
             err = bdrv_snapshot_create(bs, sn);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
 
 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    bool not_found = true;
-    BlockDriverState *bs = NULL;
+    BlockDriverState *bs;
+    BdrvNextIterator it;
 
-    while (not_found && (bs = bdrv_next(bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
+        bool found;
 
         aio_context_acquire(ctx);
-        not_found = !bdrv_can_snapshot(bs);
+        found = bdrv_can_snapshot(bs);
         aio_context_release(ctx);
+
+        if (found) {
+            break;
+        }
     }
     return bs;
 }
diff --git a/block/ssh.c b/block/ssh.c
index 06928ed93..5ce12b633 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -508,36 +508,73 @@ static int authenticate(BDRVSSHState *s, const char *user, Error **errp)
     return ret;
 }
 
+static QemuOptsList ssh_runtime_opts = {
+    .name = "ssh",
+    .head = QTAILQ_HEAD_INITIALIZER(ssh_runtime_opts.head),
+    .desc = {
+        {
+            .name = "host",
+            .type = QEMU_OPT_STRING,
+            .help = "Host to connect to",
+        },
+        {
+            .name = "port",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Port to connect to",
+        },
+        {
+            .name = "path",
+            .type = QEMU_OPT_STRING,
+            .help = "Path of the image on the host",
+        },
+        {
+            .name = "user",
+            .type = QEMU_OPT_STRING,
+            .help = "User as which to connect",
+        },
+        {
+            .name = "host_key_check",
+            .type = QEMU_OPT_STRING,
+            .help = "Defines how and what to check the host key against",
+        },
+    },
+};
+
 static int connect_to_ssh(BDRVSSHState *s, QDict *options,
                           int ssh_flags, int creat_mode, Error **errp)
 {
     int r, ret;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
     const char *host, *user, *path, *host_key_check;
     int port;
 
-    if (!qdict_haskey(options, "host")) {
+    opts = qemu_opts_create(&ssh_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
         ret = -EINVAL;
-        error_setg(errp, "No hostname was specified");
+        error_propagate(errp, local_err);
         goto err;
     }
-    host = qdict_get_str(options, "host");
 
-    if (qdict_haskey(options, "port")) {
-        port = qdict_get_int(options, "port");
-    } else {
-        port = 22;
+    host = qemu_opt_get(opts, "host");
+    if (!host) {
+        ret = -EINVAL;
+        error_setg(errp, "No hostname was specified");
+        goto err;
     }
 
-    if (!qdict_haskey(options, "path")) {
+    port = qemu_opt_get_number(opts, "port", 22);
+
+    path = qemu_opt_get(opts, "path");
+    if (!path) {
         ret = -EINVAL;
         error_setg(errp, "No path was specified");
         goto err;
     }
-    path = qdict_get_str(options, "path");
 
-    if (qdict_haskey(options, "user")) {
-        user = qdict_get_str(options, "user");
-    } else {
+    user = qemu_opt_get(opts, "user");
+    if (!user) {
         user = g_get_user_name();
         if (!user) {
             error_setg_errno(errp, errno, "Can't get user name");
@@ -546,9 +583,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
         }
     }
 
-    if (qdict_haskey(options, "host_key_check")) {
-        host_key_check = qdict_get_str(options, "host_key_check");
-    } else {
+    host_key_check = qemu_opt_get(opts, "host_key_check");
+    if (!host_key_check) {
         host_key_check = "yes";
     }
 
@@ -612,21 +648,14 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
         goto err;
     }
 
+    qemu_opts_del(opts);
+
     r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs);
     if (r < 0) {
         sftp_error_setg(errp, s, "failed to read file attributes");
         return -EINVAL;
     }
 
-    /* Delete the options we've used; any not deleted will cause the
-     * block layer to give an error about unused options.
-     */
-    qdict_del(options, "host");
-    qdict_del(options, "port");
-    qdict_del(options, "user");
-    qdict_del(options, "path");
-    qdict_del(options, "host_key_check");
-
     return 0;
 
  err:
@@ -646,6 +675,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
     }
     s->session = NULL;
 
+    qemu_opts_del(opts);
+
     return ret;
 }
 
@@ -777,7 +808,7 @@ static void restart_coroutine(void *opaque)
 
     DPRINTF("co=%p", co);
 
-    qemu_coroutine_enter(co, NULL);
+    qemu_coroutine_enter(co);
 }
 
 static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
diff --git a/block/stream.c b/block/stream.c
index 332b9a183..31874817c 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
     char *backing_file_str;
 } StreamBlockJob;
 
-static int coroutine_fn stream_populate(BlockDriverState *bs,
+static int coroutine_fn stream_populate(BlockBackend *blk,
                                         int64_t sector_num, int nb_sectors,
                                         void *buf)
 {
@@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
     qemu_iovec_init_external(&qiov, &iov, 1);
 
     /* Copy-on-read the unallocated clusters */
-    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
+                         BDRV_REQ_COPY_ON_READ);
 }
 
 typedef struct {
@@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
     StreamBlockJob *s = container_of(job, StreamBlockJob, common);
     StreamCompleteData *data = opaque;
+    BlockDriverState *bs = blk_bs(job->blk);
     BlockDriverState *base = s->base;
 
     if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                 base_fmt = base->drv->format_name;
             }
         }
-        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
-        bdrv_set_backing_hd(job->bs, base);
+        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        bdrv_set_backing_hd(bs, base);
     }
 
     g_free(s->backing_file_str);
@@ -88,10 +90,12 @@ static void coroutine_fn stream_run(void *opaque)
 {
     StreamBlockJob *s = opaque;
     StreamCompleteData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockBackend *blk = s->common.blk;
+    BlockDriverState *bs = blk_bs(blk);
     BlockDriverState *base = s->base;
     int64_t sector_num = 0;
     int64_t end = -1;
+    uint64_t delay_ns = 0;
     int error = 0;
     int ret = 0;
     int n = 0;
@@ -120,10 +124,8 @@ static void coroutine_fn stream_run(void *opaque)
     }
 
     for (sector_num = 0; sector_num < end; sector_num += n) {
-        uint64_t delay_ns = 0;
         bool copy;
 
-wait:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
@@ -153,18 +155,11 @@ wait:
         }
         trace_stream_one_iteration(s, sector_num, n, ret);
         if (copy) {
-            if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, n);
-                if (delay_ns > 0) {
-                    goto wait;
-                }
-            }
-            ret = stream_populate(bs, sector_num, n, buf);
+            ret = stream_populate(blk, sector_num, n, buf);
         }
         if (ret < 0) {
             BlockErrorAction action =
-                block_job_error_action(&s->common, s->common.bs, s->on_error,
-                                       true, -ret);
+                block_job_error_action(&s->common, s->on_error, true, -ret);
             if (action == BLOCK_ERROR_ACTION_STOP) {
                 n = 0;
                 continue;
@@ -180,6 +175,9 @@ wait:
 
         /* Publish progress */
         s->common.offset += n * BDRV_SECTOR_SIZE;
+        if (copy && s->common.speed) {
+            delay_ns = ratelimit_calculate_delay(&s->limit, n);
+        }
     }
 
     if (!base) {
@@ -216,22 +214,15 @@ static const BlockJobDriver stream_job_driver = {
     .set_speed     = stream_set_speed,
 };
 
-void stream_start(BlockDriverState *bs, BlockDriverState *base,
-                  const char *backing_file_str, int64_t speed,
-                  BlockdevOnError on_error,
-                  BlockCompletionFunc *cb,
-                  void *opaque, Error **errp)
+void stream_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *base, const char *backing_file_str,
+                  int64_t speed, BlockdevOnError on_error,
+                  BlockCompletionFunc *cb, void *opaque, Error **errp)
 {
     StreamBlockJob *s;
 
-    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
-         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
-        (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
-        error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
-        return;
-    }
-
-    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp);
+    s = block_job_create(job_id, &stream_job_driver, bs, speed,
+                         cb, opaque, errp);
     if (!s) {
         return;
     }
@@ -240,7 +231,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
     s->backing_file_str = g_strdup(backing_file_str);
 
     s->on_error = on_error;
-    s->common.co = qemu_coroutine_create(stream_run);
+    s->common.co = qemu_coroutine_create(stream_run, s);
     trace_stream_start(bs, base, s, s->common.co, opaque);
-    qemu_coroutine_enter(s->common.co, s);
+    qemu_coroutine_enter(s->common.co);
 }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 4920e0949..59545e287 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,13 +23,14 @@
  */
 
 #include "qemu/osdep.h"
+#include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
- * among different BlockDriverState and it's independent from
+ * among different BlockBackends and it's independent from
  * AioContext, so in order to use it from different threads it needs
  * its own locking.
  *
@@ -39,26 +40,26 @@
  * The whole ThrottleGroup structure is private and invisible to
  * outside users, that only use it through its ThrottleState.
  *
- * In addition to the ThrottleGroup structure, BlockDriverState has
+ * In addition to the ThrottleGroup structure, BlockBackendPublic has
  * fields that need to be accessed by other members of the group and
- * therefore also need to be protected by this lock. Once a BDS is
- * registered in a group those fields can be accessed by other threads
- * any time.
+ * therefore also need to be protected by this lock. Once a
+ * BlockBackend is registered in a group those fields can be accessed
+ * by other threads any time.
  *
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
  * bdrv_set_aio_context()). Therefore in this file a thread will
- * access some other BDS's timers only after verifying that that BDS
- * has throttled requests in the queue.
+ * access some other BlockBackend's timers only after verifying that
+ * that BlockBackend has throttled requests in the queue.
  */
 typedef struct ThrottleGroup {
     char *name; /* This is constant during the lifetime of the group */
 
     QemuMutex lock; /* This lock protects the following four fields */
     ThrottleState ts;
-    QLIST_HEAD(, BlockDriverState) head;
-    BlockDriverState *tokens[2];
+    QLIST_HEAD(, BlockBackendPublic) head;
+    BlockBackend *tokens[2];
     bool any_timer_armed[2];
 
     /* These two are protected by the global throttle_groups_lock */
@@ -132,93 +133,98 @@ void throttle_group_unref(ThrottleState *ts)
     qemu_mutex_unlock(&throttle_groups_lock);
 }
 
-/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
- * the pointer) is guaranteed to remain constant during the lifetime
- * of the group.
+/* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
+ * is guaranteed to remain constant during the lifetime of the group.
  *
- * @bs:   a BlockDriverState that is member of a throttling group
+ * @blk:  a BlockBackend that is member of a throttling group
  * @ret:  the name of the group.
  */
-const char *throttle_group_get_name(BlockDriverState *bs)
+const char *throttle_group_get_name(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     return tg->name;
 }
 
-/* Return the next BlockDriverState in the round-robin sequence,
- * simulating a circular list.
+/* Return the next BlockBackend in the round-robin sequence, simulating a
+ * circular list.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:  the current BlockDriverState
- * @ret: the next BlockDriverState in the sequence
+ * @blk: the current BlockBackend
+ * @ret: the next BlockBackend in the sequence
  */
-static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+    BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
 
     if (!next) {
-        return QLIST_FIRST(&tg->head);
+        next = QLIST_FIRST(&tg->head);
     }
 
-    return next;
+    return blk_by_public(next);
 }
 
-/* Return the next BlockDriverState in the round-robin sequence with
- * pending I/O requests.
+/* Return the next BlockBackend in the round-robin sequence with pending I/O
+ * requests.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @is_write:  the type of operation (read/write)
- * @ret:       the next BlockDriverState with pending requests, or bs
- *             if there is none.
+ * @ret:       the next BlockBackend with pending requests, or blk if there is
+ *             none.
  */
-static BlockDriverState *next_throttle_token(BlockDriverState *bs,
-                                             bool is_write)
+static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
-    BlockDriverState *token, *start;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    BlockBackend *token, *start;
 
     start = token = tg->tokens[is_write];
 
     /* get next bs round in round robin style */
-    token = throttle_group_next_bs(token);
-    while (token != start && !token->pending_reqs[is_write]) {
-        token = throttle_group_next_bs(token);
+    token = throttle_group_next_blk(token);
+    while (token != start && !blkp->pending_reqs[is_write]) {
+        token = throttle_group_next_blk(token);
     }
 
     /* If no IO are queued for scheduling on the next round robin token
      * then decide the token is the current bs because chances are
      * the current bs get the current request queued.
      */
-    if (token == start && !token->pending_reqs[is_write]) {
-        token = bs;
+    if (token == start && !blkp->pending_reqs[is_write]) {
+        token = blk;
     }
 
     return token;
 }
 
-/* Check if the next I/O request for a BlockDriverState needs to be
- * throttled or not. If there's no timer set in this group, set one
- * and update the token accordingly.
+/* Check if the next I/O request for a BlockBackend needs to be throttled or
+ * not. If there's no timer set in this group, set one and update the token
+ * accordingly.
  *
  * This assumes that tg->lock is held.
  *
- * @bs:         the current BlockDriverState
+ * @blk:        the current BlockBackend
  * @is_write:   the type of operation (read/write)
  * @ret:        whether the I/O request needs to be throttled or not
  */
-static bool throttle_group_schedule_timer(BlockDriverState *bs,
-                                          bool is_write)
+static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
-    ThrottleTimers *tt = &bs->throttle_timers;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
+    ThrottleTimers *tt = &blkp->throttle_timers;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool must_wait;
 
+    if (blkp->io_limits_disabled) {
+        return false;
+    }
+
     /* Check if any of the timers in this group is already armed */
     if (tg->any_timer_armed[is_write]) {
         return true;
@@ -226,9 +232,9 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
 
     must_wait = throttle_schedule_timer(ts, tt, is_write);
 
-    /* If a timer just got armed, set bs as the current token */
+    /* If a timer just got armed, set blk as the current token */
     if (must_wait) {
-        tg->tokens[is_write] = bs;
+        tg->tokens[is_write] = blk;
         tg->any_timer_armed[is_write] = true;
     }
 
@@ -239,18 +245,19 @@ static bool throttle_group_schedule_timer(BlockDriverState *bs,
  *
  * This assumes that tg->lock is held.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @is_write:  the type of operation (read/write)
  */
-static void schedule_next_request(BlockDriverState *bs, bool is_write)
+static void schedule_next_request(BlockBackend *blk, bool is_write)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;
 
     /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(bs, is_write);
-    if (!token->pending_reqs[is_write]) {
+    token = next_throttle_token(blk, is_write);
+    if (!blkp->pending_reqs[is_write]) {
         return;
     }
 
@@ -259,12 +266,12 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
 
     /* If it doesn't have to wait, queue it for immediate execution */
     if (!must_wait) {
-        /* Give preference to requests from the current bs */
+        /* Give preference to requests from the current blk */
         if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
-            token = bs;
+            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            token = blk;
         } else {
-            ThrottleTimers *tt = &token->throttle_timers;
+            ThrottleTimers *tt = &blkp->throttle_timers;
             int64_t now = qemu_clock_get_ns(tt->clock_type);
             timer_mod(tt->timers[is_write], now + 1);
             tg->any_timer_armed[is_write] = true;
@@ -277,53 +284,67 @@ static void schedule_next_request(BlockDriverState *bs, bool is_write)
  * if necessary, and schedule the next request using a round robin
  * algorithm.
  *
- * @bs:        the current BlockDriverState
+ * @blk:       the current BlockBackend
  * @bytes:     the number of bytes for this I/O
  * @is_write:  the type of operation (read/write)
  */
-void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
                                                         unsigned int bytes,
                                                         bool is_write)
 {
     bool must_wait;
-    BlockDriverState *token;
+    BlockBackend *token;
 
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
 
     /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(bs, is_write);
+    token = next_throttle_token(blk, is_write);
     must_wait = throttle_group_schedule_timer(token, is_write);
 
     /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || bs->pending_reqs[is_write]) {
-        bs->pending_reqs[is_write]++;
+    if (must_wait || blkp->pending_reqs[is_write]) {
+        blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
         qemu_mutex_lock(&tg->lock);
-        bs->pending_reqs[is_write]--;
+        blkp->pending_reqs[is_write]--;
     }
 
     /* The I/O will be executed, so do the accounting */
-    throttle_account(bs->throttle_state, is_write, bytes);
+    throttle_account(blkp->throttle_state, is_write, bytes);
 
     /* Schedule the next request */
-    schedule_next_request(bs, is_write);
+    schedule_next_request(blk, is_write);
 
     qemu_mutex_unlock(&tg->lock);
 }
 
+void throttle_group_restart_blk(BlockBackend *blk)
+{
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
+            ;
+        }
+    }
+}
+
 /* Update the throttle configuration for a particular group. Similar
  * to throttle_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
  * @cfg: the configuration to set
  */
-void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleTimers *tt = &bs->throttle_timers;
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleTimers *tt = &blkp->throttle_timers;
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     /* throttle_config() cancels the timers */
@@ -335,18 +356,22 @@ void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
     }
     throttle_config(ts, tt, cfg);
     qemu_mutex_unlock(&tg->lock);
+
+    qemu_co_enter_next(&blkp->throttled_reqs[0]);
+    qemu_co_enter_next(&blkp->throttled_reqs[1]);
 }
 
 /* Get the throttle configuration from a particular group. Similar to
  * throttle_get_config(), but guarantees atomicity within the
  * throttling group.
  *
- * @bs:  a BlockDriverState that is member of the group
+ * @blk: a BlockBackend that is a member of the group
  * @cfg: the configuration will be written here
  */
-void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     qemu_mutex_lock(&tg->lock);
     throttle_get_config(ts, cfg);
@@ -356,12 +381,13 @@ void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting
  * because it had been throttled.
  *
- * @bs:        the BlockDriverState whose request had been throttled
+ * @blk:       the BlockBackend whose request had been throttled
  * @is_write:  the type of operation (read/write)
  */
-static void timer_cb(BlockDriverState *bs, bool is_write)
+static void timer_cb(BlockBackend *blk, bool is_write)
 {
-    ThrottleState *ts = bs->throttle_state;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool empty_queue;
 
@@ -371,13 +397,13 @@ static void timer_cb(BlockDriverState *bs, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
-    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
 
     /* If the request queue was empty then we have to take care of
      * scheduling the next one */
     if (empty_queue) {
         qemu_mutex_lock(&tg->lock);
-        schedule_next_request(bs, is_write);
+        schedule_next_request(blk, is_write);
         qemu_mutex_unlock(&tg->lock);
     }
 }
@@ -392,17 +418,17 @@ static void write_timer_cb(void *opaque)
     timer_cb(opaque, true);
 }
 
-/* Register a BlockDriverState in the throttling group, also
- * initializing its timers and updating its throttle_state pointer to
- * point to it. If a throttling group with that name does not exist
- * yet, it will be created.
+/* Register a BlockBackend in the throttling group, also initializing its
+ * timers and updating its throttle_state pointer to point to it. If a
+ * throttling group with that name does not exist yet, it will be created.
  *
- * @bs:        the BlockDriverState to insert
+ * @blk:       the BlockBackend to insert
  * @groupname: the name of the group
  */
-void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
 {
     int i;
+    BlockBackendPublic *blkp = blk_get_public(blk);
     ThrottleState *ts = throttle_group_incref(groupname);
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     int clock_type = QEMU_CLOCK_REALTIME;
@@ -412,67 +438,67 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
         clock_type = QEMU_CLOCK_VIRTUAL;
     }
 
-    bs->throttle_state = ts;
+    blkp->throttle_state = ts;
 
     qemu_mutex_lock(&tg->lock);
-    /* If the ThrottleGroup is new set this BlockDriverState as the token */
+    /* If the ThrottleGroup is new set this BlockBackend as the token */
     for (i = 0; i < 2; i++) {
         if (!tg->tokens[i]) {
-            tg->tokens[i] = bs;
+            tg->tokens[i] = blk;
         }
     }
 
-    QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+    QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
 
-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
+    throttle_timers_init(&blkp->throttle_timers,
+                         blk_get_aio_context(blk),
                          clock_type,
                          read_timer_cb,
                          write_timer_cb,
-                         bs);
+                         blk);
 
     qemu_mutex_unlock(&tg->lock);
 }
 
-/* Unregister a BlockDriverState from its group, removing it from the
- * list, destroying the timers and setting the throttle_state pointer
- * to NULL.
+/* Unregister a BlockBackend from its group, removing it from the list,
+ * destroying the timers and setting the throttle_state pointer to NULL.
  *
- * The BlockDriverState must not have pending throttled requests, so
- * the caller has to drain them first.
+ * The BlockBackend must not have pending throttled requests, so the caller has
+ * to drain them first.
  *
  * The group will be destroyed if it's empty after this operation.
  *
- * @bs: the BlockDriverState to remove
+ * @blk: the BlockBackend to remove
  */
-void throttle_group_unregister_bs(BlockDriverState *bs)
+void throttle_group_unregister_blk(BlockBackend *blk)
 {
-    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
     int i;
 
-    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+    assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
 
     qemu_mutex_lock(&tg->lock);
     for (i = 0; i < 2; i++) {
-        if (tg->tokens[i] == bs) {
-            BlockDriverState *token = throttle_group_next_bs(bs);
-            /* Take care of the case where this is the last bs in the group */
-            if (token == bs) {
+        if (tg->tokens[i] == blk) {
+            BlockBackend *token = throttle_group_next_blk(blk);
+            /* Take care of the case where this is the last blk in the group */
+            if (token == blk) {
                 token = NULL;
             }
             tg->tokens[i] = token;
         }
     }
 
-    /* remove the current bs from the list */
-    QLIST_REMOVE(bs, round_robin);
-    throttle_timers_destroy(&bs->throttle_timers);
+    /* remove the current blk from the list */
+    QLIST_REMOVE(blkp, round_robin);
+    throttle_timers_destroy(&blkp->throttle_timers);
     qemu_mutex_unlock(&tg->lock);
 
     throttle_group_unref(&tg->ts);
-    bs->throttle_state = NULL;
+    blkp->throttle_state = NULL;
 }
 
 static void throttle_groups_init(void)
diff --git a/block/trace-events b/block/trace-events
new file mode 100644
index 000000000..05fa13c89
--- /dev/null
+++ b/block/trace-events
@@ -0,0 +1,116 @@
+# See docs/tracing.txt for syntax documentation.
+
+# block.c
+bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\""
+bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
+
+# block/block-backend.c
+blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+
+# block/io.c
+bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p"
+bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
+bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
+bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
+
+# block/stream.c
+stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
+
+# block/commit.c
+commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
+
+# block/mirror.c
+mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
+mirror_before_flush(void *s) "s %p"
+mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
+mirror_before_sleep(void *s, int64_t cnt, int synced, uint64_t delay_ns) "s %p dirty count %"PRId64" synced %d delay %"PRIu64"ns"
+mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
+mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
+mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
+mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
+mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
+
+# block/backup.c
+backup_do_cow_enter(void *job, int64_t start, int64_t sector_num, int nb_sectors) "job %p start %"PRId64" sector_num %"PRId64" nb_sectors %d"
+backup_do_cow_return(void *job, int64_t sector_num, int nb_sectors, int ret) "job %p sector_num %"PRId64" nb_sectors %d ret %d"
+backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64
+backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64
+backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
+backup_do_cow_write_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d"
+
+# blockdev.c
+qmp_block_job_cancel(void *job) "job %p"
+qmp_block_job_pause(void *job) "job %p"
+qmp_block_job_resume(void *job) "job %p"
+qmp_block_job_complete(void *job) "job %p"
+block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d"
+qmp_block_stream(void *bs, void *job) "bs %p job %p"
+
+# block/raw-win32.c
+# block/raw-posix.c
+paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d"
+paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
+
+# block/qcow2.c
+qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
+qcow2_writev_done_req(void *co, int ret) "co %p ret %d"
+qcow2_writev_start_part(void *co) "co %p"
+qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
+qcow2_writev_data(void *co, uint64_t offset) "co %p offset %" PRIx64
+qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset %" PRIx64 " count %d"
+
+# block/qcow2-cluster.c
+qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"
+qcow2_handle_copied(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
+qcow2_handle_alloc(void *co, uint64_t guest_offset, uint64_t host_offset, uint64_t bytes) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " bytes %" PRIx64
+qcow2_do_alloc_clusters_offset(void *co, uint64_t guest_offset, uint64_t host_offset, int nb_clusters) "co %p guest_offset %" PRIx64 " host_offset %" PRIx64 " nb_clusters %d"
+qcow2_cluster_alloc_phys(void *co) "co %p"
+qcow2_cluster_link_l2(void *co, int nb_clusters) "co %p nb_clusters %d"
+
+qcow2_l2_allocate(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_get_empty(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_write_l2(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_write_l1(void *bs, int l1_index) "bs %p l1_index %d"
+qcow2_l2_allocate_done(void *bs, int l1_index, int ret) "bs %p l1_index %d ret %d"
+
+# block/qcow2-cache.c
+qcow2_cache_get(void *co, int c, uint64_t offset, bool read_from_disk) "co %p is_l2_cache %d offset %" PRIx64 " read_from_disk %d"
+qcow2_cache_get_replace_entry(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_get_read(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_get_done(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+qcow2_cache_flush(void *co, int c) "co %p is_l2_cache %d"
+qcow2_cache_entry_flush(void *co, int c, int i) "co %p is_l2_cache %d index %d"
+
+# block/qed-l2-cache.c
+qed_alloc_l2_cache_entry(void *l2_cache, void *entry) "l2_cache %p entry %p"
+qed_unref_l2_cache_entry(void *entry, int ref) "entry %p ref %d"
+qed_find_l2_cache_entry(void *l2_cache, void *entry, uint64_t offset, int ref) "l2_cache %p entry %p offset %"PRIu64" ref %d"
+
+# block/qed-table.c
+qed_read_table(void *s, uint64_t offset, void *table) "s %p offset %"PRIu64" table %p"
+qed_read_table_cb(void *s, void *table, int ret) "s %p table %p ret %d"
+qed_write_table(void *s, uint64_t offset, void *table, unsigned int index, unsigned int n) "s %p offset %"PRIu64" table %p index %u n %u"
+qed_write_table_cb(void *s, void *table, int flush, int ret) "s %p table %p flush %d ret %d"
+
+# block/qed.c
+qed_need_check_timer_cb(void *s) "s %p"
+qed_start_need_check_timer(void *s) "s %p"
+qed_cancel_need_check_timer(void *s) "s %p"
+qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
+qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
+qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64
+qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
+qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
+qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
diff --git a/block/vdi.c b/block/vdi.c
index 75d4819ed..8a1cf9792 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -54,6 +54,7 @@
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/coroutine.h"
 #include "qemu/cutils.h"
@@ -402,7 +403,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
 
     logout("\n");
 
-    ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1);
+    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
     if (ret < 0) {
         goto fail;
     }
@@ -499,7 +500,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap,
+    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap,
                     bmap_size);
     if (ret < 0) {
         goto fail_free_bmap;
@@ -557,98 +558,109 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
     return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }
 
-static int vdi_co_read(BlockDriverState *bs,
-        int64_t sector_num, uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
     BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
     uint32_t bmap_entry;
     uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
+    uint64_t bytes_done = 0;
     int ret = 0;
 
     logout("\n");
 
-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
 
-        logout("will read %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+        logout("will read %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);
 
         /* prepare next AIO request */
         bmap_entry = le32_to_cpu(s->bmap[block_index]);
         if (!VDI_IS_ALLOCATED(bmap_entry)) {
             /* Block not allocated, return zeros, no need to wait. */
-            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
             ret = 0;
         } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
-            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;
+
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file, data_offset, n_bytes,
+                                 &local_qiov, 0);
         }
-        logout("%u sectors read\n", n_sectors);
+        logout("%u bytes read\n", n_bytes);
 
-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
     }
 
+    qemu_iovec_destroy(&local_qiov);
+
     return ret;
 }
 
-static int vdi_co_write(BlockDriverState *bs,
-        int64_t sector_num, const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
     BDRVVdiState *s = bs->opaque;
+    QEMUIOVector local_qiov;
     uint32_t bmap_entry;
     uint32_t block_index;
-    uint32_t sector_in_block;
-    uint32_t n_sectors;
+    uint32_t offset_in_block;
+    uint32_t n_bytes;
     uint32_t bmap_first = VDI_UNALLOCATED;
     uint32_t bmap_last = VDI_UNALLOCATED;
     uint8_t *block = NULL;
+    uint64_t bytes_done = 0;
     int ret = 0;
 
     logout("\n");
 
-    while (ret >= 0 && nb_sectors > 0) {
-        block_index = sector_num / s->block_sectors;
-        sector_in_block = sector_num % s->block_sectors;
-        n_sectors = s->block_sectors - sector_in_block;
-        if (n_sectors > nb_sectors) {
-            n_sectors = nb_sectors;
-        }
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (ret >= 0 && bytes > 0) {
+        block_index = offset / s->block_size;
+        offset_in_block = offset % s->block_size;
+        n_bytes = MIN(bytes, s->block_size - offset_in_block);
 
-        logout("will write %u sectors starting at sector %" PRIu64 "\n",
-               n_sectors, sector_num);
+        logout("will write %u bytes starting at offset %" PRIu64 "\n",
+               n_bytes, offset);
 
         /* prepare next AIO request */
         bmap_entry = le32_to_cpu(s->bmap[block_index]);
         if (!VDI_IS_ALLOCATED(bmap_entry)) {
             /* Allocate new block and write to it. */
-            uint64_t offset;
+            uint64_t data_offset;
             bmap_entry = s->header.blocks_allocated;
             s->bmap[block_index] = cpu_to_le32(bmap_entry);
             s->header.blocks_allocated++;
-            offset = s->header.offset_data / SECTOR_SIZE +
-                     (uint64_t)bmap_entry * s->block_sectors;
+            data_offset = s->header.offset_data +
+                          (uint64_t)bmap_entry * s->block_size;
             if (block == NULL) {
                 block = g_malloc(s->block_size);
                 bmap_first = block_index;
             }
             bmap_last = block_index;
             /* Copy data to be written to new block and zero unused parts. */
-            memset(block, 0, sector_in_block * SECTOR_SIZE);
-            memcpy(block + sector_in_block * SECTOR_SIZE,
-                   buf, n_sectors * SECTOR_SIZE);
-            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            memset(block, 0, offset_in_block);
+            qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
+                              n_bytes);
+            memset(block + offset_in_block + n_bytes, 0,
+                   s->block_size - n_bytes - offset_in_block);
 
             /* Note that this coroutine does not yield anywhere from reading the
              * bmap entry until here, so in regards to all the coroutines trying
@@ -658,12 +670,12 @@ static int vdi_co_write(BlockDriverState *bs,
              * acquire the lock and thus the padded cluster is written before
              * the other coroutines can write to the affected area. */
             qemu_co_mutex_lock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
+            ret = bdrv_pwrite(bs->file, data_offset, block, s->block_size);
             qemu_co_mutex_unlock(&s->write_lock);
         } else {
-            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                              (uint64_t)bmap_entry * s->block_sectors +
-                              sector_in_block;
+            uint64_t data_offset = s->header.offset_data +
+                                   (uint64_t)bmap_entry * s->block_size +
+                                   offset_in_block;
             qemu_co_mutex_lock(&s->write_lock);
             /* This lock is only used to make sure the following write operation
              * is executed after the write issued by the coroutine allocating
@@ -674,16 +686,23 @@ static int vdi_co_write(BlockDriverState *bs,
              * that that write operation has returned (there may be other writes
              * in flight, but they do not concern this very operation). */
             qemu_co_mutex_unlock(&s->write_lock);
-            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
+
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_pwritev(bs->file, data_offset, n_bytes,
+                                  &local_qiov, 0);
         }
 
-        nb_sectors -= n_sectors;
-        sector_num += n_sectors;
-        buf += n_sectors * SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
 
-        logout("%u sectors written\n", n_sectors);
+        logout("%u bytes written\n", n_bytes);
     }
 
+    qemu_iovec_destroy(&local_qiov);
+
     logout("finished data write\n");
     if (ret < 0) {
         return ret;
@@ -694,12 +713,13 @@ static int vdi_co_write(BlockDriverState *bs,
         VdiHeader *header = (VdiHeader *) block;
         uint8_t *base;
         uint64_t offset;
+        uint32_t n_sectors;
 
         logout("now writing modified header\n");
         assert(VDI_IS_ALLOCATED(bmap_first));
         *header = s->header;
         vdi_header_to_le(header);
-        ret = bdrv_write(bs->file->bs, 0, block, 1);
+        ret = bdrv_write(bs->file, 0, block, 1);
         g_free(block);
         block = NULL;
 
@@ -717,7 +737,7 @@ static int vdi_co_write(BlockDriverState *bs,
         base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
         logout("will write %u block map sectors starting from entry %u\n",
                n_sectors, bmap_first);
-        ret = bdrv_write(bs->file->bs, offset, base, n_sectors);
+        ret = bdrv_write(bs->file, offset, base, n_sectors);
     }
 
     return ret;
@@ -808,7 +828,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
     vdi_header_print(&header);
 #endif
     vdi_header_to_le(&header);
-    ret = blk_pwrite(blk, offset, &header, sizeof(header));
+    ret = blk_pwrite(blk, offset, &header, sizeof(header), 0);
     if (ret < 0) {
         error_setg(errp, "Error writing header to %s", filename);
         goto exit;
@@ -829,7 +849,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
                 bmap[i] = VDI_UNALLOCATED;
             }
         }
-        ret = blk_pwrite(blk, offset, bmap, bmap_size);
+        ret = blk_pwrite(blk, offset, bmap, bmap_size, 0);
         if (ret < 0) {
             error_setg(errp, "Error writing bmap to %s", filename);
             goto exit;
@@ -903,9 +923,9 @@ static BlockDriver bdrv_vdi = {
     .bdrv_co_get_block_status = vdi_co_get_block_status,
     .bdrv_make_empty = vdi_make_empty,
 
-    .bdrv_read = vdi_co_read,
+    .bdrv_co_preadv     = vdi_co_preadv,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_write = vdi_co_write,
+    .bdrv_co_pwritev    = vdi_co_pwritev,
 #endif
 
     .bdrv_get_info = vdi_get_info,
diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c
index da33cd38e..c306b90d5 100644
--- a/block/vhdx-endian.c
+++ b/block/vhdx-endian.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block_int.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 
 #include <uuid/uuid.h>
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index 7ea7187fc..02eb10431 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -23,6 +23,7 @@
 #include "block/block_int.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 
 
@@ -83,7 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
 
     offset = log->offset + read;
 
-    ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader));
+    ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
     if (ret < 0) {
         goto exit;
     }
@@ -143,7 +144,7 @@ static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
         }
         offset = log->offset + read;
 
-        ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE);
+        ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
         if (ret < 0) {
             goto exit;
         }
@@ -193,7 +194,7 @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
             /* full */
             break;
         }
-        ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp,
+        ret = bdrv_pwrite(bs->file, offset, buffer_tmp,
                           VHDX_LOG_SECTOR_SIZE);
         if (ret < 0) {
             goto exit;
@@ -465,7 +466,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
 
     /* count is only > 1 if we are writing zeroes */
     for (i = 0; i < count; i++) {
-        ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer,
+        ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
                                VHDX_LOG_SECTOR_SIZE);
         if (ret < 0) {
             goto exit;
@@ -944,7 +945,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
 
         if (i == 0 && leading_length) {
             /* partial sector at the front of the buffer */
-            ret = bdrv_pread(bs->file->bs, file_offset, merged_sector,
+            ret = bdrv_pread(bs->file, file_offset, merged_sector,
                              VHDX_LOG_SECTOR_SIZE);
             if (ret < 0) {
                 goto exit;
@@ -954,7 +955,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
             sector_write = merged_sector;
         } else if (i == sectors - 1 && trailing_length) {
             /* partial sector at the end of the buffer */
-            ret = bdrv_pread(bs->file->bs,
+            ret = bdrv_pread(bs->file,
                             file_offset,
                             merged_sector + trailing_length,
                             VHDX_LOG_SECTOR_SIZE - trailing_length);
diff --git a/block/vhdx.c b/block/vhdx.c
index 2b7b33240..75ef2b1c2 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -22,11 +22,11 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "qemu/crc32c.h"
+#include "qemu/bswap.h"
 #include "block/vhdx.h"
 #include "migration/migration.h"
 
 #include <uuid/uuid.h>
-#include <glib.h>
 
 /* Options for VHDX creation */
 
@@ -298,9 +298,10 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
  * and then update the header checksum.  Header is converted to proper
  * endianness before being written to the specified file offset
  */
-static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
+static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr,
                              uint64_t offset, bool read)
 {
+    BlockDriverState *bs_file = file->bs;
     uint8_t *buffer = NULL;
     int ret;
     VHDXHeader *header_le;
@@ -315,7 +316,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
     buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE);
     if (read) {
         /* if true, we can't assume the extra reserved bytes are 0 */
-        ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE);
+        ret = bdrv_pread(file, offset, buffer, VHDX_HEADER_SIZE);
         if (ret < 0) {
             goto exit;
         }
@@ -329,7 +330,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr,
     vhdx_header_le_export(hdr, header_le);
     vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
                          offsetof(VHDXHeader, checksum));
-    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader));
+    ret = bdrv_pwrite_sync(file, offset, header_le, sizeof(VHDXHeader));
 
 exit:
     qemu_vfree(buffer);
@@ -378,7 +379,7 @@ static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
         inactive_header->log_guid = *log_guid;
     }
 
-    ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true);
+    ret = vhdx_write_header(bs->file, inactive_header, header_offset, true);
     if (ret < 0) {
         goto exit;
     }
@@ -430,7 +431,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
     /* We have to read the whole VHDX_HEADER_SIZE instead of
      * sizeof(VHDXHeader), because the checksum is over the whole
      * region */
-    ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer,
+    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer,
                      VHDX_HEADER_SIZE);
     if (ret < 0) {
         goto fail;
@@ -447,7 +448,7 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
         }
     }
 
-    ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer,
+    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer,
                      VHDX_HEADER_SIZE);
     if (ret < 0) {
         goto fail;
@@ -521,7 +522,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
      * whole block */
     buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
 
-    ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer,
+    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
                      VHDX_HEADER_BLOCK_SIZE);
     if (ret < 0) {
         goto fail;
@@ -634,7 +635,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
 
     buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
 
-    ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer,
+    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
                      VHDX_METADATA_TABLE_MAX_SIZE);
     if (ret < 0) {
         goto exit;
@@ -737,7 +738,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
         goto exit;
     }
 
-    ret = bdrv_pread(bs->file->bs,
+    ret = bdrv_pread(bs->file,
                      s->metadata_entries.file_parameters_entry.offset
                                          + s->metadata_rt.file_offset,
                      &s->params,
@@ -772,7 +773,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
     /* determine virtual disk size, logical sector size,
      * and phys sector size */
 
-    ret = bdrv_pread(bs->file->bs,
+    ret = bdrv_pread(bs->file,
                      s->metadata_entries.virtual_disk_size_entry.offset
                                            + s->metadata_rt.file_offset,
                      &s->virtual_disk_size,
@@ -780,7 +781,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
     if (ret < 0) {
         goto exit;
     }
-    ret = bdrv_pread(bs->file->bs,
+    ret = bdrv_pread(bs->file,
                      s->metadata_entries.logical_sector_size_entry.offset
                                              + s->metadata_rt.file_offset,
                      &s->logical_sector_size,
@@ -788,7 +789,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
     if (ret < 0) {
         goto exit;
     }
-    ret = bdrv_pread(bs->file->bs,
+    ret = bdrv_pread(bs->file,
                      s->metadata_entries.phys_sector_size_entry.offset
                                           + s->metadata_rt.file_offset,
                      &s->physical_sector_size,
@@ -905,7 +906,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     QLIST_INIT(&s->regions);
 
     /* validate the file signature */
-    ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t));
+    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
     if (ret < 0) {
         goto fail;
     }
@@ -964,7 +965,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length);
+    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
     if (ret < 0) {
         goto fail;
     }
@@ -1117,7 +1118,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                 break;
             case PAYLOAD_BLOCK_FULLY_PRESENT:
                 qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_readv(bs->file->bs,
+                ret = bdrv_co_readv(bs->file,
                                     sinfo.file_offset >> BDRV_SECTOR_BITS,
                                     sinfo.sectors_avail, &hd_qiov);
                 qemu_co_mutex_lock(&s->lock);
@@ -1326,7 +1327,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                 }
                 /* block exists, so we can just overwrite it */
                 qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_writev(bs->file->bs,
+                ret = bdrv_co_writev(bs->file,
                                     sinfo.file_offset >> BDRV_SECTOR_BITS,
                                     sectors_to_write, &hd_qiov);
                 qemu_co_mutex_lock(&s->lock);
@@ -1387,9 +1388,11 @@ exit:
  * There are 2 headers, and the highest sequence number will represent
  * the active header
  */
-static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
+static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
                                    uint32_t log_size)
 {
+    BlockDriverState *bs = blk_bs(blk);
+    BdrvChild *child;
     int ret = 0;
     VHDXHeader *hdr = NULL;
 
@@ -1404,12 +1407,18 @@ static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
     vhdx_guid_generate(&hdr->file_write_guid);
     vhdx_guid_generate(&hdr->data_write_guid);
 
-    ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false);
+    /* XXX Ugly way to get blk->root, but that's a feature, not a bug. This
+     * hack makes it obvious that vhdx_write_header() bypasses the BlockBackend
+     * here, which it really shouldn't be doing. */
+    child = QLIST_FIRST(&bs->parents);
+    assert(!QLIST_NEXT(child, next_parent));
+
+    ret = vhdx_write_header(child, hdr, VHDX_HEADER1_OFFSET, false);
     if (ret < 0) {
         goto exit;
     }
     hdr->sequence_number++;
-    ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false);
+    ret = vhdx_write_header(child, hdr, VHDX_HEADER2_OFFSET, false);
     if (ret < 0) {
         goto exit;
     }
@@ -1442,7 +1451,7 @@ exit:
  * The first 64KB of the Metadata section is reserved for the metadata
  * header and entries; beyond that, the metadata items themselves reside.
  */
-static int vhdx_create_new_metadata(BlockDriverState *bs,
+static int vhdx_create_new_metadata(BlockBackend *blk,
                                     uint64_t image_size,
                                     uint32_t block_size,
                                     uint32_t sector_size,
@@ -1538,13 +1547,13 @@ static int vhdx_create_new_metadata(BlockDriverState *bs,
                                    VHDX_META_FLAGS_IS_VIRTUAL_DISK;
     vhdx_metadata_entry_le_export(&md_table_entry[4]);
 
-    ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE);
+    ret = blk_pwrite(blk, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE, 0);
     if (ret < 0) {
         goto exit;
     }
 
-    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
-                      VHDX_METADATA_ENTRY_BUFFER_SIZE);
+    ret = blk_pwrite(blk, metadata_offset + (64 * KiB), entry_buffer,
+                     VHDX_METADATA_ENTRY_BUFFER_SIZE, 0);
     if (ret < 0) {
         goto exit;
     }
@@ -1564,7 +1573,7 @@ exit:
  *  Fixed images: default state of the BAT is fully populated, with
  *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
  */
-static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
+static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
                            uint64_t image_size, VHDXImageType type,
                            bool use_zero_blocks, uint64_t file_offset,
                            uint32_t length)
@@ -1588,12 +1597,12 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
     if (type == VHDX_TYPE_DYNAMIC) {
         /* All zeroes, so we can just extend the file - the end of the BAT
          * is the furthest thing we have written yet */
-        ret = bdrv_truncate(bs, data_file_offset);
+        ret = blk_truncate(blk, data_file_offset);
         if (ret < 0) {
             goto exit;
         }
     } else if (type == VHDX_TYPE_FIXED) {
-        ret = bdrv_truncate(bs, data_file_offset + image_size);
+        ret = blk_truncate(blk, data_file_offset + image_size);
         if (ret < 0) {
             goto exit;
         }
@@ -1604,7 +1613,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
 
     if (type == VHDX_TYPE_FIXED ||
                 use_zero_blocks ||
-                bdrv_has_zero_init(bs) == 0) {
+                bdrv_has_zero_init(blk_bs(blk)) == 0) {
         /* for a fixed file, the default BAT entry is not zero */
         s->bat = g_try_malloc0(length);
         if (length && s->bat == NULL) {
@@ -1620,12 +1629,12 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
             sinfo.file_offset = data_file_offset +
                                 (sector_num << s->logical_sector_size_bits);
             sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB);
-            vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused,
+            vhdx_update_bat_table_entry(blk_bs(blk), s, &sinfo, &unused, &unused,
                                         block_state);
             cpu_to_le64s(&s->bat[sinfo.bat_idx]);
             sector_num += s->sectors_per_block;
         }
-        ret = bdrv_pwrite(bs, file_offset, s->bat, length);
+        ret = blk_pwrite(blk, file_offset, s->bat, length, 0);
         if (ret < 0) {
             goto exit;
         }
@@ -1645,7 +1654,7 @@ exit:
  * to create the BAT itself, we will also cause the BAT to be
  * created.
  */
-static int vhdx_create_new_region_table(BlockDriverState *bs,
+static int vhdx_create_new_region_table(BlockBackend *blk,
                                         uint64_t image_size,
                                         uint32_t block_size,
                                         uint32_t sector_size,
@@ -1720,21 +1729,21 @@ static int vhdx_create_new_region_table(BlockDriverState *bs,
 
     /* The region table gives us the data we need to create the BAT,
      * so do that now */
-    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks,
+    ret = vhdx_create_bat(blk, s, image_size, type, use_zero_blocks,
                           bat_file_offset, bat_length);
     if (ret < 0) {
         goto exit;
     }
 
     /* Now write out the region headers to disk */
-    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
-                      VHDX_HEADER_BLOCK_SIZE);
+    ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, buffer,
+                     VHDX_HEADER_BLOCK_SIZE, 0);
     if (ret < 0) {
         goto exit;
     }
 
-    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer,
-                      VHDX_HEADER_BLOCK_SIZE);
+    ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, buffer,
+                     VHDX_HEADER_BLOCK_SIZE, 0);
     if (ret < 0) {
         goto exit;
     }
@@ -1856,13 +1865,14 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
     creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
                               &creator_items, NULL);
     signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
-    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
+    ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature),
+                     0);
     if (ret < 0) {
         goto delete_and_exit;
     }
     if (creator) {
         ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
-                         creator, creator_items * sizeof(gunichar2));
+                         creator, creator_items * sizeof(gunichar2), 0);
         if (ret < 0) {
             goto delete_and_exit;
         }
@@ -1870,13 +1880,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
 
 
     /* Creates (B),(C) */
-    ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size);
+    ret = vhdx_create_new_headers(blk, image_size, log_size);
     if (ret < 0) {
         goto delete_and_exit;
     }
 
     /* Creates (D),(E),(G) explicitly. (F) created as by-product */
-    ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512,
+    ret = vhdx_create_new_region_table(blk, image_size, block_size, 512,
                                        log_size, use_zero_blocks, image_type,
                                        &metadata_offset);
     if (ret < 0) {
@@ -1884,7 +1894,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
     }
 
     /* Creates (H) */
-    ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512,
+    ret = vhdx_create_new_metadata(blk, image_size, block_size, 512,
                                    metadata_offset, image_type);
     if (ret < 0) {
         goto delete_and_exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index 45f9d3c5b..46d474e44 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -30,10 +30,10 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qemu/cutils.h"
 #include <zlib.h>
-#include <glib.h>
 
 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
@@ -252,7 +252,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
     int ret;
 
     desc = g_malloc0(DESC_SIZE);
-    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
     if (ret < 0) {
         g_free(desc);
         return 0;
@@ -286,7 +286,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 
     desc = g_malloc0(DESC_SIZE);
     tmp_desc = g_malloc0(DESC_SIZE);
-    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
     if (ret < 0) {
         goto out;
     }
@@ -306,7 +306,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
         pstrcat(desc, DESC_SIZE, tmp_desc);
     }
 
-    ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
 
 out:
     g_free(desc);
@@ -350,7 +350,7 @@ static int vmdk_parent_open(BlockDriverState *bs)
     int ret;
 
     desc = g_malloc0(DESC_SIZE + 1);
-    ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
+    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
     if (ret < 0) {
         goto out;
     }
@@ -454,7 +454,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
         return -ENOMEM;
     }
 
-    ret = bdrv_pread(extent->file->bs,
+    ret = bdrv_pread(extent->file,
                      extent->l1_table_offset,
                      extent->l1_table,
                      l1_size);
@@ -474,7 +474,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
             ret = -ENOMEM;
             goto fail_l1;
         }
-        ret = bdrv_pread(extent->file->bs,
+        ret = bdrv_pread(extent->file,
                          extent->l1_backup_table_offset,
                          extent->l1_backup_table,
                          l1_size);
@@ -508,7 +508,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
     VMDK3Header header;
     VmdkExtent *extent;
 
-    ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header));
+    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
         error_setg_errno(errp, -ret,
                          "Could not read header from file '%s'",
@@ -538,14 +538,13 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
                                QDict *options, Error **errp);
 
-static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
-                            Error **errp)
+static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
 {
     int64_t size;
     char *buf;
     int ret;
 
-    size = bdrv_getlength(file);
+    size = bdrv_getlength(file->bs);
     if (size < 0) {
         error_setg_errno(errp, -size, "Could not access file");
         return NULL;
@@ -586,7 +585,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     int64_t l1_backup_offset = 0;
     bool compressed;
 
-    ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header));
+    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
         error_setg_errno(errp, -ret,
                          "Could not read header from file '%s'",
@@ -596,7 +595,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (header.capacity == 0) {
         uint64_t desc_offset = le64_to_cpu(header.desc_offset);
         if (desc_offset) {
-            char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp);
+            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
             if (!buf) {
                 return -EINVAL;
             }
@@ -636,7 +635,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
             } QEMU_PACKED eos_marker;
         } QEMU_PACKED footer;
 
-        ret = bdrv_pread(file->bs,
+        ret = bdrv_pread(file,
             bs->file->bs->total_sectors * 512 - 1536,
             &footer, sizeof(footer));
         if (ret < 0) {
@@ -874,7 +873,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             extent->flat_start_offset = flat_offset << 9;
         } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
             /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
-            char *buf = vmdk_read_desc(extent_file->bs, 0, errp);
+            char *buf = vmdk_read_desc(extent_file, 0, errp);
             if (!buf) {
                 ret = -EINVAL;
             } else {
@@ -943,7 +942,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVVmdkState *s = bs->opaque;
     uint32_t magic;
 
-    buf = vmdk_read_desc(bs->file->bs, 0, errp);
+    buf = vmdk_read_desc(bs->file, 0, errp);
     if (!buf) {
         return -EINVAL;
     }
@@ -997,9 +996,9 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
 
     for (i = 0; i < s->num_extents; i++) {
         if (!s->extents[i].flat) {
-            bs->bl.write_zeroes_alignment =
-                MAX(bs->bl.write_zeroes_alignment,
-                    s->extents[i].cluster_sectors);
+            bs->bl.pwrite_zeroes_alignment =
+                MAX(bs->bl.pwrite_zeroes_alignment,
+                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
         }
     }
 }
@@ -1016,27 +1015,26 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
  */
 static int get_whole_cluster(BlockDriverState *bs,
                              VmdkExtent *extent,
-                             uint64_t cluster_sector_num,
-                             uint64_t sector_num,
-                             uint64_t skip_start_sector,
-                             uint64_t skip_end_sector)
+                             uint64_t cluster_offset,
+                             uint64_t offset,
+                             uint64_t skip_start_bytes,
+                             uint64_t skip_end_bytes)
 {
     int ret = VMDK_OK;
     int64_t cluster_bytes;
     uint8_t *whole_grain;
 
     /* For COW, align request sector_num to cluster start */
-    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
     cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
+    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
     whole_grain = qemu_blockalign(bs, cluster_bytes);
 
     if (!bs->backing) {
-        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
-        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
-               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
+        memset(whole_grain, 0, skip_start_bytes);
+        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
     }
 
-    assert(skip_end_sector <= extent->cluster_sectors);
+    assert(skip_end_bytes <= cluster_bytes);
     /* we will be here if it's first write on non-exist grain(cluster).
      * try to read from parent image, if exist */
     if (bs->backing && !vmdk_is_cid_valid(bs)) {
@@ -1045,42 +1043,43 @@ static int get_whole_cluster(BlockDriverState *bs,
     }
 
     /* Read backing data before skip range */
-    if (skip_start_sector > 0) {
+    if (skip_start_bytes > 0) {
         if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num,
-                            whole_grain, skip_start_sector);
+            ret = bdrv_pread(bs->backing, offset, whole_grain,
+                             skip_start_bytes);
             if (ret < 0) {
                 ret = VMDK_ERROR;
                 goto exit;
             }
         }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain,
-                         skip_start_sector);
+        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
+                          skip_start_bytes);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
     }
     /* Read backing data after skip range */
-    if (skip_end_sector < extent->cluster_sectors) {
+    if (skip_end_bytes < cluster_bytes) {
         if (bs->backing) {
-            ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector,
-                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                            extent->cluster_sectors - skip_end_sector);
+            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
+                             whole_grain + skip_end_bytes,
+                             cluster_bytes - skip_end_bytes);
             if (ret < 0) {
                 ret = VMDK_ERROR;
                 goto exit;
             }
         }
-        ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector,
-                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
-                         extent->cluster_sectors - skip_end_sector);
+        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
+                          whole_grain + skip_end_bytes,
+                          cluster_bytes - skip_end_bytes);
         if (ret < 0) {
             ret = VMDK_ERROR;
             goto exit;
         }
     }
 
+    ret = VMDK_OK;
 exit:
     qemu_vfree(whole_grain);
     return ret;
@@ -1091,8 +1090,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
 {
     offset = cpu_to_le32(offset);
     /* update L2 table */
-    if (bdrv_pwrite_sync(
-                extent->file->bs,
+    if (bdrv_pwrite_sync(extent->file,
                 ((int64_t)m_data->l2_offset * 512)
                     + (m_data->l2_index * sizeof(offset)),
                 &offset, sizeof(offset)) < 0) {
@@ -1101,8 +1099,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
         m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
-        if (bdrv_pwrite_sync(
-                    extent->file->bs,
+        if (bdrv_pwrite_sync(extent->file,
                     ((int64_t)m_data->l2_offset * 512)
                         + (m_data->l2_index * sizeof(offset)),
                     &offset, sizeof(offset)) < 0) {
@@ -1142,8 +1139,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                               uint64_t offset,
                               bool allocate,
                               uint64_t *cluster_offset,
-                              uint64_t skip_start_sector,
-                              uint64_t skip_end_sector)
+                              uint64_t skip_start_bytes,
+                              uint64_t skip_end_bytes)
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
@@ -1191,8 +1188,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
     }
     l2_table = extent->l2_cache + (min_index * extent->l2_size);
-    if (bdrv_pread(
-                extent->file->bs,
+    if (bdrv_pread(extent->file,
                 (int64_t)l2_offset * 512,
                 l2_table,
                 extent->l2_size * sizeof(uint32_t)
@@ -1206,13 +1202,6 @@ static int get_cluster_offset(BlockDriverState *bs,
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     cluster_sector = le32_to_cpu(l2_table[l2_index]);
 
-    if (m_data) {
-        m_data->valid = 1;
-        m_data->l1_index = l1_index;
-        m_data->l2_index = l2_index;
-        m_data->l2_offset = l2_offset;
-        m_data->l2_cache_entry = &l2_table[l2_index];
-    }
     if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
         zeroed = true;
     }
@@ -1230,13 +1219,18 @@ static int get_cluster_offset(BlockDriverState *bs,
          * This problem may occur because of insufficient space on host disk
          * or inappropriate VM shutdown.
          */
-        ret = get_whole_cluster(bs, extent,
-                                cluster_sector,
-                                offset >> BDRV_SECTOR_BITS,
-                                skip_start_sector, skip_end_sector);
+        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+                                offset, skip_start_bytes, skip_end_bytes);
         if (ret) {
             return ret;
         }
+        if (m_data) {
+            m_data->valid = 1;
+            m_data->l1_index = l1_index;
+            m_data->l2_index = l2_index;
+            m_data->l2_offset = l2_offset;
+            m_data->l2_cache_entry = &l2_table[l2_index];
+        }
     }
     *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
     return VMDK_OK;
@@ -1259,15 +1253,24 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+                                                   int64_t offset)
+{
+    uint64_t extent_begin_offset, extent_relative_offset;
+    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+    extent_begin_offset =
+        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+    extent_relative_offset = offset - extent_begin_offset;
+    return extent_relative_offset % cluster_size;
+}
+
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                   int64_t sector_num)
 {
-    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num;
-
-    extent_begin_sector = extent->end_sector - extent->sectors;
-    extent_relative_sector_num = sector_num - extent_begin_sector;
-    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
-    return index_in_cluster;
+    uint64_t offset;
+    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
+    return offset / BDRV_SECTOR_SIZE;
 }
 
 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
@@ -1319,38 +1322,57 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
 }
 
 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, const uint8_t *buf,
-                            int nb_sectors, int64_t sector_num)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            uint64_t qiov_offset, uint64_t n_bytes,
+                            uint64_t offset)
 {
     int ret;
     VmdkGrainMarker *data = NULL;
     uLongf buf_len;
-    const uint8_t *write_buf = buf;
-    int write_len = nb_sectors * 512;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
     int64_t write_offset;
     int64_t write_end_sector;
 
     if (extent->compressed) {
+        void *compressed_data;
+
         if (!extent->has_marker) {
             ret = -EINVAL;
             goto out;
         }
         buf_len = (extent->cluster_sectors << 9) * 2;
         data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
-        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
-                buf_len == 0) {
+
+        compressed_data = g_malloc(n_bytes);
+        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
+        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
+        g_free(compressed_data);
+
+        if (ret != Z_OK || buf_len == 0) {
             ret = -EINVAL;
             goto out;
         }
-        data->lba = sector_num;
+
+        data->lba = offset >> BDRV_SECTOR_BITS;
         data->size = buf_len;
-        write_buf = (uint8_t *)data;
-        write_len = buf_len + sizeof(VmdkGrainMarker);
+
+        n_bytes = buf_len + sizeof(VmdkGrainMarker);
+        iov = (struct iovec) {
+            .iov_base   = data,
+            .iov_len    = n_bytes,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    } else {
+        qemu_iovec_init(&local_qiov, qiov->niov);
+        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
     }
+
     write_offset = cluster_offset + offset_in_cluster,
-    ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len);
+    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
+                          &local_qiov, 0);
 
-    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE);
+    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
 
     if (extent->compressed) {
         extent->next_cluster_sector = write_end_sector;
@@ -1359,19 +1381,21 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
                                           write_end_sector);
     }
 
-    if (ret != write_len) {
-        ret = ret < 0 ? ret : -EIO;
+    if (ret < 0) {
         goto out;
     }
     ret = 0;
  out:
     g_free(data);
+    if (!extent->compressed) {
+        qemu_iovec_destroy(&local_qiov);
+    }
     return ret;
 }
 
 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
-                            int64_t offset_in_cluster, uint8_t *buf,
-                            int nb_sectors)
+                            int64_t offset_in_cluster, QEMUIOVector *qiov,
+                            int bytes)
 {
     int ret;
     int cluster_bytes, buf_bytes;
@@ -1383,21 +1407,20 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
 
 
     if (!extent->compressed) {
-        ret = bdrv_pread(extent->file->bs,
-                          cluster_offset + offset_in_cluster,
-                          buf, nb_sectors * 512);
-        if (ret == nb_sectors * 512) {
-            return 0;
-        } else {
-            return -EIO;
+        ret = bdrv_co_preadv(extent->file,
+                             cluster_offset + offset_in_cluster, bytes,
+                             qiov, 0);
+        if (ret < 0) {
+            return ret;
         }
+        return 0;
     }
     cluster_bytes = extent->cluster_sectors * 512;
     /* Read two clusters in case GrainMarker + compressed data > one cluster */
     buf_bytes = cluster_bytes * 2;
     cluster_buf = g_malloc(buf_bytes);
     uncomp_buf = g_malloc(cluster_bytes);
-    ret = bdrv_pread(extent->file->bs,
+    ret = bdrv_pread(extent->file,
                 cluster_offset,
                 cluster_buf, buf_bytes);
     if (ret < 0) {
@@ -1422,11 +1445,11 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
 
     }
     if (offset_in_cluster < 0 ||
-            offset_in_cluster + nb_sectors * 512 > buf_len) {
+            offset_in_cluster + bytes > buf_len) {
         ret = -EINVAL;
         goto out;
     }
-    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
+    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
     ret = 0;
 
  out:
@@ -1435,64 +1458,73 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
     return ret;
 }
 
-static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
     BDRVVmdkState *s = bs->opaque;
     int ret;
-    uint64_t n, index_in_cluster;
+    uint64_t n_bytes, offset_in_cluster;
     VmdkExtent *extent = NULL;
+    QEMUIOVector local_qiov;
     uint64_t cluster_offset;
+    uint64_t bytes_done = 0;
 
-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
         if (!extent) {
-            return -EIO;
+            ret = -EIO;
+            goto fail;
         }
         ret = get_cluster_offset(bs, extent, NULL,
-                                 sector_num << 9, false, &cluster_offset,
-                                 0, 0);
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
+                                 offset, false, &cluster_offset, 0, 0);
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
         if (ret != VMDK_OK) {
             /* if not allocated, try to read from parent image, if exist */
             if (bs->backing && ret != VMDK_ZEROED) {
                 if (!vmdk_is_cid_valid(bs)) {
-                    return -EINVAL;
+                    ret = -EINVAL;
+                    goto fail;
                 }
-                ret = bdrv_read(bs->backing->bs, sector_num, buf, n);
+
+                qemu_iovec_reset(&local_qiov);
+                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
+                                     &local_qiov, 0);
                 if (ret < 0) {
-                    return ret;
+                    goto fail;
                 }
             } else {
-                memset(buf, 0, 512 * n);
+                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
             }
         } else {
-            ret = vmdk_read_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n);
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
+                                   &local_qiov, n_bytes);
             if (ret) {
-                return ret;
+                goto fail;
             }
         }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
     }
-    return 0;
-}
 
-static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
-                                     uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVmdkState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
     qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&local_qiov);
+
     return ret;
 }
 
@@ -1506,38 +1538,38 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
  *
  * Returns: error code with 0 for success.
  */
-static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                      const uint8_t *buf, int nb_sectors,
-                      bool zeroed, bool zero_dry_run)
+static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
+                       uint64_t bytes, QEMUIOVector *qiov,
+                       bool zeroed, bool zero_dry_run)
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
     int ret;
-    int64_t index_in_cluster, n;
+    int64_t offset_in_cluster, n_bytes;
     uint64_t cluster_offset;
+    uint64_t bytes_done = 0;
     VmdkMetaData m_data;
 
-    if (sector_num > bs->total_sectors) {
-        error_report("Wrong offset: sector_num=0x%" PRIx64
+    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
+        error_report("Wrong offset: offset=0x%" PRIx64
                      " total_sectors=0x%" PRIx64,
-                     sector_num, bs->total_sectors);
+                     offset, bs->total_sectors);
         return -EIO;
     }
 
-    while (nb_sectors > 0) {
-        extent = find_extent(s, sector_num, extent);
+    while (bytes > 0) {
+        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
         if (!extent) {
             return -EIO;
         }
-        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
-        n = extent->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors) {
-            n = nb_sectors;
-        }
-        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
+        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+                             - offset_in_cluster);
+
+        ret = get_cluster_offset(bs, extent, &m_data, offset,
                                  !(extent->compressed || zeroed),
-                                 &cluster_offset,
-                                 index_in_cluster, index_in_cluster + n);
+                                 &cluster_offset, offset_in_cluster,
+                                 offset_in_cluster + n_bytes);
         if (extent->compressed) {
             if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
@@ -1546,7 +1578,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                 return -EIO;
             } else {
                 /* allocate */
-                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
+                ret = get_cluster_offset(bs, extent, &m_data, offset,
                                          true, &cluster_offset, 0, 0);
             }
         }
@@ -1556,9 +1588,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (zeroed) {
             /* Do zeroed write, buf is ignored */
             if (extent->has_zero_grain &&
-                    index_in_cluster == 0 &&
-                    n >= extent->cluster_sectors) {
-                n = extent->cluster_sectors;
+                    offset_in_cluster == 0 &&
+                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
+                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
                 if (!zero_dry_run) {
                     /* update L2 tables */
                     if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
@@ -1570,9 +1602,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                 return -ENOTSUP;
             }
         } else {
-            ret = vmdk_write_extent(extent,
-                            cluster_offset, index_in_cluster * 512,
-                            buf, n, sector_num);
+            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
+                                    qiov, bytes_done, n_bytes, offset);
             if (ret) {
                 return ret;
             }
@@ -1585,9 +1616,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                 }
             }
         }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
 
         /* update CID on the first write every time the virtual disk is
          * opened */
@@ -1602,43 +1633,84 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
     return 0;
 }
 
-static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
-                                      const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
     int ret;
     BDRVVmdkState *s = bs->opaque;
     qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
 
+typedef struct VmdkWriteCompressedCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    const uint8_t *buf;
+    int nb_sectors;
+    int ret;
+} VmdkWriteCompressedCo;
+
+static void vmdk_co_write_compressed(void *opaque)
+{
+    VmdkWriteCompressedCo *co = opaque;
+    QEMUIOVector local_qiov;
+    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
+    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;
+
+    struct iovec iov = (struct iovec) {
+        .iov_base   = (uint8_t*) co->buf,
+        .iov_len    = bytes,
+    };
+    qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+    co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false);
+}
+
 static int vmdk_write_compressed(BlockDriverState *bs,
                                  int64_t sector_num,
                                  const uint8_t *buf,
                                  int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
+
     if (s->num_extents == 1 && s->extents[0].compressed) {
-        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+        Coroutine *co;
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        VmdkWriteCompressedCo data = {
+            .bs         = bs,
+            .sector_num = sector_num,
+            .buf        = buf,
+            .nb_sectors = nb_sectors,
+            .ret        = -EINPROGRESS,
+        };
+        co = qemu_coroutine_create(vmdk_co_write_compressed, &data);
+        qemu_coroutine_enter(co);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(aio_context, true);
+        }
+        return data.ret;
     } else {
         return -ENOTSUP;
     }
 }
 
-static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
-                                             int64_t sector_num,
-                                             int nb_sectors,
-                                             BdrvRequestFlags flags)
+static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
 {
     int ret;
     BDRVVmdkState *s = bs->opaque;
+
     qemu_co_mutex_lock(&s->lock);
     /* write zeroes could fail if sectors not aligned to cluster, test it with
      * dry_run == true before really updating image */
-    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
     if (!ret) {
-        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
     }
     qemu_co_mutex_unlock(&s->lock);
     return ret;
@@ -1728,12 +1800,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     header.check_bytes[3] = 0xa;
 
     /* write all the data */
-    ret = blk_pwrite(blk, 0, &magic, sizeof(magic));
+    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
     if (ret < 0) {
         error_setg(errp, QERR_IO_ERROR);
         goto exit;
     }
-    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header));
+    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
     if (ret < 0) {
         error_setg(errp, QERR_IO_ERROR);
         goto exit;
@@ -1753,7 +1825,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         gd_buf[i] = cpu_to_le32(tmp);
     }
     ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
     if (ret < 0) {
         error_setg(errp, QERR_IO_ERROR);
         goto exit;
@@ -1765,7 +1837,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         gd_buf[i] = cpu_to_le32(tmp);
     }
     ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                     gd_buf, gd_buf_size);
+                     gd_buf, gd_buf_size, 0);
     if (ret < 0) {
         error_setg(errp, QERR_IO_ERROR);
         goto exit;
@@ -1829,8 +1901,8 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
     int64_t total_size = 0, filesize;
     char *adapter_type = NULL;
     char *backing_file = NULL;
+    char *hw_version = NULL;
     char *fmt = NULL;
-    int flags = 0;
     int ret = 0;
     bool flat, split, compress;
     GString *ext_desc_lines;
@@ -1861,7 +1933,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
         "# The Disk Data Base\n"
         "#DDB\n"
         "\n"
-        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.virtualHWVersion = \"%s\"\n"
         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
         "ddb.geometry.heads = \"%" PRIu32 "\"\n"
         "ddb.geometry.sectors = \"63\"\n"
@@ -1878,8 +1950,20 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                           BDRV_SECTOR_SIZE);
     adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
     backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        flags |= BLOCK_FLAG_COMPAT6;
+        if (strcmp(hw_version, "undefined")) {
+            error_setg(errp,
+                       "compat6 cannot be enabled with hwversion set");
+            ret = -EINVAL;
+            goto exit;
+        }
+        g_free(hw_version);
+        hw_version = g_strdup("6");
+    }
+    if (strcmp(hw_version, "undefined") == 0) {
+        g_free(hw_version);
+        hw_version = g_strdup("4");
     }
     fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
@@ -2001,7 +2085,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
                            fmt,
                            parent_desc_line,
                            ext_desc_lines->str,
-                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+                           hw_version,
                            total_size /
                                (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                            number_heads,
@@ -2028,7 +2112,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
 
     blk_set_allow_write_beyond_eof(new_blk, true);
 
-    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len);
+    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
@@ -2047,6 +2131,7 @@ exit:
     }
     g_free(adapter_type);
     g_free(backing_file);
+    g_free(hw_version);
     g_free(fmt);
     g_free(desc);
     g_free(path);
@@ -2250,27 +2335,6 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
-static void vmdk_detach_aio_context(BlockDriverState *bs)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_detach_aio_context(s->extents[i].file->bs);
-    }
-}
-
-static void vmdk_attach_aio_context(BlockDriverState *bs,
-                                    AioContext *new_context)
-{
-    BDRVVmdkState *s = bs->opaque;
-    int i;
-
-    for (i = 0; i < s->num_extents; i++) {
-        bdrv_attach_aio_context(s->extents[i].file->bs, new_context);
-    }
-}
-
 static QemuOptsList vmdk_create_opts = {
     .name = "vmdk-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2298,6 +2362,12 @@ static QemuOptsList vmdk_create_opts = {
             .def_value_str = "off"
         },
         {
+            .name = BLOCK_OPT_HWVERSION,
+            .type = QEMU_OPT_STRING,
+            .help = "VMDK hardware version",
+            .def_value_str = "undefined"
+        },
+        {
             .name = BLOCK_OPT_SUBFMT,
             .type = QEMU_OPT_STRING,
             .help =
@@ -2321,10 +2391,10 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_open                    = vmdk_open,
     .bdrv_check                   = vmdk_check,
     .bdrv_reopen_prepare          = vmdk_reopen_prepare,
-    .bdrv_read                    = vmdk_co_read,
-    .bdrv_write                   = vmdk_co_write,
+    .bdrv_co_preadv               = vmdk_co_preadv,
+    .bdrv_co_pwritev              = vmdk_co_pwritev,
     .bdrv_write_compressed        = vmdk_write_compressed,
-    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
     .bdrv_close                   = vmdk_close,
     .bdrv_create                  = vmdk_create,
     .bdrv_co_flush_to_disk        = vmdk_co_flush,
@@ -2334,8 +2404,6 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_get_specific_info       = vmdk_get_specific_info,
     .bdrv_refresh_limits          = vmdk_refresh_limits,
     .bdrv_get_info                = vmdk_get_info,
-    .bdrv_detach_aio_context      = vmdk_detach_aio_context,
-    .bdrv_attach_aio_context      = vmdk_attach_aio_context,
 
     .supports_backing             = true,
     .create_opts                  = &vmdk_create_opts,
diff --git a/block/vpc.c b/block/vpc.c
index 3e2ea698d..43707ed22 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
+#include "qemu/bswap.h"
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #endif
@@ -236,7 +237,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
+    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
     if (ret < 0) {
         error_setg(errp, "Unable to read VHD header");
         goto fail;
@@ -256,7 +257,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
         }
 
         /* If a fixed disk, the footer is found only at the end of the file */
-        ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf,
+        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
                          HEADER_SIZE);
         if (ret < 0) {
             goto fail;
@@ -327,7 +328,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     if (disk_type == VHD_DYNAMIC) {
-        ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
+        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
                          HEADER_SIZE);
         if (ret < 0) {
             error_setg(errp, "Error reading dynamic VHD header");
@@ -384,7 +385,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
 
         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
 
-        ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
+        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
                          pagetable_size);
         if (ret < 0) {
             error_setg(errp, "Error reading pagetable");
@@ -454,22 +455,21 @@ static int vpc_reopen_prepare(BDRVReopenState *state,
  * The parameter write must be 1 if the offset will be used for a write
  * operation (the block bitmaps is updated then), 0 otherwise.
  */
-static inline int64_t get_sector_offset(BlockDriverState *bs,
-    int64_t sector_num, int write)
+static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
+                                       bool write)
 {
     BDRVVPCState *s = bs->opaque;
-    uint64_t offset = sector_num * 512;
     uint64_t bitmap_offset, block_offset;
-    uint32_t pagetable_index, pageentry_index;
+    uint32_t pagetable_index, offset_in_block;
 
     pagetable_index = offset / s->block_size;
-    pageentry_index = (offset % s->block_size) / 512;
+    offset_in_block = offset % s->block_size;
 
     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
         return -1; /* not allocated */
 
     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
-    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
 
     /* We must ensure that we don't write to any sectors which are marked as
        unused in the bitmap. We get away with setting all bits in the block
@@ -481,12 +481,18 @@ static inline int64_t get_sector_offset(BlockDriverState *bs,
 
         s->last_bitmap_offset = bitmap_offset;
         memset(bitmap, 0xff, s->bitmap_size);
-        bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size);
+        bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
     }
 
     return block_offset;
 }
 
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+                                        int64_t sector_num, bool write)
+{
+    return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
+}
+
 /*
  * Writes the footer to the end of the image file. This is needed when the
  * file grows as it overwrites the old footer
@@ -499,7 +505,7 @@ static int rewrite_footer(BlockDriverState* bs)
     BDRVVPCState *s = bs->opaque;
     int64_t offset = s->free_data_block_offset;
 
-    ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE);
+    ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
     if (ret < 0)
         return ret;
 
@@ -513,7 +519,7 @@ static int rewrite_footer(BlockDriverState* bs)
  *
  * Returns the sectors' offset in the image file on success and < 0 on error
  */
-static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 {
     BDRVVPCState *s = bs->opaque;
     int64_t bat_offset;
@@ -522,19 +528,18 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
     uint8_t bitmap[s->bitmap_size];
 
     /* Check if sector_num is valid */
-    if ((sector_num < 0) || (sector_num > bs->total_sectors))
-        return -1;
+    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
+        return -EINVAL;
+    }
 
     /* Write entry into in-memory BAT */
-    index = (sector_num * 512) / s->block_size;
-    if (s->pagetable[index] != 0xFFFFFFFF)
-        return -1;
-
+    index = offset / s->block_size;
+    assert(s->pagetable[index] == 0xFFFFFFFF);
     s->pagetable[index] = s->free_data_block_offset / 512;
 
     /* Initialize the block's bitmap */
     memset(bitmap, 0xff, s->bitmap_size);
-    ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
+    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
         s->bitmap_size);
     if (ret < 0) {
         return ret;
@@ -549,15 +554,15 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
     /* Write BAT entry to disk */
     bat_offset = s->bat_offset + (4 * index);
     bat_value = cpu_to_be32(s->pagetable[index]);
-    ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
+    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
     if (ret < 0)
         goto fail;
 
-    return get_sector_offset(bs, sector_num, 0);
+    return get_image_offset(bs, offset, false);
 
 fail:
     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
-    return -1;
+    return ret;
 }
 
 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -573,104 +578,105 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
-static int vpc_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+              QEMUIOVector *qiov, int flags)
 {
     BDRVVPCState *s = bs->opaque;
     int ret;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
     VHDFooter *footer = (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;
 
     if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
+        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
     }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 0);
 
-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, false);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 
-        if (offset == -1) {
-            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
+        if (image_offset == -1) {
+            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
         } else {
-            ret = bdrv_pread(bs->file->bs, offset, buf,
-                sectors * BDRV_SECTOR_SIZE);
-            if (ret != sectors * BDRV_SECTOR_SIZE) {
-                return -1;
+            qemu_iovec_reset(&local_qiov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
+                                 &local_qiov, 0);
+            if (ret < 0) {
+                goto fail;
             }
         }
 
-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
     }
-    return 0;
-}
 
-static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
-                                    uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_read(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
     qemu_co_mutex_unlock(&s->lock);
+
     return ret;
 }
 
-static int vpc_write(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+               QEMUIOVector *qiov, int flags)
 {
     BDRVVPCState *s = bs->opaque;
-    int64_t offset;
-    int64_t sectors, sectors_per_block;
+    int64_t image_offset;
+    int64_t n_bytes;
+    int64_t bytes_done = 0;
     int ret;
     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
+    QEMUIOVector local_qiov;
 
     if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
+        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
     }
-    while (nb_sectors > 0) {
-        offset = get_sector_offset(bs, sector_num, 1);
 
-        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
-        sectors = sectors_per_block - (sector_num % sectors_per_block);
-        if (sectors > nb_sectors) {
-            sectors = nb_sectors;
-        }
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&local_qiov, qiov->niov);
+
+    while (bytes > 0) {
+        image_offset = get_image_offset(bs, offset, true);
+        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 
-        if (offset == -1) {
-            offset = alloc_block(bs, sector_num);
-            if (offset < 0)
-                return -1;
+        if (image_offset == -1) {
+            image_offset = alloc_block(bs, offset);
+            if (image_offset < 0) {
+                ret = image_offset;
+                goto fail;
+            }
         }
 
-        ret = bdrv_pwrite(bs->file->bs, offset, buf,
-                          sectors * BDRV_SECTOR_SIZE);
-        if (ret != sectors * BDRV_SECTOR_SIZE) {
-            return -1;
+        qemu_iovec_reset(&local_qiov);
+        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
+
+        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
+                              &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
         }
 
-        nb_sectors -= sectors;
-        sector_num += sectors;
-        buf += sectors * BDRV_SECTOR_SIZE;
+        bytes -= n_bytes;
+        offset += n_bytes;
+        bytes_done += n_bytes;
     }
 
-    return 0;
-}
-
-static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
-                                     const uint8_t *buf, int nb_sectors)
-{
-    int ret;
-    BDRVVPCState *s = bs->opaque;
-    qemu_co_mutex_lock(&s->lock);
-    ret = vpc_write(bs, sector_num, buf, nb_sectors);
+    ret = 0;
+fail:
+    qemu_iovec_destroy(&local_qiov);
     qemu_co_mutex_unlock(&s->lock);
+
     return ret;
 }
 
@@ -783,13 +789,13 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
     block_size = 0x200000;
     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
 
-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
     if (ret < 0) {
         goto fail;
     }
 
     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
-    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
     if (ret < 0) {
         goto fail;
     }
@@ -799,7 +805,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
 
     memset(buf, 0xFF, 512);
     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
-        ret = blk_pwrite(blk, offset, buf, 512);
+        ret = blk_pwrite(blk, offset, buf, 512, 0);
         if (ret < 0) {
             goto fail;
         }
@@ -826,7 +832,7 @@ static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
     /* Write the header */
     offset = 512;
 
-    ret = blk_pwrite(blk, offset, buf, 1024);
+    ret = blk_pwrite(blk, offset, buf, 1024, 0);
     if (ret < 0) {
         goto fail;
     }
@@ -848,7 +854,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
         return ret;
     }
 
-    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
+    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
     if (ret < 0) {
         return ret;
     }
@@ -1056,8 +1062,8 @@ static BlockDriver bdrv_vpc = {
     .bdrv_reopen_prepare    = vpc_reopen_prepare,
     .bdrv_create            = vpc_create,
 
-    .bdrv_read                  = vpc_co_read,
-    .bdrv_write                 = vpc_co_write,
+    .bdrv_co_preadv             = vpc_co_preadv,
+    .bdrv_co_pwritev            = vpc_co_pwritev,
     .bdrv_co_get_block_status   = vpc_co_get_block_status,
 
     .bdrv_get_info          = vpc_get_info,
diff --git a/block/vvfat.c b/block/vvfat.c
index 183fc4f04..ba2620f3c 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qemu/bswap.h"
 #include "migration/migration.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qbool.h"
@@ -113,15 +114,12 @@ static inline int array_ensure_allocated(array_t* array, int index)
 
 static inline void* array_get_next(array_t* array) {
     unsigned int next = array->next;
-    void* result;
 
     if (array_ensure_allocated(array, next) < 0)
 	return NULL;
 
     array->next = next + 1;
-    result = array_get(array, next);
-
-    return result;
+    return array_get(array, next);
 }
 
 static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
@@ -343,9 +341,8 @@ typedef struct BDRVVVFATState {
     unsigned int current_cluster;
 
     /* write support */
-    BlockDriverState* write_target;
     char* qcow_filename;
-    BlockDriverState* qcow;
+    BdrvChild* qcow;
     void* fat2;
     char* used_clusters;
     array_t commits;
@@ -983,7 +980,7 @@ static int init_directories(BDRVVVFATState* s,
 static BDRVVVFATState *vvv = NULL;
 #endif
 
-static int enable_write_target(BDRVVVFATState *s, Error **errp);
+static int enable_write_target(BlockDriverState *bs, Error **errp);
 static int is_consistent(BDRVVVFATState *s);
 
 static QemuOptsList runtime_opts = {
@@ -1160,8 +1157,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
     s->current_cluster=0xffffffff;
 
     /* read only is the default for safety */
-    bs->read_only = 1;
-    s->qcow = s->write_target = NULL;
+    bs->read_only = true;
+    s->qcow = NULL;
     s->qcow_filename = NULL;
     s->fat2 = NULL;
     s->downcase_short_names = 1;
@@ -1172,11 +1169,11 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
     s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1);
 
     if (qemu_opt_get_bool(opts, "rw", false)) {
-        ret = enable_write_target(s, errp);
+        ret = enable_write_target(bs, errp);
         if (ret < 0) {
             goto fail;
         }
-        bs->read_only = 0;
+        bs->read_only = false;
     }
 
     bs->total_sectors = cyls * heads * secs;
@@ -1210,6 +1207,11 @@ fail:
     return ret;
 }
 
+static void vvfat_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */
+}
+
 static inline void vvfat_close_current_file(BDRVVVFATState *s)
 {
     if(s->current_mapping) {
@@ -1388,9 +1390,10 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
 	   return -1;
 	if (s->qcow) {
 	    int n;
-            if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) {
-DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
-                if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) {
+            if (bdrv_is_allocated(s->qcow->bs, sector_num, nb_sectors-i, &n)) {
+                DLOG(fprintf(stderr, "sectors %d+%d allocated\n",
+                             (int)sector_num, n));
+                if (bdrv_read(s->qcow, sector_num, buf + i * 0x200, n)) {
                     return -1;
                 }
                 i += n - 1;
@@ -1421,14 +1424,31 @@ DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
     return 0;
 }
 
-static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num,
-                                      uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                QEMUIOVector *qiov, int flags)
 {
     int ret;
     BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+
     qemu_co_mutex_lock(&s->lock);
     ret = vvfat_read(bs, sector_num, buf, nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_from_buf(qiov, 0, buf, bytes);
+    g_free(buf);
+
     return ret;
 }
 
@@ -1649,12 +1669,15 @@ static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
     int was_modified = 0;
     int i, dummy;
 
-    if (s->qcow == NULL)
-	return 0;
+    if (s->qcow == NULL) {
+        return 0;
+    }
 
-    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
-	was_modified = bdrv_is_allocated(s->qcow,
-		cluster2sector(s, cluster_num) + i, 1, &dummy);
+    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) {
+        was_modified = bdrv_is_allocated(s->qcow->bs,
+                                         cluster2sector(s, cluster_num) + i,
+                                         1, &dummy);
+    }
 
     return was_modified;
 }
@@ -1803,11 +1826,16 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
 
 		vvfat_close_current_file(s);
                 for (i = 0; i < s->sectors_per_cluster; i++) {
-                    if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) {
-                        if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) {
+                    int res;
+
+                    res = bdrv_is_allocated(s->qcow->bs, offset + i, 1, &dummy);
+                    if (!res) {
+                        res = vvfat_read(s->bs, offset, s->cluster_buffer, 1);
+                        if (res) {
                             return -1;
                         }
-                        if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) {
+                        res = bdrv_write(s->qcow, offset, s->cluster_buffer, 1);
+                        if (res) {
                             return -2;
                         }
                     }
@@ -1941,8 +1969,7 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i))
 		/* check file size with FAT */
 		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
 		if (cluster_count !=
-			(le32_to_cpu(direntries[i].size) + s->cluster_size
-			 - 1) / s->cluster_size) {
+            DIV_ROUND_UP(le32_to_cpu(direntries[i].size), s->cluster_size)) {
 		    DLOG(fprintf(stderr, "Cluster count mismatch\n"));
 		    goto fail;
 		}
@@ -2764,8 +2791,8 @@ static int do_commit(BDRVVVFATState* s)
 	return ret;
     }
 
-    if (s->qcow->drv->bdrv_make_empty) {
-        s->qcow->drv->bdrv_make_empty(s->qcow);
+    if (s->qcow->bs->drv->bdrv_make_empty) {
+        s->qcow->bs->drv->bdrv_make_empty(s->qcow->bs);
     }
 
     memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
@@ -2880,14 +2907,31 @@ DLOG(checkpoint());
     return 0;
 }
 
-static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
-                                       const uint8_t *buf, int nb_sectors)
+static int coroutine_fn
+vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                 QEMUIOVector *qiov, int flags)
 {
     int ret;
     BDRVVVFATState *s = bs->opaque;
+    uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+    void *buf;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    buf = g_try_malloc(bytes);
+    if (bytes && buf == NULL) {
+        return -ENOMEM;
+    }
+    qemu_iovec_to_buf(qiov, 0, buf, bytes);
+
     qemu_co_mutex_lock(&s->lock);
     ret = vvfat_write(bs, sector_num, buf, nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
+
+    g_free(buf);
+
     return ret;
 }
 
@@ -2904,26 +2948,39 @@ static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
     return BDRV_BLOCK_DATA;
 }
 
-static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
-	const uint8_t* buffer, int nb_sectors) {
+static int coroutine_fn
+write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                    QEMUIOVector *qiov, int flags)
+{
     BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     return try_commit(s);
 }
 
 static void write_target_close(BlockDriverState *bs) {
     BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
-    bdrv_unref(s->qcow);
+    bdrv_unref_child(s->bs, s->qcow);
     g_free(s->qcow_filename);
 }
 
 static BlockDriver vvfat_write_target = {
     .format_name        = "vvfat_write_target",
-    .bdrv_write         = write_target_commit,
+    .bdrv_co_pwritev    = write_target_commit,
     .bdrv_close         = write_target_close,
 };
 
-static int enable_write_target(BDRVVVFATState *s, Error **errp)
+static void vvfat_qcow_options(int *child_flags, QDict *child_options,
+                               int parent_flags, QDict *parent_options)
 {
+    *child_flags = BDRV_O_RDWR | BDRV_O_NO_FLUSH;
+}
+
+static const BdrvChildRole child_vvfat_qcow = {
+    .inherit_options    = vvfat_qcow_options,
+};
+
+static int enable_write_target(BlockDriverState *bs, Error **errp)
+{
+    BDRVVVFATState *s = bs->opaque;
     BlockDriver *bdrv_qcow = NULL;
     BlockDriverState *backing;
     QemuOpts *opts = NULL;
@@ -2960,12 +3017,13 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
         goto err;
     }
 
-    s->qcow = NULL;
     options = qdict_new();
-    qdict_put(options, "driver", qstring_from_str("qcow"));
-    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
-    if (ret < 0) {
+    qdict_put(options, "write-target.driver", qstring_from_str("qcow"));
+    s->qcow = bdrv_open_child(s->qcow_filename, options, "write-target", bs,
+                              &child_vvfat_qcow, false, errp);
+    QDECREF(options);
+    if (!s->qcow) {
+        ret = -EINVAL;
         goto err;
     }
 
@@ -3012,10 +3070,11 @@ static BlockDriver bdrv_vvfat = {
 
     .bdrv_parse_filename    = vvfat_parse_filename,
     .bdrv_file_open         = vvfat_open,
+    .bdrv_refresh_limits    = vvfat_refresh_limits,
     .bdrv_close             = vvfat_close,
 
-    .bdrv_read              = vvfat_co_read,
-    .bdrv_write             = vvfat_co_write,
+    .bdrv_co_preadv         = vvfat_co_preadv,
+    .bdrv_co_pwritev        = vvfat_co_pwritev,
     .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };
 
diff --git a/block/win32-aio.c b/block/win32-aio.c
index 2d509a9a7..95e3ab154 100644
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@@ -27,7 +27,7 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "block/aio.h"
-#include "raw-aio.h"
+#include "block/raw-aio.h"
 #include "qemu/event_notifier.h"
 #include "qemu/iov.h"
 #include <windows.h>
author	hyokeun <hyokeun.jeon@samsung.com>	2016-09-06 14:09:22 +0900
committer	hyokeun <hyokeun.jeon@samsung.com>	2016-09-06 14:09:22 +0900
commit	bd54c25035217800f3b1d39f6472d599cd602d5a (patch)
tree	299417fe96f546225439ff92b27ac3e55909a970 /block
parent	186efde2677c31fb40d154a81a5f3731eab52414 (diff)
download	qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.gz qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.tar.bz2 qemu-bd54c25035217800f3b1d39f6472d599cd602d5a.zip