summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorYonghee Han <onstudy@samsung.com>2016-07-27 16:40:17 +0900
committerYonghee Han <onstudy@samsung.com>2016-07-27 00:53:56 -0700
commit3158f4a51894e46ecb593bffbfd12824e1d6534a (patch)
tree2bef7f0238e687c5de65f48b5995ee124a95d157 /block
parenta3b133b0ea0696e42fd876b9a803e28bc6ef5299 (diff)
downloadqemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.gz
qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.bz2
qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.zip
Imported Upstream version 2.4.1upstream/2.4.1
Change-Id: I0b584f569cb0e0f4eac13cdb79e110c2dbc34bfc
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs8
-rw-r--r--block/backup.c160
-rw-r--r--block/blkdebug.c20
-rw-r--r--block/blkverify.c4
-rw-r--r--block/block-backend.c16
-rw-r--r--block/commit.c3
-rw-r--r--block/curl.c16
-rw-r--r--block/dmg.c1
-rw-r--r--block/io.c2610
-rw-r--r--block/iscsi.c169
-rw-r--r--block/mirror.c117
-rw-r--r--block/nfs.c2
-rw-r--r--block/null.c66
-rw-r--r--block/parallels.c679
-rw-r--r--block/qapi.c65
-rw-r--r--block/qcow.c119
-rw-r--r--block/qcow2-cache.c174
-rw-r--r--block/qcow2-cluster.c137
-rw-r--r--block/qcow2-refcount.c73
-rw-r--r--block/qcow2-snapshot.c7
-rw-r--r--block/qcow2.c129
-rw-r--r--block/qcow2.h26
-rw-r--r--block/qed.c8
-rw-r--r--block/quorum.c102
-rw-r--r--block/raw-posix.c115
-rw-r--r--block/raw-win32.c1
-rw-r--r--block/rbd.c66
-rw-r--r--block/sheepdog.c155
-rw-r--r--block/snapshot.c19
-rw-r--r--block/ssh.c4
-rw-r--r--block/stream.c5
-rw-r--r--block/throttle-groups.c501
-rw-r--r--block/vdi.c6
-rw-r--r--block/vhdx-log.c1
-rw-r--r--block/vhdx.c10
-rw-r--r--block/vmdk.c83
-rw-r--r--block/vpc.c6
-rw-r--r--block/vvfat.c37
38 files changed, 4854 insertions, 866 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e46..58ef2ef3f 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,15 +1,16 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
-block-obj-$(CONFIG_QUORUM) += quorum.o
+block-obj-y += quorum.o
block-obj-y += parallels.o blkdebug.o blkverify.o
block-obj-y += block-backend.o snapshot.o qapi.o
block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
+block-obj-y += throttle-groups.o
block-obj-y += nbd.o nbd-client.o sheepdog.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -37,6 +38,7 @@ gluster.o-libs := $(GLUSTERFS_LIBS)
ssh.o-cflags := $(LIBSSH2_CFLAGS)
ssh.o-libs := $(LIBSSH2_LIBS)
archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m += dmg.o
dmg.o-libs := $(BZIP2_LIBS)
qcow.o-libs := -lz
linux-aio.o-libs := -laio
diff --git a/block/backup.c b/block/backup.c
index 1c535b1ab..965654d52 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -19,6 +19,7 @@
#include "block/block.h"
#include "block/block_int.h"
#include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
#include "qemu/ratelimit.h"
#define BACKUP_CLUSTER_BITS 16
@@ -37,6 +38,8 @@ typedef struct CowRequest {
typedef struct BackupBlockJob {
BlockJob common;
BlockDriverState *target;
+ /* bitmap for sync=incremental */
+ BdrvDirtyBitmap *sync_bitmap;
MirrorSyncMode sync_mode;
RateLimit limit;
BlockdevOnError on_source_error;
@@ -195,7 +198,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
BackupBlockJob *s = container_of(job, BackupBlockJob, common);
if (speed < 0) {
- error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ error_setg(errp, QERR_INVALID_PARAMETER, "speed");
return;
}
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -242,6 +245,91 @@ static void backup_complete(BlockJob *job, void *opaque)
g_free(data);
}
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ /* we need to yield so that bdrv_drain_all() returns.
+ * (without, VM does not reboot)
+ */
+ if (job->common.speed) {
+ uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+ job->sectors_read);
+ job->sectors_read = 0;
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+ } else {
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+ }
+
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+ bool error_is_read;
+ int ret = 0;
+ int clusters_per_iter;
+ uint32_t granularity;
+ int64_t sector;
+ int64_t cluster;
+ int64_t end;
+ int64_t last_cluster = -1;
+ BlockDriverState *bs = job->common.bs;
+ HBitmapIter hbi;
+
+ granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+ clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+ bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+
+ /* Find the next dirty sector(s) */
+ while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+ cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+ /* Fake progress updates for any clusters we skipped */
+ if (cluster != last_cluster + 1) {
+ job->common.offset += ((cluster - last_cluster - 1) *
+ BACKUP_CLUSTER_SIZE);
+ }
+
+ for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+ do {
+ if (yield_and_check(job)) {
+ return ret;
+ }
+ ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+ BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+ if ((ret < 0) &&
+ backup_error_action(job, error_is_read, -ret) ==
+ BLOCK_ERROR_ACTION_REPORT) {
+ return ret;
+ }
+ } while (ret < 0);
+ }
+
+ /* If the bitmap granularity is smaller than the backup granularity,
+ * we need to advance the iterator pointer to the next cluster. */
+ if (granularity < BACKUP_CLUSTER_SIZE) {
+ bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+ }
+
+ last_cluster = cluster - 1;
+ }
+
+ /* Play some final catchup with the progress meter */
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+ if (last_cluster + 1 < end) {
+ job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+ }
+
+ return ret;
+}
+
static void coroutine_fn backup_run(void *opaque)
{
BackupBlockJob *job = opaque;
@@ -259,8 +347,7 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_init(&job->flush_rwlock);
start = 0;
- end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
- BACKUP_SECTORS_PER_CLUSTER);
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
job->bitmap = hbitmap_alloc(end, 0);
@@ -278,28 +365,13 @@ static void coroutine_fn backup_run(void *opaque)
qemu_coroutine_yield();
job->common.busy = true;
}
+ } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+ ret = backup_run_incremental(job);
} else {
/* Both FULL and TOP SYNC_MODE's require copying.. */
for (; start < end; start++) {
bool error_is_read;
-
- if (block_job_is_cancelled(&job->common)) {
- break;
- }
-
- /* we need to yield so that qemu_aio_flush() returns.
- * (without, VM does not reboot)
- */
- if (job->common.speed) {
- uint64_t delay_ns = ratelimit_calculate_delay(
- &job->limit, job->sectors_read);
- job->sectors_read = 0;
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
- } else {
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
- }
-
- if (block_job_is_cancelled(&job->common)) {
+ if (yield_and_check(job)) {
break;
}
@@ -357,6 +429,18 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_wrlock(&job->flush_rwlock);
qemu_co_rwlock_unlock(&job->flush_rwlock);
+ if (job->sync_bitmap) {
+ BdrvDirtyBitmap *bm;
+ if (ret < 0 || block_job_is_cancelled(&job->common)) {
+ /* Merge the successor back into the parent, delete nothing. */
+ bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ } else {
+ /* Everything is fine, delete this bitmap and install the backup. */
+ bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ }
+ }
hbitmap_free(job->bitmap);
bdrv_iostatus_disable(target);
@@ -369,6 +453,7 @@ static void coroutine_fn backup_run(void *opaque)
void backup_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb, void *opaque,
@@ -388,7 +473,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
!bdrv_iostatus_is_enabled(bs)) {
- error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+ error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
return;
}
@@ -412,17 +497,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
return;
}
+ if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+ if (!sync_bitmap) {
+ error_setg(errp, "must provide a valid bitmap name for "
+ "\"incremental\" sync mode");
+ return;
+ }
+
+ /* Create a new bitmap, and freeze/disable this one. */
+ if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+ return;
+ }
+ } else if (sync_bitmap) {
+ error_setg(errp,
+ "a sync_bitmap was provided to backup_run, "
+ "but received an incompatible sync_mode (%s)",
+ MirrorSyncMode_lookup[sync_mode]);
+ return;
+ }
+
len = bdrv_getlength(bs);
if (len < 0) {
error_setg_errno(errp, -len, "unable to get length for '%s'",
bdrv_get_device_name(bs));
- return;
+ goto error;
}
BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
cb, opaque, errp);
if (!job) {
- return;
+ goto error;
}
bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +535,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
job->on_target_error = on_target_error;
job->target = target;
job->sync_mode = sync_mode;
+ job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
+ sync_bitmap : NULL;
job->common.len = len;
job->common.co = qemu_coroutine_create(backup_run);
qemu_coroutine_enter(job->common.co, job);
+ return;
+
+ error:
+ if (sync_bitmap) {
+ bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+ }
}
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0a3..bc247f46f 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -216,10 +216,9 @@ static int get_event_by_name(const char *name, BlkDebugEvent *event)
struct add_rule_data {
BDRVBlkdebugState *s;
int action;
- Error **errp;
};
-static int add_rule(QemuOpts *opts, void *opaque)
+static int add_rule(void *opaque, QemuOpts *opts, Error **errp)
{
struct add_rule_data *d = opaque;
BDRVBlkdebugState *s = d->s;
@@ -230,10 +229,10 @@ static int add_rule(QemuOpts *opts, void *opaque)
/* Find the right event for the rule */
event_name = qemu_opt_get(opts, "event");
if (!event_name) {
- error_setg(d->errp, "Missing event name for rule");
+ error_setg(errp, "Missing event name for rule");
return -1;
} else if (get_event_by_name(event_name, &event) < 0) {
- error_setg(d->errp, "Invalid event name \"%s\"", event_name);
+ error_setg(errp, "Invalid event name \"%s\"", event_name);
return -1;
}
@@ -319,8 +318,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,
d.s = s;
d.action = ACTION_INJECT_ERROR;
- d.errp = &local_err;
- qemu_opts_foreach(&inject_error_opts, add_rule, &d, 1);
+ qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err);
if (local_err) {
error_propagate(errp, local_err);
ret = -EINVAL;
@@ -328,7 +326,7 @@ static int read_config(BDRVBlkdebugState *s, const char *filename,
}
d.action = ACTION_SET_STATE;
- qemu_opts_foreach(&set_state_opts, add_rule, &d, 1);
+ qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err);
if (local_err) {
error_propagate(errp, local_err);
ret = -EINVAL;
@@ -431,7 +429,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
/* Open the backing file */
assert(bs->file == NULL);
ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
- flags | BDRV_O_PROTOCOL, false, &local_err);
+ bs, &child_file, false, &local_err);
if (ret < 0) {
error_propagate(errp, local_err);
goto out;
@@ -721,6 +719,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
return bdrv_getlength(bs->file);
}
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+ return bdrv_truncate(bs->file, offset);
+}
+
static void blkdebug_refresh_filename(BlockDriverState *bs)
{
QDict *opts;
@@ -779,6 +782,7 @@ static BlockDriver bdrv_blkdebug = {
.bdrv_file_open = blkdebug_open,
.bdrv_close = blkdebug_close,
.bdrv_getlength = blkdebug_getlength,
+ .bdrv_truncate = blkdebug_truncate,
.bdrv_refresh_filename = blkdebug_refresh_filename,
.bdrv_aio_readv = blkdebug_aio_readv,
diff --git a/block/blkverify.c b/block/blkverify.c
index 438dff8bc..d277e6322 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -125,7 +125,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
/* Open the raw file */
assert(bs->file == NULL);
ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
- "raw", flags | BDRV_O_PROTOCOL, false, &local_err);
+ "raw", bs, &child_file, false, &local_err);
if (ret < 0) {
error_propagate(errp, local_err);
goto fail;
@@ -134,7 +134,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
/* Open the test file */
assert(s->test_file == NULL);
ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
- "test", flags, false, &local_err);
+ "test", bs, &child_format, false, &local_err);
if (ret < 0) {
error_propagate(errp, local_err);
s->test_file = NULL;
diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c05..aee8a1202 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
}
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ int ret = blk_check_request(blk, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
static void error_callback_bh(void *opaque)
{
struct BlockBackendAIOCB *acb = opaque;
@@ -689,6 +700,11 @@ int blk_flush_all(void)
return bdrv_flush_all();
}
+void blk_drain(BlockBackend *blk)
+{
+ bdrv_drain(blk->bs);
+}
+
void blk_drain_all(void)
{
bdrv_drain_all();
diff --git a/block/commit.c b/block/commit.c
index cfa2bbebc..7312a5bdc 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -15,6 +15,7 @@
#include "trace.h"
#include "block/block_int.h"
#include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
#include "qemu/ratelimit.h"
enum {
@@ -186,7 +187,7 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
CommitBlockJob *s = container_of(job, CommitBlockJob, common);
if (speed < 0) {
- error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ error_setg(errp, QERR_INVALID_PARAMETER, "speed");
return;
}
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
diff --git a/block/curl.c b/block/curl.c
index bbee3ca17..032cc8ae2 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -22,8 +22,10 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
+#include "qemu/error-report.h"
#include "block/block_int.h"
#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qstring.h"
#include <curl/curl.h>
// #define DEBUG_CURL
@@ -297,6 +299,18 @@ static void curl_multi_check_completion(BDRVCURLState *s)
/* ACBs for successful messages get completed in curl_read_cb */
if (msg->data.result != CURLE_OK) {
int i;
+ static int errcount = 100;
+
+ /* Don't lose the original error message from curl, since
+ * it contains extra data.
+ */
+ if (errcount > 0) {
+ error_report("curl: %s", state->errmsg);
+ if (--errcount == 0) {
+ error_report("curl: further errors suppressed");
+ }
+ }
+
for (i = 0; i < CURL_NUM_ACB; i++) {
CURLAIOCB *acb = state->acb[i];
@@ -304,7 +318,7 @@ static void curl_multi_check_completion(BDRVCURLState *s)
continue;
}
- acb->common.cb(acb->common.opaque, -EIO);
+ acb->common.cb(acb->common.opaque, -EPROTO);
qemu_aio_unref(acb);
state->acb[i] = NULL;
}
diff --git a/block/dmg.c b/block/dmg.c
index 825c49d59..9f2528169 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -24,6 +24,7 @@
#include "qemu-common.h"
#include "block/block_int.h"
#include "qemu/bswap.h"
+#include "qemu/error-report.h"
#include "qemu/module.h"
#include <zlib.h>
#ifdef CONFIG_BZIP2
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 000000000..d4bc83b33
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2610 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "block/throttle-groups.h"
+#include "qemu/error-report.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+ ThrottleConfig *cfg)
+{
+ int i;
+
+ throttle_group_config(bs, cfg);
+
+ for (i = 0; i < 2; i++) {
+ qemu_co_enter_next(&bs->throttled_reqs[i]);
+ }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+ bool drained = false;
+ bool enabled = bs->io_limits_enabled;
+ int i;
+
+ bs->io_limits_enabled = false;
+
+ for (i = 0; i < 2; i++) {
+ while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+ drained = true;
+ }
+ }
+
+ bs->io_limits_enabled = enabled;
+
+ return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+ bdrv_start_throttled_reqs(bs);
+ throttle_group_unregister_bs(bs);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
+{
+ assert(!bs->io_limits_enabled);
+ throttle_group_register_bs(bs, group);
+ bs->io_limits_enabled = true;
+}
+
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
+{
+ /* this bs is not part of any group */
+ if (!bs->throttle_state) {
+ return;
+ }
+
+ /* this bs is a part of the same group than the one we want */
+ if (!g_strcmp0(throttle_group_get_name(bs), group)) {
+ return;
+ }
+
+ /* need to change the group this bs belong to */
+ bdrv_io_limits_disable(bs);
+ bdrv_io_limits_enable(bs, group);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+ /* Block drivers without coroutine functions need emulation */
+ if (!bdrv->bdrv_co_readv) {
+ bdrv->bdrv_co_readv = bdrv_co_readv_em;
+ bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+ /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+ * the block driver lacks aio we need to emulate that too.
+ */
+ if (!bdrv->bdrv_aio_readv) {
+ /* add AIO emulation layer */
+ bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+ bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+ }
+ }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ BlockDriver *drv = bs->drv;
+ Error *local_err = NULL;
+
+ memset(&bs->bl, 0, sizeof(bs->bl));
+
+ if (!drv) {
+ return;
+ }
+
+ /* Take some limits from the children as a default */
+ if (bs->file) {
+ bdrv_refresh_limits(bs->file, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+ bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+ bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment;
+ bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+ } else {
+ bs->bl.min_mem_alignment = 512;
+ bs->bl.opt_mem_alignment = getpagesize();
+ }
+
+ if (bs->backing_hd) {
+ bdrv_refresh_limits(bs->backing_hd, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length =
+ MAX(bs->bl.opt_transfer_length,
+ bs->backing_hd->bl.opt_transfer_length);
+ bs->bl.max_transfer_length =
+ MIN_NON_ZERO(bs->bl.max_transfer_length,
+ bs->backing_hd->bl.max_transfer_length);
+ bs->bl.opt_mem_alignment =
+ MAX(bs->bl.opt_mem_alignment,
+ bs->backing_hd->bl.opt_mem_alignment);
+ bs->bl.min_mem_alignment =
+ MAX(bs->bl.min_mem_alignment,
+ bs->backing_hd->bl.min_mem_alignment);
+ }
+
+ /* Then let the driver override it */
+ if (drv->bdrv_refresh_limits) {
+ drv->bdrv_refresh_limits(bs, errp);
+ }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+ bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+ assert(bs->copy_on_read > 0);
+ bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+ if (!QLIST_EMPTY(&bs->tracked_requests)) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+ return true;
+ }
+ if (bs->file && bdrv_requests_pending(bs->file)) {
+ return true;
+ }
+ if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts. In that case, use
+ * bdrv_drain_all() instead.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+ bool busy = true;
+
+ while (busy) {
+ /* Keep iterating */
+ bdrv_flush_io_queue(bs);
+ busy = bdrv_requests_pending(bs);
+ busy |= aio_poll(bdrv_get_aio_context(bs), busy);
+ }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ */
+void bdrv_drain_all(void)
+{
+ /* Always run first iteration so any pending completion BHs run */
+ bool busy = true;
+ BlockDriverState *bs = NULL;
+ GSList *aio_ctxs = NULL, *ctx;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_pause(bs->job);
+ }
+ aio_context_release(aio_context);
+
+ if (!g_slist_find(aio_ctxs, aio_context)) {
+ aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+ }
+ }
+
+ /* Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a
+ * coroutine can submit an I/O request to another device in response to
+ * request completion. Therefore we must keep looping until there was no
+ * more activity rather than simply draining each device independently.
+ */
+ while (busy) {
+ busy = false;
+
+ for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+ AioContext *aio_context = ctx->data;
+ bs = NULL;
+
+ aio_context_acquire(aio_context);
+ while ((bs = bdrv_next(bs))) {
+ if (aio_context == bdrv_get_aio_context(bs)) {
+ bdrv_flush_io_queue(bs);
+ if (bdrv_requests_pending(bs)) {
+ busy = true;
+ aio_poll(aio_context, busy);
+ }
+ }
+ }
+ busy |= aio_poll(aio_context, false);
+ aio_context_release(aio_context);
+ }
+ }
+
+ bs = NULL;
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_resume(bs->job);
+ }
+ aio_context_release(aio_context);
+ }
+ g_slist_free(aio_ctxs);
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+ if (req->serialising) {
+ req->bs->serialising_in_flight--;
+ }
+
+ QLIST_REMOVE(req, list);
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+ BlockDriverState *bs,
+ int64_t offset,
+ unsigned int bytes, bool is_write)
+{
+ *req = (BdrvTrackedRequest){
+ .bs = bs,
+ .offset = offset,
+ .bytes = bytes,
+ .is_write = is_write,
+ .co = qemu_coroutine_self(),
+ .serialising = false,
+ .overlap_offset = offset,
+ .overlap_bytes = bytes,
+ };
+
+ qemu_co_queue_init(&req->wait_queue);
+
+ QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+ int64_t overlap_offset = req->offset & ~(align - 1);
+ unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+ - overlap_offset;
+
+ if (!req->serialising) {
+ req->bs->serialising_in_flight++;
+ req->serialising = true;
+ }
+
+ req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+ req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ int64_t *cluster_sector_num,
+ int *cluster_nb_sectors)
+{
+ BlockDriverInfo bdi;
+
+ if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+ *cluster_sector_num = sector_num;
+ *cluster_nb_sectors = nb_sectors;
+ } else {
+ int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+ *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+ *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+ nb_sectors, c);
+ }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+ BlockDriverInfo bdi;
+ int ret;
+
+ ret = bdrv_get_info(bs, &bdi);
+ if (ret < 0 || bdi.cluster_size == 0) {
+ return bs->request_alignment;
+ } else {
+ return bdi.cluster_size;
+ }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+ int64_t offset, unsigned int bytes)
+{
+ /* aaaa bbbb */
+ if (offset >= req->overlap_offset + req->overlap_bytes) {
+ return false;
+ }
+ /* bbbb aaaa */
+ if (req->overlap_offset >= offset + bytes) {
+ return false;
+ }
+ return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+ BlockDriverState *bs = self->bs;
+ BdrvTrackedRequest *req;
+ bool retry;
+ bool waited = false;
+
+ if (!bs->serialising_in_flight) {
+ return false;
+ }
+
+ do {
+ retry = false;
+ QLIST_FOREACH(req, &bs->tracked_requests, list) {
+ if (req == self || (!req->serialising && !self->serialising)) {
+ continue;
+ }
+ if (tracked_request_overlaps(req, self->overlap_offset,
+ self->overlap_bytes))
+ {
+ /* Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests. This must
+ * never happen since it means deadlock.
+ */
+ assert(qemu_coroutine_self() != req->co);
+
+ /* If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case). */
+ if (!req->waiting_for) {
+ self->waiting_for = req;
+ qemu_co_queue_wait(&req->wait_queue);
+ self->waiting_for = NULL;
+ retry = true;
+ waited = true;
+ break;
+ }
+ }
+ }
+ } while (retry);
+
+ return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+ size_t size)
+{
+ if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+ return -EIO;
+ }
+
+ if (!bdrv_is_inserted(bs)) {
+ return -ENOMEDIUM;
+ }
+
+ if (offset < 0) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EIO;
+ }
+
+ return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+ BlockDriverState *bs;
+ int64_t offset;
+ QEMUIOVector *qiov;
+ bool is_write;
+ int ret;
+ BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ if (!rwco->is_write) {
+ rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ } else {
+ rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+ QEMUIOVector *qiov, bool is_write,
+ BdrvRequestFlags flags)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .offset = offset,
+ .qiov = qiov,
+ .is_write = is_write,
+ .ret = NOT_DONE,
+ .flags = flags,
+ };
+
+ /**
+ * In sync call context, when the vcpu is blocked, this throttling timer
+ * will not fire; so the I/O throttling function has to be disabled here
+ * if it has been enabled.
+ */
+ if (bs->io_limits_enabled) {
+ fprintf(stderr, "Disabling I/O throttling on '%s' due "
+ "to synchronous I/O.\n", bdrv_get_device_name(bs));
+ bdrv_io_limits_disable(bs);
+ }
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_rw_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_rw_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+ int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+ };
+
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+ &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ bool enabled;
+ int ret;
+
+ enabled = bs->io_limits_enabled;
+ bs->io_limits_enabled = false;
+ ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+ bs->io_limits_enabled = enabled;
+ return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+ -EIO generic I/O error (may happen for all errors)
+ -ENOMEDIUM No media inserted.
+ -EINVAL Invalid sector number or nb_sectors
+ -EACCES Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+ BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+ int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+ int n;
+
+ target_sectors = bdrv_nb_sectors(bs);
+ if (target_sectors < 0) {
+ return target_sectors;
+ }
+
+ for (;;) {
+ nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+ if (nb_sectors <= 0) {
+ return 0;
+ }
+ ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+ if (ret < 0) {
+ error_report("error getting block status at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ if (ret & BDRV_BLOCK_ZERO) {
+ sector_num += n;
+ continue;
+ }
+ ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+ if (ret < 0) {
+ error_report("error writing zeroes at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ sector_num += n;
+ }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = bytes,
+ };
+ int ret;
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+ int ret;
+
+ ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+ const void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = bytes,
+ };
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+ const void *buf, int count)
+{
+ int ret;
+
+ ret = bdrv_pwrite(bs, offset, buf, count);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* No flush needed for cache modes that already do it */
+ if (bs->enable_write_cache) {
+ bdrv_flush(bs);
+ }
+
+ return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ /* Perform I/O through a temporary buffer so that users who scribble over
+ * their read buffer while the operation is in progress do not end up
+ * modifying the image file. This is critical for zero-copy guest I/O
+ * where anything might happen inside guest memory.
+ */
+ void *bounce_buffer;
+
+ BlockDriver *drv = bs->drv;
+ struct iovec iov;
+ QEMUIOVector bounce_qiov;
+ int64_t cluster_sector_num;
+ int cluster_nb_sectors;
+ size_t skip_bytes;
+ int ret;
+
+ /* Cover entire cluster so no additional backing file I/O is required when
+ * allocating cluster in the image file.
+ */
+ bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+ &cluster_sector_num, &cluster_nb_sectors);
+
+ trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+ cluster_sector_num, cluster_nb_sectors);
+
+ iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+ iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+ if (bounce_buffer == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+ ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (drv->bdrv_co_write_zeroes &&
+ buffer_is_zero(bounce_buffer, iov.iov_len)) {
+ ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+ cluster_nb_sectors, 0);
+ } else {
+ /* This does not change the data on the disk, it is not necessary
+ * to flush even in cache=writethrough mode.
+ */
+ ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ }
+
+ if (ret < 0) {
+ /* It might be okay to ignore write errors for guest requests. If this
+ * is a deliberate copy-on-read then we don't want to ignore the error.
+ * Simply report it in all cases.
+ */
+ goto err;
+ }
+
+ skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+ qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+ nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+ qemu_vfree(bounce_buffer);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ int64_t align, QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ /* Handle Copy on Read and associated serialisation */
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ /* If we touch the same cluster it counts as an overlap. This
+ * guarantees that allocating writes will be serialized and not race
+ * with each other for the same cluster. For example, in copy-on-read
+ * it ensures that the CoR read and write operations are atomic and
+ * guest writes cannot interleave between them. */
+ mark_request_serialising(req, bdrv_get_cluster_size(bs));
+ }
+
+ wait_serialising_requests(req);
+
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ int pnum;
+
+ ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (!ret || pnum != nb_sectors) {
+ ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+ goto out;
+ }
+ }
+
+ /* Forward the request to the BlockDriver */
+ if (!bs->zero_beyond_eof) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else {
+ /* Read zeros after EOF */
+ int64_t total_sectors, max_nb_sectors;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ ret = total_sectors;
+ goto out;
+ }
+
+ max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+ align >> BDRV_SECTOR_BITS);
+ if (nb_sectors < max_nb_sectors) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else if (max_nb_sectors > 0) {
+ QEMUIOVector local_qiov;
+
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, 0,
+ max_nb_sectors * BDRV_SECTOR_SIZE);
+
+ ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+ &local_qiov);
+
+ qemu_iovec_destroy(&local_qiov);
+ } else {
+ ret = 0;
+ }
+
+ /* Reading beyond end of file is supposed to produce zeroes */
+ if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+ uint64_t offset = MAX(0, total_sectors - sector_num);
+ uint64_t bytes = (sector_num + nb_sectors - offset) *
+ BDRV_SECTOR_SIZE;
+ qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest req;
+
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bs->copy_on_read) {
+ flags |= BDRV_REQ_COPY_ON_READ;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ throttle_group_co_io_limits_intercept(bs, bytes, false);
+ }
+
+ /* Align read if necessary by padding qiov */
+ if (offset & (align - 1)) {
+ head_buf = qemu_blockalign(bs, align);
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+ tail_buf = qemu_blockalign(bs, align);
+ qemu_iovec_add(&local_qiov, tail_buf,
+ align - ((offset + bytes) & (align - 1)));
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ tracked_request_begin(&req, bs, offset, bytes, false);
+ ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+ tracked_request_end(&req);
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+ }
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ QEMUIOVector qiov;
+ struct iovec iov = {0};
+ int ret = 0;
+
+ int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+ BDRV_REQUEST_MAX_SECTORS);
+
+ while (nb_sectors > 0 && !ret) {
+ int num = nb_sectors;
+
+ /* Align request. Block drivers can expect the "bulk" of the request
+ * to be aligned.
+ */
+ if (bs->bl.write_zeroes_alignment
+ && num > bs->bl.write_zeroes_alignment) {
+ if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+ /* Make a small request up to the first aligned sector. */
+ num = bs->bl.write_zeroes_alignment;
+ num -= sector_num % bs->bl.write_zeroes_alignment;
+ } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+ /* Shorten the request to the last aligned sector. num cannot
+ * underflow because num > bs->bl.write_zeroes_alignment.
+ */
+ num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+ }
+ }
+
+ /* limit request size */
+ if (num > max_write_zeroes) {
+ num = max_write_zeroes;
+ }
+
+ ret = -ENOTSUP;
+ /* First try the efficient write zeroes operation */
+ if (drv->bdrv_co_write_zeroes) {
+ ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+ }
+
+ if (ret == -ENOTSUP) {
+ /* Fall back to bounce buffer if write zeroes is unsupported */
+ int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+ MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+ num = MIN(num, max_xfer_len);
+ iov.iov_len = num * BDRV_SECTOR_SIZE;
+ if (iov.iov_base == NULL) {
+ iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+ if (iov.iov_base == NULL) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+ }
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+ /* Keep bounce buffer around if it is big enough for all
+ * all future requests.
+ */
+ if (num < max_xfer_len) {
+ qemu_vfree(iov.iov_base);
+ iov.iov_base = NULL;
+ }
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+
+fail:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ bool waited;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ waited = wait_serialising_requests(req);
+ assert(!waited || !req->serialising);
+ assert(req->overlap_offset <= offset);
+ assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+ ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+ if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+ !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+ qemu_iovec_is_zero(qiov)) {
+ flags |= BDRV_REQ_ZERO_WRITE;
+ if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+ flags |= BDRV_REQ_MAY_UNMAP;
+ }
+ }
+
+ if (ret < 0) {
+ /* Do nothing, write notifier decided to fail this request */
+ } else if (flags & BDRV_REQ_ZERO_WRITE) {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+ ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+ } else {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+ if (ret == 0 && !bs->enable_write_cache) {
+ ret = bdrv_co_flush(bs);
+ }
+
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+ block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+ if (ret >= 0) {
+ bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+ }
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+ int64_t offset,
+ unsigned int bytes,
+ BdrvRequestFlags flags,
+ BdrvTrackedRequest *req)
+{
+ uint8_t *buf = NULL;
+ QEMUIOVector local_qiov;
+ struct iovec iov;
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+ unsigned int head_padding_bytes, tail_padding_bytes;
+ int ret = 0;
+
+ head_padding_bytes = offset & (align - 1);
+ tail_padding_bytes = align - ((offset + bytes) & (align - 1));
+
+
+ assert(flags & BDRV_REQ_ZERO_WRITE);
+ if (head_padding_bytes || tail_padding_bytes) {
+ buf = qemu_blockalign(bs, align);
+ iov = (struct iovec) {
+ .iov_base = buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&local_qiov, &iov, 1);
+ }
+ if (head_padding_bytes) {
+ uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
+
+ /* RMW the unaligned part before head. */
+ mark_request_serialising(req, align);
+ wait_serialising_requests(req);
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+ align, &local_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+ memset(buf + head_padding_bytes, 0, zero_bytes);
+ ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+ &local_qiov,
+ flags & ~BDRV_REQ_ZERO_WRITE);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += zero_bytes;
+ bytes -= zero_bytes;
+ }
+
+ assert(!bytes || (offset & (align - 1)) == 0);
+ if (bytes >= align) {
+ /* Write the aligned part in the middle. */
+ uint64_t aligned_bytes = bytes & ~(align - 1);
+ ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
+ NULL, flags);
+ if (ret < 0) {
+ goto fail;
+ }
+ bytes -= aligned_bytes;
+ offset += aligned_bytes;
+ }
+
+ assert(!bytes || (offset & (align - 1)) == 0);
+ if (bytes) {
+ assert(align == tail_padding_bytes + bytes);
+ /* RMW the unaligned part after tail. */
+ mark_request_serialising(req, align);
+ wait_serialising_requests(req);
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ ret = bdrv_aligned_preadv(bs, req, offset, align,
+ align, &local_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+ memset(buf, 0, bytes);
+ ret = bdrv_aligned_pwritev(bs, req, offset, align,
+ &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
+ }
+fail:
+ qemu_vfree(buf);
+ return ret;
+
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BdrvTrackedRequest req;
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+ if (bs->read_only) {
+ return -EPERM;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ throttle_group_co_io_limits_intercept(bs, bytes, true);
+ }
+
+ /*
+ * Align write if necessary by performing a read-modify-write cycle.
+ * Pad qiov with the read parts and be sure to have a tracked request not
+ * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+ */
+ tracked_request_begin(&req, bs, offset, bytes, true);
+
+ if (!qiov) {
+ ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+ goto out;
+ }
+
+ if (offset & (align - 1)) {
+ QEMUIOVector head_qiov;
+ struct iovec head_iov;
+
+ mark_request_serialising(&req, align);
+ wait_serialising_requests(&req);
+
+ head_buf = qemu_blockalign(bs, align);
+ head_iov = (struct iovec) {
+ .iov_base = head_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+ align, &head_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ QEMUIOVector tail_qiov;
+ struct iovec tail_iov;
+ size_t tail_bytes;
+ bool waited;
+
+ mark_request_serialising(&req, align);
+ waited = wait_serialising_requests(&req);
+ assert(!waited || !use_local_qiov);
+
+ tail_buf = qemu_blockalign(bs, align);
+ tail_iov = (struct iovec) {
+ .iov_base = tail_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+ align, &tail_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+
+ tail_bytes = (offset + bytes) & (align - 1);
+ qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+
+fail:
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ }
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+out:
+ tracked_request_end(&req);
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BdrvRequestFlags flags)
+{
+ trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ flags &= ~BDRV_REQ_MAY_UNMAP;
+ }
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+ BDRV_REQ_ZERO_WRITE | flags);
+}
+
+int bdrv_flush_all(void)
+{
+ BlockDriverState *bs = NULL;
+ int result = 0;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+ int ret;
+
+ aio_context_acquire(aio_context);
+ ret = bdrv_flush(bs);
+ if (ret < 0 && !result) {
+ result = ret;
+ }
+ aio_context_release(aio_context);
+ }
+
+ return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+ BlockDriverState *bs;
+ BlockDriverState *base;
+ int64_t sector_num;
+ int nb_sectors;
+ int *pnum;
+ int64_t ret;
+ bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t total_sectors;
+ int64_t n;
+ int64_t ret, ret2;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ return total_sectors;
+ }
+
+ if (sector_num >= total_sectors) {
+ *pnum = 0;
+ return 0;
+ }
+
+ n = total_sectors - sector_num;
+ if (n < nb_sectors) {
+ nb_sectors = n;
+ }
+
+ if (!bs->drv->bdrv_co_get_block_status) {
+ *pnum = nb_sectors;
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+ if (bs->drv->protocol_name) {
+ ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+ }
+ return ret;
+ }
+
+ ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ *pnum = 0;
+ return ret;
+ }
+
+ if (ret & BDRV_BLOCK_RAW) {
+ assert(ret & BDRV_BLOCK_OFFSET_VALID);
+ return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, pnum);
+ }
+
+ if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+ ret |= BDRV_BLOCK_ALLOCATED;
+ } else {
+ if (bdrv_unallocated_blocks_are_zero(bs)) {
+ ret |= BDRV_BLOCK_ZERO;
+ } else if (bs->backing_hd) {
+ BlockDriverState *bs2 = bs->backing_hd;
+ int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+ if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+ ret |= BDRV_BLOCK_ZERO;
+ }
+ }
+ }
+
+ if (bs->file &&
+ (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+ (ret & BDRV_BLOCK_OFFSET_VALID)) {
+ int file_pnum;
+
+ ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, &file_pnum);
+ if (ret2 >= 0) {
+ /* Ignore errors. This is just providing extra information, it
+ * is useful but not necessary.
+ */
+ if (!file_pnum) {
+ /* !file_pnum indicates an offset at or beyond the EOF; it is
+ * perfectly valid for the format block driver to point to such
+ * offsets, so catch it and mark everything as zero */
+ ret |= BDRV_BLOCK_ZERO;
+ } else {
+ /* Limit request to the range reported by the protocol driver */
+ *pnum = file_pnum;
+ ret |= (ret2 & BDRV_BLOCK_ZERO);
+ }
+ }
+ }
+
+ return ret;
+}
+
+static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors,
+ int *pnum)
+{
+ BlockDriverState *p;
+ int64_t ret = 0;
+
+ assert(bs != base);
+ for (p = bs; p != base; p = p->backing_hd) {
+ ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+ if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+ break;
+ }
+ /* [sector_num, pnum] unallocated on this layer, which could be only
+ * the first part of [sector_num, nb_sectors]. */
+ nb_sectors = MIN(nb_sectors, *pnum);
+ }
+ return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status_above() */
+static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
+{
+ BdrvCoGetBlockStatusData *data = opaque;
+
+ data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
+ data->sector_num,
+ data->nb_sectors,
+ data->pnum);
+ data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status_above().
+ *
+ * See bdrv_co_get_block_status_above() for details.
+ */
+int64_t bdrv_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ Coroutine *co;
+ BdrvCoGetBlockStatusData data = {
+ .bs = bs,
+ .base = base,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .pnum = pnum,
+ .done = false,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_get_block_status_above_co_entry(&data);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
+ qemu_coroutine_enter(co, &data);
+ while (!data.done) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return data.ret;
+}
+
+int64_t bdrv_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ return bdrv_get_block_status_above(bs, bs->backing_hd,
+ sector_num, nb_sectors, pnum);
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ return ret;
+ }
+ return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive). BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain. Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BlockDriverState *intermediate;
+ int ret, n = nb_sectors;
+
+ intermediate = top;
+ while (intermediate && intermediate != base) {
+ int pnum_inter;
+ ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+ &pnum_inter);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ *pnum = pnum_inter;
+ return 1;
+ }
+
+ /*
+ * [sector_num, nb_sectors] is unallocated on top but intermediate
+ * might have
+ *
+ * [sector_num+x, nr_sectors] allocated.
+ */
+ if (n > pnum_inter &&
+ (intermediate == top ||
+ sector_num + pnum_inter < intermediate->total_sectors)) {
+ n = pnum_inter;
+ }
+
+ intermediate = intermediate->backing_hd;
+ }
+
+ *pnum = n;
+ return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (!drv->bdrv_write_compressed) {
+ return -ENOTSUP;
+ }
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+ return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = size,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ } else if (drv->bdrv_save_vmstate) {
+ return drv->bdrv_save_vmstate(bs, qiov, pos);
+ } else if (bs->file) {
+ return bdrv_writev_vmstate(bs->file, qiov, pos);
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (drv->bdrv_load_vmstate)
+ return drv->bdrv_load_vmstate(bs, buf, pos, size);
+ if (bs->file)
+ return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+ BDRV_REQ_ZERO_WRITE | flags,
+ cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+ int error;
+ int num_requests;
+ int num_callbacks;
+ struct {
+ BlockCompletionFunc *cb;
+ void *opaque;
+ QEMUIOVector *free_qiov;
+ } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+ int i;
+
+ for (i = 0; i < mcb->num_callbacks; i++) {
+ mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+ if (mcb->callbacks[i].free_qiov) {
+ qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+ }
+ g_free(mcb->callbacks[i].free_qiov);
+ }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+ MultiwriteCB *mcb = opaque;
+
+ trace_multiwrite_cb(mcb, ret);
+
+ if (ret < 0 && !mcb->error) {
+ mcb->error = ret;
+ }
+
+ mcb->num_requests--;
+ if (mcb->num_requests == 0) {
+ multiwrite_user_cb(mcb);
+ g_free(mcb);
+ }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+ const BlockRequest *req1 = a, *req2 = b;
+
+ /*
+ * Note that we can't simply subtract req2->sector from req1->sector
+ * here as that could overflow the return value.
+ */
+ if (req1->sector > req2->sector) {
+ return 1;
+ } else if (req1->sector < req2->sector) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+ int num_reqs, MultiwriteCB *mcb)
+{
+ int i, outidx;
+
+ // Sort requests by start sector
+ qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+ // Check if adjacent requests touch the same clusters. If so, combine them,
+ // filling up gaps with zero sectors.
+ outidx = 0;
+ for (i = 1; i < num_reqs; i++) {
+ int merge = 0;
+ int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+ // Handle exactly sequential writes and overlapping writes.
+ if (reqs[i].sector <= oldreq_last) {
+ merge = 1;
+ }
+
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ merge = 0;
+ }
+
+ if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+ reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+ merge = 0;
+ }
+
+ if (merge) {
+ size_t size;
+ QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+ qemu_iovec_init(qiov,
+ reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+ // Add the first request to the merged one. If the requests are
+ // overlapping, drop the last sectors of the first request.
+ size = (reqs[i].sector - reqs[outidx].sector) << 9;
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+ // We should need to add any zeros between the two requests
+ assert (reqs[i].sector <= oldreq_last);
+
+ // Add the second request
+ qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+ // Add tail of first request, if necessary
+ if (qiov->size < reqs[outidx].qiov->size) {
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+ reqs[outidx].qiov->size - qiov->size);
+ }
+
+ reqs[outidx].nb_sectors = qiov->size >> 9;
+ reqs[outidx].qiov = qiov;
+
+ mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+ } else {
+ outidx++;
+ reqs[outidx].sector = reqs[i].sector;
+ reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+ reqs[outidx].qiov = reqs[i].qiov;
+ }
+ }
+
+ block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+ return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+ MultiwriteCB *mcb;
+ int i;
+
+ /* don't submit writes if we don't have a medium */
+ if (bs->drv == NULL) {
+ for (i = 0; i < num_reqs; i++) {
+ reqs[i].error = -ENOMEDIUM;
+ }
+ return -1;
+ }
+
+ if (num_reqs == 0) {
+ return 0;
+ }
+
+ // Create MultiwriteCB structure
+ mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+ mcb->num_requests = 0;
+ mcb->num_callbacks = num_reqs;
+
+ for (i = 0; i < num_reqs; i++) {
+ mcb->callbacks[i].cb = reqs[i].cb;
+ mcb->callbacks[i].opaque = reqs[i].opaque;
+ }
+
+ // Check for mergable requests
+ num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+ trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+ /* Run the aio requests. */
+ mcb->num_requests = num_reqs;
+ for (i = 0; i < num_reqs; i++) {
+ bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+ reqs[i].nb_sectors, reqs[i].flags,
+ multiwrite_cb, mcb,
+ true);
+ }
+
+ return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+ qemu_aio_ref(acb);
+ bdrv_aio_cancel_async(acb);
+ while (acb->refcnt > 1) {
+ if (acb->aiocb_info->get_aio_context) {
+ aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+ } else if (acb->bs) {
+ aio_poll(bdrv_get_aio_context(acb->bs), true);
+ } else {
+ abort();
+ }
+ }
+ qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+ if (acb->aiocb_info->cancel_async) {
+ acb->aiocb_info->cancel_async(acb);
+ }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+ BlockAIOCB common;
+ QEMUBH *bh;
+ int ret;
+ /* vector translation state */
+ QEMUIOVector *qiov;
+ uint8_t *bounce;
+ int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+ BlockAIOCBSync *acb = opaque;
+
+ if (!acb->is_write && acb->ret >= 0) {
+ qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+ }
+ qemu_vfree(acb->bounce);
+ acb->common.cb(acb->common.opaque, acb->ret);
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ int is_write)
+
+{
+ BlockAIOCBSync *acb;
+
+ acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+ acb->is_write = is_write;
+ acb->qiov = qiov;
+ acb->bounce = qemu_try_blockalign(bs, qiov->size);
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+ if (acb->bounce == NULL) {
+ acb->ret = -ENOMEM;
+ } else if (is_write) {
+ qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+ acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+ } else {
+ acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+ }
+
+ qemu_bh_schedule(acb->bh);
+
+ return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+ BlockAIOCB common;
+ BlockRequest req;
+ bool is_write;
+ bool need_bh;
+ bool *done;
+ QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+ if (!acb->need_bh) {
+ acb->common.cb(acb->common.opaque, acb->req.error);
+ qemu_aio_unref(acb);
+ }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+
+ assert(!acb->need_bh);
+ qemu_bh_delete(acb->bh);
+ bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+ acb->need_bh = false;
+ if (acb->req.error != -EINPROGRESS) {
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+ }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ if (!acb->is_write) {
+ acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ } else {
+ acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ }
+
+ bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ acb->req.qiov = qiov;
+ acb->req.flags = flags;
+ acb->is_write = is_write;
+
+ co = qemu_coroutine_create(bdrv_co_do_rw);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_flush(bs);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_flush(bs, opaque);
+
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+
+ co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockAIOCB *acb;
+
+ acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb->aiocb_info = aiocb_info;
+ acb->bs = bs;
+ acb->cb = cb;
+ acb->opaque = opaque;
+ acb->refcnt = 1;
+ return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+ BlockAIOCB *acb = p;
+ acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+ BlockAIOCB *acb = p;
+ assert(acb->refcnt > 0);
+ if (--acb->refcnt == 0) {
+ g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+ Coroutine *coroutine;
+ int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+ CoroutineIOCompletion *co = opaque;
+
+ co->ret = ret;
+ qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *iov,
+ bool is_write)
+{
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockAIOCB *acb;
+
+ if (is_write) {
+ acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ } else {
+ acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ }
+
+ trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+ if (!acb) {
+ return -EIO;
+ }
+ qemu_coroutine_yield();
+
+ return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+ int ret;
+
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+ bdrv_is_sg(bs)) {
+ return 0;
+ }
+
+ /* Write back cached data to the OS even with cache=unsafe */
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+ if (bs->drv->bdrv_co_flush_to_os) {
+ ret = bs->drv->bdrv_co_flush_to_os(bs);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* But don't actually force it to the disk with cache=unsafe */
+ if (bs->open_flags & BDRV_O_NO_FLUSH) {
+ goto flush_parent;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+ if (bs->drv->bdrv_co_flush_to_disk) {
+ ret = bs->drv->bdrv_co_flush_to_disk(bs);
+ } else if (bs->drv->bdrv_aio_flush) {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ ret = -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ } else {
+ /*
+ * Some block drivers always operate in either writethrough or unsafe
+ * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+ * know how the server works (because the behaviour is hardcoded or
+ * depends on server-side configuration), so we can't ensure that
+ * everything is safe on disk. Returning an error doesn't work because
+ * that would break guests even if the server operates in writethrough
+ * mode.
+ *
+ * Let's hope the user knows what he's doing.
+ */
+ ret = 0;
+ }
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
+ * in the case of cache=unsafe, so there are no useless flushes.
+ */
+flush_parent:
+ return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_flush_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_flush_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+typedef struct DiscardCo {
+ BlockDriverState *bs;
+ int64_t sector_num;
+ int nb_sectors;
+ int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+ DiscardCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ int max_discard, ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ } else if (bs->read_only) {
+ return -EPERM;
+ }
+
+ /* Do nothing if disabled. */
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ return 0;
+ }
+
+ if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+ return 0;
+ }
+
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+ max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+ while (nb_sectors > 0) {
+ int ret;
+ int num = nb_sectors;
+
+ /* align request */
+ if (bs->bl.discard_alignment &&
+ num >= bs->bl.discard_alignment &&
+ sector_num % bs->bl.discard_alignment) {
+ if (num > bs->bl.discard_alignment) {
+ num = bs->bl.discard_alignment;
+ }
+ num -= sector_num % bs->bl.discard_alignment;
+ }
+
+ /* limit request size */
+ if (num > max_discard) {
+ num = max_discard;
+ }
+
+ if (bs->drv->bdrv_co_discard) {
+ ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+ } else {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ return -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ }
+ if (ret && ret != -ENOTSUP) {
+ return ret;
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+ return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+ Coroutine *co;
+ DiscardCo rwco = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_discard_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_discard_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_ioctl)
+ return drv->bdrv_ioctl(bs, req, buf);
+ return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_aio_ioctl)
+ return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+ return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+ return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+ return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+ size_t align = bdrv_opt_mem_align(bs);
+
+ /* Ensure that NULL is never returned on success */
+ assert(align > 0);
+ if (size == 0) {
+ size = align;
+ }
+
+ return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+ void *mem = qemu_try_blockalign(bs, size);
+
+ if (mem) {
+ memset(mem, 0, size);
+ }
+
+ return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+ size_t alignment = bdrv_min_mem_align(bs);
+
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+ return false;
+ }
+ if (qiov->iov[i].iov_len % alignment) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+ NotifierWithReturn *notifier)
+{
+ notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_plug) {
+ drv->bdrv_io_plug(bs);
+ } else if (bs->file) {
+ bdrv_io_plug(bs->file);
+ }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_unplug) {
+ drv->bdrv_io_unplug(bs);
+ } else if (bs->file) {
+ bdrv_io_unplug(bs->file);
+ }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_flush_io_queue) {
+ drv->bdrv_flush_io_queue(bs);
+ } else if (bs->file) {
+ bdrv_flush_io_queue(bs->file);
+ }
+ bdrv_start_throttled_reqs(bs);
+}
diff --git a/block/iscsi.c b/block/iscsi.c
index be8af46ad..93f1ee4c6 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
* QEMU Block driver for iSCSI images
*
* Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -38,6 +38,7 @@
#include "qemu/iov.h"
#include "sysemu/sysemu.h"
#include "qmp-commands.h"
+#include "qapi/qmp/qstring.h"
#include <iscsi/iscsi.h>
#include <iscsi/scsi-lowlevel.h>
@@ -57,9 +58,6 @@ typedef struct IscsiLun {
int events;
QEMUTimer *nop_timer;
QEMUTimer *event_timer;
- uint8_t lbpme;
- uint8_t lbprz;
- uint8_t has_write_same;
struct scsi_inquiry_logical_block_provisioning lbp;
struct scsi_inquiry_block_limits bl;
unsigned char *zeroblock;
@@ -67,6 +65,12 @@ typedef struct IscsiLun {
int cluster_sectors;
bool use_16_for_rw;
bool write_protected;
+ bool lbpme;
+ bool lbprz;
+ bool dpofua;
+ bool has_write_same;
+ bool force_next_flush;
+ bool request_timed_out;
} IscsiLun;
typedef struct IscsiTask {
@@ -79,6 +83,7 @@ typedef struct IscsiTask {
QEMUBH *bh;
IscsiLun *iscsilun;
QEMUTimer retry_timer;
+ bool force_next_flush;
} IscsiTask;
typedef struct IscsiAIOCB {
@@ -96,11 +101,12 @@ typedef struct IscsiAIOCB {
#endif
} IscsiAIOCB;
-#define EVENT_INTERVAL 250
+/* libiscsi uses time_t so its enough to process events every second */
+#define EVENT_INTERVAL 1000
#define NOP_INTERVAL 5000
#define MAX_NOP_FAILURES 3
#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
/* this threshold is a trade-off knob to choose between
* the potential additional overhead of an extra GET_LBA_STATUS request
@@ -163,6 +169,19 @@ static inline unsigned exp_random(double mean)
return -mean * log((double)rand() / RAND_MAX);
}
+/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
+ * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
+ * macro was introduced in 1.11.0. So use the API_VERSION macro as
+ * a hint that the macros are defined and define them ourselves
+ * otherwise to keep the required libiscsi version at 1.9.0 */
+#if !defined(LIBISCSI_API_VERSION)
+#define QEMU_SCSI_STATUS_TASK_SET_FULL 0x28
+#define QEMU_SCSI_STATUS_TIMEOUT 0x0f000002
+#else
+#define QEMU_SCSI_STATUS_TASK_SET_FULL SCSI_STATUS_TASK_SET_FULL
+#define QEMU_SCSI_STATUS_TIMEOUT SCSI_STATUS_TIMEOUT
+#endif
+
static void
iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
void *command_data, void *opaque)
@@ -183,10 +202,19 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
iTask->do_retry = 1;
goto out;
}
- if (status == SCSI_STATUS_BUSY) {
+ if (status == SCSI_STATUS_BUSY ||
+ status == QEMU_SCSI_STATUS_TIMEOUT ||
+ status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
unsigned retry_time =
exp_random(iscsi_retry_times[iTask->retries - 1]);
- error_report("iSCSI Busy (retry #%u in %u ms): %s",
+ if (status == QEMU_SCSI_STATUS_TIMEOUT) {
+ /* make sure the request is rescheduled AFTER the
+ * reconnect is initiated */
+ retry_time = EVENT_INTERVAL * 2;
+ iTask->iscsilun->request_timed_out = true;
+ }
+ error_report("iSCSI Busy/TaskSetFull/TimeOut"
+ " (retry #%u in %u ms): %s",
iTask->retries, retry_time,
iscsi_get_error(iscsi));
aio_timer_init(iTask->iscsilun->aio_context,
@@ -199,6 +227,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
}
}
error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+ } else {
+ iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
}
out:
@@ -268,20 +298,26 @@ iscsi_set_events(IscsiLun *iscsilun)
iscsilun);
iscsilun->events = ev;
}
-
- /* newer versions of libiscsi may return zero events. In this
- * case start a timer to ensure we are able to return to service
- * once this situation changes. */
- if (!ev) {
- timer_mod(iscsilun->event_timer,
- qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
- }
}
-static void iscsi_timed_set_events(void *opaque)
+static void iscsi_timed_check_events(void *opaque)
{
IscsiLun *iscsilun = opaque;
+
+ /* check for timed out requests */
+ iscsi_service(iscsilun->iscsi, 0);
+
+ if (iscsilun->request_timed_out) {
+ iscsilun->request_timed_out = false;
+ iscsi_reconnect(iscsilun->iscsi);
+ }
+
+ /* newer versions of libiscsi may return zero events. Ensure we are able
+ * to return to service once this situation changes. */
iscsi_set_events(iscsilun);
+
+ timer_mod(iscsilun->event_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
}
static void
@@ -369,6 +405,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
struct IscsiTask iTask;
uint64_t lba;
uint32_t num_sectors;
+ int fua;
if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
return -EINVAL;
@@ -384,15 +421,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
iscsi_co_init_iscsitask(iscsilun, &iTask);
retry:
+ fua = iscsilun->dpofua && !bs->enable_write_cache;
+ iTask.force_next_flush = !fua;
if (iscsilun->use_16_for_rw) {
iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
} else {
iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
}
if (iTask.task == NULL) {
@@ -460,7 +499,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
*pnum = nb_sectors;
/* LUN does not support logical block provisioning */
- if (iscsilun->lbpme == 0) {
+ if (!iscsilun->lbpme) {
goto out;
}
@@ -616,12 +655,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
IscsiLun *iscsilun = bs->opaque;
struct IscsiTask iTask;
- if (bs->sg) {
+ if (!iscsilun->force_next_flush) {
return 0;
}
+ iscsilun->force_next_flush = false;
iscsi_co_init_iscsitask(iscsilun, &iTask);
-
retry:
if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -917,6 +956,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
}
iscsi_co_init_iscsitask(iscsilun, &iTask);
+ iTask.force_next_flush = true;
retry:
if (use_16_for_ws) {
iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
@@ -1080,16 +1120,37 @@ static char *parse_initiator_name(const char *target)
return iscsi_name;
}
+static int parse_timeout(const char *target)
+{
+ QemuOptsList *list;
+ QemuOpts *opts;
+ const char *timeout;
+
+ list = qemu_find_opts("iscsi");
+ if (list) {
+ opts = qemu_opts_find(list, target);
+ if (!opts) {
+ opts = QTAILQ_FIRST(&list->head);
+ }
+ if (opts) {
+ timeout = qemu_opt_get(opts, "timeout");
+ if (timeout) {
+ return atoi(timeout);
+ }
+ }
+ }
+
+ return 0;
+}
+
static void iscsi_nop_timed_event(void *opaque)
{
IscsiLun *iscsilun = opaque;
- if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES) {
+ if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
error_report("iSCSI: NOP timeout. Reconnecting...");
- iscsi_reconnect(iscsilun->iscsi);
- }
-
- if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
+ iscsilun->request_timed_out = true;
+ } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
return;
}
@@ -1121,8 +1182,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
} else {
iscsilun->block_size = rc16->block_length;
iscsilun->num_blocks = rc16->returned_lba + 1;
- iscsilun->lbpme = rc16->lbpme;
- iscsilun->lbprz = rc16->lbprz;
+ iscsilun->lbpme = !!rc16->lbpme;
+ iscsilun->lbprz = !!rc16->lbprz;
iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
}
}
@@ -1153,6 +1214,10 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
if (task == NULL || task->status != SCSI_STATUS_GOOD) {
error_setg(errp, "iSCSI: failed to send readcapacity10 command.");
+ } else if (!iscsilun->block_size ||
+ iscsilun->block_size % BDRV_SECTOR_SIZE) {
+ error_setg(errp, "iSCSI: the target returned an invalid "
+ "block size of %d.", iscsilun->block_size);
}
if (task) {
scsi_free_scsi_task(task);
@@ -1247,17 +1312,21 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
timer_mod(iscsilun->nop_timer,
qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
- /* Prepare a timer for a delayed call to iscsi_set_events */
+ /* Set up a timer for periodic calls to iscsi_set_events and to
+ * scan for command timeout */
iscsilun->event_timer = aio_timer_new(iscsilun->aio_context,
QEMU_CLOCK_REALTIME, SCALE_MS,
- iscsi_timed_set_events, iscsilun);
+ iscsi_timed_check_events, iscsilun);
+ timer_mod(iscsilun->event_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
}
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
{
struct scsi_task *task;
struct scsi_mode_sense *ms = NULL;
- bool wrprotected = false;
+ iscsilun->write_protected = false;
+ iscsilun->dpofua = false;
task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1347,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
iscsi_get_error(iscsilun->iscsi));
goto out;
}
- wrprotected = ms->device_specific_parameter & 0x80;
+ iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+ iscsilun->dpofua = ms->device_specific_parameter & 0x10;
out:
if (task) {
scsi_free_scsi_task(task);
}
- return wrprotected;
}
/*
@@ -1304,14 +1373,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
QemuOpts *opts;
Error *local_err = NULL;
const char *filename;
- int i, ret = 0;
-
- if ((BDRV_SECTOR_SIZE % 512) != 0) {
- error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. "
- "BDRV_SECTOR_SIZE(%lld) is not a multiple "
- "of 512", BDRV_SECTOR_SIZE);
- return -EINVAL;
- }
+ int i, ret = 0, timeout = 0;
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -1381,6 +1443,16 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
goto out;
}
+ /* timeout handling is broken in libiscsi before 1.15.0 */
+ timeout = parse_timeout(iscsi_url->target);
+#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621
+ iscsi_set_timeout(iscsi, timeout);
+#else
+ if (timeout) {
+ error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0");
+ }
+#endif
+
if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
iscsi_get_error(iscsi));
@@ -1403,7 +1475,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
scsi_free_scsi_task(task);
task = NULL;
- iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+ iscsi_modesense_sync(iscsilun);
+
/* Check the write protect flag of the LUN if we want to write */
if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
iscsilun->write_protected) {
@@ -1481,7 +1554,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
iscsilun->block_size) >> BDRV_SECTOR_BITS;
- if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+ if (iscsilun->lbprz) {
iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
if (iscsilun->allocationmap == NULL) {
ret = -ENOMEM;
@@ -1655,7 +1728,7 @@ out:
static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
IscsiLun *iscsilun = bs->opaque;
- bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+ bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
return 0;
@@ -1728,6 +1801,10 @@ static QemuOptsList qemu_iscsi_opts = {
.name = "initiator-name",
.type = QEMU_OPT_STRING,
.help = "Initiator iqn name to use when connecting",
+ },{
+ .name = "timeout",
+ .type = QEMU_OPT_NUMBER,
+ .help = "Request timeout in seconds (default 0 = no timeout)",
},
{ /* end of list */ }
},
diff --git a/block/mirror.c b/block/mirror.c
index bd079a439..b2fb4b9b1 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -14,11 +14,13 @@
#include "trace.h"
#include "block/blockjob.h"
#include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
#include "qemu/ratelimit.h"
#include "qemu/bitmap.h"
#define SLICE_TIME 100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
+#define DEFAULT_MIRROR_BUF_SIZE (10 << 20)
/* The mirroring buffer is a list of granularity-sized chunks.
* Free chunks are organized in a list.
@@ -58,6 +60,7 @@ typedef struct MirrorBlockJob {
int sectors_in_flight;
int ret;
bool unmap;
+ bool waiting_for_io;
} MirrorBlockJob;
typedef struct MirrorOp {
@@ -112,11 +115,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
qemu_iovec_destroy(&op->qiov);
g_slice_free(MirrorOp, op);
- /* Enter coroutine when it is not sleeping. The coroutine sleeps to
- * rate-limit itself. The coroutine will eventually resume since there is
- * a sleep timeout so don't wake it early.
- */
- if (s->common.busy) {
+ if (s->waiting_for_io) {
qemu_coroutine_enter(s->common.co, NULL);
}
}
@@ -126,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, false, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -144,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, true, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -173,10 +168,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
s->sector_num = hbitmap_iter_next(&s->hbi);
if (s->sector_num < 0) {
- bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
s->sector_num = hbitmap_iter_next(&s->hbi);
- trace_mirror_restart_iter(s,
- bdrv_get_dirty_count(source, s->dirty_bitmap));
+ trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
assert(s->sector_num >= 0);
}
@@ -206,7 +200,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
/* Wait for I/O to this cluster (from a previous iteration) to be done. */
while (test_bit(next_chunk, s->in_flight_bitmap)) {
trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+ s->waiting_for_io = true;
qemu_coroutine_yield();
+ s->waiting_for_io = false;
}
do {
@@ -242,7 +238,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
*/
while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+ s->waiting_for_io = true;
qemu_coroutine_yield();
+ s->waiting_for_io = false;
}
if (s->buf_free_count < nb_chunks + added_chunks) {
trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
@@ -291,8 +289,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
next_sector += sectors_per_chunk;
}
- bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
- nb_sectors);
+ bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
/* Copy the dirty cluster. */
s->in_flight++;
@@ -337,7 +334,9 @@ static void mirror_free_init(MirrorBlockJob *s)
static void mirror_drain(MirrorBlockJob *s)
{
while (s->in_flight > 0) {
+ s->waiting_for_io = true;
qemu_coroutine_yield();
+ s->waiting_for_io = false;
}
}
@@ -392,7 +391,7 @@ static void coroutine_fn mirror_run(void *opaque)
MirrorBlockJob *s = opaque;
MirrorExitData *data;
BlockDriverState *bs = s->common.bs;
- int64_t sector_num, end, sectors_per_chunk, length;
+ int64_t sector_num, end, length;
uint64_t last_pause_ns;
BlockDriverInfo bdi;
char backing_filename[2]; /* we only need 2 characters because we are only
@@ -446,16 +445,28 @@ static void coroutine_fn mirror_run(void *opaque)
goto immediate_exit;
}
- sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
mirror_free_init(s);
+ last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
if (!s->is_none_mode) {
/* First part, loop on the sectors and initialize the dirty bitmap. */
BlockDriverState *base = s->base;
for (sector_num = 0; sector_num < end; ) {
- int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
- ret = bdrv_is_allocated_above(bs, base,
- sector_num, next - sector_num, &n);
+ /* Just to make sure we are not exceeding int limit. */
+ int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
+ end - sector_num);
+ int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+ if (now - last_pause_ns > SLICE_TIME) {
+ last_pause_ns = now;
+ block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+ }
+
+ if (block_job_is_cancelled(&s->common)) {
+ goto immediate_exit;
+ }
+
+ ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
if (ret < 0) {
goto immediate_exit;
@@ -463,16 +474,13 @@ static void coroutine_fn mirror_run(void *opaque)
assert(n > 0);
if (ret == 1) {
- bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
- sector_num = next;
- } else {
- sector_num += n;
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
}
+ sector_num += n;
}
}
- bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
- last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
for (;;) {
uint64_t delay_ns = 0;
int64_t cnt;
@@ -483,7 +491,7 @@ static void coroutine_fn mirror_run(void *opaque)
goto immediate_exit;
}
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
/* s->common.offset contains the number of bytes already processed so
* far, cnt is the number of dirty sectors remaining and
* s->sectors_in_flight is the number of sectors currently being
@@ -492,7 +500,7 @@ static void coroutine_fn mirror_run(void *opaque)
(cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
/* Note that even when no rate limit is applied we need to yield
- * periodically with no pending I/O so that qemu_aio_flush() returns.
+ * periodically with no pending I/O so that bdrv_drain_all() returns.
* We do so every SLICE_TIME nanoseconds, or when there is an error,
* or when the source is clean, whichever comes first.
*/
@@ -501,13 +509,12 @@ static void coroutine_fn mirror_run(void *opaque)
if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
(cnt == 0 && s->in_flight > 0)) {
trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
+ s->waiting_for_io = true;
qemu_coroutine_yield();
+ s->waiting_for_io = false;
continue;
} else if (cnt != 0) {
delay_ns = mirror_iteration(s);
- if (delay_ns == 0) {
- continue;
- }
}
}
@@ -533,7 +540,7 @@ static void coroutine_fn mirror_run(void *opaque)
should_complete = s->should_complete ||
block_job_is_cancelled(&s->common);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
}
@@ -548,7 +555,7 @@ static void coroutine_fn mirror_run(void *opaque)
*/
trace_mirror_before_drain(s, cnt);
bdrv_drain(bs);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
ret = 0;
@@ -599,7 +606,7 @@ static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
if (speed < 0) {
- error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ error_setg(errp, QERR_INVALID_PARAMETER, "speed");
return;
}
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -624,8 +631,8 @@ static void mirror_complete(BlockJob *job, Error **errp)
return;
}
if (!s->synced) {
- error_set(errp, QERR_BLOCK_JOB_NOT_READY,
- bdrv_get_device_name(job->bs));
+ error_setg(errp, QERR_BLOCK_JOB_NOT_READY,
+ bdrv_get_device_name(job->bs));
return;
}
@@ -651,7 +658,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
}
s->should_complete = true;
- block_job_resume(job);
+ block_job_enter(&s->common);
}
static const BlockJobDriver mirror_job_driver = {
@@ -673,7 +680,7 @@ static const BlockJobDriver commit_active_job_driver = {
static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity,
+ int64_t speed, uint32_t granularity,
int64_t buf_size,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
@@ -686,15 +693,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
MirrorBlockJob *s;
if (granularity == 0) {
- /* Choose the default granularity based on the target file's cluster
- * size, clamped between 4k and 64k. */
- BlockDriverInfo bdi;
- if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
- granularity = MAX(4096, bdi.cluster_size);
- granularity = MIN(65536, granularity);
- } else {
- granularity = 65536;
- }
+ granularity = bdrv_get_default_bitmap_granularity(target);
}
assert ((granularity & (granularity - 1)) == 0);
@@ -702,10 +701,18 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
!bdrv_iostatus_is_enabled(bs)) {
- error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+ error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
return;
}
+ if (buf_size < 0) {
+ error_setg(errp, "Invalid parameter 'buf-size'");
+ return;
+ }
+
+ if (buf_size == 0) {
+ buf_size = DEFAULT_MIRROR_BUF_SIZE;
+ }
s = block_job_create(driver, bs, speed, cb, opaque, errp);
if (!s) {
@@ -719,11 +726,13 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
s->is_none_mode = is_none_mode;
s->base = base;
s->granularity = granularity;
- s->buf_size = MAX(buf_size, granularity);
+ s->buf_size = ROUND_UP(buf_size, granularity);
s->unmap = unmap;
- s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+ s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
if (!s->dirty_bitmap) {
+ g_free(s->replaces);
+ block_job_release(bs);
return;
}
bdrv_set_enable_write_cache(s->target, true);
@@ -736,7 +745,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity, int64_t buf_size,
+ int64_t speed, uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap,
@@ -746,6 +755,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
bool is_none_mode;
BlockDriverState *base;
+ if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
+ error_setg(errp, "Sync mode 'incremental' not supported");
+ return;
+ }
is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
mirror_start_job(bs, target, replaces,
diff --git a/block/nfs.c b/block/nfs.c
index c026ff688..02eb4e464 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
aio_poll(client->aio_context, true);
}
- return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+ return (task.ret < 0 ? task.ret : st.st_blocks * 512);
}
static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
diff --git a/block/null.c b/block/null.c
index ec2bd27a4..7d083233f 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
#include "block/block_int.h"
+#define NULL_OPT_LATENCY "latency-ns"
+
typedef struct {
int64_t length;
+ int64_t latency_ns;
} BDRVNullState;
static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
.type = QEMU_OPT_SIZE,
.help = "size of the null block",
},
+ {
+ .name = NULL_OPT_LATENCY,
+ .type = QEMU_OPT_NUMBER,
+ .help = "nanoseconds (approximated) to wait "
+ "before completing request",
+ },
{ /* end of list */ }
},
};
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
{
QemuOpts *opts;
BDRVNullState *s = bs->opaque;
+ int ret = 0;
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &error_abort);
s->length =
qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+ s->latency_ns =
+ qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+ if (s->latency_ns < 0) {
+ error_setg(errp, "latency-ns is invalid");
+ ret = -EINVAL;
+ }
qemu_opts_del(opts);
- return 0;
+ return ret;
}
static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
return s->length;
}
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+ BDRVNullState *s = bs->opaque;
+
+ if (s->latency_ns) {
+ co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+ s->latency_ns);
+ }
+ return 0;
+}
+
static coroutine_fn int null_co_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_flush(BlockDriverState *bs)
{
- return 0;
+ return null_co_common(bs);
}
typedef struct {
BlockAIOCB common;
QEMUBH *bh;
+ QEMUTimer timer;
} NullAIOCB;
static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
qemu_aio_unref(acb);
}
+static void null_timer_cb(void *opaque)
+{
+ NullAIOCB *acb = opaque;
+ acb->common.cb(acb->common.opaque, 0);
+ timer_deinit(&acb->timer);
+ qemu_aio_unref(acb);
+}
+
static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
BlockCompletionFunc *cb,
void *opaque)
{
NullAIOCB *acb;
+ BDRVNullState *s = bs->opaque;
acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
- qemu_bh_schedule(acb->bh);
+ /* Only emulate latency after vcpu is running. */
+ if (s->latency_ns) {
+ aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+ QEMU_CLOCK_REALTIME, SCALE_NS,
+ null_timer_cb, acb);
+ timer_mod_ns(&acb->timer,
+ qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+ } else {
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+ qemu_bh_schedule(acb->bh);
+ }
return &acb->common;
}
@@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
return null_aio_common(bs, cb, opaque);
}
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static BlockDriver bdrv_null_co = {
.format_name = "null-co",
.protocol_name = "null-co",
@@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = {
.bdrv_co_readv = null_co_readv,
.bdrv_co_writev = null_co_writev,
.bdrv_co_flush_to_disk = null_co_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static BlockDriver bdrv_null_aio = {
@@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
.bdrv_aio_readv = null_aio_readv,
.bdrv_aio_writev = null_aio_writev,
.bdrv_aio_flush = null_aio_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static void bdrv_null_init(void)
diff --git a/block/parallels.c b/block/parallels.c
index 4f9cd8dd2..046b56844 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -2,8 +2,12 @@
* Block driver for Parallels disk image format
*
* Copyright (c) 2007 Alex Beregszaszi
+ * Copyright (c) 2015 Denis V. Lunev <den@openvz.org>
*
- * This code is based on comparing different disk images created by Parallels.
+ * This code was originally based on comparing different disk images created
+ * by Parallels. Currently it is based on opened OpenVZ sources
+ * available at
+ * http://git.openvz.org/?p=ploop;a=summary
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -26,63 +30,539 @@
#include "qemu-common.h"
#include "block/block_int.h"
#include "qemu/module.h"
+#include "qemu/bitmap.h"
+#include "qapi/util.h"
/**************************************************************/
#define HEADER_MAGIC "WithoutFreeSpace"
#define HEADER_MAGIC2 "WithouFreSpacExt"
#define HEADER_VERSION 2
-#define HEADER_SIZE 64
+#define HEADER_INUSE_MAGIC (0x746F6E59)
+
+#define DEFAULT_CLUSTER_SIZE 1048576 /* 1 MiB */
+
// always little-endian
-struct parallels_header {
+typedef struct ParallelsHeader {
char magic[16]; // "WithoutFreeSpace"
uint32_t version;
uint32_t heads;
uint32_t cylinders;
uint32_t tracks;
- uint32_t catalog_entries;
+ uint32_t bat_entries;
uint64_t nb_sectors;
uint32_t inuse;
uint32_t data_off;
char padding[12];
-} QEMU_PACKED;
+} QEMU_PACKED ParallelsHeader;
+
+
+typedef enum ParallelsPreallocMode {
+ PRL_PREALLOC_MODE_FALLOCATE = 0,
+ PRL_PREALLOC_MODE_TRUNCATE = 1,
+ PRL_PREALLOC_MODE_MAX = 2,
+} ParallelsPreallocMode;
+
+static const char *prealloc_mode_lookup[] = {
+ "falloc",
+ "truncate",
+ NULL,
+};
+
typedef struct BDRVParallelsState {
+ /** Locking is conservative, the lock protects
+ * - image file extending (truncate, fallocate)
+ * - any access to block allocation table
+ */
CoMutex lock;
- uint32_t *catalog_bitmap;
- unsigned int catalog_size;
+ ParallelsHeader *header;
+ uint32_t header_size;
+ bool header_unclean;
+
+ unsigned long *bat_dirty_bmap;
+ unsigned int bat_dirty_block;
+
+ uint32_t *bat_bitmap;
+ unsigned int bat_size;
+
+ int64_t data_end;
+ uint64_t prealloc_size;
+ ParallelsPreallocMode prealloc_mode;
unsigned int tracks;
unsigned int off_multiplier;
} BDRVParallelsState;
-static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+
+#define PARALLELS_OPT_PREALLOC_MODE "prealloc-mode"
+#define PARALLELS_OPT_PREALLOC_SIZE "prealloc-size"
+
+static QemuOptsList parallels_runtime_opts = {
+ .name = "parallels",
+ .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
+ .desc = {
+ {
+ .name = PARALLELS_OPT_PREALLOC_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "Preallocation size on image expansion",
+ .def_value_str = "128MiB",
+ },
+ {
+ .name = PARALLELS_OPT_PREALLOC_MODE,
+ .type = QEMU_OPT_STRING,
+ .help = "Preallocation mode on image expansion "
+ "(allowed values: falloc, truncate)",
+ .def_value_str = "falloc",
+ },
+ { /* end of list */ },
+ },
+};
+
+
+static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx)
+{
+ return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier;
+}
+
+static uint32_t bat_entry_off(uint32_t idx)
+{
+ return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx;
+}
+
+static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num)
+{
+ uint32_t index, offset;
+
+ index = sector_num / s->tracks;
+ offset = sector_num % s->tracks;
+
+ /* not allocated */
+ if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) {
+ return -1;
+ }
+ return bat2sect(s, index) + offset;
+}
+
+static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
+ int nb_sectors)
+{
+ int ret = s->tracks - sector_num % s->tracks;
+ return MIN(nb_sectors, ret);
+}
+
+static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t start_off = -2, prev_end_off = -2;
+
+ *pnum = 0;
+ while (nb_sectors > 0 || start_off == -2) {
+ int64_t offset = seek_to_sector(s, sector_num);
+ int to_end;
+
+ if (start_off == -2) {
+ start_off = offset;
+ prev_end_off = offset;
+ } else if (offset != prev_end_off) {
+ break;
+ }
+
+ to_end = cluster_remainder(s, sector_num, nb_sectors);
+ nb_sectors -= to_end;
+ sector_num += to_end;
+ *pnum += to_end;
+
+ if (offset > 0) {
+ prev_end_off += to_end;
+ }
+ }
+ return start_off;
+}
+
+static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
{
- const struct parallels_header *ph = (const void *)buf;
+ BDRVParallelsState *s = bs->opaque;
+ uint32_t idx, to_allocate, i;
+ int64_t pos, space;
+
+ pos = block_status(s, sector_num, nb_sectors, pnum);
+ if (pos > 0) {
+ return pos;
+ }
+
+ idx = sector_num / s->tracks;
+ if (idx >= s->bat_size) {
+ return -EINVAL;
+ }
+
+ to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx;
+ space = to_allocate * s->tracks;
+ if (s->data_end + space > bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS) {
+ int ret;
+ space += s->prealloc_size;
+ if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
+ ret = bdrv_write_zeroes(bs->file, s->data_end, space, 0);
+ } else {
+ ret = bdrv_truncate(bs->file,
+ (s->data_end + space) << BDRV_SECTOR_BITS);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ for (i = 0; i < to_allocate; i++) {
+ s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+ s->data_end += s->tracks;
+ bitmap_set(s->bat_dirty_bmap,
+ bat_entry_off(idx) / s->bat_dirty_block, 1);
+ }
+
+ return bat2sect(s, idx) + sector_num % s->tracks;
+}
+
- if (buf_size < HEADER_SIZE)
+static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
+{
+ BDRVParallelsState *s = bs->opaque;
+ unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block);
+ unsigned long bit;
+
+ qemu_co_mutex_lock(&s->lock);
+
+ bit = find_first_bit(s->bat_dirty_bmap, size);
+ while (bit < size) {
+ uint32_t off = bit * s->bat_dirty_block;
+ uint32_t to_write = s->bat_dirty_block;
+ int ret;
+
+ if (off + to_write > s->header_size) {
+ to_write = s->header_size - off;
+ }
+ ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
+ bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1);
+ }
+ bitmap_zero(s->bat_dirty_bmap, size);
+
+ qemu_co_mutex_unlock(&s->lock);
+ return 0;
+}
+
+
+static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int *pnum)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int64_t offset;
+
+ qemu_co_mutex_lock(&s->lock);
+ offset = block_status(s, sector_num, nb_sectors, pnum);
+ qemu_co_mutex_unlock(&s->lock);
+
+ if (offset < 0) {
return 0;
+ }
+
+ return (offset << BDRV_SECTOR_BITS) |
+ BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+}
+
+static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVParallelsState *s = bs->opaque;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+ int ret = 0;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ while (nb_sectors > 0) {
+ int64_t position;
+ int n, nbytes;
+
+ qemu_co_mutex_lock(&s->lock);
+ position = allocate_clusters(bs, sector_num, nb_sectors, &n);
+ qemu_co_mutex_unlock(&s->lock);
+ if (position < 0) {
+ ret = (int)position;
+ break;
+ }
+
+ nbytes = n << BDRV_SECTOR_BITS;
+
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
+
+ ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
+ if (ret < 0) {
+ break;
+ }
+
+ nb_sectors -= n;
+ sector_num += n;
+ bytes_done += nbytes;
+ }
+
+ qemu_iovec_destroy(&hd_qiov);
+ return ret;
+}
+
+static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ BDRVParallelsState *s = bs->opaque;
+ uint64_t bytes_done = 0;
+ QEMUIOVector hd_qiov;
+ int ret = 0;
+
+ qemu_iovec_init(&hd_qiov, qiov->niov);
+
+ while (nb_sectors > 0) {
+ int64_t position;
+ int n, nbytes;
+
+ qemu_co_mutex_lock(&s->lock);
+ position = block_status(s, sector_num, nb_sectors, &n);
+ qemu_co_mutex_unlock(&s->lock);
+
+ nbytes = n << BDRV_SECTOR_BITS;
+
+ if (position < 0) {
+ qemu_iovec_memset(qiov, bytes_done, 0, nbytes);
+ } else {
+ qemu_iovec_reset(&hd_qiov);
+ qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
+
+ ret = bdrv_co_readv(bs->file, position, n, &hd_qiov);
+ if (ret < 0) {
+ break;
+ }
+ }
+
+ nb_sectors -= n;
+ sector_num += n;
+ bytes_done += nbytes;
+ }
+
+ qemu_iovec_destroy(&hd_qiov);
+ return ret;
+}
+
+
+static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int64_t size, prev_off, high_off;
+ int ret;
+ uint32_t i;
+ bool flush_bat = false;
+ int cluster_size = s->tracks << BDRV_SECTOR_BITS;
+
+ size = bdrv_getlength(bs->file);
+ if (size < 0) {
+ res->check_errors++;
+ return size;
+ }
+
+ if (s->header_unclean) {
+ fprintf(stderr, "%s image was not closed correctly\n",
+ fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+ res->corruptions++;
+ if (fix & BDRV_FIX_ERRORS) {
+ /* parallels_close will do the job right */
+ res->corruptions_fixed++;
+ s->header_unclean = false;
+ }
+ }
+
+ res->bfi.total_clusters = s->bat_size;
+ res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+ high_off = 0;
+ prev_off = 0;
+ for (i = 0; i < s->bat_size; i++) {
+ int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+ if (off == 0) {
+ prev_off = 0;
+ continue;
+ }
+
+ /* cluster outside the image */
+ if (off > size) {
+ fprintf(stderr, "%s cluster %u is outside image\n",
+ fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+ res->corruptions++;
+ if (fix & BDRV_FIX_ERRORS) {
+ prev_off = 0;
+ s->bat_bitmap[i] = 0;
+ res->corruptions_fixed++;
+ flush_bat = true;
+ continue;
+ }
+ }
+
+ res->bfi.allocated_clusters++;
+ if (off > high_off) {
+ high_off = off;
+ }
+
+ if (prev_off != 0 && (prev_off + cluster_size) != off) {
+ res->bfi.fragmented_clusters++;
+ }
+ prev_off = off;
+ }
+
+ if (flush_bat) {
+ ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size);
+ if (ret < 0) {
+ res->check_errors++;
+ return ret;
+ }
+ }
+
+ res->image_end_offset = high_off + cluster_size;
+ if (size > res->image_end_offset) {
+ int64_t count;
+ count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size);
+ fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n",
+ fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
+ size - res->image_end_offset);
+ res->leaks += count;
+ if (fix & BDRV_FIX_LEAKS) {
+ ret = bdrv_truncate(bs->file, res->image_end_offset);
+ if (ret < 0) {
+ res->check_errors++;
+ return ret;
+ }
+ res->leaks_fixed += count;
+ }
+ }
+
+ return 0;
+}
+
+
+static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
+{
+ int64_t total_size, cl_size;
+ uint8_t tmp[BDRV_SECTOR_SIZE];
+ Error *local_err = NULL;
+ BlockDriverState *file;
+ uint32_t bat_entries, bat_sectors;
+ ParallelsHeader header;
+ int ret;
+
+ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+ BDRV_SECTOR_SIZE);
+ cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
+ DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE);
+
+ ret = bdrv_create_file(filename, opts, &local_err);
+ if (ret < 0) {
+ error_propagate(errp, local_err);
+ return ret;
+ }
+
+ file = NULL;
+ ret = bdrv_open(&file, filename, NULL, NULL,
+ BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
+ if (ret < 0) {
+ error_propagate(errp, local_err);
+ return ret;
+ }
+ ret = bdrv_truncate(file, 0);
+ if (ret < 0) {
+ goto exit;
+ }
+
+ bat_entries = DIV_ROUND_UP(total_size, cl_size);
+ bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size);
+ bat_sectors = (bat_sectors * cl_size) >> BDRV_SECTOR_BITS;
+
+ memset(&header, 0, sizeof(header));
+ memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic));
+ header.version = cpu_to_le32(HEADER_VERSION);
+ /* don't care much about geometry, it is not used on image level */
+ header.heads = cpu_to_le32(16);
+ header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32);
+ header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS);
+ header.bat_entries = cpu_to_le32(bat_entries);
+ header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE));
+ header.data_off = cpu_to_le32(bat_sectors);
+
+ /* write all the data */
+ memset(tmp, 0, sizeof(tmp));
+ memcpy(tmp, &header, sizeof(header));
+
+ ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE);
+ if (ret < 0) {
+ goto exit;
+ }
+ ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0);
+ if (ret < 0) {
+ goto exit;
+ }
+ ret = 0;
+
+done:
+ bdrv_unref(file);
+ return ret;
+
+exit:
+ error_setg_errno(errp, -ret, "Failed to create Parallels image");
+ goto done;
+}
+
+
+static int parallels_probe(const uint8_t *buf, int buf_size,
+ const char *filename)
+{
+ const ParallelsHeader *ph = (const void *)buf;
+
+ if (buf_size < sizeof(ParallelsHeader)) {
+ return 0;
+ }
if ((!memcmp(ph->magic, HEADER_MAGIC, 16) ||
- !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
- (le32_to_cpu(ph->version) == HEADER_VERSION))
+ !memcmp(ph->magic, HEADER_MAGIC2, 16)) &&
+ (le32_to_cpu(ph->version) == HEADER_VERSION)) {
return 100;
+ }
return 0;
}
+static int parallels_update_header(BlockDriverState *bs)
+{
+ BDRVParallelsState *s = bs->opaque;
+ unsigned size = MAX(bdrv_opt_mem_align(bs->file), sizeof(ParallelsHeader));
+
+ if (size > s->header_size) {
+ size = s->header_size;
+ }
+ return bdrv_pwrite_sync(bs->file, 0, s->header, size);
+}
+
static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVParallelsState *s = bs->opaque;
- int i;
- struct parallels_header ph;
- int ret;
-
- bs->read_only = 1; // no write support yet
+ ParallelsHeader ph;
+ int ret, size, i;
+ QemuOpts *opts = NULL;
+ Error *local_err = NULL;
+ char *buf;
ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
if (ret < 0) {
@@ -115,25 +595,90 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
goto fail;
}
- s->catalog_size = le32_to_cpu(ph.catalog_entries);
- if (s->catalog_size > INT_MAX / 4) {
+ s->bat_size = le32_to_cpu(ph.bat_entries);
+ if (s->bat_size > INT_MAX / sizeof(uint32_t)) {
error_setg(errp, "Catalog too large");
ret = -EFBIG;
goto fail;
}
- s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size);
- if (s->catalog_size && s->catalog_bitmap == NULL) {
+
+ size = bat_entry_off(s->bat_size);
+ s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file));
+ s->header = qemu_try_blockalign(bs->file, s->header_size);
+ if (s->header == NULL) {
ret = -ENOMEM;
goto fail;
}
+ s->data_end = le32_to_cpu(ph.data_off);
+ if (s->data_end == 0) {
+ s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
+ }
+ if (s->data_end < s->header_size) {
+ /* there is not enough unused space to fit to block align between BAT
+ and actual data. We can't avoid read-modify-write... */
+ s->header_size = size;
+ }
- ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4);
+ ret = bdrv_pread(bs->file, 0, s->header, s->header_size);
if (ret < 0) {
goto fail;
}
+ s->bat_bitmap = (uint32_t *)(s->header + 1);
+
+ for (i = 0; i < s->bat_size; i++) {
+ int64_t off = bat2sect(s, i);
+ if (off >= s->data_end) {
+ s->data_end = off + s->tracks;
+ }
+ }
- for (i = 0; i < s->catalog_size; i++)
- le32_to_cpus(&s->catalog_bitmap[i]);
+ if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
+ /* Image was not closed correctly. The check is mandatory */
+ s->header_unclean = true;
+ if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
+ error_setg(errp, "parallels: Image was not closed correctly; "
+ "cannot be opened read/write");
+ ret = -EACCES;
+ goto fail;
+ }
+ }
+
+ opts = qemu_opts_create(&parallels_runtime_opts, NULL, 0, &local_err);
+ if (local_err != NULL) {
+ goto fail_options;
+ }
+
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (local_err != NULL) {
+ goto fail_options;
+ }
+
+ s->prealloc_size =
+ qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
+ s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
+ buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
+ s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf,
+ PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err);
+ g_free(buf);
+ if (local_err != NULL) {
+ goto fail_options;
+ }
+ if (!bdrv_has_zero_init(bs->file) ||
+ bdrv_truncate(bs->file, bdrv_getlength(bs->file)) != 0) {
+ s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE;
+ }
+
+ if (flags & BDRV_O_RDWR) {
+ s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC);
+ ret = parallels_update_header(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
+ s->bat_dirty_block = 4 * getpagesize();
+ s->bat_dirty_bmap =
+ bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block));
qemu_co_mutex_init(&s->lock);
return 0;
@@ -142,67 +687,67 @@ fail_format:
error_setg(errp, "Image not in Parallels format");
ret = -EINVAL;
fail:
- g_free(s->catalog_bitmap);
+ qemu_vfree(s->header);
return ret;
+
+fail_options:
+ error_propagate(errp, local_err);
+ ret = -EINVAL;
+ goto fail;
}
-static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+
+static void parallels_close(BlockDriverState *bs)
{
BDRVParallelsState *s = bs->opaque;
- uint32_t index, offset;
-
- index = sector_num / s->tracks;
- offset = sector_num % s->tracks;
- /* not allocated */
- if ((index >= s->catalog_size) || (s->catalog_bitmap[index] == 0))
- return -1;
- return
- ((uint64_t)s->catalog_bitmap[index] * s->off_multiplier + offset) * 512;
-}
+ if (bs->open_flags & BDRV_O_RDWR) {
+ s->header->inuse = 0;
+ parallels_update_header(bs);
+ }
-static int parallels_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
-{
- while (nb_sectors > 0) {
- int64_t position = seek_to_sector(bs, sector_num);
- if (position >= 0) {
- if (bdrv_pread(bs->file, position, buf, 512) != 512)
- return -1;
- } else {
- memset(buf, 0, 512);
- }
- nb_sectors--;
- sector_num++;
- buf += 512;
+ if (bs->open_flags & BDRV_O_RDWR) {
+ bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS);
}
- return 0;
-}
-static coroutine_fn int parallels_co_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
-{
- int ret;
- BDRVParallelsState *s = bs->opaque;
- qemu_co_mutex_lock(&s->lock);
- ret = parallels_read(bs, sector_num, buf, nb_sectors);
- qemu_co_mutex_unlock(&s->lock);
- return ret;
+ g_free(s->bat_dirty_bmap);
+ qemu_vfree(s->header);
}
-static void parallels_close(BlockDriverState *bs)
-{
- BDRVParallelsState *s = bs->opaque;
- g_free(s->catalog_bitmap);
-}
+static QemuOptsList parallels_create_opts = {
+ .name = "parallels-create-opts",
+ .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head),
+ .desc = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "Virtual disk size",
+ },
+ {
+ .name = BLOCK_OPT_CLUSTER_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "Parallels image cluster size",
+ .def_value_str = stringify(DEFAULT_CLUSTER_SIZE),
+ },
+ { /* end of list */ }
+ }
+};
static BlockDriver bdrv_parallels = {
.format_name = "parallels",
.instance_size = sizeof(BDRVParallelsState),
.bdrv_probe = parallels_probe,
.bdrv_open = parallels_open,
- .bdrv_read = parallels_co_read,
.bdrv_close = parallels_close,
+ .bdrv_co_get_block_status = parallels_co_get_block_status,
+ .bdrv_has_zero_init = bdrv_has_zero_init_1,
+ .bdrv_co_flush_to_os = parallels_co_flush_to_os,
+ .bdrv_co_readv = parallels_co_readv,
+ .bdrv_co_writev = parallels_co_writev,
+
+ .bdrv_create = parallels_create,
+ .bdrv_check = parallels_check,
+ .create_opts = &parallels_create_opts,
};
static void bdrv_parallels_init(void)
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed44..2ce509711 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@
#include "block/qapi.h"
#include "block/block_int.h"
+#include "block/throttle-groups.h"
#include "block/write-threshold.h"
#include "qmp-commands.h"
#include "qapi-visit.h"
@@ -31,8 +32,10 @@
#include "qapi/qmp/types.h"
#include "sysemu/block-backend.h"
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
{
+ ImageInfo **p_image_info;
+ BlockDriverState *bs0;
BlockDeviceInfo *info = g_malloc0(sizeof(*info));
info->file = g_strdup(bs->filename);
@@ -63,7 +66,9 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
if (bs->io_limits_enabled) {
ThrottleConfig cfg;
- throttle_get_config(&bs->throttle_state, &cfg);
+
+ throttle_group_get_config(bs, &cfg);
+
info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg;
info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg;
@@ -88,10 +93,32 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
info->has_iops_size = cfg.op_size;
info->iops_size = cfg.op_size;
+
+ info->has_group = true;
+ info->group = g_strdup(throttle_group_get_name(bs));
}
info->write_threshold = bdrv_write_threshold_get(bs);
+ bs0 = bs;
+ p_image_info = &info->image;
+ while (1) {
+ Error *local_err = NULL;
+ bdrv_query_image_info(bs0, p_image_info, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ qapi_free_BlockDeviceInfo(info);
+ return NULL;
+ }
+ if (bs0->drv && bs0->backing_hd) {
+ bs0 = bs0->backing_hd;
+ (*p_image_info)->has_backing_image = true;
+ p_image_info = &((*p_image_info)->backing_image);
+ } else {
+ break;
+ }
+ }
+
return info;
}
@@ -264,9 +291,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
{
BlockInfo *info = g_malloc0(sizeof(*info));
BlockDriverState *bs = blk_bs(blk);
- BlockDriverState *bs0;
- ImageInfo **p_image_info;
- Error *local_err = NULL;
info->device = g_strdup(blk_name(blk));
info->type = g_strdup("unknown");
info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +313,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
if (bs->drv) {
info->has_inserted = true;
- info->inserted = bdrv_block_device_info(bs);
-
- bs0 = bs;
- p_image_info = &info->inserted->image;
- while (1) {
- bdrv_query_image_info(bs0, p_image_info, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- goto err;
- }
- if (bs0->drv && bs0->backing_hd) {
- bs0 = bs0->backing_hd;
- (*p_image_info)->has_backing_image = true;
- p_image_info = &((*p_image_info)->backing_image);
- } else {
- break;
- }
+ info->inserted = bdrv_block_device_info(bs, errp);
+ if (info->inserted == NULL) {
+ goto err;
}
}
@@ -510,18 +520,9 @@ static void dump_qobject(fprintf_function func_fprintf, void *f,
}
case QTYPE_QBOOL: {
QBool *value = qobject_to_qbool(obj);
- func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false");
- break;
- }
- case QTYPE_QERROR: {
- QString *value = qerror_human((QError *)obj);
- func_fprintf(f, "%s", qstring_get_str(value));
- QDECREF(value);
+ func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false");
break;
}
- case QTYPE_NONE:
- break;
- case QTYPE_MAX:
default:
abort();
}
diff --git a/block/qcow.c b/block/qcow.c
index 055896910..01fba54ce 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -25,7 +25,8 @@
#include "block/block_int.h"
#include "qemu/module.h"
#include <zlib.h>
-#include "qemu/aes.h"
+#include "qapi/qmp/qerror.h"
+#include "crypto/cipher.h"
#include "migration/migration.h"
/**************************************************************/
@@ -71,10 +72,8 @@ typedef struct BDRVQcowState {
uint8_t *cluster_cache;
uint8_t *cluster_data;
uint64_t cluster_cache_offset;
- uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ QCryptoCipher *cipher; /* NULL if no key yet */
uint32_t crypt_method_header;
- AES_KEY aes_encrypt_key;
- AES_KEY aes_decrypt_key;
CoMutex lock;
Error *migration_blocker;
} BDRVQcowState;
@@ -123,8 +122,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
char version[64];
snprintf(version, sizeof(version), "QCOW version %" PRIu32,
header.version);
- error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow", version);
+ error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bdrv_get_device_or_node_name(bs), "qcow", version);
ret = -ENOTSUP;
goto fail;
}
@@ -153,6 +152,11 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
ret = -EINVAL;
goto fail;
}
+ if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+ error_setg(errp, "AES cipher not available");
+ ret = -EINVAL;
+ goto fail;
+ }
s->crypt_method_header = header.crypt_method;
if (s->crypt_method_header) {
bs->encrypted = 1;
@@ -229,9 +233,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when qcow images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "qcow", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->lock);
@@ -259,6 +263,7 @@ static int qcow_set_key(BlockDriverState *bs, const char *key)
BDRVQcowState *s = bs->opaque;
uint8_t keybuf[16];
int len, i;
+ Error *err;
memset(keybuf, 0, 16);
len = strlen(key);
@@ -269,38 +274,68 @@ static int qcow_set_key(BlockDriverState *bs, const char *key)
for(i = 0;i < len;i++) {
keybuf[i] = key[i];
}
- s->crypt_method = s->crypt_method_header;
-
- if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
- return -1;
- if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ assert(bs->encrypted);
+
+ qcrypto_cipher_free(s->cipher);
+ s->cipher = qcrypto_cipher_new(
+ QCRYPTO_CIPHER_ALG_AES_128,
+ QCRYPTO_CIPHER_MODE_CBC,
+ keybuf, G_N_ELEMENTS(keybuf),
+ &err);
+
+ if (!s->cipher) {
+ /* XXX would be nice if errors in this method could
+ * be properly propagate to the caller. Would need
+ * the bdrv_set_key() API signature to be fixed. */
+ error_free(err);
return -1;
+ }
return 0;
}
/* The crypt function is compatible with the linux cryptoloop
algorithm for < 4 GB images. NOTE: out_buf == in_buf is
supported */
-static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
- uint8_t *out_buf, const uint8_t *in_buf,
- int nb_sectors, int enc,
- const AES_KEY *key)
+static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, bool enc, Error **errp)
{
union {
uint64_t ll[2];
uint8_t b[16];
} ivec;
int i;
+ int ret;
for(i = 0; i < nb_sectors; i++) {
ivec.ll[0] = cpu_to_le64(sector_num);
ivec.ll[1] = 0;
- AES_cbc_encrypt(in_buf, out_buf, 512, key,
- ivec.b, enc);
+ if (qcrypto_cipher_setiv(s->cipher,
+ ivec.b, G_N_ELEMENTS(ivec.b),
+ errp) < 0) {
+ return -1;
+ }
+ if (enc) {
+ ret = qcrypto_cipher_encrypt(s->cipher,
+ in_buf,
+ out_buf,
+ 512,
+ errp);
+ } else {
+ ret = qcrypto_cipher_decrypt(s->cipher,
+ in_buf,
+ out_buf,
+ 512,
+ errp);
+ }
+ if (ret < 0) {
+ return -1;
+ }
sector_num++;
in_buf += 512;
out_buf += 512;
}
+ return 0;
}
/* 'allocate' is:
@@ -411,17 +446,23 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
/* if encrypted, we must initialize the cluster
content which won't be written */
- if (s->crypt_method &&
+ if (bs->encrypted &&
(n_end - n_start) < s->cluster_sectors) {
uint64_t start_sect;
+ assert(s->cipher);
start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
memset(s->cluster_data + 512, 0x00, 512);
for(i = 0; i < s->cluster_sectors; i++) {
if (i < n_start || i >= n_end) {
- encrypt_sectors(s, start_sect + i,
- s->cluster_data,
- s->cluster_data + 512, 1, 1,
- &s->aes_encrypt_key);
+ Error *err = NULL;
+ if (encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1,
+ true, &err) < 0) {
+ error_free(err);
+ errno = EIO;
+ return -1;
+ }
if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
s->cluster_data, 512) != 512)
return -1;
@@ -461,7 +502,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
if (!cluster_offset) {
return 0;
}
- if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) {
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) {
return BDRV_BLOCK_DATA;
}
cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
@@ -528,6 +569,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector hd_qiov;
uint8_t *buf;
void *orig_buf;
+ Error *err = NULL;
if (qiov->niov > 1) {
buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
@@ -590,10 +632,12 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
if (ret < 0) {
break;
}
- if (s->crypt_method) {
- encrypt_sectors(s, sector_num, buf, buf,
- n, 0,
- &s->aes_decrypt_key);
+ if (bs->encrypted) {
+ assert(s->cipher);
+ if (encrypt_sectors(s, sector_num, buf, buf,
+ n, false, &err) < 0) {
+ goto fail;
+ }
}
}
ret = 0;
@@ -614,6 +658,7 @@ done:
return ret;
fail:
+ error_free(err);
ret = -EIO;
goto done;
}
@@ -661,12 +706,18 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
ret = -EIO;
break;
}
- if (s->crypt_method) {
+ if (bs->encrypted) {
+ Error *err = NULL;
+ assert(s->cipher);
if (!cluster_data) {
cluster_data = g_malloc0(s->cluster_size);
}
- encrypt_sectors(s, sector_num, cluster_data, buf,
- n, 1, &s->aes_encrypt_key);
+ if (encrypt_sectors(s, sector_num, cluster_data, buf,
+ n, true, &err) < 0) {
+ error_free(err);
+ ret = -EIO;
+ break;
+ }
src_buf = cluster_data;
} else {
src_buf = buf;
@@ -703,6 +754,8 @@ static void qcow_close(BlockDriverState *bs)
{
BDRVQcowState *s = bs->opaque;
+ qcrypto_cipher_free(s->cipher);
+ s->cipher = NULL;
g_free(s->l1_table);
qemu_vfree(s->l2_cache);
g_free(s->cluster_cache);
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index b1155492a..53b8afc3d 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -28,62 +28,68 @@
#include "trace.h"
typedef struct Qcow2CachedTable {
- void* table;
- int64_t offset;
- bool dirty;
- int cache_hits;
- int ref;
+ int64_t offset;
+ bool dirty;
+ uint64_t lru_counter;
+ int ref;
} Qcow2CachedTable;
struct Qcow2Cache {
- Qcow2CachedTable* entries;
- struct Qcow2Cache* depends;
+ Qcow2CachedTable *entries;
+ struct Qcow2Cache *depends;
int size;
bool depends_on_flush;
+ void *table_array;
+ uint64_t lru_counter;
};
+static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
+ Qcow2Cache *c, int table)
+{
+ BDRVQcowState *s = bs->opaque;
+ return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
+}
+
+static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
+ Qcow2Cache *c, void *table)
+{
+ BDRVQcowState *s = bs->opaque;
+ ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
+ int idx = table_offset / s->cluster_size;
+ assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
+ return idx;
+}
+
Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
{
BDRVQcowState *s = bs->opaque;
Qcow2Cache *c;
- int i;
c = g_new0(Qcow2Cache, 1);
c->size = num_tables;
c->entries = g_try_new0(Qcow2CachedTable, num_tables);
- if (!c->entries) {
- goto fail;
- }
-
- for (i = 0; i < c->size; i++) {
- c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size);
- if (c->entries[i].table == NULL) {
- goto fail;
- }
+ c->table_array = qemu_try_blockalign(bs->file,
+ (size_t) num_tables * s->cluster_size);
+
+ if (!c->entries || !c->table_array) {
+ qemu_vfree(c->table_array);
+ g_free(c->entries);
+ g_free(c);
+ c = NULL;
}
return c;
-
-fail:
- if (c->entries) {
- for (i = 0; i < c->size; i++) {
- qemu_vfree(c->entries[i].table);
- }
- }
- g_free(c->entries);
- g_free(c);
- return NULL;
}
-int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c)
{
int i;
for (i = 0; i < c->size; i++) {
assert(c->entries[i].ref == 0);
- qemu_vfree(c->entries[i].table);
}
+ qemu_vfree(c->table_array);
g_free(c->entries);
g_free(c);
@@ -151,8 +157,8 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
}
- ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
- s->cluster_size);
+ ret = bdrv_pwrite(bs->file, c->entries[i].offset,
+ qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
if (ret < 0) {
return ret;
}
@@ -228,42 +234,12 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
for (i = 0; i < c->size; i++) {
assert(c->entries[i].ref == 0);
c->entries[i].offset = 0;
- c->entries[i].cache_hits = 0;
+ c->entries[i].lru_counter = 0;
}
- return 0;
-}
-
-static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
-{
- int i;
- int min_count = INT_MAX;
- int min_index = -1;
-
-
- for (i = 0; i < c->size; i++) {
- if (c->entries[i].ref) {
- continue;
- }
-
- if (c->entries[i].cache_hits < min_count) {
- min_index = i;
- min_count = c->entries[i].cache_hits;
- }
-
- /* Give newer hits priority */
- /* TODO Check how to optimize the replacement strategy */
- if (c->entries[i].cache_hits > 1) {
- c->entries[i].cache_hits /= 2;
- }
- }
+ c->lru_counter = 0;
- if (min_index == -1) {
- /* This can't happen in current synchronous code, but leave the check
- * here as a reminder for whoever starts using AIO with the cache */
- abort();
- }
- return min_index;
+ return 0;
}
static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
@@ -272,24 +248,39 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
BDRVQcowState *s = bs->opaque;
int i;
int ret;
+ int lookup_index;
+ uint64_t min_lru_counter = UINT64_MAX;
+ int min_lru_index = -1;
trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
offset, read_from_disk);
/* Check if the table is already cached */
- for (i = 0; i < c->size; i++) {
- if (c->entries[i].offset == offset) {
+ i = lookup_index = (offset / s->cluster_size * 4) % c->size;
+ do {
+ const Qcow2CachedTable *t = &c->entries[i];
+ if (t->offset == offset) {
goto found;
}
+ if (t->ref == 0 && t->lru_counter < min_lru_counter) {
+ min_lru_counter = t->lru_counter;
+ min_lru_index = i;
+ }
+ if (++i == c->size) {
+ i = 0;
+ }
+ } while (i != lookup_index);
+
+ if (min_lru_index == -1) {
+ /* This can't happen in current synchronous code, but leave the check
+ * here as a reminder for whoever starts using AIO with the cache */
+ abort();
}
- /* If not, write a table back and replace it */
- i = qcow2_cache_find_entry_to_replace(c);
+ /* Cache miss: write a table back and replace it */
+ i = min_lru_index;
trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
c == s->l2_table_cache, i);
- if (i < 0) {
- return i;
- }
ret = qcow2_cache_entry_flush(bs, c, i);
if (ret < 0) {
@@ -304,22 +295,19 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
}
- ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+ ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i),
+ s->cluster_size);
if (ret < 0) {
return ret;
}
}
- /* Give the table some hits for the start so that it won't be replaced
- * immediately. The number 32 is completely arbitrary. */
- c->entries[i].cache_hits = 32;
c->entries[i].offset = offset;
/* And return the right table */
found:
- c->entries[i].cache_hits++;
c->entries[i].ref++;
- *table = c->entries[i].table;
+ *table = qcow2_cache_get_table_addr(bs, c, i);
trace_qcow2_cache_get_done(qemu_coroutine_self(),
c == s->l2_table_cache, i);
@@ -339,36 +327,24 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
return qcow2_cache_do_get(bs, c, offset, table, false);
}
-int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
{
- int i;
+ int i = qcow2_cache_get_table_idx(bs, c, *table);
- for (i = 0; i < c->size; i++) {
- if (c->entries[i].table == *table) {
- goto found;
- }
- }
- return -ENOENT;
-
-found:
c->entries[i].ref--;
*table = NULL;
+ if (c->entries[i].ref == 0) {
+ c->entries[i].lru_counter = ++c->lru_counter;
+ }
+
assert(c->entries[i].ref >= 0);
- return 0;
}
-void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+ void *table)
{
- int i;
-
- for (i = 0; i < c->size; i++) {
- if (c->entries[i].table == table) {
- goto found;
- }
- }
- abort();
-
-found:
+ int i = qcow2_cache_get_table_idx(bs, c, table);
+ assert(c->entries[i].offset != 0);
c->entries[i].dirty = true;
}
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ed2b44d29..7e94fe70e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -253,17 +253,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
memcpy(l2_table, old_table, s->cluster_size);
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
- if (ret < 0) {
- goto fail;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
}
/* write the l2 table to the file */
BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
trace_qcow2_l2_allocate_write_l2(bs, l1_index);
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
ret = qcow2_cache_flush(bs, s->l2_table_cache);
if (ret < 0) {
goto fail;
@@ -301,7 +298,7 @@ fail:
* as contiguous. (This allows it, for example, to stop at the first compressed
* cluster which may require a different handling)
*/
-static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+static int count_contiguous_clusters(int nb_clusters, int cluster_size,
uint64_t *l2_table, uint64_t stop_flags)
{
int i;
@@ -324,7 +321,7 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
return i;
}
-static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table)
{
int i;
@@ -342,26 +339,47 @@ static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_tab
/* The crypt function is compatible with the linux cryptoloop
algorithm for < 4 GB images. NOTE: out_buf == in_buf is
supported */
-void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
- uint8_t *out_buf, const uint8_t *in_buf,
- int nb_sectors, int enc,
- const AES_KEY *key)
+int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, bool enc,
+ Error **errp)
{
union {
uint64_t ll[2];
uint8_t b[16];
} ivec;
int i;
+ int ret;
for(i = 0; i < nb_sectors; i++) {
ivec.ll[0] = cpu_to_le64(sector_num);
ivec.ll[1] = 0;
- AES_cbc_encrypt(in_buf, out_buf, 512, key,
- ivec.b, enc);
+ if (qcrypto_cipher_setiv(s->cipher,
+ ivec.b, G_N_ELEMENTS(ivec.b),
+ errp) < 0) {
+ return -1;
+ }
+ if (enc) {
+ ret = qcrypto_cipher_encrypt(s->cipher,
+ in_buf,
+ out_buf,
+ 512,
+ errp);
+ } else {
+ ret = qcrypto_cipher_decrypt(s->cipher,
+ in_buf,
+ out_buf,
+ 512,
+ errp);
+ }
+ if (ret < 0) {
+ return -1;
+ }
sector_num++;
in_buf += 512;
out_buf += 512;
}
+ return 0;
}
static int coroutine_fn copy_sectors(BlockDriverState *bs,
@@ -403,10 +421,16 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
goto out;
}
- if (s->crypt_method) {
- qcow2_encrypt_sectors(s, start_sect + n_start,
- iov.iov_base, iov.iov_base, n, 1,
- &s->aes_encrypt_key);
+ if (bs->encrypted) {
+ Error *err = NULL;
+ assert(s->cipher);
+ if (qcow2_encrypt_sectors(s, start_sect + n_start,
+ iov.iov_base, iov.iov_base, n,
+ true, &err) < 0) {
+ ret = -EIO;
+ error_free(err);
+ goto out;
+ }
}
ret = qcow2_pre_write_overlap_check(bs, 0,
@@ -471,6 +495,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
if (nb_needed > nb_available) {
nb_needed = nb_available;
}
+ assert(nb_needed <= INT_MAX);
*cluster_offset = 0;
@@ -506,6 +531,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
*cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
nb_clusters = size_to_clusters(s, nb_needed << 9);
ret = qcow2_get_cluster_type(*cluster_offset);
@@ -692,12 +719,9 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
/* compressed clusters never have the copied flag */
BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
l2_table[l2_index] = cpu_to_be64(cluster_offset);
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- return 0;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
return cluster_offset;
}
@@ -771,7 +795,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
if (ret < 0) {
goto err;
}
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
assert(l2_index + m->nb_clusters <= s->l2_size);
for (i = 0; i < m->nb_clusters; i++) {
@@ -789,10 +813,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
}
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- goto err;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
/*
* If this was a COW, we need to decrease the refcount of the old cluster.
@@ -942,9 +963,9 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
int l2_index;
uint64_t cluster_offset;
uint64_t *l2_table;
- unsigned int nb_clusters;
+ uint64_t nb_clusters;
unsigned int keep_clusters;
- int ret, pret;
+ int ret;
trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
*bytes);
@@ -961,6 +982,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
l2_index = offset_to_l2_index(s, guest_offset);
nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+ assert(nb_clusters <= INT_MAX);
/* Find L2 entry for the first involved cluster */
ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
@@ -1011,10 +1033,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
/* Cleanup */
out:
- pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (pret < 0) {
- return pret;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
/* Only return a host offset if we actually made progress. Otherwise we
* would make requirements for handle_alloc() that it can't fulfill */
@@ -1046,7 +1065,7 @@ out:
* restarted, but the whole request should not be failed.
*/
static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
- uint64_t *host_offset, unsigned int *nb_clusters)
+ uint64_t *host_offset, uint64_t *nb_clusters)
{
BDRVQcowState *s = bs->opaque;
@@ -1064,7 +1083,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
*host_offset = cluster_offset;
return 0;
} else {
- int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+ int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
if (ret < 0) {
return ret;
}
@@ -1100,7 +1119,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
int l2_index;
uint64_t *l2_table;
uint64_t entry;
- unsigned int nb_clusters;
+ uint64_t nb_clusters;
int ret;
uint64_t alloc_cluster_offset;
@@ -1118,6 +1137,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
l2_index = offset_to_l2_index(s, guest_offset);
nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+ assert(nb_clusters <= INT_MAX);
/* Find L2 entry for the first involved cluster */
ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
@@ -1139,10 +1159,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
* wrong with our code. */
assert(nb_clusters > 0);
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- return ret;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
/* Allocate, if necessary at a given offset in the image file */
alloc_cluster_offset = start_of_cluster(s, *host_offset);
@@ -1414,7 +1431,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
* clusters.
*/
static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
- unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard)
+ uint64_t nb_clusters, enum qcow2_discard_type type,
+ bool full_discard)
{
BDRVQcowState *s = bs->opaque;
uint64_t *l2_table;
@@ -1429,6 +1447,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
/* Limit nb_clusters to one L2 table */
nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+ assert(nb_clusters <= INT_MAX);
for (i = 0; i < nb_clusters; i++) {
uint64_t old_l2_entry;
@@ -1470,7 +1489,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
}
/* First remove L2 entries */
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
if (!full_discard && s->qcow_version >= 3) {
l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
} else {
@@ -1481,10 +1500,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
}
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- return ret;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
return nb_clusters;
}
@@ -1494,7 +1510,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
{
BDRVQcowState *s = bs->opaque;
uint64_t end_offset;
- unsigned int nb_clusters;
+ uint64_t nb_clusters;
int ret;
end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
@@ -1536,7 +1552,7 @@ fail:
* clusters.
*/
static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
- unsigned int nb_clusters)
+ uint64_t nb_clusters)
{
BDRVQcowState *s = bs->opaque;
uint64_t *l2_table;
@@ -1551,6 +1567,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
/* Limit nb_clusters to one L2 table */
nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+ assert(nb_clusters <= INT_MAX);
for (i = 0; i < nb_clusters; i++) {
uint64_t old_offset;
@@ -1558,7 +1575,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
old_offset = be64_to_cpu(l2_table[l2_index + i]);
/* Update L2 entries */
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
if (old_offset & QCOW_OFLAG_COMPRESSED) {
l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
@@ -1567,10 +1584,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
}
}
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- return ret;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
return nb_clusters;
}
@@ -1578,7 +1592,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
{
BDRVQcowState *s = bs->opaque;
- unsigned int nb_clusters;
+ uint64_t nb_clusters;
int ret;
/* The zero flag is only supported by version 3 and newer */
@@ -1760,14 +1774,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
if (is_active_l1) {
if (l2_dirty) {
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
qcow2_cache_depends_on_flush(s->l2_table_cache);
}
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
- if (ret < 0) {
- l2_table = NULL;
- goto fail;
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
} else {
if (l2_dirty) {
ret = qcow2_pre_write_overlap_check(bs,
@@ -1798,12 +1808,7 @@ fail:
if (!is_active_l1) {
qemu_vfree(l2_table);
} else {
- if (ret < 0) {
- qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
- } else {
- ret = qcow2_cache_put(bs, s->l2_table_cache,
- (void **)&l2_table);
- }
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
}
}
return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 63c00858d..0b6c302ee 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -265,10 +265,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
block_index = cluster_index & (s->refcount_block_size - 1);
*refcount = s->get_refcount(refcount_block, block_index);
- ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
- if (ret < 0) {
- return ret;
- }
+ qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
return 0;
}
@@ -424,7 +421,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
/* Now the new refcount block needs to be written to disk */
BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
- qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+ qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
ret = qcow2_cache_flush(bs, s->refcount_block_cache);
if (ret < 0) {
goto fail_block;
@@ -448,10 +445,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
return -EAGAIN;
}
- ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
- if (ret < 0) {
- goto fail_block;
- }
+ qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
/*
* If we come here, we need to grow the refcount table. Again, a new
@@ -723,13 +717,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
/* Load the refcount block and allocate it if needed */
if (table_index != old_table_index) {
if (refcount_block) {
- ret = qcow2_cache_put(bs, s->refcount_block_cache,
- &refcount_block);
- if (ret < 0) {
- goto fail;
- }
+ qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
}
-
ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
if (ret < 0) {
goto fail;
@@ -737,7 +726,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
}
old_table_index = table_index;
- qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+ qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
+ refcount_block);
/* we can update the count and save it */
block_index = cluster_index & (s->refcount_block_size - 1);
@@ -773,11 +763,7 @@ fail:
/* Write last changed block to disk */
if (refcount_block) {
- int wret;
- wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
- if (wret < 0) {
- return ret < 0 ? ret : wret;
- }
+ qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
}
/*
@@ -889,8 +875,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
return offset;
}
-int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
- int nb_clusters)
+int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int64_t nb_clusters)
{
BDRVQcowState *s = bs->opaque;
uint64_t cluster_index, refcount;
@@ -954,19 +940,21 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
}
free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
- if (!offset || free_in_cluster < size) {
- int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
- if (new_cluster < 0) {
- return new_cluster;
- }
+ do {
+ if (!offset || free_in_cluster < size) {
+ int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
+ if (new_cluster < 0) {
+ return new_cluster;
+ }
- if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
- offset = new_cluster;
+ if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
+ offset = new_cluster;
+ }
}
- }
- assert(offset);
- ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
+ assert(offset);
+ ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
+ } while (ret == -EAGAIN);
if (ret < 0) {
return ret;
}
@@ -1182,15 +1170,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
s->refcount_block_cache);
}
l2_table[j] = cpu_to_be64(offset);
- qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
+ l2_table);
}
}
- ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
- if (ret < 0) {
- goto fail;
- }
-
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
if (addend != 0) {
ret = qcow2_update_cluster_refcount(bs, l2_offset >>
@@ -1274,7 +1259,7 @@ static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries)
static int realloc_refcount_array(BDRVQcowState *s, void **array,
int64_t *size, int64_t new_size)
{
- size_t old_byte_size, new_byte_size;
+ int64_t old_byte_size, new_byte_size;
void *new_ptr;
/* Round to clusters so the array can be directly written to disk */
@@ -1290,13 +1275,17 @@ static int realloc_refcount_array(BDRVQcowState *s, void **array,
assert(new_byte_size > 0);
+ if (new_byte_size > SIZE_MAX) {
+ return -ENOMEM;
+ }
+
new_ptr = g_try_realloc(*array, new_byte_size);
if (!new_ptr) {
return -ENOMEM;
}
if (new_byte_size > old_byte_size) {
- memset((void *)((uintptr_t)new_ptr + old_byte_size), 0,
+ memset((char *)new_ptr + old_byte_size, 0,
new_byte_size - old_byte_size);
}
@@ -2455,7 +2444,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
if (ret < 0) {
return ret;
} else if (ret > 0) {
- int metadata_ol_bitnr = ffs(ret) - 1;
+ int metadata_ol_bitnr = ctz32(ret);
assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb1d..b6f58c13e 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -25,6 +25,7 @@
#include "qemu-common.h"
#include "block/block_int.h"
#include "block/qcow2.h"
+#include "qemu/error-report.h"
void qcow2_free_snapshots(BlockDriverState *bs)
{
@@ -351,10 +352,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
memset(sn, 0, sizeof(*sn));
- /* Generate an ID if it wasn't passed */
- if (sn_info->id_str[0] == '\0') {
- find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
- }
+ /* Generate an ID */
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
/* Check that the ID is unique */
if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db22..76c331b38 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -25,7 +25,6 @@
#include "block/block_int.h"
#include "qemu/module.h"
#include <zlib.h>
-#include "qemu/aes.h"
#include "block/qcow2.h"
#include "qemu/error-report.h"
#include "qapi/qmp/qerror.h"
@@ -207,8 +206,8 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
vsnprintf(msg, sizeof(msg), fmt, ap);
va_end(ap);
- error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow2", msg);
+ error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bdrv_get_device_or_node_name(bs), "qcow2", msg);
}
static void report_unsupported_feature(BlockDriverState *bs,
@@ -483,9 +482,11 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
[QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2,
};
-static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
+static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
+ uint64_t *l2_cache_size,
uint64_t *refcount_cache_size, Error **errp)
{
+ BDRVQcowState *s = bs->opaque;
uint64_t combined_cache_size;
bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
@@ -525,7 +526,9 @@ static void read_cache_sizes(QemuOpts *opts, uint64_t *l2_cache_size,
}
} else {
if (!l2_cache_size_set && !refcount_cache_size_set) {
- *l2_cache_size = DEFAULT_L2_CACHE_BYTE_SIZE;
+ *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
+ (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
+ * s->cluster_size);
*refcount_cache_size = *l2_cache_size
/ DEFAULT_L2_REFCOUNT_SIZE_RATIO;
} else if (!l2_cache_size_set) {
@@ -695,6 +698,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
ret = -EINVAL;
goto fail;
}
+ if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
+ error_setg(errp, "AES cipher not available");
+ ret = -EINVAL;
+ goto fail;
+ }
s->crypt_method_header = header.crypt_method;
if (s->crypt_method_header) {
bs->encrypted = 1;
@@ -803,7 +811,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
goto fail;
}
- read_cache_sizes(opts, &l2_cache_size, &refcount_cache_size, &local_err);
+ read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
+ &local_err);
if (local_err) {
error_propagate(errp, local_err);
ret = -EINVAL;
@@ -1027,6 +1036,7 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
BDRVQcowState *s = bs->opaque;
uint8_t keybuf[16];
int len, i;
+ Error *err = NULL;
memset(keybuf, 0, 16);
len = strlen(key);
@@ -1037,30 +1047,22 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
for(i = 0;i < len;i++) {
keybuf[i] = key[i];
}
- s->crypt_method = s->crypt_method_header;
+ assert(bs->encrypted);
- if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
- return -1;
- if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ qcrypto_cipher_free(s->cipher);
+ s->cipher = qcrypto_cipher_new(
+ QCRYPTO_CIPHER_ALG_AES_128,
+ QCRYPTO_CIPHER_MODE_CBC,
+ keybuf, G_N_ELEMENTS(keybuf),
+ &err);
+
+ if (!s->cipher) {
+ /* XXX would be nice if errors in this method could
+ * be properly propagate to the caller. Would need
+ * the bdrv_set_key() API signature to be fixed. */
+ error_free(err);
return -1;
-#if 0
- /* test */
- {
- uint8_t in[16];
- uint8_t out[16];
- uint8_t tmp[16];
- for(i=0;i<16;i++)
- in[i] = i;
- AES_encrypt(in, tmp, &s->aes_encrypt_key);
- AES_decrypt(tmp, out, &s->aes_decrypt_key);
- for(i = 0; i < 16; i++)
- printf(" %02x", tmp[i]);
- printf("\n");
- for(i = 0; i < 16; i++)
- printf(" %02x", out[i]);
- printf("\n");
}
-#endif
return 0;
}
@@ -1103,7 +1105,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
}
if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
- !s->crypt_method) {
+ !s->cipher) {
index_in_cluster = sector_num & (s->cluster_sectors - 1);
cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
@@ -1153,7 +1155,7 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
/* prepare next request */
cur_nr_sectors = remaining_sectors;
- if (s->crypt_method) {
+ if (s->cipher) {
cur_nr_sectors = MIN(cur_nr_sectors,
QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
}
@@ -1224,7 +1226,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
goto fail;
}
- if (s->crypt_method) {
+ if (bs->encrypted) {
+ assert(s->cipher);
+
/*
* For encrypted images, read everything into a temporary
* contiguous buffer on which the AES functions can work.
@@ -1255,9 +1259,16 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
if (ret < 0) {
goto fail;
}
- if (s->crypt_method) {
- qcow2_encrypt_sectors(s, sector_num, cluster_data,
- cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+ if (bs->encrypted) {
+ assert(s->cipher);
+ Error *err = NULL;
+ if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors, false,
+ &err) < 0) {
+ error_free(err);
+ ret = -EIO;
+ goto fail;
+ }
qemu_iovec_from_buf(qiov, bytes_done,
cluster_data, 512 * cur_nr_sectors);
}
@@ -1315,7 +1326,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
trace_qcow2_writev_start_part(qemu_coroutine_self());
index_in_cluster = sector_num & (s->cluster_sectors - 1);
cur_nr_sectors = remaining_sectors;
- if (s->crypt_method &&
+ if (bs->encrypted &&
cur_nr_sectors >
QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
cur_nr_sectors =
@@ -1334,7 +1345,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
cur_nr_sectors * 512);
- if (s->crypt_method) {
+ if (bs->encrypted) {
+ Error *err = NULL;
+ assert(s->cipher);
if (!cluster_data) {
cluster_data = qemu_try_blockalign(bs->file,
QCOW_MAX_CRYPT_CLUSTERS
@@ -1349,8 +1362,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
- qcow2_encrypt_sectors(s, sector_num, cluster_data,
- cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+ if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
+ cluster_data, cur_nr_sectors,
+ true, &err) < 0) {
+ error_free(err);
+ ret = -EIO;
+ goto fail;
+ }
qemu_iovec_reset(&hd_qiov);
qemu_iovec_add(&hd_qiov, cluster_data,
@@ -1456,6 +1474,9 @@ static void qcow2_close(BlockDriverState *bs)
qcow2_cache_destroy(bs, s->l2_table_cache);
qcow2_cache_destroy(bs, s->refcount_block_cache);
+ qcrypto_cipher_free(s->cipher);
+ s->cipher = NULL;
+
g_free(s->unknown_header_fields);
cleanup_unknown_header_ext(bs);
@@ -1472,9 +1493,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
{
BDRVQcowState *s = bs->opaque;
int flags = s->flags;
- AES_KEY aes_encrypt_key;
- AES_KEY aes_decrypt_key;
- uint32_t crypt_method = 0;
+ QCryptoCipher *cipher = NULL;
QDict *options;
Error *local_err = NULL;
int ret;
@@ -1484,11 +1503,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
* that means we don't have to worry about reopening them here.
*/
- if (s->crypt_method) {
- crypt_method = s->crypt_method;
- memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
- memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
- }
+ cipher = s->cipher;
+ s->cipher = NULL;
qcow2_close(bs);
@@ -1513,11 +1529,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
return;
}
- if (crypt_method) {
- s->crypt_method = crypt_method;
- memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
- memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
- }
+ s->cipher = cipher;
}
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
@@ -1802,7 +1814,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
{
/* Calculate cluster_bits */
int cluster_bits;
- cluster_bits = ffs(cluster_size) - 1;
+ cluster_bits = ctz32(cluster_size);
if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
(1 << cluster_bits) != cluster_size)
{
@@ -2110,7 +2122,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
goto finish;
}
- refcount_order = ffs(refcount_bits) - 1;
+ refcount_order = ctz32(refcount_bits);
ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
cluster_size, prealloc, opts, version, refcount_order,
@@ -2718,8 +2730,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
} else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
- s->crypt_method);
- if (encrypt != !!s->crypt_method) {
+ !!s->cipher);
+
+ if (encrypt != !!s->cipher) {
fprintf(stderr, "Changing the encryption flag is not "
"supported.\n");
return -ENOTSUP;
@@ -2824,6 +2837,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
int64_t size, const char *message_format, ...)
{
BDRVQcowState *s = bs->opaque;
+ const char *node_name;
char *message;
va_list ap;
@@ -2847,8 +2861,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
"corruption events will be suppressed\n", message);
}
- qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
- offset >= 0, offset, size >= 0, size,
+ node_name = bdrv_get_node_name(bs);
+ qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+ *node_name != '\0', node_name,
+ message, offset >= 0, offset,
+ size >= 0, size,
fatal, &error_abort);
g_free(message);
diff --git a/block/qcow2.h b/block/qcow2.h
index 2f2094959..4b5a6afc8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -25,7 +25,7 @@
#ifndef BLOCK_QCOW2_H
#define BLOCK_QCOW2_H
-#include "qemu/aes.h"
+#include "crypto/cipher.h"
#include "block/coroutine.h"
//#define DEBUG_ALLOC
@@ -68,6 +68,8 @@
/* Must be at least 4 to cover all cases of refcount table growth */
#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
+/* Whichever is more */
+#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
/* The refblock cache needs only a fourth of the L2 cache size to cover as many
@@ -251,10 +253,8 @@ typedef struct BDRVQcowState {
CoMutex lock;
- uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ QCryptoCipher *cipher; /* current cipher, NULL if no key yet */
uint32_t crypt_method_header;
- AES_KEY aes_encrypt_key;
- AES_KEY aes_decrypt_key;
uint64_t snapshots_offset;
int snapshots_size;
unsigned int nb_snapshots;
@@ -412,7 +412,7 @@ static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
return offset & (s->cluster_size - 1);
}
-static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+static inline uint64_t size_to_clusters(BDRVQcowState *s, uint64_t size)
{
return (size + (s->cluster_size - 1)) >> s->cluster_bits;
}
@@ -506,8 +506,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
enum qcow2_discard_type type);
int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
-int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
- int nb_clusters);
+int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+ int64_t nb_clusters);
int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
void qcow2_free_clusters(BlockDriverState *bs,
int64_t offset, int64_t size,
@@ -534,10 +534,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
void qcow2_l2_cache_reset(BlockDriverState *bs);
int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
-void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
- uint8_t *out_buf, const uint8_t *in_buf,
- int nb_sectors, int enc,
- const AES_KEY *key);
+int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, bool enc, Error **errp);
int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
int *num, uint64_t *cluster_offset);
@@ -575,7 +574,8 @@ int qcow2_read_snapshots(BlockDriverState *bs);
Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
-void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+ void *table);
int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
Qcow2Cache *dependency);
@@ -587,6 +587,6 @@ int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
void **table);
int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
void **table);
-int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
#endif
diff --git a/block/qed.c b/block/qed.c
index 892b13c80..954ed007c 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -407,8 +407,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
char buf[64];
snprintf(buf, sizeof(buf), "%" PRIx64,
s->header.features & ~QED_FEATURE_MASK);
- error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "QED", buf);
+ error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bdrv_get_device_or_node_name(bs), "QED", buf);
return -ENOTSUP;
}
if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
s->table_nelems = (s->header.cluster_size * s->header.table_size) /
sizeof(uint64_t);
- s->l2_shift = ffs(s->header.cluster_size) - 1;
+ s->l2_shift = ctz32(s->header.cluster_size);
s->l2_mask = s->table_nelems - 1;
- s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+ s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
/* Header size calculation must not overflow uint32_t */
if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/quorum.c b/block/quorum.c
index 437b12251..2f6c45f76 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -13,16 +13,16 @@
* See the COPYING file in the top-level directory.
*/
-#include <gnutls/gnutls.h>
-#include <gnutls/crypto.h>
#include "block/block_int.h"
#include "qapi/qmp/qbool.h"
#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qerror.h"
#include "qapi/qmp/qint.h"
#include "qapi/qmp/qjson.h"
#include "qapi/qmp/qlist.h"
#include "qapi/qmp/qstring.h"
#include "qapi-event.h"
+#include "crypto/hash.h"
#define HASH_LENGTH 32
@@ -33,7 +33,7 @@
/* This union holds a vote hash value */
typedef union QuorumVoteValue {
- char h[HASH_LENGTH]; /* SHA-256 hash */
+ uint8_t h[HASH_LENGTH]; /* SHA-256 hash */
int64_t l; /* simpler 64 bits hash */
} QuorumVoteValue;
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
static void quorum_report_failure(QuorumAIOCB *acb)
{
- const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
- bdrv_get_device_name(acb->common.bs) :
- acb->common.bs->node_name;
-
+ const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
qapi_event_send_quorum_failure(reference, acb->sector_num,
acb->nb_sectors, &error_abort);
}
@@ -430,25 +427,21 @@ static void quorum_free_vote_list(QuorumVotes *votes)
static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
{
- int j, ret;
- gnutls_hash_hd_t dig;
QEMUIOVector *qiov = &acb->qcrs[i].qiov;
-
- ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
-
- if (ret < 0) {
- return ret;
+ size_t len = sizeof(hash->h);
+ uint8_t *data = hash->h;
+
+ /* XXX - would be nice if we could pass in the Error **
+ * and propagate that back, but this quorum code is
+ * restricted to just errno values currently */
+ if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256,
+ qiov->iov, qiov->niov,
+ &data, &len,
+ NULL) < 0) {
+ return -EINVAL;
}
- for (j = 0; j < qiov->niov; j++) {
- ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
- if (ret < 0) {
- break;
- }
- }
-
- gnutls_hash_deinit(dig, (void *) hash);
- return ret;
+ return 0;
}
static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
@@ -803,8 +796,8 @@ static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
{
if (threshold < 1) {
- error_set(errp, QERR_INVALID_PARAMETER_VALUE,
- "vote-threshold", "value >= 1");
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+ "vote-threshold", "value >= 1");
return -ERANGE;
}
@@ -869,25 +862,18 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
Error *local_err = NULL;
QemuOpts *opts = NULL;
bool *opened;
- QDict *sub = NULL;
- QList *list = NULL;
- const QListEntry *lentry;
int i;
int ret = 0;
qdict_flatten(options);
- qdict_extract_subqdict(options, &sub, "children.");
- qdict_array_split(sub, &list);
- if (qdict_size(sub)) {
- error_setg(&local_err, "Invalid option children.%s",
- qdict_first(sub)->key);
+ /* count how many different children are present */
+ s->num_children = qdict_array_entries(options, "children.");
+ if (s->num_children < 0) {
+ error_setg(&local_err, "Option children is not a valid array");
ret = -EINVAL;
goto exit;
}
-
- /* count how many different children are present */
- s->num_children = qlist_size(list);
if (s->num_children < 2) {
error_setg(&local_err,
"Number of provided children must be greater than 1");
@@ -940,37 +926,17 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
s->bs = g_new0(BlockDriverState *, s->num_children);
opened = g_new0(bool, s->num_children);
- for (i = 0, lentry = qlist_first(list); lentry;
- lentry = qlist_next(lentry), i++) {
- QDict *d;
- QString *string;
-
- switch (qobject_type(lentry->value))
- {
- /* List of options */
- case QTYPE_QDICT:
- d = qobject_to_qdict(lentry->value);
- QINCREF(d);
- ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL,
- &local_err);
- break;
-
- /* QMP reference */
- case QTYPE_QSTRING:
- string = qobject_to_qstring(lentry->value);
- ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL,
- flags, NULL, &local_err);
- break;
-
- default:
- error_setg(&local_err, "Specification of child block device %i "
- "is invalid", i);
- ret = -EINVAL;
- }
+ for (i = 0; i < s->num_children; i++) {
+ char indexstr[32];
+ ret = snprintf(indexstr, 32, "children.%d", i);
+ assert(ret < 32);
+ ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs,
+ &child_format, false, &local_err);
if (ret < 0) {
goto close_exit;
}
+
opened[i] = true;
}
@@ -993,8 +959,6 @@ exit:
if (local_err) {
error_propagate(errp, local_err);
}
- QDECREF(list);
- QDECREF(sub);
return ret;
}
@@ -1056,9 +1020,9 @@ static void quorum_refresh_filename(BlockDriverState *bs)
qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD,
QOBJECT(qint_from_int(s->threshold)));
qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY,
- QOBJECT(qbool_from_int(s->is_blkverify)));
+ QOBJECT(qbool_from_bool(s->is_blkverify)));
qdict_put_obj(opts, QUORUM_OPT_REWRITE,
- QOBJECT(qbool_from_int(s->rewrite_corrupted)));
+ QOBJECT(qbool_from_bool(s->rewrite_corrupted)));
qdict_put_obj(opts, "children", QOBJECT(children));
bs->full_open_options = opts;
@@ -1091,6 +1055,10 @@ static BlockDriver bdrv_quorum = {
static void bdrv_quorum_init(void)
{
+ if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) {
+ /* SHA256 hash support is required for quorum device */
+ return;
+ }
bdrv_register(&bdrv_quorum);
}
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 24d85826c..855febed5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -22,6 +22,7 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
+#include "qemu/error-report.h"
#include "qemu/timer.h"
#include "qemu/log.h"
#include "block/block_int.h"
@@ -31,6 +32,7 @@
#include "qemu/iov.h"
#include "raw-aio.h"
#include "qapi/util.h"
+#include "qapi/qmp/qstring.h"
#if defined(__APPLE__) && (__MACH__)
#include <paths.h>
@@ -57,6 +59,7 @@
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
+#include <scsi/sg.h>
#ifdef __s390__
#include <asm/dasd.h>
#endif
@@ -94,15 +97,19 @@
#include <xfs/xfs.h>
#endif
-//#define DEBUG_FLOPPY
-
//#define DEBUG_BLOCK
-#if defined(DEBUG_BLOCK)
-#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
- { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+
+#ifdef DEBUG_BLOCK
+# define DEBUG_BLOCK_PRINT 1
#else
-#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+# define DEBUG_BLOCK_PRINT 0
#endif
+#define DPRINTF(fmt, ...) \
+do { \
+ if (DEBUG_BLOCK_PRINT) { \
+ printf(fmt, ## __VA_ARGS__); \
+ } \
+} while (0)
/* OS X does not have O_DSYNC */
#ifndef O_DSYNC
@@ -301,10 +308,11 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
{
BDRVRawState *s = bs->opaque;
char *buf;
+ size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
- /* For /dev/sg devices the alignment is not really used.
+ /* For SCSI generic devices the alignment is not really used.
With buffered I/O, we don't have any restrictions. */
- if (bs->sg || !s->needs_alignment) {
+ if (bdrv_is_sg(bs) || !s->needs_alignment) {
bs->request_alignment = 1;
s->buf_align = 1;
return;
@@ -330,9 +338,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
/* If we could not get the sizes so far, we can only guess them */
if (!s->buf_align) {
size_t align;
- buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
- for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
- if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
+ buf = qemu_memalign(max_align, 2 * max_align);
+ for (align = 512; align <= max_align; align <<= 1) {
+ if (raw_is_io_aligned(fd, buf + align, max_align)) {
s->buf_align = align;
break;
}
@@ -342,8 +350,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
if (!bs->request_alignment) {
size_t align;
- buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
- for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+ buf = qemu_memalign(s->buf_align, max_align);
+ for (align = 512; align <= max_align; align <<= 1) {
if (raw_is_io_aligned(fd, buf, align)) {
bs->request_alignment = align;
break;
@@ -725,7 +733,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
BDRVRawState *s = bs->opaque;
raw_probe_alignment(bs, s->fd, errp);
- bs->bl.opt_mem_alignment = s->buf_align;
+ bs->bl.min_mem_alignment = s->buf_align;
+ bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
}
static int check_for_dasd(int fd)
@@ -1016,6 +1025,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
struct xfs_flock64 fl;
+ int err;
memset(&fl, 0, sizeof(fl));
fl.l_whence = SEEK_SET;
@@ -1023,8 +1033,9 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
fl.l_len = bytes;
if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
- DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno));
- return -errno;
+ err = errno;
+ DPRINTF("cannot write zero range (%s)\n", strerror(errno));
+ return -err;
}
return 0;
@@ -1033,6 +1044,7 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
struct xfs_flock64 fl;
+ int err;
memset(&fl, 0, sizeof(fl));
fl.l_whence = SEEK_SET;
@@ -1040,8 +1052,9 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
fl.l_len = bytes;
if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
- DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
- return -errno;
+ err = errno;
+ DPRINTF("cannot punch hole (%s)\n", strerror(errno));
+ return -err;
}
return 0;
@@ -1846,8 +1859,9 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
*pnum = nb_sectors;
ret = BDRV_BLOCK_DATA;
} else if (data == start) {
- /* On a data extent, compute sectors to the end of the extent. */
- *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+ /* On a data extent, compute sectors to the end of the extent,
+ * possibly including a partial sector at EOF. */
+ *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute sectors to the beginning of the next extent. */
@@ -2073,15 +2087,38 @@ static void hdev_parse_filename(const char *filename, QDict *options,
qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
}
+static bool hdev_is_sg(BlockDriverState *bs)
+{
+
+#if defined(__linux__)
+
+ struct stat st;
+ struct sg_scsi_id scsiid;
+ int sg_version;
+
+ if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) &&
+ !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) &&
+ !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) {
+ DPRINTF("SG device found: type=%d, version=%d\n",
+ scsiid.scsi_type, sg_version);
+ return true;
+ }
+
+#endif
+
+ return false;
+}
+
static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVRawState *s = bs->opaque;
Error *local_err = NULL;
int ret;
- const char *filename = qdict_get_str(options, "filename");
#if defined(__APPLE__) && defined(__MACH__)
+ const char *filename = qdict_get_str(options, "filename");
+
if (strstart(filename, "/dev/cdrom", NULL)) {
kern_return_t kernResult;
io_iterator_t mediaIterator;
@@ -2110,16 +2147,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
#endif
s->type = FTYPE_FILE;
-#if defined(__linux__)
- {
- char resolved_path[ MAXPATHLEN ], *temp;
-
- temp = realpath(filename, resolved_path);
- if (temp && strstart(temp, "/dev/sg", NULL)) {
- bs->sg = 1;
- }
- }
-#endif
ret = raw_open_common(bs, options, flags, 0, &local_err);
if (ret < 0) {
@@ -2129,6 +2156,9 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
return ret;
}
+ /* Since this does ioctl the device must be already opened */
+ bs->sg = hdev_is_sg(bs);
+
if (flags & BDRV_O_RDWR) {
ret = check_hdev_writable(s);
if (ret < 0) {
@@ -2157,16 +2187,12 @@ static int fd_open(BlockDriverState *bs)
(qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
qemu_close(s->fd);
s->fd = -1;
-#ifdef DEBUG_FLOPPY
- printf("Floppy closed\n");
-#endif
+ DPRINTF("Floppy closed\n");
}
if (s->fd < 0) {
if (s->fd_got_error &&
(qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
-#ifdef DEBUG_FLOPPY
- printf("No floppy (open delayed)\n");
-#endif
+ DPRINTF("No floppy (open delayed)\n");
return -EIO;
}
s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
@@ -2175,14 +2201,10 @@ static int fd_open(BlockDriverState *bs)
s->fd_got_error = 1;
if (last_media_present)
s->fd_media_changed = 1;
-#ifdef DEBUG_FLOPPY
- printf("No floppy\n");
-#endif
+ DPRINTF("No floppy\n");
return -EIO;
}
-#ifdef DEBUG_FLOPPY
- printf("Floppy opened\n");
-#endif
+ DPRINTF("Floppy opened\n");
}
if (!last_media_present)
s->fd_media_changed = 1;
@@ -2408,7 +2430,8 @@ static int floppy_probe_device(const char *filename)
struct stat st;
if (strstart(filename, "/dev/fd", NULL) &&
- !strstart(filename, "/dev/fdset/", NULL)) {
+ !strstart(filename, "/dev/fdset/", NULL) &&
+ !strstart(filename, "/dev/fd/", NULL)) {
prio = 50;
}
@@ -2450,9 +2473,7 @@ static int floppy_media_changed(BlockDriverState *bs)
fd_open(bs);
ret = s->fd_media_changed;
s->fd_media_changed = 0;
-#ifdef DEBUG_FLOPPY
- printf("Floppy changed=%d\n", ret);
-#endif
+ DPRINTF("Floppy changed=%d\n", ret);
return ret;
}
diff --git a/block/raw-win32.c b/block/raw-win32.c
index dae5d2fee..68f2338ac 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -29,6 +29,7 @@
#include "trace.h"
#include "block/thread-pool.h"
#include "qemu/iov.h"
+#include "qapi/qmp/qstring.h"
#include <windows.h>
#include <winioctl.h>
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2ddd5..a60a19d58 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -74,25 +74,18 @@ typedef struct RBDAIOCB {
QEMUIOVector *qiov;
char *bounce;
RBDAIOCmd cmd;
- int64_t sector_num;
int error;
struct BDRVRBDState *s;
- int status;
} RBDAIOCB;
typedef struct RADOSCB {
- int rcbid;
RBDAIOCB *acb;
struct BDRVRBDState *s;
- int done;
int64_t size;
char *buf;
int64_t ret;
} RADOSCB;
-#define RBD_FD_READ 0
-#define RBD_FD_WRITE 1
-
typedef struct BDRVRBDState {
rados_t cluster;
rados_ioctx_t io_ctx;
@@ -235,7 +228,9 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname)
return NULL;
}
-static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp)
+static int qemu_rbd_set_conf(rados_t cluster, const char *conf,
+ bool only_read_conf_file,
+ Error **errp)
{
char *p, *buf;
char name[RBD_MAX_CONF_NAME_SIZE];
@@ -267,14 +262,18 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp)
qemu_rbd_unescape(value);
if (strcmp(name, "conf") == 0) {
- ret = rados_conf_read_file(cluster, value);
- if (ret < 0) {
- error_setg(errp, "error reading conf file %s", value);
- break;
+ /* read the conf file alone, so it doesn't override more
+ specific settings for a particular device */
+ if (only_read_conf_file) {
+ ret = rados_conf_read_file(cluster, value);
+ if (ret < 0) {
+ error_setg(errp, "error reading conf file %s", value);
+ break;
+ }
}
} else if (strcmp(name, "id") == 0) {
/* ignore, this is parsed by qemu_rbd_parse_clientname() */
- } else {
+ } else if (!only_read_conf_file) {
ret = rados_conf_set(cluster, name, value);
if (ret < 0) {
error_setg(errp, "invalid conf option %s", name);
@@ -325,7 +324,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
error_setg(errp, "obj size too small");
return -EINVAL;
}
- obj_order = ffs(objsize) - 1;
+ obj_order = ctz32(objsize);
}
clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
@@ -337,10 +336,15 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
if (strstr(conf, "conf=") == NULL) {
/* try default location, but ignore failure */
rados_conf_read_file(cluster, NULL);
+ } else if (conf[0] != '\0' &&
+ qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) {
+ rados_shutdown(cluster);
+ error_propagate(errp, local_err);
+ return -EIO;
}
if (conf[0] != '\0' &&
- qemu_rbd_set_conf(cluster, conf, &local_err) < 0) {
+ qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) {
rados_shutdown(cluster);
error_propagate(errp, local_err);
return -EIO;
@@ -405,7 +409,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
}
qemu_vfree(acb->bounce);
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
- acb->status = 0;
qemu_aio_unref(acb);
}
@@ -468,6 +471,23 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
s->snap = g_strdup(snap_buf);
}
+ if (strstr(conf, "conf=") == NULL) {
+ /* try default location, but ignore failure */
+ rados_conf_read_file(s->cluster, NULL);
+ } else if (conf[0] != '\0') {
+ r = qemu_rbd_set_conf(s->cluster, conf, true, errp);
+ if (r < 0) {
+ goto failed_shutdown;
+ }
+ }
+
+ if (conf[0] != '\0') {
+ r = qemu_rbd_set_conf(s->cluster, conf, false, errp);
+ if (r < 0) {
+ goto failed_shutdown;
+ }
+ }
+
/*
* Fallback to more conservative semantics if setting cache
* options fails. Ignore errors from setting rbd_cache because the
@@ -481,18 +501,6 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
rados_conf_set(s->cluster, "rbd_cache", "true");
}
- if (strstr(conf, "conf=") == NULL) {
- /* try default location, but ignore failure */
- rados_conf_read_file(s->cluster, NULL);
- }
-
- if (conf[0] != '\0') {
- r = qemu_rbd_set_conf(s->cluster, conf, errp);
- if (r < 0) {
- goto failed_shutdown;
- }
- }
-
r = rados_connect(s->cluster);
if (r < 0) {
error_setg(errp, "error connecting");
@@ -621,7 +629,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
acb->error = 0;
acb->s = s;
acb->bh = NULL;
- acb->status = -EINPROGRESS;
if (cmd == RBD_AIO_WRITE) {
qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -633,7 +640,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
size = nb_sectors * BDRV_SECTOR_SIZE;
rcb = g_new(RADOSCB, 1);
- rcb->done = 0;
rcb->acb = acb;
rcb->buf = buf;
rcb->s = acb->s;
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172cfa..9585beb73 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -318,6 +318,10 @@ enum AIOCBState {
AIOCB_DISCARD_OBJ,
};
+#define AIOCBOverwrapping(x, y) \
+ (!(x->max_affect_data_idx < y->min_affect_data_idx \
+ || y->max_affect_data_idx < x->min_affect_data_idx))
+
struct SheepdogAIOCB {
BlockAIOCB common;
@@ -334,6 +338,11 @@ struct SheepdogAIOCB {
bool cancelable;
int nr_pending;
+
+ uint32_t min_affect_data_idx;
+ uint32_t max_affect_data_idx;
+
+ QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
};
typedef struct BDRVSheepdogState {
@@ -362,8 +371,10 @@ typedef struct BDRVSheepdogState {
/* Every aio request must be linked to either of these queues. */
QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
- QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
+
+ CoQueue overwrapping_queue;
+ QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
} BDRVSheepdogState;
static const char * sd_strerror(int err)
@@ -498,13 +509,7 @@ static void sd_aio_cancel(BlockAIOCB *blockacb)
AIOReq *aioreq, *next;
if (sd_acb_cancelable(acb)) {
- /* Remove outstanding requests from pending and failed queues. */
- QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
- next) {
- if (aioreq->aiocb == acb) {
- free_aio_req(s, aioreq);
- }
- }
+ /* Remove outstanding requests from failed queue. */
QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
next) {
if (aioreq->aiocb == acb) {
@@ -529,6 +534,10 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
int64_t sector_num, int nb_sectors)
{
SheepdogAIOCB *acb;
+ uint32_t object_size;
+ BDRVSheepdogState *s = bs->opaque;
+
+ object_size = (UINT32_C(1) << s->inode.block_size_shift);
acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
@@ -542,6 +551,11 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
acb->coroutine = qemu_coroutine_self();
acb->ret = 0;
acb->nr_pending = 0;
+
+ acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+ acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
+ acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
+
return acb;
}
@@ -703,38 +717,6 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
static void co_write_request(void *opaque);
-static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
- AIOReq *aio_req;
-
- QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
- if (aio_req->oid == oid) {
- return aio_req;
- }
- }
-
- return NULL;
-}
-
-/*
- * This function searchs pending requests to the object `oid', and
- * sends them.
- */
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
- AIOReq *aio_req;
- SheepdogAIOCB *acb;
-
- while ((aio_req = find_pending_req(s, oid)) != NULL) {
- acb = aio_req->aiocb;
- /* move aio_req from pending list to inflight one */
- QLIST_REMOVE(aio_req, aio_siblings);
- QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
- acb->aiocb_type);
- }
-}
-
static coroutine_fn void reconnect_to_sdog(void *opaque)
{
BDRVSheepdogState *s = opaque;
@@ -840,12 +822,6 @@ static void coroutine_fn aio_read_response(void *opaque)
s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
}
- /*
- * Some requests may be blocked because simultaneous
- * create requests are not allowed, so we search the
- * pending requests here.
- */
- send_pending_req(s, aio_req->oid);
}
break;
case AIOCB_READ_UDATA:
@@ -1341,30 +1317,6 @@ out:
return ret;
}
-/* Return true if the specified request is linked to the pending list. */
-static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
-{
- AIOReq *areq;
- QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
- if (areq != aio_req && areq->oid == aio_req->oid) {
- /*
- * Sheepdog cannot handle simultaneous create requests to the same
- * object, so we cannot send the request until the previous request
- * finishes.
- */
- DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
- aio_req->flags = 0;
- aio_req->base_oid = 0;
- aio_req->create = false;
- QLIST_REMOVE(aio_req, aio_siblings);
- QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
- return true;
- }
- }
-
- return false;
-}
-
static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
{
SheepdogAIOCB *acb = aio_req->aiocb;
@@ -1379,10 +1331,6 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
goto out;
}
- if (check_simultaneous_create(s, aio_req)) {
- return;
- }
-
if (s->inode.data_vdi_id[idx]) {
aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
aio_req->flags |= SD_FLAG_CMD_COW;
@@ -1458,8 +1406,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
filename = qemu_opt_get(opts, "filename");
QLIST_INIT(&s->inflight_aio_head);
- QLIST_INIT(&s->pending_aio_head);
QLIST_INIT(&s->failed_aio_head);
+ QLIST_INIT(&s->inflight_aiocb_head);
s->fd = -1;
memset(vdi, 0, sizeof(vdi));
@@ -1524,6 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
pstrcpy(s->name, sizeof(s->name), vdi);
qemu_co_mutex_init(&s->lock);
+ qemu_co_queue_init(&s->overwrapping_queue);
qemu_opts_del(opts);
g_free(buf);
return 0;
@@ -1716,7 +1665,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
if ((object_size - 1) & object_size) { /* not a power of 2? */
return -EINVAL;
}
- obj_order = ffs(object_size) - 1;
+ obj_order = ctz32(object_size);
if (obj_order < 20 || obj_order > 31) {
return -EINVAL;
}
@@ -2195,12 +2144,6 @@ static int coroutine_fn sd_co_rw_vector(void *p)
old_oid, done);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- if (create) {
- if (check_simultaneous_create(s, aio_req)) {
- goto done;
- }
- }
-
add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
acb->aiocb_type);
done:
@@ -2215,6 +2158,20 @@ out:
return 1;
}
+static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+{
+ SheepdogAIOCB *cb;
+
+ QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+ if (AIOCBOverwrapping(aiocb, cb)) {
+ return true;
+ }
+ }
+
+ QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
+ return false;
+}
+
static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
{
@@ -2234,14 +2191,25 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
acb->aio_done_func = sd_write_done;
acb->aiocb_type = AIOCB_WRITE_UDATA;
+retry:
+ if (check_overwrapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overwrapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
+
return acb->ret;
}
@@ -2250,19 +2218,30 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
{
SheepdogAIOCB *acb;
int ret;
+ BDRVSheepdogState *s = bs->opaque;
acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
acb->aiocb_type = AIOCB_READ_UDATA;
acb->aio_done_func = sd_finish_aiocb;
+retry:
+ if (check_overwrapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overwrapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
return acb->ret;
}
@@ -2341,6 +2320,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
if (ret < 0) {
error_report("failed to create inode for snapshot: %s",
error_get_pretty(local_err));
+ error_free(local_err);
goto cleanup;
}
@@ -2609,14 +2589,25 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
acb->aiocb_type = AIOCB_DISCARD_OBJ;
acb->aio_done_func = sd_finish_aiocb;
+retry:
+ if (check_overwrapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overwrapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overwrapping_queue);
+
return acb->ret;
}
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1d5..49e143e99 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -24,6 +24,7 @@
#include "block/snapshot.h"
#include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
QemuOptsList internal_snapshot_opts = {
.name = "snapshot",
@@ -229,7 +230,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
{
BlockDriver *drv = bs->drv;
if (!drv) {
- error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
+ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
return -ENOMEDIUM;
}
if (!snapshot_id && !name) {
@@ -238,7 +239,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
}
/* drain all pending i/o before deleting snapshot */
- bdrv_drain_all();
+ bdrv_drain(bs);
if (drv->bdrv_snapshot_delete) {
return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
@@ -246,9 +247,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
if (bs->file) {
return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "internal snapshot deletion");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support internal snapshot deletion",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
@@ -315,7 +316,7 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
BlockDriver *drv = bs->drv;
if (!drv) {
- error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
+ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
return -ENOMEDIUM;
}
if (!snapshot_id && !name) {
@@ -329,9 +330,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
if (drv->bdrv_snapshot_load_tmp) {
return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "temporarily load internal snapshot");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support temporarily loading internal snapshots",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
diff --git a/block/ssh.c b/block/ssh.c
index f466cbf39..8d0673903 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -30,9 +30,11 @@
#include <libssh2_sftp.h>
#include "block/block_int.h"
+#include "qemu/error-report.h"
#include "qemu/sockets.h"
#include "qemu/uri.h"
#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"
/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
* this block driver code.
@@ -561,7 +563,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
/* Open the socket and connect. */
s->sock = inet_connect(s->hostport, errp);
if (s->sock < 0) {
- ret = -errno;
+ ret = -EIO;
goto err;
}
diff --git a/block/stream.c b/block/stream.c
index a628901f6..ab0bd057f 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -14,6 +14,7 @@
#include "trace.h"
#include "block/block_int.h"
#include "block/blockjob.h"
+#include "qapi/qmp/qerror.h"
#include "qemu/ratelimit.h"
enum {
@@ -227,7 +228,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
StreamBlockJob *s = container_of(job, StreamBlockJob, common);
if (speed < 0) {
- error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ error_setg(errp, QERR_INVALID_PARAMETER, "speed");
return;
}
ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
@@ -250,7 +251,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
!bdrv_iostatus_is_enabled(bs)) {
- error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+ error_setg(errp, QERR_INVALID_PARAMETER, "on-error");
return;
}
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
new file mode 100644
index 000000000..1abc6fcae
--- /dev/null
+++ b/block/throttle-groups.c
@@ -0,0 +1,501 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ * BenoƮt Canet <benoit.canet@nodalink.com>
+ * Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "block/throttle-groups.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "sysemu/qtest.h"
+
+/* The ThrottleGroup structure (with its ThrottleState) is shared
+ * among different BlockDriverState and it's independent from
+ * AioContext, so in order to use it from different threads it needs
+ * its own locking.
+ *
+ * This locking is however handled internally in this file, so it's
+ * mostly transparent to outside users (but see the documentation in
+ * throttle_groups_lock()).
+ *
+ * The whole ThrottleGroup structure is private and invisible to
+ * outside users, that only use it through its ThrottleState.
+ *
+ * In addition to the ThrottleGroup structure, BlockDriverState has
+ * fields that need to be accessed by other members of the group and
+ * therefore also need to be protected by this lock. Once a BDS is
+ * registered in a group those fields can be accessed by other threads
+ * any time.
+ *
+ * Again, all this is handled internally and is mostly transparent to
+ * the outside. The 'throttle_timers' field however has an additional
+ * constraint because it may be temporarily invalid (see for example
+ * bdrv_set_aio_context()). Therefore in this file a thread will
+ * access some other BDS's timers only after verifying that that BDS
+ * has throttled requests in the queue.
+ */
+typedef struct ThrottleGroup {
+ char *name; /* This is constant during the lifetime of the group */
+
+ QemuMutex lock; /* This lock protects the following four fields */
+ ThrottleState ts;
+ QLIST_HEAD(, BlockDriverState) head;
+ BlockDriverState *tokens[2];
+ bool any_timer_armed[2];
+
+ /* These two are protected by the global throttle_groups_lock */
+ unsigned refcount;
+ QTAILQ_ENTRY(ThrottleGroup) list;
+} ThrottleGroup;
+
+static QemuMutex throttle_groups_lock;
+static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
+ QTAILQ_HEAD_INITIALIZER(throttle_groups);
+
+/* Increments the reference count of a ThrottleGroup given its name.
+ *
+ * If no ThrottleGroup is found with the given name a new one is
+ * created.
+ *
+ * @name: the name of the ThrottleGroup
+ * @ret: the ThrottleGroup
+ */
+static ThrottleGroup *throttle_group_incref(const char *name)
+{
+ ThrottleGroup *tg = NULL;
+ ThrottleGroup *iter;
+
+ qemu_mutex_lock(&throttle_groups_lock);
+
+ /* Look for an existing group with that name */
+ QTAILQ_FOREACH(iter, &throttle_groups, list) {
+ if (!strcmp(name, iter->name)) {
+ tg = iter;
+ break;
+ }
+ }
+
+ /* Create a new one if not found */
+ if (!tg) {
+ tg = g_new0(ThrottleGroup, 1);
+ tg->name = g_strdup(name);
+ qemu_mutex_init(&tg->lock);
+ throttle_init(&tg->ts);
+ QLIST_INIT(&tg->head);
+
+ QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
+ }
+
+ tg->refcount++;
+
+ qemu_mutex_unlock(&throttle_groups_lock);
+
+ return tg;
+}
+
+/* Decrease the reference count of a ThrottleGroup.
+ *
+ * When the reference count reaches zero the ThrottleGroup is
+ * destroyed.
+ *
+ * @tg: The ThrottleGroup to unref
+ */
+static void throttle_group_unref(ThrottleGroup *tg)
+{
+ qemu_mutex_lock(&throttle_groups_lock);
+ if (--tg->refcount == 0) {
+ QTAILQ_REMOVE(&throttle_groups, tg, list);
+ qemu_mutex_destroy(&tg->lock);
+ g_free(tg->name);
+ g_free(tg);
+ }
+ qemu_mutex_unlock(&throttle_groups_lock);
+}
+
+/* Get the name from a BlockDriverState's ThrottleGroup. The name (and
+ * the pointer) is guaranteed to remain constant during the lifetime
+ * of the group.
+ *
+ * @bs: a BlockDriverState that is member of a throttling group
+ * @ret: the name of the group.
+ */
+const char *throttle_group_get_name(BlockDriverState *bs)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ return tg->name;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence,
+ * simulating a circular list.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs: the current BlockDriverState
+ * @ret: the next BlockDriverState in the sequence
+ */
+static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
+{
+ ThrottleState *ts = bs->throttle_state;
+ ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+ BlockDriverState *next = QLIST_NEXT(bs, round_robin);
+
+ if (!next) {
+ return QLIST_FIRST(&tg->head);
+ }
+
+ return next;
+}
+
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs: the current BlockDriverState
+ * @is_write: the type of operation (read/write)
+ * @ret: the next BlockDriverState with pending requests, or bs
+ * if there is none.
+ */
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+ bool is_write)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ BlockDriverState *token, *start;
+
+ start = token = tg->tokens[is_write];
+
+ /* get next bs round in round robin style */
+ token = throttle_group_next_bs(token);
+ while (token != start && !token->pending_reqs[is_write]) {
+ token = throttle_group_next_bs(token);
+ }
+
+ /* If no IO are queued for scheduling on the next round robin token
+ * then decide the token is the current bs because chances are
+ * the current bs get the current request queued.
+ */
+ if (token == start && !token->pending_reqs[is_write]) {
+ token = bs;
+ }
+
+ return token;
+}
+
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs: the current BlockDriverState
+ * @is_write: the type of operation (read/write)
+ * @ret: whether the I/O request needs to be throttled or not
+ */
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+ bool is_write)
+{
+ ThrottleState *ts = bs->throttle_state;
+ ThrottleTimers *tt = &bs->throttle_timers;
+ ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+ bool must_wait;
+
+ /* Check if any of the timers in this group is already armed */
+ if (tg->any_timer_armed[is_write]) {
+ return true;
+ }
+
+ must_wait = throttle_schedule_timer(ts, tt, is_write);
+
+ /* If a timer just got armed, set bs as the current token */
+ if (must_wait) {
+ tg->tokens[is_write] = bs;
+ tg->any_timer_armed[is_write] = true;
+ }
+
+ return must_wait;
+}
+
+/* Look for the next pending I/O request and schedule it.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs: the current BlockDriverState
+ * @is_write: the type of operation (read/write)
+ */
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ bool must_wait;
+ BlockDriverState *token;
+
+ /* Check if there's any pending request to schedule next */
+ token = next_throttle_token(bs, is_write);
+ if (!token->pending_reqs[is_write]) {
+ return;
+ }
+
+ /* Set a timer for the request if it needs to be throttled */
+ must_wait = throttle_group_schedule_timer(token, is_write);
+
+ /* If it doesn't have to wait, queue it for immediate execution */
+ if (!must_wait) {
+ /* Give preference to requests from the current bs */
+ if (qemu_in_coroutine() &&
+ qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+ token = bs;
+ } else {
+ ThrottleTimers *tt = &token->throttle_timers;
+ int64_t now = qemu_clock_get_ns(tt->clock_type);
+ timer_mod(tt->timers[is_write], now + 1);
+ tg->any_timer_armed[is_write] = true;
+ }
+ tg->tokens[is_write] = token;
+ }
+}
+
+/* Check if an I/O request needs to be throttled, wait and set a timer
+ * if necessary, and schedule the next request using a round robin
+ * algorithm.
+ *
+ * @bs: the current BlockDriverState
+ * @bytes: the number of bytes for this I/O
+ * @is_write: the type of operation (read/write)
+ */
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+ unsigned int bytes,
+ bool is_write)
+{
+ bool must_wait;
+ BlockDriverState *token;
+
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ qemu_mutex_lock(&tg->lock);
+
+ /* First we check if this I/O has to be throttled. */
+ token = next_throttle_token(bs, is_write);
+ must_wait = throttle_group_schedule_timer(token, is_write);
+
+ /* Wait if there's a timer set or queued requests of this type */
+ if (must_wait || bs->pending_reqs[is_write]) {
+ bs->pending_reqs[is_write]++;
+ qemu_mutex_unlock(&tg->lock);
+ qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+ qemu_mutex_lock(&tg->lock);
+ bs->pending_reqs[is_write]--;
+ }
+
+ /* The I/O will be executed, so do the accounting */
+ throttle_account(bs->throttle_state, is_write, bytes);
+
+ /* Schedule the next request */
+ schedule_next_request(bs, is_write);
+
+ qemu_mutex_unlock(&tg->lock);
+}
+
+/* Update the throttle configuration for a particular group. Similar
+ * to throttle_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs: a BlockDriverState that is member of the group
+ * @cfg: the configuration to set
+ */
+void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+ ThrottleTimers *tt = &bs->throttle_timers;
+ ThrottleState *ts = bs->throttle_state;
+ ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+ qemu_mutex_lock(&tg->lock);
+ /* throttle_config() cancels the timers */
+ if (timer_pending(tt->timers[0])) {
+ tg->any_timer_armed[0] = false;
+ }
+ if (timer_pending(tt->timers[1])) {
+ tg->any_timer_armed[1] = false;
+ }
+ throttle_config(ts, tt, cfg);
+ qemu_mutex_unlock(&tg->lock);
+}
+
+/* Get the throttle configuration from a particular group. Similar to
+ * throttle_get_config(), but guarantees atomicity within the
+ * throttling group.
+ *
+ * @bs: a BlockDriverState that is member of the group
+ * @cfg: the configuration will be written here
+ */
+void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
+{
+ ThrottleState *ts = bs->throttle_state;
+ ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+ qemu_mutex_lock(&tg->lock);
+ throttle_get_config(ts, cfg);
+ qemu_mutex_unlock(&tg->lock);
+}
+
+/* ThrottleTimers callback. This wakes up a request that was waiting
+ * because it had been throttled.
+ *
+ * @bs: the BlockDriverState whose request had been throttled
+ * @is_write: the type of operation (read/write)
+ */
+static void timer_cb(BlockDriverState *bs, bool is_write)
+{
+ ThrottleState *ts = bs->throttle_state;
+ ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+ bool empty_queue;
+
+ /* The timer has just been fired, so we can update the flag */
+ qemu_mutex_lock(&tg->lock);
+ tg->any_timer_armed[is_write] = false;
+ qemu_mutex_unlock(&tg->lock);
+
+ /* Run the request that was waiting for this timer */
+ empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+
+ /* If the request queue was empty then we have to take care of
+ * scheduling the next one */
+ if (empty_queue) {
+ qemu_mutex_lock(&tg->lock);
+ schedule_next_request(bs, is_write);
+ qemu_mutex_unlock(&tg->lock);
+ }
+}
+
+static void read_timer_cb(void *opaque)
+{
+ timer_cb(opaque, false);
+}
+
+static void write_timer_cb(void *opaque)
+{
+ timer_cb(opaque, true);
+}
+
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
+ *
+ * @bs: the BlockDriverState to insert
+ * @groupname: the name of the group
+ */
+void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
+{
+ int i;
+ ThrottleGroup *tg = throttle_group_incref(groupname);
+ int clock_type = QEMU_CLOCK_REALTIME;
+
+ if (qtest_enabled()) {
+ /* For testing block IO throttling only */
+ clock_type = QEMU_CLOCK_VIRTUAL;
+ }
+
+ bs->throttle_state = &tg->ts;
+
+ qemu_mutex_lock(&tg->lock);
+ /* If the ThrottleGroup is new set this BlockDriverState as the token */
+ for (i = 0; i < 2; i++) {
+ if (!tg->tokens[i]) {
+ tg->tokens[i] = bs;
+ }
+ }
+
+ QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+
+ throttle_timers_init(&bs->throttle_timers,
+ bdrv_get_aio_context(bs),
+ clock_type,
+ read_timer_cb,
+ write_timer_cb,
+ bs);
+
+ qemu_mutex_unlock(&tg->lock);
+}
+
+/* Unregister a BlockDriverState from its group, removing it from the
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
+ *
+ * The group will be destroyed if it's empty after this operation.
+ *
+ * @bs: the BlockDriverState to remove
+ */
+void throttle_group_unregister_bs(BlockDriverState *bs)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ int i;
+
+ qemu_mutex_lock(&tg->lock);
+ for (i = 0; i < 2; i++) {
+ if (tg->tokens[i] == bs) {
+ BlockDriverState *token = throttle_group_next_bs(bs);
+ /* Take care of the case where this is the last bs in the group */
+ if (token == bs) {
+ token = NULL;
+ }
+ tg->tokens[i] = token;
+ }
+ }
+
+ /* remove the current bs from the list */
+ QLIST_REMOVE(bs, round_robin);
+ throttle_timers_destroy(&bs->throttle_timers);
+ qemu_mutex_unlock(&tg->lock);
+
+ throttle_group_unref(tg);
+ bs->throttle_state = NULL;
+}
+
+/* Acquire the lock of this throttling group.
+ *
+ * You won't normally need to use this. None of the functions from the
+ * ThrottleGroup API require you to acquire the lock since all of them
+ * deal with it internally.
+ *
+ * This should only be used in exceptional cases when you want to
+ * access the protected fields of a BlockDriverState directly
+ * (e.g. bdrv_swap()).
+ *
+ * @bs: a BlockDriverState that is member of the group
+ */
+void throttle_group_lock(BlockDriverState *bs)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ qemu_mutex_lock(&tg->lock);
+}
+
+/* Release the lock of this throttling group.
+ *
+ * See the comments in throttle_group_lock().
+ */
+void throttle_group_unlock(BlockDriverState *bs)
+{
+ ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+ qemu_mutex_unlock(&tg->lock);
+}
+
+static void throttle_groups_init(void)
+{
+ qemu_mutex_init(&throttle_groups_lock);
+}
+
+block_init(throttle_groups_init);
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02fe2..7642ef359 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when vdi images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vdi", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index 6547bec40..47fec63c6 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -19,6 +19,7 @@
*/
#include "qemu-common.h"
#include "block/block_int.h"
+#include "qemu/error-report.h"
#include "qemu/module.h"
#include "block/vhdx.h"
diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45d5..0776de717 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
/* TODO: differencing files */
/* Disable migration when VHDX images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vhdx", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
memset(iov1.iov_base, 0, iov1.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
- sinfo.block_offset);
+ iov1.iov_len);
sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
}
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
memset(iov2.iov_base, 0, iov2.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
- sinfo.block_offset);
+ iov2.iov_len);
sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
}
}
diff --git a/block/vmdk.c b/block/vmdk.c
index 4c71cde3b..fbaab67c8 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -25,6 +25,8 @@
#include "qemu-common.h"
#include "block/block_int.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
#include "qemu/module.h"
#include "migration/migration.h"
#include <zlib.h>
@@ -321,37 +323,13 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
return 1;
}
-/* Queue extents, if any, for reopen() */
+/* We have nothing to do for VMDK reopen, stubs just return success */
static int vmdk_reopen_prepare(BDRVReopenState *state,
BlockReopenQueue *queue, Error **errp)
{
- BDRVVmdkState *s;
- int ret = -1;
- int i;
- VmdkExtent *e;
-
assert(state != NULL);
assert(state->bs != NULL);
-
- if (queue == NULL) {
- error_setg(errp, "No reopen queue for VMDK extents");
- goto exit;
- }
-
- s = state->bs->opaque;
-
- assert(s != NULL);
-
- for (i = 0; i < s->num_extents; i++) {
- e = &s->extents[i];
- if (e->file != state->bs->file) {
- bdrv_reopen_queue(queue, e->file, state->flags);
- }
- }
- ret = 0;
-
-exit:
- return ret;
+ return 0;
}
static int vmdk_parent_open(BlockDriverState *bs)
@@ -524,7 +502,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
}
ret = vmdk_add_extent(bs, file, false,
le32_to_cpu(header.disk_sectors),
- le32_to_cpu(header.l1dir_offset) << 9,
+ (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
0,
le32_to_cpu(header.l1dir_size),
4096,
@@ -543,7 +521,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
}
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
- Error **errp);
+ QDict *options, Error **errp);
static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
Error **errp)
@@ -582,7 +560,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
static int vmdk_open_vmdk4(BlockDriverState *bs,
BlockDriverState *file,
- int flags, Error **errp)
+ int flags, QDict *options, Error **errp)
{
int ret;
uint32_t magic;
@@ -606,7 +584,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
if (!buf) {
return -EINVAL;
}
- ret = vmdk_open_desc_file(bs, flags, buf, errp);
+ ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
g_free(buf);
return ret;
}
@@ -669,8 +647,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
char buf[64];
snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
le32_to_cpu(header.version));
- error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "vmdk", buf);
+ error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+ bdrv_get_device_or_node_name(bs), "vmdk", buf);
return -ENOTSUP;
} else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
/* VMware KB 2064959 explains that version 3 added support for
@@ -763,7 +741,7 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
/* Open an extent file and append to bs array */
static int vmdk_open_sparse(BlockDriverState *bs,
BlockDriverState *file, int flags,
- char *buf, Error **errp)
+ char *buf, QDict *options, Error **errp)
{
uint32_t magic;
@@ -773,7 +751,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
return vmdk_open_vmfs_sparse(bs, file, flags, errp);
break;
case VMDK4_MAGIC:
- return vmdk_open_vmdk4(bs, file, flags, errp);
+ return vmdk_open_vmdk4(bs, file, flags, options, errp);
break;
default:
error_setg(errp, "Image not in VMDK format");
@@ -783,7 +761,8 @@ static int vmdk_open_sparse(BlockDriverState *bs,
}
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
- const char *desc_file_path, Error **errp)
+ const char *desc_file_path, QDict *options,
+ Error **errp)
{
int ret;
int matches;
@@ -797,6 +776,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
BlockDriverState *extent_file;
BDRVVmdkState *s = bs->opaque;
VmdkExtent *extent;
+ char extent_opt_prefix[32];
while (*p) {
/* parse extent line in one of below formats:
@@ -846,8 +826,12 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
extent_path = g_malloc0(PATH_MAX);
path_combine(extent_path, PATH_MAX, desc_file_path, fname);
extent_file = NULL;
- ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
- bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
+
+ ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
+ assert(ret < 32);
+
+ ret = bdrv_open_image(&extent_file, extent_path, options,
+ extent_opt_prefix, bs, &child_file, false, errp);
g_free(extent_path);
if (ret) {
return ret;
@@ -870,7 +854,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
if (!buf) {
ret = -EINVAL;
} else {
- ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
+ ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
+ options, errp);
}
g_free(buf);
if (ret) {
@@ -898,7 +883,7 @@ next_line:
}
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
- Error **errp)
+ QDict *options, Error **errp)
{
int ret;
char ct[128];
@@ -920,7 +905,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
}
s->create_type = g_strdup(ct);
s->desc_offset = 0;
- ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp);
+ ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp);
exit:
return ret;
}
@@ -942,11 +927,11 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
switch (magic) {
case VMDK3_MAGIC:
case VMDK4_MAGIC:
- ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp);
+ ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp);
s->desc_offset = 0x200;
break;
default:
- ret = vmdk_open_desc_file(bs, flags, buf, errp);
+ ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
break;
}
if (ret) {
@@ -963,9 +948,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VMDK images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vmdk", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
g_free(buf);
return 0;
@@ -1705,12 +1690,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
/* write all the data */
ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
if (ret < 0) {
- error_set(errp, QERR_IO_ERROR);
+ error_setg(errp, QERR_IO_ERROR);
goto exit;
}
ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
if (ret < 0) {
- error_set(errp, QERR_IO_ERROR);
+ error_setg(errp, QERR_IO_ERROR);
goto exit;
}
@@ -1730,7 +1715,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
gd_buf, gd_buf_size);
if (ret < 0) {
- error_set(errp, QERR_IO_ERROR);
+ error_setg(errp, QERR_IO_ERROR);
goto exit;
}
@@ -1742,7 +1727,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
gd_buf, gd_buf_size);
if (ret < 0) {
- error_set(errp, QERR_IO_ERROR);
+ error_setg(errp, QERR_IO_ERROR);
goto exit;
}
diff --git a/block/vpc.c b/block/vpc.c
index 8ab30d600..3e385d9fb 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -328,9 +328,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VHD images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vpc", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f40..206869712 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -30,6 +30,7 @@
#include "migration/migration.h"
#include "qapi/qmp/qint.h"
#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qstring.h"
#ifndef S_IWGRP
#define S_IWGRP 0
@@ -322,6 +323,7 @@ typedef struct BDRVVVFATState {
int fat_type; /* 16 or 32 */
array_t fat,directory,mapping;
+ char volume_label[11];
unsigned int cluster_size;
unsigned int sectors_per_cluster;
@@ -859,7 +861,7 @@ static int init_directories(BDRVVVFATState* s,
{
direntry_t* entry=array_get_next(&(s->directory));
entry->attributes=0x28; /* archive | volume label */
- memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name));
+ memcpy(entry->name, s->volume_label, sizeof(entry->name));
}
/* Now build FAT, and write back information into directory */
@@ -968,7 +970,8 @@ static int init_directories(BDRVVVFATState* s,
bootsector->u.fat16.signature=0x29;
bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
- memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+ memcpy(bootsector->u.fat16.volume_label, s->volume_label,
+ sizeof(bootsector->u.fat16.volume_label));
memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8);
bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
@@ -1008,6 +1011,11 @@ static QemuOptsList runtime_opts = {
.help = "Create a floppy rather than a hard disk image",
},
{
+ .name = "label",
+ .type = QEMU_OPT_STRING,
+ .help = "Use a volume label other than QEMU VVFAT",
+ },
+ {
.name = "rw",
.type = QEMU_OPT_BOOL,
.help = "Make the image writable",
@@ -1059,8 +1067,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options,
/* Fill in the options QDict */
qdict_put(options, "dir", qstring_from_str(filename));
qdict_put(options, "fat-type", qint_from_int(fat_type));
- qdict_put(options, "floppy", qbool_from_int(floppy));
- qdict_put(options, "rw", qbool_from_int(rw));
+ qdict_put(options, "floppy", qbool_from_bool(floppy));
+ qdict_put(options, "rw", qbool_from_bool(rw));
}
static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
@@ -1069,7 +1077,7 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
BDRVVVFATState *s = bs->opaque;
int cyls, heads, secs;
bool floppy;
- const char *dirname;
+ const char *dirname, *label;
QemuOpts *opts;
Error *local_err = NULL;
int ret;
@@ -1096,6 +1104,18 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
s->fat_type = qemu_opt_get_number(opts, "fat-type", 0);
floppy = qemu_opt_get_bool(opts, "floppy", false);
+ memset(s->volume_label, ' ', sizeof(s->volume_label));
+ label = qemu_opt_get(opts, "label");
+ if (label) {
+ size_t label_length = strlen(label);
+ if (label_length > 11) {
+ error_setg(errp, "vvfat label cannot be longer than 11 bytes");
+ ret = -EINVAL;
+ goto fail;
+ }
+ memcpy(s->volume_label, label, label_length);
+ }
+
if (floppy) {
/* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */
if (!s->fat_type) {
@@ -1180,9 +1200,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
/* Disable migration when vvfat is used rw */
if (s->qcow) {
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker,
+ "The vvfat (rw) format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
}