summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorAnas Nashif <anas.nashif@intel.com>2013-01-15 13:31:42 -0800
committerAnas Nashif <anas.nashif@intel.com>2013-01-15 13:31:42 -0800
commit42bf3037d458a330856a0be584200c1e41c3f417 (patch)
tree25b9be1088727757e52271e25a446e8a852357df /block
parent060629c6ef0b7e5c267d84c91600113264d33120 (diff)
downloadqemu-42bf3037d458a330856a0be584200c1e41c3f417.tar.gz
qemu-42bf3037d458a330856a0be584200c1e41c3f417.tar.bz2
qemu-42bf3037d458a330856a0be584200c1e41c3f417.zip
Imported Upstream version 1.3.0upstream/1.3.0
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs15
-rw-r--r--block/blkdebug.c16
-rw-r--r--block/blkverify.c4
-rw-r--r--block/commit.c259
-rw-r--r--block/curl.c7
-rw-r--r--block/gluster.c624
-rw-r--r--block/iscsi.c299
-rw-r--r--block/linux-aio.c216
-rw-r--r--block/mirror.c322
-rw-r--r--block/nbd.c115
-rw-r--r--block/qcow.c10
-rw-r--r--block/qcow2-refcount.c3
-rw-r--r--block/qcow2.c11
-rw-r--r--block/qed-table.c1
-rw-r--r--block/qed.c13
-rw-r--r--block/raw-aio.h (renamed from block/raw-posix-aio.h)29
-rw-r--r--block/raw-posix.c545
-rw-r--r--block/raw-win32.c259
-rw-r--r--block/raw.c10
-rw-r--r--block/rbd.c14
-rw-r--r--block/sheepdog.c123
-rw-r--r--block/stream.c33
-rw-r--r--block/vdi.c41
-rw-r--r--block/vmdk.c48
-rw-r--r--block/vpc.c7
-rw-r--r--block/win32-aio.c226
26 files changed, 2783 insertions, 467 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d39b..7f015105b 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,10 +2,19 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
-block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-obj-y += stream.o
-block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-y += parallels.o blkdebug.o blkverify.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
+block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+
+ifeq ($(CONFIG_POSIX),y)
+block-obj-y += nbd.o sheepdog.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+endif
+
+common-obj-y += stream.o
+common-obj-y += commit.o
+common-obj-y += mirror.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 59dcea065..d61ece86a 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -28,6 +28,7 @@
typedef struct BDRVBlkdebugState {
int state;
+ int new_state;
QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
} BDRVBlkdebugState;
@@ -40,7 +41,7 @@ typedef struct BlkdebugAIOCB {
static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);
-static AIOPool blkdebug_aio_pool = {
+static const AIOCBInfo blkdebug_aiocb_info = {
.aiocb_size = sizeof(BlkdebugAIOCB),
.cancel = blkdebug_aio_cancel,
};
@@ -334,7 +335,7 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
return NULL;
}
- acb = qemu_aio_get(&blkdebug_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque);
acb->ret = -error;
bh = qemu_bh_new(error_callback_bh, acb);
@@ -403,12 +404,12 @@ static void blkdebug_close(BlockDriverState *bs)
}
static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
- int old_state, bool injected)
+ bool injected)
{
BDRVBlkdebugState *s = bs->opaque;
/* Only process rules for the current state */
- if (rule->state && rule->state != old_state) {
+ if (rule->state && rule->state != s->state) {
return injected;
}
@@ -423,7 +424,7 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
break;
case ACTION_SET_STATE:
- s->state = rule->options.set_state.new_state;
+ s->new_state = rule->options.set_state.new_state;
break;
}
return injected;
@@ -433,15 +434,16 @@ static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
{
BDRVBlkdebugState *s = bs->opaque;
struct BlkdebugRule *rule;
- int old_state = s->state;
bool injected;
assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
injected = false;
+ s->new_state = s->state;
QLIST_FOREACH(rule, &s->rules[event], next) {
- injected = process_rule(bs, rule, old_state, injected);
+ injected = process_rule(bs, rule, injected);
}
+ s->state = s->new_state;
}
static int64_t blkdebug_getlength(BlockDriverState *bs)
diff --git a/block/blkverify.c b/block/blkverify.c
index 9d5f1ec5b..4beede77a 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -48,7 +48,7 @@ static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
}
}
-static AIOPool blkverify_aio_pool = {
+static const AIOCBInfo blkverify_aiocb_info = {
.aiocb_size = sizeof(BlkverifyAIOCB),
.cancel = blkverify_aio_cancel,
};
@@ -233,7 +233,7 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+ BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);
acb->bh = NULL;
acb->is_write = is_write;
diff --git a/block/commit.c b/block/commit.c
new file mode 100644
index 000000000..fae79582d
--- /dev/null
+++ b/block/commit.c
@@ -0,0 +1,259 @@
+/*
+ * Live block commit
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Jeff Cody <jcody@redhat.com>
+ * Based on stream.c by Stefan Hajnoczi
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+#include "blockjob.h"
+#include "qemu/ratelimit.h"
+
+enum {
+ /*
+ * Size of data buffer for populating the image file. This should be large
+ * enough to process multiple clusters in a single call, so that populating
+ * contiguous regions of the image is efficient.
+ */
+ COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CommitBlockJob {
+ BlockJob common;
+ RateLimit limit;
+ BlockDriverState *active;
+ BlockDriverState *top;
+ BlockDriverState *base;
+ BlockdevOnError on_error;
+ int base_flags;
+ int orig_overlay_flags;
+} CommitBlockJob;
+
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num, int nb_sectors,
+ void *buf)
+{
+ int ret = 0;
+
+ ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+ if (ret) {
+ return ret;
+ }
+
+ ret = bdrv_write(base, sector_num, buf, nb_sectors);
+ if (ret) {
+ return ret;
+ }
+
+ return 0;
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+ CommitBlockJob *s = opaque;
+ BlockDriverState *active = s->active;
+ BlockDriverState *top = s->top;
+ BlockDriverState *base = s->base;
+ BlockDriverState *overlay_bs = NULL;
+ int64_t sector_num, end;
+ int ret = 0;
+ int n = 0;
+ void *buf;
+ int bytes_written = 0;
+ int64_t base_len;
+
+ ret = s->common.len = bdrv_getlength(top);
+
+
+ if (s->common.len < 0) {
+ goto exit_restore_reopen;
+ }
+
+ ret = base_len = bdrv_getlength(base);
+ if (base_len < 0) {
+ goto exit_restore_reopen;
+ }
+
+ if (base_len < s->common.len) {
+ ret = bdrv_truncate(base, s->common.len);
+ if (ret) {
+ goto exit_restore_reopen;
+ }
+ }
+
+ overlay_bs = bdrv_find_overlay(active, top);
+
+ end = s->common.len >> BDRV_SECTOR_BITS;
+ buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+
+ for (sector_num = 0; sector_num < end; sector_num += n) {
+ uint64_t delay_ns = 0;
+ bool copy;
+
+wait:
+ /* Note that even when no rate limit is applied we need to yield
+ * with no pending I/O here so that qemu_aio_flush() returns.
+ */
+ block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+ if (block_job_is_cancelled(&s->common)) {
+ break;
+ }
+ /* Copy if allocated above the base */
+ ret = bdrv_co_is_allocated_above(top, base, sector_num,
+ COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+ &n);
+ copy = (ret == 1);
+ trace_commit_one_iteration(s, sector_num, n, ret);
+ if (copy) {
+ if (s->common.speed) {
+ delay_ns = ratelimit_calculate_delay(&s->limit, n);
+ if (delay_ns > 0) {
+ goto wait;
+ }
+ }
+ ret = commit_populate(top, base, sector_num, n, buf);
+ bytes_written += n * BDRV_SECTOR_SIZE;
+ }
+ if (ret < 0) {
+ if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+ s->on_error == BLOCKDEV_ON_ERROR_REPORT||
+ (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
+ goto exit_free_buf;
+ } else {
+ n = 0;
+ continue;
+ }
+ }
+ /* Publish progress */
+ s->common.offset += n * BDRV_SECTOR_SIZE;
+ }
+
+ ret = 0;
+
+ if (!block_job_is_cancelled(&s->common) && sector_num == end) {
+ /* success */
+ ret = bdrv_drop_intermediate(active, top, base);
+ }
+
+exit_free_buf:
+ qemu_vfree(buf);
+
+exit_restore_reopen:
+ /* restore base open flags here if appropriate (e.g., change the base back
+ * to r/o). These reopens do not need to be atomic, since we won't abort
+ * even on failure here */
+ if (s->base_flags != bdrv_get_flags(base)) {
+ bdrv_reopen(base, s->base_flags, NULL);
+ }
+ if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+ bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+ }
+
+ block_job_completed(&s->common, ret);
+}
+
+static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+ CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+ if (speed < 0) {
+ error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ return;
+ }
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType commit_job_type = {
+ .instance_size = sizeof(CommitBlockJob),
+ .job_type = "commit",
+ .set_speed = commit_set_speed,
+};
+
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+ BlockDriverState *top, int64_t speed,
+ BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+ void *opaque, Error **errp)
+{
+ CommitBlockJob *s;
+ BlockReopenQueue *reopen_queue = NULL;
+ int orig_overlay_flags;
+ int orig_base_flags;
+ BlockDriverState *overlay_bs;
+ Error *local_err = NULL;
+
+ if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+ on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+ !bdrv_iostatus_is_enabled(bs)) {
+ error_set(errp, QERR_INVALID_PARAMETER_COMBINATION);
+ return;
+ }
+
+ /* Once we support top == active layer, remove this check */
+ if (top == bs) {
+ error_setg(errp,
+ "Top image as the active layer is currently unsupported");
+ return;
+ }
+
+ if (top == base) {
+ error_setg(errp, "Invalid files for merge: top and base are the same");
+ return;
+ }
+
+ overlay_bs = bdrv_find_overlay(bs, top);
+
+ if (overlay_bs == NULL) {
+ error_setg(errp, "Could not find overlay image for %s:", top->filename);
+ return;
+ }
+
+ orig_base_flags = bdrv_get_flags(base);
+ orig_overlay_flags = bdrv_get_flags(overlay_bs);
+
+ /* convert base & overlay_bs to r/w, if necessary */
+ if (!(orig_base_flags & BDRV_O_RDWR)) {
+ reopen_queue = bdrv_reopen_queue(reopen_queue, base,
+ orig_base_flags | BDRV_O_RDWR);
+ }
+ if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+ reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+ orig_overlay_flags | BDRV_O_RDWR);
+ }
+ if (reopen_queue) {
+ bdrv_reopen_multiple(reopen_queue, &local_err);
+ if (local_err != NULL) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+
+
+ s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+ if (!s) {
+ return;
+ }
+
+ s->base = base;
+ s->top = top;
+ s->active = bs;
+
+ s->base_flags = orig_base_flags;
+ s->orig_overlay_flags = orig_overlay_flags;
+
+ s->on_error = on_error;
+ s->common.co = qemu_coroutine_create(commit_run);
+
+ trace_commit_start(bs, base, top, s, s->common.co, opaque);
+ qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/curl.c b/block/curl.c
index e7c3634d3..1179484de 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -438,7 +438,7 @@ static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
// Do we have to implement canceling? Seems to work without...
}
-static AIOPool curl_aio_pool = {
+static const AIOCBInfo curl_aiocb_info = {
.aiocb_size = sizeof(CURLAIOCB),
.cancel = curl_aio_cancel,
};
@@ -505,7 +505,7 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
{
CURLAIOCB *acb;
- acb = qemu_aio_get(&curl_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque);
acb->qiov = qiov;
acb->sector_num = sector_num;
@@ -542,8 +542,7 @@ static void curl_close(BlockDriverState *bs)
}
if (s->multi)
curl_multi_cleanup(s->multi);
- if (s->url)
- free(s->url);
+ g_free(s->url);
}
static int64_t curl_getlength(BlockDriverState *bs)
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 000000000..1c90174b1
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,624 @@
+/*
+ * GlusterFS backend for QEMU
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * Pipe handling mechanism in AIO implementation is derived from
+ * block/rbd.c. Hence,
+ *
+ * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
+ * Josh Durgin <josh.durgin@dreamhost.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include <glusterfs/api/glfs.h>
+#include "block_int.h"
+#include "qemu_socket.h"
+#include "uri.h"
+
+typedef struct GlusterAIOCB {
+ BlockDriverAIOCB common;
+ int64_t size;
+ int ret;
+ bool *finished;
+ QEMUBH *bh;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+ struct glfs *glfs;
+ int fds[2];
+ struct glfs_fd *fd;
+ int qemu_aio_count;
+ int event_reader_pos;
+ GlusterAIOCB *event_acb;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ 0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterConf {
+ char *server;
+ int port;
+ char *volname;
+ char *image;
+ char *transport;
+} GlusterConf;
+
+static void qemu_gluster_gconf_free(GlusterConf *gconf)
+{
+ g_free(gconf->server);
+ g_free(gconf->volname);
+ g_free(gconf->image);
+ g_free(gconf->transport);
+ g_free(gconf);
+}
+
+static int parse_volume_options(GlusterConf *gconf, char *path)
+{
+ char *p, *q;
+
+ if (!path) {
+ return -EINVAL;
+ }
+
+ /* volume */
+ p = q = path + strspn(path, "/");
+ p += strcspn(p, "/");
+ if (*p == '\0') {
+ return -EINVAL;
+ }
+ gconf->volname = g_strndup(q, p - q);
+
+ /* image */
+ p += strspn(p, "/");
+ if (*p == '\0') {
+ return -EINVAL;
+ }
+ gconf->image = g_strdup(p);
+ return 0;
+}
+
+/*
+ * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'transport' specifies the transport type used to connect to gluster
+ * management daemon (glusterd). Valid transport types are
+ * tcp, unix and rdma. If a transport type isn't specified, then tcp
+ * type is assumed.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname, ipv4 address
+ * or ipv6 address. ipv6 address needs to be within square brackets [ ].
+ * If transport type is 'unix', then 'server' field should not be specifed.
+ * The 'socket' field needs to be populated with the path to unix domain
+ * socket.
+ *
+ * 'port' is the port number on which glusterd is listening. This is optional
+ * and if not specified, QEMU will send 0 which will make gluster to use the
+ * default port. If the transport type is unix, then 'port' should not be
+ * specified.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image that resides on gluster volume.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+ * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+ * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+ */
+static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+{
+ URI *uri;
+ QueryParams *qp = NULL;
+ bool is_unix = false;
+ int ret = 0;
+
+ uri = uri_parse(filename);
+ if (!uri) {
+ return -EINVAL;
+ }
+
+ /* transport */
+ if (!strcmp(uri->scheme, "gluster")) {
+ gconf->transport = g_strdup("tcp");
+ } else if (!strcmp(uri->scheme, "gluster+tcp")) {
+ gconf->transport = g_strdup("tcp");
+ } else if (!strcmp(uri->scheme, "gluster+unix")) {
+ gconf->transport = g_strdup("unix");
+ is_unix = true;
+ } else if (!strcmp(uri->scheme, "gluster+rdma")) {
+ gconf->transport = g_strdup("rdma");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = parse_volume_options(gconf, uri->path);
+ if (ret < 0) {
+ goto out;
+ }
+
+ qp = query_params_parse(uri->query);
+ if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (is_unix) {
+ if (uri->server || uri->port) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (strcmp(qp->p[0].name, "socket")) {
+ ret = -EINVAL;
+ goto out;
+ }
+ gconf->server = g_strdup(qp->p[0].value);
+ } else {
+ gconf->server = g_strdup(uri->server);
+ gconf->port = uri->port;
+ }
+
+out:
+ if (qp) {
+ query_params_free(qp);
+ }
+ uri_free(uri);
+ return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
+{
+ struct glfs *glfs = NULL;
+ int ret;
+ int old_errno;
+
+ ret = qemu_gluster_parseuri(gconf, filename);
+ if (ret < 0) {
+ error_report("Usage: file=gluster[+transport]://[server[:port]]/"
+ "volname/image[?socket=...]");
+ errno = -ret;
+ goto out;
+ }
+
+ glfs = glfs_new(gconf->volname);
+ if (!glfs) {
+ goto out;
+ }
+
+ ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
+ gconf->port);
+ if (ret < 0) {
+ goto out;
+ }
+
+ /*
+ * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+ * GlusterFS makes GF_LOG_* macros available to libgfapi users.
+ */
+ ret = glfs_set_logging(glfs, "-", 4);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glfs_init(glfs);
+ if (ret) {
+ error_report("Gluster connection failed for server=%s port=%d "
+ "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+ gconf->volname, gconf->image, gconf->transport);
+ goto out;
+ }
+ return glfs;
+
+out:
+ if (glfs) {
+ old_errno = errno;
+ glfs_fini(glfs);
+ errno = old_errno;
+ }
+ return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+{
+ int ret;
+ bool *finished = acb->finished;
+ BlockDriverCompletionFunc *cb = acb->common.cb;
+ void *opaque = acb->common.opaque;
+
+ if (!acb->ret || acb->ret == acb->size) {
+ ret = 0; /* Success */
+ } else if (acb->ret < 0) {
+ ret = acb->ret; /* Read/Write failed */
+ } else {
+ ret = -EIO; /* Partial read/write - fail it */
+ }
+
+ s->qemu_aio_count--;
+ qemu_aio_release(acb);
+ cb(opaque, ret);
+ if (finished) {
+ *finished = true;
+ }
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+ BDRVGlusterState *s = opaque;
+ ssize_t ret;
+
+ do {
+ char *p = (char *)&s->event_acb;
+
+ ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
+ sizeof(s->event_acb) - s->event_reader_pos);
+ if (ret > 0) {
+ s->event_reader_pos += ret;
+ if (s->event_reader_pos == sizeof(s->event_acb)) {
+ s->event_reader_pos = 0;
+ qemu_gluster_complete_aio(s->event_acb, s);
+ }
+ }
+ } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+ BDRVGlusterState *s = opaque;
+
+ return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+ int bdrv_flags)
+{
+ BDRVGlusterState *s = bs->opaque;
+ int open_flags = O_BINARY;
+ int ret = 0;
+ GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+ s->glfs = qemu_gluster_init(gconf, filename);
+ if (!s->glfs) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (bdrv_flags & BDRV_O_RDWR) {
+ open_flags |= O_RDWR;
+ } else {
+ open_flags |= O_RDONLY;
+ }
+
+ if ((bdrv_flags & BDRV_O_NOCACHE)) {
+ open_flags |= O_DIRECT;
+ }
+
+ s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+ if (!s->fd) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = qemu_pipe(s->fds);
+ if (ret < 0) {
+ ret = -errno;
+ goto out;
+ }
+ fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
+ qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+ qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+ qemu_gluster_gconf_free(gconf);
+ if (!ret) {
+ return ret;
+ }
+ if (s->fd) {
+ glfs_close(s->fd);
+ }
+ if (s->glfs) {
+ glfs_fini(s->glfs);
+ }
+ return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+ QEMUOptionParameter *options)
+{
+ struct glfs *glfs;
+ struct glfs_fd *fd;
+ int ret = 0;
+ int64_t total_size = 0;
+ GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+ glfs = qemu_gluster_init(gconf, filename);
+ if (!glfs) {
+ ret = -errno;
+ goto out;
+ }
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ fd = glfs_creat(glfs, gconf->image,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+ if (!fd) {
+ ret = -errno;
+ } else {
+ if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+ ret = -errno;
+ }
+ if (glfs_close(fd) != 0) {
+ ret = -errno;
+ }
+ }
+out:
+ qemu_gluster_gconf_free(gconf);
+ if (glfs) {
+ glfs_fini(glfs);
+ }
+ return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+ bool finished = false;
+
+ acb->finished = &finished;
+ while (!finished) {
+ qemu_aio_wait();
+ }
+}
+
+static const AIOCBInfo gluster_aiocb_info = {
+ .aiocb_size = sizeof(GlusterAIOCB),
+ .cancel = qemu_gluster_aio_cancel,
+};
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+ GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVGlusterState *s = bs->opaque;
+ int retval;
+
+ acb->ret = ret;
+ retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
+ if (retval != sizeof(acb)) {
+ /*
+ * Gluster AIO callback thread failed to notify the waiting
+ * QEMU thread about IO completion.
+ *
+ * Complete this IO request and make the disk inaccessible for
+ * subsequent reads and writes.
+ */
+ error_report("Gluster failed to notify QEMU about IO completion");
+
+ qemu_mutex_lock_iothread(); /* We are in gluster thread context */
+ acb->common.cb(acb->common.opaque, -EIO);
+ qemu_aio_release(acb);
+ s->qemu_aio_count--;
+ close(s->fds[GLUSTER_FD_READ]);
+ close(s->fds[GLUSTER_FD_WRITE]);
+ qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
+ NULL);
+ bs->drv = NULL; /* Make the disk inaccessible */
+ qemu_mutex_unlock_iothread();
+ }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+ int ret;
+ GlusterAIOCB *acb;
+ BDRVGlusterState *s = bs->opaque;
+ size_t size;
+ off_t offset;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+ s->qemu_aio_count++;
+
+ acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+ acb->size = size;
+ acb->ret = 0;
+ acb->finished = NULL;
+
+ if (write) {
+ ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+ &gluster_finish_aiocb, acb);
+ } else {
+ ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+ &gluster_finish_aiocb, acb);
+ }
+
+ if (ret < 0) {
+ goto out;
+ }
+ return &acb->common;
+
+out:
+ s->qemu_aio_count--;
+ qemu_aio_release(acb);
+ return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ int ret;
+ GlusterAIOCB *acb;
+ BDRVGlusterState *s = bs->opaque;
+
+ acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+ acb->size = 0;
+ acb->ret = 0;
+ acb->finished = NULL;
+ s->qemu_aio_count++;
+
+ ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+ if (ret < 0) {
+ goto out;
+ }
+ return &acb->common;
+
+out:
+ s->qemu_aio_count--;
+ qemu_aio_release(acb);
+ return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+ BDRVGlusterState *s = bs->opaque;
+ int64_t ret;
+
+ ret = glfs_lseek(s->fd, 0, SEEK_END);
+ if (ret < 0) {
+ return -errno;
+ } else {
+ return ret;
+ }
+}
+
+static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
+{
+ BDRVGlusterState *s = bs->opaque;
+ struct stat st;
+ int ret;
+
+ ret = glfs_fstat(s->fd, &st);
+ if (ret < 0) {
+ return -errno;
+ } else {
+ return st.st_blocks * 512;
+ }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+ BDRVGlusterState *s = bs->opaque;
+
+ close(s->fds[GLUSTER_FD_READ]);
+ close(s->fds[GLUSTER_FD_WRITE]);
+ qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
+
+ if (s->fd) {
+ glfs_close(s->fd);
+ s->fd = NULL;
+ }
+ glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+ .format_name = "gluster",
+ .protocol_name = "gluster",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_aio_readv = qemu_gluster_aio_readv,
+ .bdrv_aio_writev = qemu_gluster_aio_writev,
+ .bdrv_aio_flush = qemu_gluster_aio_flush,
+ .create_options = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_tcp = {
+ .format_name = "gluster",
+ .protocol_name = "gluster+tcp",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_aio_readv = qemu_gluster_aio_readv,
+ .bdrv_aio_writev = qemu_gluster_aio_writev,
+ .bdrv_aio_flush = qemu_gluster_aio_flush,
+ .create_options = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_unix = {
+ .format_name = "gluster",
+ .protocol_name = "gluster+unix",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_aio_readv = qemu_gluster_aio_readv,
+ .bdrv_aio_writev = qemu_gluster_aio_writev,
+ .bdrv_aio_flush = qemu_gluster_aio_flush,
+ .create_options = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_rdma = {
+ .format_name = "gluster",
+ .protocol_name = "gluster+rdma",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_aio_readv = qemu_gluster_aio_readv,
+ .bdrv_aio_writev = qemu_gluster_aio_writev,
+ .bdrv_aio_flush = qemu_gluster_aio_flush,
+ .create_options = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+ bdrv_register(&bdrv_gluster_rdma);
+ bdrv_register(&bdrv_gluster_unix);
+ bdrv_register(&bdrv_gluster_tcp);
+ bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);
diff --git a/block/iscsi.c b/block/iscsi.c
index 0b96165ec..c0b70b3d3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -65,13 +65,6 @@ typedef struct IscsiAIOCB {
#endif
} IscsiAIOCB;
-struct IscsiTask {
- IscsiLun *iscsilun;
- BlockDriverState *bs;
- int status;
- int complete;
-};
-
static void
iscsi_bh_cb(void *p)
{
@@ -133,7 +126,7 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
}
}
-static AIOPool iscsi_aio_pool = {
+static const AIOCBInfo iscsi_aiocb_info = {
.aiocb_size = sizeof(IscsiAIOCB),
.cancel = iscsi_aio_cancel,
};
@@ -167,12 +160,6 @@ iscsi_set_events(IscsiLun *iscsilun)
}
- /* If we just added an event, the callback might be delayed
- * unless we call qemu_notify_event().
- */
- if (ev & ~iscsilun->events) {
- qemu_notify_event();
- }
iscsilun->events = ev;
}
@@ -240,7 +227,7 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
uint64_t lba;
struct iscsi_data data;
- acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);
acb->iscsilun = iscsilun;
@@ -268,10 +255,6 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
acb->task->xfer_dir = SCSI_XFER_WRITE;
acb->task->cdb_size = 16;
acb->task->cdb[0] = 0x8a;
- if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
- /* set FUA on writes when cache mode is write through */
- acb->task->cdb[1] |= 0x04;
- }
lba = sector_qemu2lun(sector_num, iscsilun);
*(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32);
*(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff);
@@ -335,7 +318,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;
- acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);
acb->iscsilun = iscsilun;
@@ -390,7 +373,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
*(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
break;
}
-
+
if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
iscsi_aio_read16_cb,
NULL,
@@ -440,7 +423,7 @@ iscsi_aio_flush(BlockDriverState *bs,
struct iscsi_context *iscsi = iscsilun->iscsi;
IscsiAIOCB *acb;
- acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
acb->iscsilun = iscsilun;
acb->canceled = 0;
@@ -493,7 +476,7 @@ iscsi_aio_discard(BlockDriverState *bs,
IscsiAIOCB *acb;
struct unmap_list list[1];
- acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
acb->iscsilun = iscsilun;
acb->canceled = 0;
@@ -568,7 +551,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
assert(req == SG_IO);
- acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
acb->iscsilun = iscsilun;
acb->canceled = 0;
@@ -628,9 +611,17 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
return &acb->common;
}
+
+static void ioctl_cb(void *opaque, int status)
+{
+ int *p_status = opaque;
+ *p_status = status;
+}
+
static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
IscsiLun *iscsilun = bs->opaque;
+ int status;
switch (req) {
case SG_GET_VERSION_NUM:
@@ -639,6 +630,15 @@ static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
case SG_GET_SCSI_ID:
((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
break;
+ case SG_IO:
+ status = -EINPROGRESS;
+ iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
+
+ while (status == -EINPROGRESS) {
+ qemu_aio_wait();
+ }
+
+ return 0;
default:
return -1;
}
@@ -658,163 +658,6 @@ iscsi_getlength(BlockDriverState *bs)
return len;
}
-static void
-iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status,
- void *command_data, void *opaque)
-{
- struct IscsiTask *itask = opaque;
- struct scsi_readcapacity16 *rc16;
- struct scsi_task *task = command_data;
-
- if (status != 0) {
- error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
- iscsi_get_error(iscsi));
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- rc16 = scsi_datain_unmarshall(task);
- if (rc16 == NULL) {
- error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- itask->iscsilun->block_size = rc16->block_length;
- itask->iscsilun->num_blocks = rc16->returned_lba + 1;
- itask->bs->total_sectors = itask->iscsilun->num_blocks *
- itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
- itask->status = 0;
- itask->complete = 1;
- scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status,
- void *command_data, void *opaque)
-{
- struct IscsiTask *itask = opaque;
- struct scsi_readcapacity10 *rc10;
- struct scsi_task *task = command_data;
-
- if (status != 0) {
- error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
- iscsi_get_error(iscsi));
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- rc10 = scsi_datain_unmarshall(task);
- if (rc10 == NULL) {
- error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- itask->iscsilun->block_size = rc10->block_size;
- if (rc10->lba == 0) {
- /* blank disk loaded */
- itask->iscsilun->num_blocks = 0;
- } else {
- itask->iscsilun->num_blocks = rc10->lba + 1;
- }
- itask->bs->total_sectors = itask->iscsilun->num_blocks *
- itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
- itask->status = 0;
- itask->complete = 1;
- scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_inquiry_cb(struct iscsi_context *iscsi, int status, void *command_data,
- void *opaque)
-{
- struct IscsiTask *itask = opaque;
- struct scsi_task *task = command_data;
- struct scsi_inquiry_standard *inq;
-
- if (status != 0) {
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- inq = scsi_datain_unmarshall(task);
- if (inq == NULL) {
- error_report("iSCSI: Failed to unmarshall inquiry data.");
- itask->status = 1;
- itask->complete = 1;
- scsi_free_scsi_task(task);
- return;
- }
-
- itask->iscsilun->type = inq->periperal_device_type;
-
- scsi_free_scsi_task(task);
-
- switch (itask->iscsilun->type) {
- case TYPE_DISK:
- task = iscsi_readcapacity16_task(iscsi, itask->iscsilun->lun,
- iscsi_readcapacity16_cb, opaque);
- if (task == NULL) {
- error_report("iSCSI: failed to send readcapacity16 command.");
- itask->status = 1;
- itask->complete = 1;
- return;
- }
- break;
- case TYPE_ROM:
- task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun,
- 0, 0,
- iscsi_readcapacity10_cb, opaque);
- if (task == NULL) {
- error_report("iSCSI: failed to send readcapacity16 command.");
- itask->status = 1;
- itask->complete = 1;
- return;
- }
- break;
- default:
- itask->status = 0;
- itask->complete = 1;
- }
-}
-
-static void
-iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data,
- void *opaque)
-{
- struct IscsiTask *itask = opaque;
- struct scsi_task *task;
-
- if (status != 0) {
- itask->status = 1;
- itask->complete = 1;
- return;
- }
-
- task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
- 0, 0, 36,
- iscsi_inquiry_cb, opaque);
- if (task == NULL) {
- error_report("iSCSI: failed to send inquiry command.");
- itask->status = 1;
- itask->complete = 1;
- return;
- }
-}
-
static int parse_chap(struct iscsi_context *iscsi, const char *target)
{
QemuOptsList *list;
@@ -927,7 +770,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
IscsiLun *iscsilun = bs->opaque;
struct iscsi_context *iscsi = NULL;
struct iscsi_url *iscsi_url = NULL;
- struct IscsiTask task;
+ struct scsi_task *task = NULL;
+ struct scsi_inquiry_standard *inq = NULL;
+ struct scsi_readcapacity10 *rc10 = NULL;
+ struct scsi_readcapacity16 *rc16 = NULL;
char *initiator_name = NULL;
int ret;
@@ -940,8 +786,7 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
iscsi_url = iscsi_parse_full_url(iscsi, filename);
if (iscsi_url == NULL) {
- error_report("Failed to parse URL : %s %s", filename,
- iscsi_get_error(iscsi));
+ error_report("Failed to parse URL : %s", filename);
ret = -EINVAL;
goto out;
}
@@ -991,33 +836,80 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
/* check if we got HEADER_DIGEST via the options */
parse_header_digest(iscsi, iscsi_url->target);
- task.iscsilun = iscsilun;
- task.status = 0;
- task.complete = 0;
- task.bs = bs;
+ if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
+ error_report("iSCSI: Failed to connect to LUN : %s",
+ iscsi_get_error(iscsi));
+ ret = -EINVAL;
+ goto out;
+ }
iscsilun->iscsi = iscsi;
iscsilun->lun = iscsi_url->lun;
- if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
- iscsi_connect_cb, &task)
- != 0) {
- error_report("iSCSI: Failed to start async connect.");
+ task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36);
+
+ if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+ error_report("iSCSI: failed to send inquiry command.");
ret = -EINVAL;
goto out;
}
- while (!task.complete) {
- iscsi_set_events(iscsilun);
- qemu_aio_wait();
- }
- if (task.status != 0) {
- error_report("iSCSI: Failed to connect to LUN : %s",
- iscsi_get_error(iscsi));
+ inq = scsi_datain_unmarshall(task);
+ if (inq == NULL) {
+ error_report("iSCSI: Failed to unmarshall inquiry data.");
ret = -EINVAL;
goto out;
}
+ iscsilun->type = inq->periperal_device_type;
+
+ scsi_free_scsi_task(task);
+
+ switch (iscsilun->type) {
+ case TYPE_DISK:
+ task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun);
+ if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+ error_report("iSCSI: failed to send readcapacity16 command.");
+ ret = -EINVAL;
+ goto out;
+ }
+ rc16 = scsi_datain_unmarshall(task);
+ if (rc16 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+ ret = -EINVAL;
+ goto out;
+ }
+ iscsilun->block_size = rc16->block_length;
+ iscsilun->num_blocks = rc16->returned_lba + 1;
+ break;
+ case TYPE_ROM:
+ task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0);
+ if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+ error_report("iSCSI: failed to send readcapacity10 command.");
+ ret = -EINVAL;
+ goto out;
+ }
+ rc10 = scsi_datain_unmarshall(task);
+ if (rc10 == NULL) {
+ error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+ ret = -EINVAL;
+ goto out;
+ }
+ iscsilun->block_size = rc10->block_size;
+ if (rc10->lba == 0) {
+ /* blank disk loaded */
+ iscsilun->num_blocks = 0;
+ } else {
+ iscsilun->num_blocks = rc10->lba + 1;
+ }
+ break;
+ default:
+ break;
+ }
+
+ bs->total_sectors = iscsilun->num_blocks *
+ iscsilun->block_size / BDRV_SECTOR_SIZE ;
+
/* Medium changer or tape. We dont have any emulation for this so this must
* be sg ioctl compatible. We force it to be sg, otherwise qemu will try
* to read from the device to guess the image format.
@@ -1036,6 +928,9 @@ out:
if (iscsi_url != NULL) {
iscsi_destroy_url(iscsi_url);
}
+ if (task != NULL) {
+ scsi_free_scsi_task(task);
+ }
if (ret) {
if (iscsi != NULL) {
@@ -1056,6 +951,11 @@ static void iscsi_close(BlockDriverState *bs)
memset(iscsilun, 0, sizeof(IscsiLun));
}
+static int iscsi_has_zero_init(BlockDriverState *bs)
+{
+ return 0;
+}
+
static BlockDriver bdrv_iscsi = {
.format_name = "iscsi",
.protocol_name = "iscsi",
@@ -1071,6 +971,7 @@ static BlockDriver bdrv_iscsi = {
.bdrv_aio_flush = iscsi_aio_flush,
.bdrv_aio_discard = iscsi_aio_discard,
+ .bdrv_has_zero_init = iscsi_has_zero_init,
#ifdef __linux__
.bdrv_ioctl = iscsi_ioctl,
diff --git a/block/linux-aio.c b/block/linux-aio.c
new file mode 100644
index 000000000..91ef86324
--- /dev/null
+++ b/block/linux-aio.c
@@ -0,0 +1,216 @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu-common.h"
+#include "qemu-aio.h"
+#include "qemu-queue.h"
+#include "block/raw-aio.h"
+#include "event_notifier.h"
+
+#include <libaio.h>
+
+/*
+ * Queue size (per-device).
+ *
+ * XXX: eventually we need to communicate this to the guest and/or make it
+ * tunable by the guest. If we get more outstanding requests at a time
+ * than this we will get EAGAIN from io_submit which is communicated to
+ * the guest as an I/O error.
+ */
+#define MAX_EVENTS 128
+
+struct qemu_laiocb {
+ BlockDriverAIOCB common;
+ struct qemu_laio_state *ctx;
+ struct iocb iocb;
+ ssize_t ret;
+ size_t nbytes;
+ QEMUIOVector *qiov;
+ bool is_read;
+ QLIST_ENTRY(qemu_laiocb) node;
+};
+
+struct qemu_laio_state {
+ io_context_t ctx;
+ EventNotifier e;
+ int count;
+};
+
+static inline ssize_t io_event_ret(struct io_event *ev)
+{
+ return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
+}
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void qemu_laio_process_completion(struct qemu_laio_state *s,
+ struct qemu_laiocb *laiocb)
+{
+ int ret;
+
+ s->count--;
+
+ ret = laiocb->ret;
+ if (ret != -ECANCELED) {
+ if (ret == laiocb->nbytes) {
+ ret = 0;
+ } else if (ret >= 0) {
+ /* Short reads mean EOF, pad with zeros. */
+ if (laiocb->is_read) {
+ qemu_iovec_memset(laiocb->qiov, ret, 0,
+ laiocb->qiov->size - ret);
+ } else {
+ ret = -EINVAL;
+ }
+ }
+
+ laiocb->common.cb(laiocb->common.opaque, ret);
+ }
+
+ qemu_aio_release(laiocb);
+}
+
+static void qemu_laio_completion_cb(EventNotifier *e)
+{
+ struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+
+ while (event_notifier_test_and_clear(&s->e)) {
+ struct io_event events[MAX_EVENTS];
+ struct timespec ts = { 0 };
+ int nevents, i;
+
+ do {
+ nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
+ } while (nevents == -EINTR);
+
+ for (i = 0; i < nevents; i++) {
+ struct iocb *iocb = events[i].obj;
+ struct qemu_laiocb *laiocb =
+ container_of(iocb, struct qemu_laiocb, iocb);
+
+ laiocb->ret = io_event_ret(&events[i]);
+ qemu_laio_process_completion(s, laiocb);
+ }
+ }
+}
+
+static int qemu_laio_flush_cb(EventNotifier *e)
+{
+ struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
+
+ return (s->count > 0) ? 1 : 0;
+}
+
+static void laio_cancel(BlockDriverAIOCB *blockacb)
+{
+ struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
+ struct io_event event;
+ int ret;
+
+ if (laiocb->ret != -EINPROGRESS)
+ return;
+
+ /*
+ * Note that as of Linux 2.6.31 neither the block device code nor any
+ * filesystem implements cancellation of AIO request.
+ * Thus the polling loop below is the normal code path.
+ */
+ ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
+ if (ret == 0) {
+ laiocb->ret = -ECANCELED;
+ return;
+ }
+
+ /*
+ * We have to wait for the iocb to finish.
+ *
+ * The only way to get the iocb status update is by polling the io context.
+ * We might be able to do this slightly more optimal by removing the
+ * O_NONBLOCK flag.
+ */
+ while (laiocb->ret == -EINPROGRESS) {
+ qemu_laio_completion_cb(&laiocb->ctx->e);
+ }
+}
+
+static const AIOCBInfo laio_aiocb_info = {
+ .aiocb_size = sizeof(struct qemu_laiocb),
+ .cancel = laio_cancel,
+};
+
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ struct qemu_laio_state *s = aio_ctx;
+ struct qemu_laiocb *laiocb;
+ struct iocb *iocbs;
+ off_t offset = sector_num * 512;
+
+ laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
+ laiocb->nbytes = nb_sectors * 512;
+ laiocb->ctx = s;
+ laiocb->ret = -EINPROGRESS;
+ laiocb->is_read = (type == QEMU_AIO_READ);
+ laiocb->qiov = qiov;
+
+ iocbs = &laiocb->iocb;
+
+ switch (type) {
+ case QEMU_AIO_WRITE:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
+ case QEMU_AIO_READ:
+ io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
+ /* Currently Linux kernel does not support other operations */
+ default:
+ fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
+ __func__, type);
+ goto out_free_aiocb;
+ }
+ io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
+ s->count++;
+
+ if (io_submit(s->ctx, 1, &iocbs) < 0)
+ goto out_dec_count;
+ return &laiocb->common;
+
+out_dec_count:
+ s->count--;
+out_free_aiocb:
+ qemu_aio_release(laiocb);
+ return NULL;
+}
+
+void *laio_init(void)
+{
+ struct qemu_laio_state *s;
+
+ s = g_malloc0(sizeof(*s));
+ if (event_notifier_init(&s->e, false) < 0) {
+ goto out_free_state;
+ }
+
+ if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
+ goto out_close_efd;
+ }
+
+ qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb,
+ qemu_laio_flush_cb);
+
+ return s;
+
+out_close_efd:
+ event_notifier_cleanup(&s->e);
+out_free_state:
+ g_free(s);
+ return NULL;
+}
diff --git a/block/mirror.c b/block/mirror.c
new file mode 100644
index 000000000..d6618a4b3
--- /dev/null
+++ b/block/mirror.c
@@ -0,0 +1,322 @@
+/*
+ * Image mirroring
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "blockjob.h"
+#include "block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+ /*
+ * Size of data buffer for populating the image file. This should be large
+ * enough to process multiple clusters in a single call, so that populating
+ * contiguous regions of the image is efficient.
+ */
+ BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct MirrorBlockJob {
+ BlockJob common;
+ RateLimit limit;
+ BlockDriverState *target;
+ MirrorSyncMode mode;
+ BlockdevOnError on_source_error, on_target_error;
+ bool synced;
+ bool should_complete;
+ int64_t sector_num;
+ uint8_t *buf;
+} MirrorBlockJob;
+
+static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
+ int error)
+{
+ s->synced = false;
+ if (read) {
+ return block_job_error_action(&s->common, s->common.bs,
+ s->on_source_error, true, error);
+ } else {
+ return block_job_error_action(&s->common, s->target,
+ s->on_target_error, false, error);
+ }
+}
+
+static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
+ BlockErrorAction *p_action)
+{
+ BlockDriverState *source = s->common.bs;
+ BlockDriverState *target = s->target;
+ QEMUIOVector qiov;
+ int ret, nb_sectors;
+ int64_t end;
+ struct iovec iov;
+
+ end = s->common.len >> BDRV_SECTOR_BITS;
+ s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
+ nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
+ bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+
+ /* Copy the dirty cluster. */
+ iov.iov_base = s->buf;
+ iov.iov_len = nb_sectors * 512;
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
+ ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+ if (ret < 0) {
+ *p_action = mirror_error_action(s, true, -ret);
+ goto fail;
+ }
+ ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+ if (ret < 0) {
+ *p_action = mirror_error_action(s, false, -ret);
+ s->synced = false;
+ goto fail;
+ }
+ return 0;
+
+fail:
+ /* Try again later. */
+ bdrv_set_dirty(source, s->sector_num, nb_sectors);
+ return ret;
+}
+
+static void coroutine_fn mirror_run(void *opaque)
+{
+ MirrorBlockJob *s = opaque;
+ BlockDriverState *bs = s->common.bs;
+ int64_t sector_num, end;
+ int ret = 0;
+ int n;
+
+ if (block_job_is_cancelled(&s->common)) {
+ goto immediate_exit;
+ }
+
+ s->common.len = bdrv_getlength(bs);
+ if (s->common.len < 0) {
+ block_job_completed(&s->common, s->common.len);
+ return;
+ }
+
+ end = s->common.len >> BDRV_SECTOR_BITS;
+ s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+
+ if (s->mode != MIRROR_SYNC_MODE_NONE) {
+ /* First part, loop on the sectors and initialize the dirty bitmap. */
+ BlockDriverState *base;
+ base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+ for (sector_num = 0; sector_num < end; ) {
+ int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+ ret = bdrv_co_is_allocated_above(bs, base,
+ sector_num, next - sector_num, &n);
+
+ if (ret < 0) {
+ goto immediate_exit;
+ }
+
+ assert(n > 0);
+ if (ret == 1) {
+ bdrv_set_dirty(bs, sector_num, n);
+ sector_num = next;
+ } else {
+ sector_num += n;
+ }
+ }
+ }
+
+ s->sector_num = -1;
+ for (;;) {
+ uint64_t delay_ns;
+ int64_t cnt;
+ bool should_complete;
+
+ cnt = bdrv_get_dirty_count(bs);
+ if (cnt != 0) {
+ BlockErrorAction action = BDRV_ACTION_REPORT;
+ ret = mirror_iteration(s, &action);
+ if (ret < 0 && action == BDRV_ACTION_REPORT) {
+ goto immediate_exit;
+ }
+ cnt = bdrv_get_dirty_count(bs);
+ }
+
+ should_complete = false;
+ if (cnt == 0) {
+ trace_mirror_before_flush(s);
+ ret = bdrv_flush(s->target);
+ if (ret < 0) {
+ if (mirror_error_action(s, false, -ret) == BDRV_ACTION_REPORT) {
+ goto immediate_exit;
+ }
+ } else {
+ /* We're out of the streaming phase. From now on, if the job
+ * is cancelled we will actually complete all pending I/O and
+ * report completion. This way, block-job-cancel will leave
+ * the target in a consistent state.
+ */
+ s->common.offset = end * BDRV_SECTOR_SIZE;
+ if (!s->synced) {
+ block_job_ready(&s->common);
+ s->synced = true;
+ }
+
+ should_complete = s->should_complete ||
+ block_job_is_cancelled(&s->common);
+ cnt = bdrv_get_dirty_count(bs);
+ }
+ }
+
+ if (cnt == 0 && should_complete) {
+ /* The dirty bitmap is not updated while operations are pending.
+ * If we're about to exit, wait for pending operations before
+ * calling bdrv_get_dirty_count(bs), or we may exit while the
+ * source has dirty data to copy!
+ *
+ * Note that I/O can be submitted by the guest while
+ * mirror_populate runs.
+ */
+ trace_mirror_before_drain(s, cnt);
+ bdrv_drain_all();
+ cnt = bdrv_get_dirty_count(bs);
+ }
+
+ ret = 0;
+ trace_mirror_before_sleep(s, cnt, s->synced);
+ if (!s->synced) {
+ /* Publish progress */
+ s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+
+ if (s->common.speed) {
+ delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+ } else {
+ delay_ns = 0;
+ }
+
+ /* Note that even when no rate limit is applied we need to yield
+ * with no pending I/O here so that qemu_aio_flush() returns.
+ */
+ block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+ if (block_job_is_cancelled(&s->common)) {
+ break;
+ }
+ } else if (!should_complete) {
+ delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+ block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+ } else if (cnt == 0) {
+ /* The two disks are in sync. Exit and report successful
+ * completion.
+ */
+ assert(QLIST_EMPTY(&bs->tracked_requests));
+ s->common.cancelled = false;
+ break;
+ }
+ }
+
+immediate_exit:
+ g_free(s->buf);
+ bdrv_set_dirty_tracking(bs, false);
+ bdrv_iostatus_disable(s->target);
+ if (s->should_complete && ret == 0) {
+ if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
+ bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
+ }
+ bdrv_swap(s->target, s->common.bs);
+ }
+ bdrv_close(s->target);
+ bdrv_delete(s->target);
+ block_job_completed(&s->common, ret);
+}
+
+static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+ MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+ if (speed < 0) {
+ error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ return;
+ }
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static void mirror_iostatus_reset(BlockJob *job)
+{
+ MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+ bdrv_iostatus_reset(s->target);
+}
+
+static void mirror_complete(BlockJob *job, Error **errp)
+{
+ MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+ int ret;
+
+ ret = bdrv_open_backing_file(s->target);
+ if (ret < 0) {
+ char backing_filename[PATH_MAX];
+ bdrv_get_full_backing_filename(s->target, backing_filename,
+ sizeof(backing_filename));
+ error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename);
+ return;
+ }
+ if (!s->synced) {
+ error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+ return;
+ }
+
+ s->should_complete = true;
+ block_job_resume(job);
+}
+
+static BlockJobType mirror_job_type = {
+ .instance_size = sizeof(MirrorBlockJob),
+ .job_type = "mirror",
+ .set_speed = mirror_set_speed,
+ .iostatus_reset= mirror_iostatus_reset,
+ .complete = mirror_complete,
+};
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+ int64_t speed, MirrorSyncMode mode,
+ BlockdevOnError on_source_error,
+ BlockdevOnError on_target_error,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, Error **errp)
+{
+ MirrorBlockJob *s;
+
+ if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+ on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+ !bdrv_iostatus_is_enabled(bs)) {
+ error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+ return;
+ }
+
+ s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp);
+ if (!s) {
+ return;
+ }
+
+ s->on_source_error = on_source_error;
+ s->on_target_error = on_target_error;
+ s->target = target;
+ s->mode = mode;
+ bdrv_set_dirty_tracking(bs, true);
+ bdrv_set_enable_write_cache(s->target, true);
+ bdrv_set_on_error(s->target, on_target_error, on_target_error);
+ bdrv_iostatus_enable(s->target);
+ s->common.co = qemu_coroutine_create(mirror_run);
+ trace_mirror_start(bs, s, s->common.co, opaque);
+ qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/nbd.c b/block/nbd.c
index 2bce47bf7..e87c24817 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -28,6 +28,7 @@
#include "qemu-common.h"
#include "nbd.h"
+#include "uri.h"
#include "block_int.h"
#include "module.h"
#include "qemu_socket.h"
@@ -55,7 +56,6 @@ typedef struct BDRVNBDState {
uint32_t nbdflags;
off_t size;
size_t blocksize;
- char *export_name; /* An NBD server may export several devices */
CoMutex send_mutex;
CoMutex free_sema;
@@ -65,13 +65,75 @@ typedef struct BDRVNBDState {
Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
struct nbd_reply reply;
- /* If it begins with '/', this is a UNIX domain socket. Otherwise,
- * it's a string of the form <hostname|ip4|\[ip6\]>:port
- */
+ int is_unix;
char *host_spec;
+ char *export_name; /* An NBD server may export several devices */
} BDRVNBDState;
-static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
+static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
+{
+ URI *uri;
+ const char *p;
+ QueryParams *qp = NULL;
+ int ret = 0;
+
+ uri = uri_parse(filename);
+ if (!uri) {
+ return -EINVAL;
+ }
+
+ /* transport */
+ if (!strcmp(uri->scheme, "nbd")) {
+ s->is_unix = false;
+ } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+ s->is_unix = false;
+ } else if (!strcmp(uri->scheme, "nbd+unix")) {
+ s->is_unix = true;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ p = uri->path ? uri->path : "/";
+ p += strspn(p, "/");
+ if (p[0]) {
+ s->export_name = g_strdup(p);
+ }
+
+ qp = query_params_parse(uri->query);
+ if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (s->is_unix) {
+ /* nbd+unix:///export?socket=path */
+ if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
+ ret = -EINVAL;
+ goto out;
+ }
+ s->host_spec = g_strdup(qp->p[0].value);
+ } else {
+ /* nbd[+tcp]://host:port/export */
+ if (!uri->server) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!uri->port) {
+ uri->port = NBD_DEFAULT_PORT;
+ }
+ s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port);
+ }
+
+out:
+ if (qp) {
+ query_params_free(qp);
+ }
+ uri_free(uri);
+ return ret;
+}
+
+static int nbd_config(BDRVNBDState *s, const char *filename)
{
char *file;
char *export_name;
@@ -79,6 +141,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
const char *unixpath;
int err = -EINVAL;
+ if (strstr(filename, "://")) {
+ return nbd_parse_uri(s, filename);
+ }
+
file = g_strdup(filename);
export_name = strstr(file, EN_OPTSTR);
@@ -98,11 +164,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
/* are we a UNIX or TCP socket? */
if (strstart(host_spec, "unix:", &unixpath)) {
- if (unixpath[0] != '/') { /* We demand an absolute path*/
- goto out;
- }
+ s->is_unix = true;
s->host_spec = g_strdup(unixpath);
} else {
+ s->is_unix = false;
s->host_spec = g_strdup(host_spec);
}
@@ -262,7 +327,7 @@ static int nbd_establish_connection(BlockDriverState *bs)
off_t size;
size_t blocksize;
- if (s->host_spec[0] == '/') {
+ if (s->is_unix) {
sock = unix_socket_outgoing(s->host_spec);
} else {
sock = tcp_socket_outgoing_spec(s->host_spec);
@@ -320,7 +385,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
qemu_co_mutex_init(&s->free_sema);
/* Pop the config into our state object. Exit if invalid. */
- result = nbd_config(s, filename, flags);
+ result = nbd_config(s, filename);
if (result != 0) {
return result;
}
@@ -498,6 +563,33 @@ static int64_t nbd_getlength(BlockDriverState *bs)
static BlockDriver bdrv_nbd = {
.format_name = "nbd",
+ .protocol_name = "nbd",
+ .instance_size = sizeof(BDRVNBDState),
+ .bdrv_file_open = nbd_open,
+ .bdrv_co_readv = nbd_co_readv,
+ .bdrv_co_writev = nbd_co_writev,
+ .bdrv_close = nbd_close,
+ .bdrv_co_flush_to_os = nbd_co_flush,
+ .bdrv_co_discard = nbd_co_discard,
+ .bdrv_getlength = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_tcp = {
+ .format_name = "nbd",
+ .protocol_name = "nbd+tcp",
+ .instance_size = sizeof(BDRVNBDState),
+ .bdrv_file_open = nbd_open,
+ .bdrv_co_readv = nbd_co_readv,
+ .bdrv_co_writev = nbd_co_writev,
+ .bdrv_close = nbd_close,
+ .bdrv_co_flush_to_os = nbd_co_flush,
+ .bdrv_co_discard = nbd_co_discard,
+ .bdrv_getlength = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_unix = {
+ .format_name = "nbd",
+ .protocol_name = "nbd+unix",
.instance_size = sizeof(BDRVNBDState),
.bdrv_file_open = nbd_open,
.bdrv_co_readv = nbd_co_readv,
@@ -506,12 +598,13 @@ static BlockDriver bdrv_nbd = {
.bdrv_co_flush_to_os = nbd_co_flush,
.bdrv_co_discard = nbd_co_discard,
.bdrv_getlength = nbd_getlength,
- .protocol_name = "nbd",
};
static void bdrv_nbd_init(void)
{
bdrv_register(&bdrv_nbd);
+ bdrv_register(&bdrv_nbd_tcp);
+ bdrv_register(&bdrv_nbd_unix);
}
block_init(bdrv_nbd_init);
diff --git a/block/qcow.c b/block/qcow.c
index 7b5ab87d2..b239c82ae 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -197,6 +197,15 @@ static int qcow_open(BlockDriverState *bs, int flags)
return ret;
}
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static int qcow_set_key(BlockDriverState *bs, const char *key)
{
BDRVQcowState *s = bs->opaque;
@@ -868,6 +877,7 @@ static BlockDriver bdrv_qcow = {
.bdrv_probe = qcow_probe,
.bdrv_open = qcow_open,
.bdrv_close = qcow_close,
+ .bdrv_reopen_prepare = qcow_reopen_prepare,
.bdrv_create = qcow_create,
.bdrv_co_readv = qcow_co_readv,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 5e3f9153f..96224d1af 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -301,7 +301,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
uint64_t last_table_size;
uint64_t blocks_clusters;
do {
- uint64_t table_clusters = size_to_clusters(s, table_size);
+ uint64_t table_clusters =
+ size_to_clusters(s, table_size * sizeof(uint64_t));
blocks_clusters = 1 +
((table_clusters + refcount_block_clusters - 1)
/ refcount_block_clusters);
diff --git a/block/qcow2.c b/block/qcow2.c
index 8f183f146..c1ff31f48 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -52,6 +52,7 @@ typedef struct {
uint32_t magic;
uint32_t len;
} QCowExtension;
+
#define QCOW2_EXT_MAGIC_END 0
#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
@@ -558,6 +559,14 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
return 0;
}
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum)
{
@@ -1087,6 +1096,7 @@ int qcow2_update_header(BlockDriverState *bs)
goto fail;
}
+ /* Using strncpy is ok here, since buf is not NUL-terminated. */
strncpy(buf, bs->backing_file, buflen);
header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
@@ -1679,6 +1689,7 @@ static BlockDriver bdrv_qcow2 = {
.bdrv_probe = qcow2_probe,
.bdrv_open = qcow2_open,
.bdrv_close = qcow2_close,
+ .bdrv_reopen_prepare = qcow2_reopen_prepare,
.bdrv_create = qcow2_create,
.bdrv_co_is_allocated = qcow2_co_is_allocated,
.bdrv_set_key = qcow2_set_key,
diff --git a/block/qed-table.c b/block/qed-table.c
index ce07b0554..de845ec3d 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -103,7 +103,6 @@ static void qed_write_table_cb(void *opaque, int ret)
out:
qemu_vfree(write_table_cb->table);
gencb_complete(&write_table_cb->gencb, ret);
- return;
}
/**
diff --git a/block/qed.c b/block/qed.c
index 21cb23987..0b5374a20 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -30,7 +30,7 @@ static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
}
}
-static AIOPool qed_aio_pool = {
+static const AIOCBInfo qed_aiocb_info = {
.aiocb_size = sizeof(QEDAIOCB),
.cancel = qed_aio_cancel,
};
@@ -505,6 +505,14 @@ out:
return ret;
}
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static void bdrv_qed_close(BlockDriverState *bs)
{
BDRVQEDState *s = bs->opaque;
@@ -1303,7 +1311,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque, int flags)
{
- QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
+ QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
opaque, flags);
@@ -1564,6 +1572,7 @@ static BlockDriver bdrv_qed = {
.bdrv_rebind = bdrv_qed_rebind,
.bdrv_open = bdrv_qed_open,
.bdrv_close = bdrv_qed_close,
+ .bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
.bdrv_create = bdrv_qed_create,
.bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
.bdrv_make_empty = bdrv_qed_make_empty,
diff --git a/block/raw-posix-aio.h b/block/raw-aio.h
index ba118f616..e77f36114 100644
--- a/block/raw-posix-aio.h
+++ b/block/raw-aio.h
@@ -1,5 +1,5 @@
/*
- * QEMU Posix block I/O backend AIO support
+ * Declarations for AIO in the raw protocol
*
* Copyright IBM, Corp. 2008
*
@@ -12,8 +12,8 @@
* Contributions after 2012-01-13 are licensed under the terms of the
* GNU GPL, version 2 or (at your option) any later version.
*/
-#ifndef QEMU_RAW_POSIX_AIO_H
-#define QEMU_RAW_POSIX_AIO_H
+#ifndef QEMU_RAW_AIO_H
+#define QEMU_RAW_AIO_H
/* AIO request types */
#define QEMU_AIO_READ 0x0001
@@ -27,19 +27,22 @@
#define QEMU_AIO_MISALIGNED 0x1000
-/* posix-aio-compat.c - thread pool based implementation */
-int paio_init(void);
-BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb, void *opaque, int type);
-BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
- unsigned long int req, void *buf,
- BlockDriverCompletionFunc *cb, void *opaque);
-
/* linux-aio.c - Linux native implementation */
+#ifdef CONFIG_LINUX_AIO
void *laio_init(void);
BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
+
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+ QEMUWin32AIOState *aio, HANDLE hfile,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
-#endif /* QEMU_RAW_POSIX_AIO_H */
+#endif /* QEMU_RAW_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6be20b192..550c81f22 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -27,7 +27,10 @@
#include "qemu-log.h"
#include "block_int.h"
#include "module.h"
-#include "block/raw-posix-aio.h"
+#include "trace.h"
+#include "thread-pool.h"
+#include "iov.h"
+#include "raw-aio.h"
#if defined(__APPLE__) && (__MACH__)
#include <paths.h>
@@ -133,16 +136,36 @@ typedef struct BDRVRawState {
int use_aio;
void *aio_ctx;
#endif
- uint8_t *aligned_buf;
- unsigned aligned_buf_size;
#ifdef CONFIG_XFS
bool is_xfs : 1;
#endif
} BDRVRawState;
+typedef struct BDRVRawReopenState {
+ int fd;
+ int open_flags;
+#ifdef CONFIG_LINUX_AIO
+ int use_aio;
+#endif
+} BDRVRawReopenState;
+
static int fd_open(BlockDriverState *bs);
static int64_t raw_getlength(BlockDriverState *bs);
+typedef struct RawPosixAIOData {
+ BlockDriverState *bs;
+ int aio_fildes;
+ union {
+ struct iovec *aio_iov;
+ void *aio_ioctl_buf;
+ };
+ int aio_niov;
+ size_t aio_nbytes;
+#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
+ off_t aio_offset;
+ int aio_type;
+} RawPosixAIOData;
+
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
static int cdrom_reopen(BlockDriverState *bs);
#endif
@@ -185,6 +208,57 @@ static int raw_normalize_devicepath(const char **filename)
}
#endif
+static void raw_parse_flags(int bdrv_flags, int *open_flags)
+{
+ assert(open_flags != NULL);
+
+ *open_flags |= O_BINARY;
+ *open_flags &= ~O_ACCMODE;
+ if (bdrv_flags & BDRV_O_RDWR) {
+ *open_flags |= O_RDWR;
+ } else {
+ *open_flags |= O_RDONLY;
+ }
+
+ /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+ * and O_DIRECT for no caching. */
+ if ((bdrv_flags & BDRV_O_NOCACHE)) {
+ *open_flags |= O_DIRECT;
+ }
+}
+
+#ifdef CONFIG_LINUX_AIO
+static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+{
+ int ret = -1;
+ assert(aio_ctx != NULL);
+ assert(use_aio != NULL);
+ /*
+ * Currently Linux do AIO only for files opened with O_DIRECT
+ * specified so check NOCACHE flag too
+ */
+ if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+ (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+ /* if non-NULL, laio_init() has already been run */
+ if (*aio_ctx == NULL) {
+ *aio_ctx = laio_init();
+ if (!*aio_ctx) {
+ goto error;
+ }
+ }
+ *use_aio = 1;
+ } else {
+ *use_aio = 0;
+ }
+
+ ret = 0;
+
+error:
+ return ret;
+}
+#endif
+
static int raw_open_common(BlockDriverState *bs, const char *filename,
int bdrv_flags, int open_flags)
{
@@ -196,20 +270,8 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
return ret;
}
- s->open_flags = open_flags | O_BINARY;
- s->open_flags &= ~O_ACCMODE;
- if (bdrv_flags & BDRV_O_RDWR) {
- s->open_flags |= O_RDWR;
- } else {
- s->open_flags |= O_RDONLY;
- }
-
- /* Use O_DSYNC for write-through caching, no flags for write-back caching,
- * and O_DIRECT for no caching. */
- if ((bdrv_flags & BDRV_O_NOCACHE))
- s->open_flags |= O_DIRECT;
- if (!(bdrv_flags & BDRV_O_CACHE_WB))
- s->open_flags |= O_DSYNC;
+ s->open_flags = open_flags;
+ raw_parse_flags(bdrv_flags, &s->open_flags);
s->fd = -1;
fd = qemu_open(filename, s->open_flags, 0644);
@@ -220,45 +282,13 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
return ret;
}
s->fd = fd;
- s->aligned_buf = NULL;
-
- if ((bdrv_flags & BDRV_O_NOCACHE)) {
- /*
- * Allocate a buffer for read/modify/write cycles. Chose the size
- * pessimistically as we don't know the block size yet.
- */
- s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
- s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
- if (s->aligned_buf == NULL) {
- goto out_close;
- }
- }
-
- /* We're falling back to POSIX AIO in some cases so init always */
- if (paio_init() < 0) {
- goto out_free_buf;
- }
#ifdef CONFIG_LINUX_AIO
- /*
- * Currently Linux do AIO only for files opened with O_DIRECT
- * specified so check NOCACHE flag too
- */
- if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
- (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
-
- s->aio_ctx = laio_init();
- if (!s->aio_ctx) {
- goto out_free_buf;
- }
- s->use_aio = 1;
- } else
-#endif
- {
-#ifdef CONFIG_LINUX_AIO
- s->use_aio = 0;
-#endif
+ if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
+ qemu_close(fd);
+ return -errno;
}
+#endif
#ifdef CONFIG_XFS
if (platform_test_xfs_fd(s->fd)) {
@@ -267,12 +297,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
#endif
return 0;
-
-out_free_buf:
- qemu_vfree(s->aligned_buf);
-out_close:
- qemu_close(fd);
- return -errno;
}
static int raw_open(BlockDriverState *bs, const char *filename, int flags)
@@ -283,6 +307,113 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
return raw_open_common(bs, filename, flags, 0);
}
+static int raw_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ BDRVRawState *s;
+ BDRVRawReopenState *raw_s;
+ int ret = 0;
+
+ assert(state != NULL);
+ assert(state->bs != NULL);
+
+ s = state->bs->opaque;
+
+ state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+ raw_s = state->opaque;
+
+#ifdef CONFIG_LINUX_AIO
+ raw_s->use_aio = s->use_aio;
+
+ /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
+ * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
+ * won't override aio_ctx if aio_ctx is non-NULL */
+ if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+ return -1;
+ }
+#endif
+
+ if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
+ raw_s->open_flags |= O_NONBLOCK;
+ }
+
+ raw_parse_flags(state->flags, &raw_s->open_flags);
+
+ raw_s->fd = -1;
+
+ int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+#ifdef O_NOATIME
+ fcntl_flags |= O_NOATIME;
+#endif
+
+ if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+ /* dup the original fd */
+ /* TODO: use qemu fcntl wrapper */
+#ifdef F_DUPFD_CLOEXEC
+ raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
+#else
+ raw_s->fd = dup(s->fd);
+ if (raw_s->fd != -1) {
+ qemu_set_cloexec(raw_s->fd);
+ }
+#endif
+ if (raw_s->fd >= 0) {
+ ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
+ if (ret) {
+ qemu_close(raw_s->fd);
+ raw_s->fd = -1;
+ }
+ }
+ }
+
+ /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+ if (raw_s->fd == -1) {
+ assert(!(raw_s->open_flags & O_CREAT));
+ raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
+ if (raw_s->fd == -1) {
+ ret = -1;
+ }
+ }
+ return ret;
+}
+
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+ BDRVRawState *s = state->bs->opaque;
+
+ s->open_flags = raw_s->open_flags;
+
+ qemu_close(s->fd);
+ s->fd = raw_s->fd;
+#ifdef CONFIG_LINUX_AIO
+ s->use_aio = raw_s->use_aio;
+#endif
+
+ g_free(state->opaque);
+ state->opaque = NULL;
+}
+
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+
+ /* nothing to do if NULL, we didn't get far enough */
+ if (raw_s == NULL) {
+ return;
+ }
+
+ if (raw_s->fd >= 0) {
+ qemu_close(raw_s->fd);
+ raw_s->fd = -1;
+ }
+ g_free(state->opaque);
+ state->opaque = NULL;
+}
+
+
/* XXX: use host sector size if necessary with:
#ifdef DIOCGSECTORSIZE
{
@@ -316,6 +447,283 @@ static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return 1;
}
+static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
+{
+ int ret;
+
+ ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ /*
+ * This looks weird, but the aio code only considers a request
+ * successful if it has written the full number of bytes.
+ *
+ * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
+ * so in fact we return the ioctl command here to make posix_aio_read()
+ * happy..
+ */
+ return aiocb->aio_nbytes;
+}
+
+static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
+{
+ int ret;
+
+ ret = qemu_fdatasync(aiocb->aio_fildes);
+ if (ret == -1) {
+ return -errno;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_PREADV
+
+static bool preadv_present = true;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+ return preadv(fd, iov, nr_iov, offset);
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+ return pwritev(fd, iov, nr_iov, offset);
+}
+
+#else
+
+static bool preadv_present = false;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+ return -ENOSYS;
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+ return -ENOSYS;
+}
+
+#endif
+
+static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
+{
+ ssize_t len;
+
+ do {
+ if (aiocb->aio_type & QEMU_AIO_WRITE)
+ len = qemu_pwritev(aiocb->aio_fildes,
+ aiocb->aio_iov,
+ aiocb->aio_niov,
+ aiocb->aio_offset);
+ else
+ len = qemu_preadv(aiocb->aio_fildes,
+ aiocb->aio_iov,
+ aiocb->aio_niov,
+ aiocb->aio_offset);
+ } while (len == -1 && errno == EINTR);
+
+ if (len == -1) {
+ return -errno;
+ }
+ return len;
+}
+
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
+{
+ ssize_t offset = 0;
+ ssize_t len;
+
+ while (offset < aiocb->aio_nbytes) {
+ if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ len = pwrite(aiocb->aio_fildes,
+ (const char *)buf + offset,
+ aiocb->aio_nbytes - offset,
+ aiocb->aio_offset + offset);
+ } else {
+ len = pread(aiocb->aio_fildes,
+ buf + offset,
+ aiocb->aio_nbytes - offset,
+ aiocb->aio_offset + offset);
+ }
+ if (len == -1 && errno == EINTR) {
+ continue;
+ } else if (len == -1) {
+ offset = -errno;
+ break;
+ } else if (len == 0) {
+ break;
+ }
+ offset += len;
+ }
+
+ return offset;
+}
+
+static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
+{
+ ssize_t nbytes;
+ char *buf;
+
+ if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
+ /*
+ * If there is just a single buffer, and it is properly aligned
+ * we can just use plain pread/pwrite without any problems.
+ */
+ if (aiocb->aio_niov == 1) {
+ return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
+ }
+ /*
+ * We have more than one iovec, and all are properly aligned.
+ *
+ * Try preadv/pwritev first and fall back to linearizing the
+ * buffer if it's not supported.
+ */
+ if (preadv_present) {
+ nbytes = handle_aiocb_rw_vector(aiocb);
+ if (nbytes == aiocb->aio_nbytes ||
+ (nbytes < 0 && nbytes != -ENOSYS)) {
+ return nbytes;
+ }
+ preadv_present = false;
+ }
+
+ /*
+ * XXX(hch): short read/write. no easy way to handle the reminder
+ * using these interfaces. For now retry using plain
+ * pread/pwrite?
+ */
+ }
+
+ /*
+ * Ok, we have to do it the hard way, copy all segments into
+ * a single aligned buffer.
+ */
+ buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+ if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ char *p = buf;
+ int i;
+
+ for (i = 0; i < aiocb->aio_niov; ++i) {
+ memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
+ p += aiocb->aio_iov[i].iov_len;
+ }
+ }
+
+ nbytes = handle_aiocb_rw_linear(aiocb, buf);
+ if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+ char *p = buf;
+ size_t count = aiocb->aio_nbytes, copy;
+ int i;
+
+ for (i = 0; i < aiocb->aio_niov && count; ++i) {
+ copy = count;
+ if (copy > aiocb->aio_iov[i].iov_len) {
+ copy = aiocb->aio_iov[i].iov_len;
+ }
+ memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+ p += copy;
+ count -= copy;
+ }
+ }
+ qemu_vfree(buf);
+
+ return nbytes;
+}
+
+static int aio_worker(void *arg)
+{
+ RawPosixAIOData *aiocb = arg;
+ ssize_t ret = 0;
+
+ switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+ case QEMU_AIO_READ:
+ ret = handle_aiocb_rw(aiocb);
+ if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) {
+ iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
+ 0, aiocb->aio_nbytes - ret);
+
+ ret = aiocb->aio_nbytes;
+ }
+ if (ret == aiocb->aio_nbytes) {
+ ret = 0;
+ } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+ ret = -EINVAL;
+ }
+ break;
+ case QEMU_AIO_WRITE:
+ ret = handle_aiocb_rw(aiocb);
+ if (ret == aiocb->aio_nbytes) {
+ ret = 0;
+ } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+ ret = -EINVAL;
+ }
+ break;
+ case QEMU_AIO_FLUSH:
+ ret = handle_aiocb_flush(aiocb);
+ break;
+ case QEMU_AIO_IOCTL:
+ ret = handle_aiocb_ioctl(aiocb);
+ break;
+ default:
+ fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+ ret = -EINVAL;
+ break;
+ }
+
+ g_slice_free(RawPosixAIOData, aiocb);
+ return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+
+ acb->bs = bs;
+ acb->aio_type = type;
+ acb->aio_fildes = fd;
+
+ if (qiov) {
+ acb->aio_iov = qiov->iov;
+ acb->aio_niov = qiov->niov;
+ }
+ acb->aio_nbytes = nb_sectors * 512;
+ acb->aio_offset = sector_num * 512;
+
+ trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+ return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
+static BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+
+ acb->bs = bs;
+ acb->aio_type = QEMU_AIO_IOCTL;
+ acb->aio_fildes = fd;
+ acb->aio_offset = 0;
+ acb->aio_ioctl_buf = buf;
+ acb->aio_ioctl_cmd = req;
+
+ return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque, int type)
@@ -330,7 +738,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
* boundary. Check if this is the case or tell the low-level
* driver that it needs to copy the buffer.
*/
- if (s->aligned_buf) {
+ if ((bs->open_flags & BDRV_O_NOCACHE)) {
if (!qiov_is_aligned(bs, qiov)) {
type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_AIO
@@ -378,8 +786,6 @@ static void raw_close(BlockDriverState *bs)
if (s->fd >= 0) {
qemu_close(s->fd);
s->fd = -1;
- if (s->aligned_buf != NULL)
- qemu_vfree(s->aligned_buf);
}
}
@@ -735,6 +1141,9 @@ static BlockDriver bdrv_file = {
.instance_size = sizeof(BDRVRawState),
.bdrv_probe = NULL, /* no probe for protocols */
.bdrv_file_open = raw_open,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
.bdrv_close = raw_close,
.bdrv_create = raw_create,
.bdrv_co_discard = raw_co_discard,
@@ -1004,6 +1413,9 @@ static BlockDriver bdrv_host_device = {
.bdrv_probe_device = hdev_probe_device,
.bdrv_file_open = hdev_open,
.bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
.bdrv_has_zero_init = hdev_has_zero_init,
@@ -1125,6 +1537,9 @@ static BlockDriver bdrv_host_floppy = {
.bdrv_probe_device = floppy_probe_device,
.bdrv_file_open = floppy_open,
.bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
.bdrv_has_zero_init = hdev_has_zero_init,
@@ -1224,6 +1639,9 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_probe_device = cdrom_probe_device,
.bdrv_file_open = cdrom_open,
.bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
.bdrv_has_zero_init = hdev_has_zero_init,
@@ -1343,6 +1761,9 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_probe_device = cdrom_probe_device,
.bdrv_file_open = cdrom_open,
.bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
.bdrv_create = hdev_create,
.create_options = raw_create_options,
.bdrv_has_zero_init = hdev_has_zero_init,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index c56bf8337..0c05c58c5 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -25,6 +25,10 @@
#include "qemu-timer.h"
#include "block_int.h"
#include "module.h"
+#include "raw-aio.h"
+#include "trace.h"
+#include "thread-pool.h"
+#include "iov.h"
#include <windows.h>
#include <winioctl.h>
@@ -32,12 +36,130 @@
#define FTYPE_CD 1
#define FTYPE_HARDDISK 2
+static QEMUWin32AIOState *aio;
+
+typedef struct RawWin32AIOData {
+ BlockDriverState *bs;
+ HANDLE hfile;
+ struct iovec *aio_iov;
+ int aio_niov;
+ size_t aio_nbytes;
+ off64_t aio_offset;
+ int aio_type;
+} RawWin32AIOData;
+
typedef struct BDRVRawState {
HANDLE hfile;
int type;
char drive_path[16]; /* format: "d:\" */
+ QEMUWin32AIOState *aio;
} BDRVRawState;
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static size_t handle_aiocb_rw(RawWin32AIOData *aiocb)
+{
+ size_t offset = 0;
+ int i;
+
+ for (i = 0; i < aiocb->aio_niov; i++) {
+ OVERLAPPED ov;
+ DWORD ret, ret_count, len;
+
+ memset(&ov, 0, sizeof(ov));
+ ov.Offset = (aiocb->aio_offset + offset);
+ ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32;
+ len = aiocb->aio_iov[i].iov_len;
+ if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+ len, &ret_count, &ov);
+ } else {
+ ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+ len, &ret_count, &ov);
+ }
+ if (!ret) {
+ ret_count = 0;
+ }
+ if (ret_count != len) {
+ break;
+ }
+ offset += len;
+ }
+
+ return offset;
+}
+
+static int aio_worker(void *arg)
+{
+ RawWin32AIOData *aiocb = arg;
+ ssize_t ret = 0;
+ size_t count;
+
+ switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+ case QEMU_AIO_READ:
+ count = handle_aiocb_rw(aiocb);
+ if (count < aiocb->aio_nbytes && aiocb->bs->growable) {
+ /* A short read means that we have reached EOF. Pad the buffer
+ * with zeros for bytes after EOF. */
+ iov_memset(aiocb->aio_iov, aiocb->aio_niov, count,
+ 0, aiocb->aio_nbytes - count);
+
+ count = aiocb->aio_nbytes;
+ }
+ if (count == aiocb->aio_nbytes) {
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ break;
+ case QEMU_AIO_WRITE:
+ count = handle_aiocb_rw(aiocb);
+ if (count == aiocb->aio_nbytes) {
+ count = 0;
+ } else {
+ count = -EINVAL;
+ }
+ break;
+ case QEMU_AIO_FLUSH:
+ if (!FlushFileBuffers(aiocb->hfile)) {
+ return -EIO;
+ }
+ break;
+ default:
+ fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+ ret = -EINVAL;
+ break;
+ }
+
+ g_slice_free(RawWin32AIOData, aiocb);
+ return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
+
+ acb->bs = bs;
+ acb->hfile = hfile;
+ acb->aio_type = type;
+
+ if (qiov) {
+ acb->aio_iov = qiov->iov;
+ acb->aio_niov = qiov->niov;
+ }
+ acb->aio_nbytes = nb_sectors * 512;
+ acb->aio_offset = sector_num * 512;
+
+ trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+ return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
int qemu_ftruncate64(int fd, int64_t length)
{
LARGE_INTEGER li;
@@ -77,6 +199,26 @@ static int set_sparse(int fd)
NULL, 0, NULL, 0, &returned, NULL);
}
+static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
+{
+ assert(access_flags != NULL);
+ assert(overlapped != NULL);
+
+ if (flags & BDRV_O_RDWR) {
+ *access_flags = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ *access_flags = GENERIC_READ;
+ }
+
+ *overlapped = FILE_ATTRIBUTE_NORMAL;
+ if (flags & BDRV_O_NATIVE_AIO) {
+ *overlapped |= FILE_FLAG_OVERLAPPED;
+ }
+ if (flags & BDRV_O_NOCACHE) {
+ *overlapped |= FILE_FLAG_NO_BUFFERING;
+ }
+}
+
static int raw_open(BlockDriverState *bs, const char *filename, int flags)
{
BDRVRawState *s = bs->opaque;
@@ -85,17 +227,15 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
s->type = FTYPE_FILE;
- if (flags & BDRV_O_RDWR) {
- access_flags = GENERIC_READ | GENERIC_WRITE;
- } else {
- access_flags = GENERIC_READ;
+ raw_parse_flags(flags, &access_flags, &overlapped);
+
+ if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
+ aio = win32_aio_init();
+ if (aio == NULL) {
+ return -EINVAL;
+ }
}
- overlapped = FILE_ATTRIBUTE_NORMAL;
- if (flags & BDRV_O_NOCACHE)
- overlapped |= FILE_FLAG_NO_BUFFERING;
- if (!(flags & BDRV_O_CACHE_WB))
- overlapped |= FILE_FLAG_WRITE_THROUGH;
s->hfile = CreateFile(filename, access_flags,
FILE_SHARE_READ, NULL,
OPEN_EXISTING, overlapped, NULL);
@@ -104,64 +244,53 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
if (err == ERROR_ACCESS_DENIED)
return -EACCES;
- return -1;
+ return -EINVAL;
+ }
+
+ if (flags & BDRV_O_NATIVE_AIO) {
+ int ret = win32_aio_attach(aio, s->hfile);
+ if (ret < 0) {
+ CloseHandle(s->hfile);
+ return ret;
+ }
+ s->aio = aio;
}
return 0;
}
-static int raw_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVRawState *s = bs->opaque;
- OVERLAPPED ov;
- DWORD ret_count;
- int ret;
- int64_t offset = sector_num * 512;
- int count = nb_sectors * 512;
-
- memset(&ov, 0, sizeof(ov));
- ov.Offset = offset;
- ov.OffsetHigh = offset >> 32;
- ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
- if (!ret)
- return ret_count;
- if (ret_count == count)
- ret_count = 0;
- return ret_count;
+ if (s->aio) {
+ return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+ nb_sectors, cb, opaque, QEMU_AIO_READ);
+ } else {
+ return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+ cb, opaque, QEMU_AIO_READ);
+ }
}
-static int raw_write(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVRawState *s = bs->opaque;
- OVERLAPPED ov;
- DWORD ret_count;
- int ret;
- int64_t offset = sector_num * 512;
- int count = nb_sectors * 512;
-
- memset(&ov, 0, sizeof(ov));
- ov.Offset = offset;
- ov.OffsetHigh = offset >> 32;
- ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
- if (!ret)
- return ret_count;
- if (ret_count == count)
- ret_count = 0;
- return ret_count;
+ if (s->aio) {
+ return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+ nb_sectors, cb, opaque, QEMU_AIO_WRITE);
+ } else {
+ return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+ cb, opaque, QEMU_AIO_WRITE);
+ }
}
-static int raw_flush(BlockDriverState *bs)
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVRawState *s = bs->opaque;
- int ret;
-
- ret = FlushFileBuffers(s->hfile);
- if (ret == 0) {
- return -EIO;
- }
-
- return 0;
+ return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
}
static void raw_close(BlockDriverState *bs)
@@ -282,9 +411,9 @@ static BlockDriver bdrv_file = {
.bdrv_close = raw_close,
.bdrv_create = raw_create,
- .bdrv_read = raw_read,
- .bdrv_write = raw_write,
- .bdrv_co_flush_to_disk = raw_flush,
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_truncate = raw_truncate,
.bdrv_getlength = raw_getlength,
@@ -374,18 +503,10 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
}
s->type = find_device_type(bs, filename);
- if (flags & BDRV_O_RDWR) {
- access_flags = GENERIC_READ | GENERIC_WRITE;
- } else {
- access_flags = GENERIC_READ;
- }
+ raw_parse_flags(flags, &access_flags, &overlapped);
+
create_flags = OPEN_EXISTING;
- overlapped = FILE_ATTRIBUTE_NORMAL;
- if (flags & BDRV_O_NOCACHE)
- overlapped |= FILE_FLAG_NO_BUFFERING;
- if (!(flags & BDRV_O_CACHE_WB))
- overlapped |= FILE_FLAG_WRITE_THROUGH;
s->hfile = CreateFile(filename, access_flags,
FILE_SHARE_READ, NULL,
create_flags, overlapped, NULL);
@@ -413,9 +534,9 @@ static BlockDriver bdrv_host_device = {
.bdrv_close = raw_close,
.bdrv_has_zero_init = hdev_has_zero_init,
- .bdrv_read = raw_read,
- .bdrv_write = raw_write,
- .bdrv_co_flush_to_disk = raw_flush,
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_flush = raw_aio_flush,
.bdrv_getlength = raw_getlength,
.bdrv_get_allocated_file_size
diff --git a/block/raw.c b/block/raw.c
index ff34ea41e..253e949b8 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -9,6 +9,14 @@ static int raw_open(BlockDriverState *bs, int flags)
return 0;
}
+/* We have nothing to do for raw reopen, stubs just return
+ * success */
+static int raw_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
{
@@ -115,6 +123,8 @@ static BlockDriver bdrv_raw = {
.bdrv_open = raw_open,
.bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+
.bdrv_co_readv = raw_co_readv,
.bdrv_co_writev = raw_co_writev,
.bdrv_co_is_allocated = raw_co_is_allocated,
diff --git a/block/rbd.c b/block/rbd.c
index 5a0f79fc8..f3becc7a8 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -69,7 +69,7 @@ typedef enum {
typedef struct RBDAIOCB {
BlockDriverAIOCB common;
QEMUBH *bh;
- int ret;
+ int64_t ret;
QEMUIOVector *qiov;
char *bounce;
RBDAIOCmd cmd;
@@ -86,7 +86,7 @@ typedef struct RADOSCB {
int done;
int64_t size;
char *buf;
- int ret;
+ int64_t ret;
} RADOSCB;
#define RBD_FD_READ 0
@@ -487,12 +487,6 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
rados_conf_set(s->cluster, "rbd_cache", "false");
} else {
rados_conf_set(s->cluster, "rbd_cache", "true");
- if (!(flags & BDRV_O_CACHE_WB)) {
- r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
- if (r < 0) {
- rados_conf_set(s->cluster, "rbd_cache", "false");
- }
- }
}
if (strstr(conf, "conf=") == NULL) {
@@ -576,7 +570,7 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
acb->cancelled = 1;
}
-static AIOPool rbd_aio_pool = {
+static const AIOCBInfo rbd_aiocb_info = {
.aiocb_size = sizeof(RBDAIOCB),
.cancel = qemu_rbd_aio_cancel,
};
@@ -678,7 +672,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
BDRVRBDState *s = bs->opaque;
- acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
acb->cmd = cmd;
acb->qiov = qiov;
if (cmd == RBD_AIO_DISCARD) {
diff --git a/block/sheepdog.c b/block/sheepdog.c
index df4f44107..a48f58cfe 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -201,12 +201,12 @@ static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
return hval;
}
-static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
+static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
{
return inode->vdi_id == inode->data_vdi_id[idx];
}
-static inline int is_data_obj(uint64_t oid)
+static inline bool is_data_obj(uint64_t oid)
{
return !(VDI_BIT & oid);
}
@@ -231,7 +231,7 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
}
-static inline int is_snapshot(struct SheepdogInode *inode)
+static inline bool is_snapshot(struct SheepdogInode *inode)
{
return !!inode->snap_ctime;
}
@@ -281,7 +281,7 @@ struct SheepdogAIOCB {
Coroutine *coroutine;
void (*aio_done_func)(SheepdogAIOCB *);
- int canceled;
+ bool canceled;
int nr_pending;
};
@@ -292,8 +292,8 @@ typedef struct BDRVSheepdogState {
uint32_t max_dirty_data_idx;
char name[SD_MAX_VDI_LEN];
- int is_snapshot;
- uint8_t cache_enabled;
+ bool is_snapshot;
+ bool cache_enabled;
char *addr;
char *port;
@@ -417,10 +417,10 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
*/
acb->ret = -EIO;
qemu_coroutine_enter(acb->coroutine, NULL);
- acb->canceled = 1;
+ acb->canceled = true;
}
-static AIOPool sd_aio_pool = {
+static const AIOCBInfo sd_aiocb_info = {
.aiocb_size = sizeof(SheepdogAIOCB),
.cancel = sd_aio_cancel,
};
@@ -431,7 +431,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
{
SheepdogAIOCB *acb;
- acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
+ acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque);
acb->qiov = qiov;
@@ -439,7 +439,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
acb->nb_sectors = nb_sectors;
acb->aio_done_func = NULL;
- acb->canceled = 0;
+ acb->canceled = false;
acb->coroutine = qemu_coroutine_self();
acb->ret = 0;
acb->nr_pending = 0;
@@ -613,7 +613,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
}
static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
- struct iovec *iov, int niov, int create,
+ struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type);
@@ -646,7 +646,7 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
QLIST_REMOVE(aio_req, aio_siblings);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
ret = add_aio_request(s, aio_req, acb->qiov->iov,
- acb->qiov->niov, 0, acb->aiocb_type);
+ acb->qiov->niov, false, acb->aiocb_type);
if (ret < 0) {
error_report("add_aio_request is failed");
free_aio_req(s, aio_req);
@@ -866,14 +866,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
s->port = 0;
}
- strncpy(vdi, p, SD_MAX_VDI_LEN);
+ pstrcpy(vdi, SD_MAX_VDI_LEN, p);
p = strchr(vdi, ':');
if (p) {
*p++ = '\0';
*snapid = strtoul(p, NULL, 10);
if (*snapid == 0) {
- strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+ pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
}
} else {
*snapid = CURRENT_VDI_ID; /* search current vdi */
@@ -900,7 +900,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
return fd;
}
- memset(buf, 0, sizeof(buf));
+ /* This pair of strncpy calls ensures that the buffer is zero-filled,
+ * which is desirable since we'll soon be sending those bytes, and
+ * don't want the send_req to read uninitialized data.
+ */
strncpy(buf, filename, SD_MAX_VDI_LEN);
strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
@@ -940,7 +943,7 @@ out:
}
static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
- struct iovec *iov, int niov, int create,
+ struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type)
{
int nr_copies = s->inode.nr_copies;
@@ -1019,7 +1022,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
unsigned int datalen, uint64_t offset,
- int write, int create, uint8_t cache)
+ bool write, bool create, bool cache)
{
SheepdogObjReq hdr;
SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -1068,18 +1071,18 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
}
static int read_object(int fd, char *buf, uint64_t oid, int copies,
- unsigned int datalen, uint64_t offset, uint8_t cache)
+ unsigned int datalen, uint64_t offset, bool cache)
{
- return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
- cache);
+ return read_write_object(fd, buf, oid, copies, datalen, offset, false,
+ false, cache);
}
static int write_object(int fd, char *buf, uint64_t oid, int copies,
- unsigned int datalen, uint64_t offset, int create,
- uint8_t cache)
+ unsigned int datalen, uint64_t offset, bool create,
+ bool cache)
{
- return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
- cache);
+ return read_write_object(fd, buf, oid, copies, datalen, offset, true,
+ create, cache);
}
static int sd_open(BlockDriverState *bs, const char *filename, int flags)
@@ -1114,19 +1117,17 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
goto out;
}
- if (flags & BDRV_O_CACHE_WB) {
- s->cache_enabled = 1;
- s->flush_fd = connect_to_sdog(s->addr, s->port);
- if (s->flush_fd < 0) {
- error_report("failed to connect");
- ret = s->flush_fd;
- goto out;
- }
+ s->cache_enabled = true;
+ s->flush_fd = connect_to_sdog(s->addr, s->port);
+ if (s->flush_fd < 0) {
+ error_report("failed to connect");
+ ret = s->flush_fd;
+ goto out;
}
if (snapid || tag[0] != '\0') {
dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
- s->is_snapshot = 1;
+ s->is_snapshot = true;
}
fd = connect_to_sdog(s->addr, s->port);
@@ -1151,7 +1152,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
s->max_dirty_data_idx = 0;
bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
- strncpy(s->name, vdi, sizeof(s->name));
+ pstrcpy(s->name, sizeof(s->name), vdi);
qemu_co_mutex_init(&s->lock);
g_free(buf);
return 0;
@@ -1179,8 +1180,11 @@ static int do_sd_create(char *filename, int64_t vdi_size,
return fd;
}
+ /* FIXME: would it be better to fail (e.g., return -EIO) when filename
+ * does not fit in buf? For now, just truncate and avoid buffer overrun.
+ */
memset(buf, 0, sizeof(buf));
- strncpy(buf, filename, SD_MAX_VDI_LEN);
+ pstrcpy(buf, sizeof(buf), filename);
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_NEW_VDI;
@@ -1266,7 +1270,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
BDRVSheepdogState *s;
char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid;
- int prealloc = 0;
+ bool prealloc = false;
const char *vdiname;
s = g_malloc0(sizeof(BDRVSheepdogState));
@@ -1288,9 +1292,9 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
backing_file = options->value.s;
} else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
if (!options->value.s || !strcmp(options->value.s, "off")) {
- prealloc = 0;
+ prealloc = false;
} else if (!strcmp(options->value.s, "full")) {
- prealloc = 1;
+ prealloc = true;
} else {
error_report("Invalid preallocation mode: '%s'",
options->value.s);
@@ -1418,7 +1422,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
s->inode.vdi_size = offset;
ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
- s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+ s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
close(fd);
if (ret < 0) {
@@ -1457,7 +1461,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
data_len, offset, 0, 0, offset);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
+ ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
if (ret) {
free_aio_req(s, aio_req);
acb->ret = -EIO;
@@ -1511,7 +1515,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
memcpy(&s->inode, buf, sizeof(s->inode));
- s->is_snapshot = 0;
+ s->is_snapshot = false;
ret = 0;
dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
@@ -1566,7 +1570,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
while (done != total) {
uint8_t flags = 0;
uint64_t old_oid = 0;
- int create = 0;
+ bool create = false;
oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
@@ -1581,10 +1585,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
break;
case AIOCB_WRITE_UDATA:
if (!inode->data_vdi_id[idx]) {
- create = 1;
+ create = true;
} else if (!is_data_obj_writable(inode, idx)) {
/* Copy-On-Write */
- create = 1;
+ create = true;
old_oid = oid;
flags = SD_FLAG_CMD_COW;
}
@@ -1718,7 +1722,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
if (rsp->result == SD_RES_INVALID_PARMS) {
dprintf("disable write cache since the server doesn't support it\n");
- s->cache_enabled = 0;
+ s->cache_enabled = false;
closesocket(s->flush_fd);
return 0;
}
@@ -1754,6 +1758,9 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
s->inode.vm_state_size = sn_info->vm_state_size;
s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+ /* It appears that inode.tag does not require a NUL terminator,
+ * which means this use of strncpy is ok.
+ */
strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
/* we don't need to update entire object */
datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
@@ -1766,7 +1773,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
}
ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
- s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+ s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
if (ret < 0) {
error_report("failed to write snapshot's inode.");
goto cleanup;
@@ -1813,13 +1820,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
memcpy(old_s, s, sizeof(BDRVSheepdogState));
- memset(vdi, 0, sizeof(vdi));
- strncpy(vdi, s->name, sizeof(vdi));
+ pstrcpy(vdi, sizeof(vdi), s->name);
- memset(tag, 0, sizeof(tag));
snapid = strtoul(snapshot_id, NULL, 10);
- if (!snapid) {
- strncpy(tag, s->name, sizeof(tag));
+ if (snapid) {
+ tag[0] = 0;
+ } else {
+ pstrcpy(tag, sizeof(tag), s->name);
}
ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
@@ -1853,7 +1860,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
goto out;
}
- s->is_snapshot = 1;
+ s->is_snapshot = true;
g_free(buf);
g_free(old_s);
@@ -1948,8 +1955,9 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
inode.snap_id);
- strncpy(sn_tab[found].name, inode.tag,
- MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+ pstrcpy(sn_tab[found].name,
+ MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
+ inode.tag);
found++;
}
}
@@ -1970,8 +1978,8 @@ out:
static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
int64_t pos, int size, int load)
{
- int fd, create;
- int ret = 0, remaining = size;
+ bool create;
+ int fd, ret = 0, remaining = size;
unsigned int data_len;
uint64_t vmstate_oid;
uint32_t vdi_index;
@@ -1986,7 +1994,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
vdi_index = pos / SD_DATA_OBJ_SIZE;
offset = pos % SD_DATA_OBJ_SIZE;
- data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+ data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
@@ -2007,6 +2015,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
}
pos += data_len;
+ data += data_len;
remaining -= data_len;
}
ret = size;
diff --git a/block/stream.c b/block/stream.c
index c4f87dd5b..0c0fc7a13 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -13,6 +13,7 @@
#include "trace.h"
#include "block_int.h"
+#include "blockjob.h"
#include "qemu/ratelimit.h"
enum {
@@ -30,6 +31,7 @@ typedef struct StreamBlockJob {
BlockJob common;
RateLimit limit;
BlockDriverState *base;
+ BlockdevOnError on_error;
char backing_file_id[1024];
} StreamBlockJob;
@@ -77,13 +79,14 @@ static void coroutine_fn stream_run(void *opaque)
BlockDriverState *bs = s->common.bs;
BlockDriverState *base = s->base;
int64_t sector_num, end;
+ int error = 0;
int ret = 0;
int n = 0;
void *buf;
s->common.len = bdrv_getlength(bs);
if (s->common.len < 0) {
- block_job_complete(&s->common, s->common.len);
+ block_job_completed(&s->common, s->common.len);
return;
}
@@ -141,7 +144,19 @@ wait:
ret = stream_populate(bs, sector_num, n, buf);
}
if (ret < 0) {
- break;
+ BlockErrorAction action =
+ block_job_error_action(&s->common, s->common.bs, s->on_error,
+ true, -ret);
+ if (action == BDRV_ACTION_STOP) {
+ n = 0;
+ continue;
+ }
+ if (error == 0) {
+ error = ret;
+ }
+ if (action == BDRV_ACTION_REPORT) {
+ break;
+ }
}
ret = 0;
@@ -153,6 +168,9 @@ wait:
bdrv_disable_copy_on_read(bs);
}
+ /* Do not remove the backing file if an error was there but ignored. */
+ ret = error;
+
if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
const char *base_id = NULL, *base_fmt = NULL;
if (base) {
@@ -166,7 +184,7 @@ wait:
}
qemu_vfree(buf);
- block_job_complete(&s->common, ret);
+ block_job_completed(&s->common, ret);
}
static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -188,11 +206,19 @@ static BlockJobType stream_job_type = {
void stream_start(BlockDriverState *bs, BlockDriverState *base,
const char *base_id, int64_t speed,
+ BlockdevOnError on_error,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp)
{
StreamBlockJob *s;
+ if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+ on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+ !bdrv_iostatus_is_enabled(bs)) {
+ error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+ return;
+ }
+
s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
if (!s) {
return;
@@ -203,6 +229,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
}
+ s->on_error = on_error;
s->common.co = qemu_coroutine_create(stream_run);
trace_stream_start(bs, base, s, s->common.co, opaque);
qemu_coroutine_enter(s->common.co, s);
diff --git a/block/vdi.c b/block/vdi.c
index c4f1529db..c8330b7ea 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -60,9 +60,6 @@
/* TODO: move uuid emulation to some central place in QEMU. */
#include "sysemu.h" /* UUID_FMT */
typedef unsigned char uuid_t[16];
-void uuid_generate(uuid_t out);
-int uuid_is_null(const uuid_t uu);
-void uuid_unparse(const uuid_t uu, char *out);
#endif
/* Code configuration options. */
@@ -124,18 +121,18 @@ void uuid_unparse(const uuid_t uu, char *out);
#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
#if !defined(CONFIG_UUID)
-void uuid_generate(uuid_t out)
+static inline void uuid_generate(uuid_t out)
{
memset(out, 0, sizeof(uuid_t));
}
-int uuid_is_null(const uuid_t uu)
+static inline int uuid_is_null(const uuid_t uu)
{
uuid_t null_uuid = { 0 };
return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
}
-void uuid_unparse(const uuid_t uu, char *out)
+static inline void uuid_unparse(const uuid_t uu, char *out)
{
snprintf(out, 37, UUID_FMT,
uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
@@ -454,6 +451,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
return -1;
}
+static int vdi_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum)
{
@@ -628,7 +631,6 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
VdiHeader header;
size_t i;
size_t bmap_size;
- uint32_t *bmap;
logout("\n");
@@ -693,21 +695,21 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
result = -errno;
}
- bmap = NULL;
if (bmap_size > 0) {
- bmap = (uint32_t *)g_malloc0(bmap_size);
- }
- for (i = 0; i < blocks; i++) {
- if (image_type == VDI_TYPE_STATIC) {
- bmap[i] = i;
- } else {
- bmap[i] = VDI_UNALLOCATED;
+ uint32_t *bmap = g_malloc0(bmap_size);
+ for (i = 0; i < blocks; i++) {
+ if (image_type == VDI_TYPE_STATIC) {
+ bmap[i] = i;
+ } else {
+ bmap[i] = VDI_UNALLOCATED;
+ }
}
+ if (write(fd, bmap, bmap_size) < 0) {
+ result = -errno;
+ }
+ g_free(bmap);
}
- if (write(fd, bmap, bmap_size) < 0) {
- result = -errno;
- }
- g_free(bmap);
+
if (image_type == VDI_TYPE_STATIC) {
if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
result = -errno;
@@ -762,6 +764,7 @@ static BlockDriver bdrv_vdi = {
.bdrv_probe = vdi_probe,
.bdrv_open = vdi_open,
.bdrv_close = vdi_close,
+ .bdrv_reopen_prepare = vdi_reopen_prepare,
.bdrv_create = vdi_create,
.bdrv_co_is_allocated = vdi_co_is_allocated,
.bdrv_make_empty = vdi_make_empty,
diff --git a/block/vmdk.c b/block/vmdk.c
index bba4c61a7..51398c0c0 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -300,6 +300,40 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
return 1;
}
+/* Queue extents, if any, for reopen() */
+static int vmdk_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ BDRVVmdkState *s;
+ int ret = -1;
+ int i;
+ VmdkExtent *e;
+
+ assert(state != NULL);
+ assert(state->bs != NULL);
+
+ if (queue == NULL) {
+ error_set(errp, ERROR_CLASS_GENERIC_ERROR,
+ "No reopen queue for VMDK extents");
+ goto exit;
+ }
+
+ s = state->bs->opaque;
+
+ assert(s != NULL);
+
+ for (i = 0; i < s->num_extents; i++) {
+ e = &s->extents[i];
+ if (e->file != state->bs->file) {
+ bdrv_reopen_queue(queue, e->file, state->flags);
+ }
+ }
+ ret = 0;
+
+exit:
+ return ret;
+}
+
static int vmdk_parent_open(BlockDriverState *bs)
{
char *p_name;
@@ -1058,6 +1092,7 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
BDRVVmdkState *s = bs->opaque;
int ret;
uint64_t n, index_in_cluster;
+ uint64_t extent_begin_sector, extent_relative_sector_num;
VmdkExtent *extent = NULL;
uint64_t cluster_offset;
@@ -1069,7 +1104,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
ret = get_cluster_offset(
bs, extent, NULL,
sector_num << 9, 0, &cluster_offset);
- index_in_cluster = sector_num % extent->cluster_sectors;
+ extent_begin_sector = extent->end_sector - extent->sectors;
+ extent_relative_sector_num = sector_num - extent_begin_sector;
+ index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
n = extent->cluster_sectors - index_in_cluster;
if (n > nb_sectors) {
n = nb_sectors;
@@ -1120,6 +1157,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
VmdkExtent *extent = NULL;
int n, ret;
int64_t index_in_cluster;
+ uint64_t extent_begin_sector, extent_relative_sector_num;
uint64_t cluster_offset;
VmdkMetaData m_data;
@@ -1162,7 +1200,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
if (ret) {
return -EINVAL;
}
- index_in_cluster = sector_num % extent->cluster_sectors;
+ extent_begin_sector = extent->end_sector - extent->sectors;
+ extent_relative_sector_num = sector_num - extent_begin_sector;
+ index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
n = extent->cluster_sectors - index_in_cluster;
if (n > nb_sectors) {
n = nb_sectors;
@@ -1374,8 +1414,7 @@ static int relative_path(char *dest, int dest_size,
return -1;
}
if (path_is_absolute(target)) {
- dest[dest_size - 1] = '\0';
- strncpy(dest, target, dest_size - 1);
+ pstrcpy(dest, dest_size, target);
return 0;
}
while (base[i] == target[i]) {
@@ -1646,6 +1685,7 @@ static BlockDriver bdrv_vmdk = {
.instance_size = sizeof(BDRVVmdkState),
.bdrv_probe = vmdk_probe,
.bdrv_open = vmdk_open,
+ .bdrv_reopen_prepare = vmdk_reopen_prepare,
.bdrv_read = vmdk_co_read,
.bdrv_write = vmdk_co_write,
.bdrv_close = vmdk_close,
diff --git a/block/vpc.c b/block/vpc.c
index c0b82c4f5..b6bf52f14 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -265,6 +265,12 @@ static int vpc_open(BlockDriverState *bs, int flags)
return err;
}
+static int vpc_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
/*
* Returns the absolute byte offset of the given sector in the image file.
* If the sector is not allocated, -1 is returned instead.
@@ -783,6 +789,7 @@ static BlockDriver bdrv_vpc = {
.bdrv_probe = vpc_probe,
.bdrv_open = vpc_open,
.bdrv_close = vpc_close,
+ .bdrv_reopen_prepare = vpc_reopen_prepare,
.bdrv_create = vpc_create,
.bdrv_read = vpc_co_read,
diff --git a/block/win32-aio.c b/block/win32-aio.c
new file mode 100644
index 000000000..4704ee06c
--- /dev/null
+++ b/block/win32-aio.c
@@ -0,0 +1,226 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include "qemu-common.h"
+#include "qemu-aio.h"
+#include "raw-aio.h"
+#include "event_notifier.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD 1
+#define FTYPE_HARDDISK 2
+
+struct QEMUWin32AIOState {
+ HANDLE hIOCP;
+ EventNotifier e;
+ int count;
+};
+
+typedef struct QEMUWin32AIOCB {
+ BlockDriverAIOCB common;
+ struct QEMUWin32AIOState *ctx;
+ int nbytes;
+ OVERLAPPED ov;
+ QEMUIOVector *qiov;
+ void *buf;
+ bool is_read;
+ bool is_linear;
+} QEMUWin32AIOCB;
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void win32_aio_process_completion(QEMUWin32AIOState *s,
+ QEMUWin32AIOCB *waiocb, DWORD count)
+{
+ int ret;
+ s->count--;
+
+ if (waiocb->ov.Internal != 0) {
+ ret = -EIO;
+ } else {
+ ret = 0;
+ if (count < waiocb->nbytes) {
+ /* Short reads mean EOF, pad with zeros. */
+ if (waiocb->is_read) {
+ qemu_iovec_memset(waiocb->qiov, count, 0,
+ waiocb->qiov->size - count);
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ }
+
+ if (!waiocb->is_linear) {
+ if (ret == 0 && waiocb->is_read) {
+ QEMUIOVector *qiov = waiocb->qiov;
+ char *p = waiocb->buf;
+ int i;
+
+ for (i = 0; i < qiov->niov; ++i) {
+ memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+ p += qiov->iov[i].iov_len;
+ }
+ g_free(waiocb->buf);
+ }
+ }
+
+
+ waiocb->common.cb(waiocb->common.opaque, ret);
+ qemu_aio_release(waiocb);
+}
+
+static void win32_aio_completion_cb(EventNotifier *e)
+{
+ QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+ DWORD count;
+ ULONG_PTR key;
+ OVERLAPPED *ov;
+
+ event_notifier_test_and_clear(&s->e);
+ while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) {
+ QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov);
+
+ win32_aio_process_completion(s, waiocb, count);
+ }
+}
+
+static int win32_aio_flush_cb(EventNotifier *e)
+{
+ QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+
+ return (s->count > 0) ? 1 : 0;
+}
+
+static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
+
+ /*
+ * CancelIoEx is only supported in Vista and newer. For now, just
+ * wait for completion.
+ */
+ while (!HasOverlappedIoCompleted(&waiocb->ov)) {
+ qemu_aio_wait();
+ }
+}
+
+static const AIOCBInfo win32_aiocb_info = {
+ .aiocb_size = sizeof(QEMUWin32AIOCB),
+ .cancel = win32_aio_cancel,
+};
+
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+ QEMUWin32AIOState *aio, HANDLE hfile,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ struct QEMUWin32AIOCB *waiocb;
+ uint64_t offset = sector_num * 512;
+ DWORD rc;
+
+ waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
+ waiocb->nbytes = nb_sectors * 512;
+ waiocb->qiov = qiov;
+ waiocb->is_read = (type == QEMU_AIO_READ);
+
+ if (qiov->niov > 1) {
+ waiocb->buf = qemu_blockalign(bs, qiov->size);
+ if (type & QEMU_AIO_WRITE) {
+ char *p = waiocb->buf;
+ int i;
+
+ for (i = 0; i < qiov->niov; ++i) {
+ memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+ p += qiov->iov[i].iov_len;
+ }
+ }
+ waiocb->is_linear = false;
+ } else {
+ waiocb->buf = qiov->iov[0].iov_base;
+ waiocb->is_linear = true;
+ }
+
+ memset(&waiocb->ov, 0, sizeof(waiocb->ov));
+ waiocb->ov.Offset = (DWORD)offset;
+ waiocb->ov.OffsetHigh = (DWORD)(offset >> 32);
+ waiocb->ov.hEvent = event_notifier_get_handle(&aio->e);
+
+ aio->count++;
+
+ if (type & QEMU_AIO_READ) {
+ rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+ } else {
+ rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+ }
+ if(rc == 0 && GetLastError() != ERROR_IO_PENDING) {
+ goto out_dec_count;
+ }
+ return &waiocb->common;
+
+out_dec_count:
+ aio->count--;
+ qemu_aio_release(waiocb);
+ return NULL;
+}
+
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
+{
+ if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) {
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+QEMUWin32AIOState *win32_aio_init(void)
+{
+ QEMUWin32AIOState *s;
+
+ s = g_malloc0(sizeof(*s));
+ if (event_notifier_init(&s->e, false) < 0) {
+ goto out_free_state;
+ }
+
+ s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+ if (s->hIOCP == NULL) {
+ goto out_close_efd;
+ }
+
+ qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb,
+ win32_aio_flush_cb);
+
+ return s;
+
+out_close_efd:
+ event_notifier_cleanup(&s->e);
+out_free_state:
+ g_free(s);
+ return NULL;
+}