diff options
author | Chanho Park <chanho61.park@samsung.com> | 2014-09-05 20:35:53 +0900 |
---|---|---|
committer | Chanho Park <chanho61.park@samsung.com> | 2014-09-05 20:35:53 +0900 |
commit | 16b1353a36171ae06d63fd309f4772dbfb1da113 (patch) | |
tree | cf6c297ee81aba0d9b47f23d78a889667e7bce48 /block | |
parent | a15119db2ff5c2fdfdeb913b297bf8aa3399132e (diff) | |
download | qemu-16b1353a36171ae06d63fd309f4772dbfb1da113.tar.gz qemu-16b1353a36171ae06d63fd309f4772dbfb1da113.tar.bz2 qemu-16b1353a36171ae06d63fd309f4772dbfb1da113.zip |
Imported Upstream version 2.1.0upstream/2.1.0
Diffstat (limited to 'block')
41 files changed, 2895 insertions, 1719 deletions
diff --git a/block/backup.c b/block/backup.c index 15a2e55e8..d0b02255c 100644 --- a/block/backup.c +++ b/block/backup.c @@ -307,7 +307,7 @@ static void coroutine_fn backup_run(void *opaque) BACKUP_SECTORS_PER_CLUSTER - i, &n); i += n; - if (alloced == 1) { + if (alloced == 1 || n == 0) { break; } } @@ -325,7 +325,7 @@ static void coroutine_fn backup_run(void *opaque) /* Depending on error action, fail now or retry cluster */ BlockErrorAction action = backup_error_action(job, error_is_read, -ret); - if (action == BDRV_ACTION_REPORT) { + if (action == BLOCK_ERROR_ACTION_REPORT) { break; } else { start--; diff --git a/block/blkdebug.c b/block/blkdebug.c index 380c73610..f51407de3 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -471,7 +471,7 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs, acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); acb->ret = -error; - bh = qemu_bh_new(error_callback_bh, acb); + bh = aio_bh_new(bdrv_get_aio_context(bs), error_callback_bh, acb); acb->bh = bh; qemu_bh_schedule(bh); diff --git a/block/blkverify.c b/block/blkverify.c index e1c31171c..621b78593 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -39,12 +39,13 @@ struct BlkverifyAIOCB { static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb) { BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb; + AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); bool finished = false; /* Wait until request completes, invokes its callback, and frees itself */ acb->finished = &finished; while (!finished) { - qemu_aio_wait(); + aio_poll(aio_context, true); } } @@ -228,7 +229,8 @@ static void blkverify_aio_cb(void *opaque, int ret) acb->verify(acb); } - acb->bh = qemu_bh_new(blkverify_aio_bh, acb); + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + blkverify_aio_bh, acb); qemu_bh_schedule(acb->bh); break; } @@ -302,21 +304,40 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, return bdrv_recurse_is_first_non_filter(s->test_file, candidate); } +/* Propagate AioContext changes to ->test_file */ +static void blkverify_detach_aio_context(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_detach_aio_context(s->test_file); +} + +static void blkverify_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_attach_aio_context(s->test_file, new_context); +} + static BlockDriver bdrv_blkverify = { - .format_name = "blkverify", - .protocol_name = "blkverify", - .instance_size = sizeof(BDRVBlkverifyState), + .format_name = "blkverify", + .protocol_name = "blkverify", + .instance_size = sizeof(BDRVBlkverifyState), + + .bdrv_parse_filename = blkverify_parse_filename, + .bdrv_file_open = blkverify_open, + .bdrv_close = blkverify_close, + .bdrv_getlength = blkverify_getlength, - .bdrv_parse_filename = blkverify_parse_filename, - .bdrv_file_open = blkverify_open, - .bdrv_close = blkverify_close, - .bdrv_getlength = blkverify_getlength, + .bdrv_aio_readv = blkverify_aio_readv, + .bdrv_aio_writev = blkverify_aio_writev, + .bdrv_aio_flush = blkverify_aio_flush, - .bdrv_aio_readv = blkverify_aio_readv, - .bdrv_aio_writev = blkverify_aio_writev, - .bdrv_aio_flush = blkverify_aio_flush, + .bdrv_attach_aio_context = blkverify_attach_aio_context, + .bdrv_detach_aio_context = blkverify_detach_aio_context, - .is_filter = true, + .is_filter = true, .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, }; diff --git a/block/bochs.c b/block/bochs.c index eacf956e7..eba23df33 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -187,13 +187,14 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) uint64_t offset = sector_num * 512; uint64_t extent_index, extent_offset, bitmap_offset; char bitmap_entry; + int ret; // seek to sector extent_index = offset / s->extent_size; extent_offset = (offset % s->extent_size) / 512; if (s->catalog_bitmap[extent_index] == 0xffffffff) { - return -1; /* not allocated */ + return 0; /* not allocated */ } bitmap_offset = s->data_offset + @@ -201,13 +202,14 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) (s->extent_blocks + s->bitmap_blocks)); /* read in bitmap for current extent */ - if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), - &bitmap_entry, 1) != 1) { - return -1; + ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), + &bitmap_entry, 1); + if (ret < 0) { + return ret; } if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { - return -1; /* not allocated */ + return 0; /* not allocated */ } return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); @@ -220,13 +222,16 @@ static int bochs_read(BlockDriverState *bs, int64_t sector_num, while (nb_sectors > 0) { int64_t block_offset = seek_to_sector(bs, sector_num); - if (block_offset >= 0) { + if (block_offset < 0) { + return block_offset; + } else if (block_offset > 0) { ret = bdrv_pread(bs->file, block_offset, buf, 512); - if (ret != 512) { - return -1; + if (ret < 0) { + return ret; } - } else + } else { memset(buf, 0, 512); + } nb_sectors--; sector_num++; buf += 512; diff --git a/block/cloop.c b/block/cloop.c index b6ad50fbb..845773792 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -72,7 +72,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, } s->block_size = be32_to_cpu(s->block_size); if (s->block_size % 512) { - error_setg(errp, "block_size %u must be a multiple of 512", + error_setg(errp, "block_size %" PRIu32 " must be a multiple of 512", s->block_size); return -EINVAL; } @@ -86,7 +86,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, * need a buffer this big. */ if (s->block_size > MAX_BLOCK_SIZE) { - error_setg(errp, "block_size %u must be %u MB or less", + error_setg(errp, "block_size %" PRIu32 " must be %u MB or less", s->block_size, MAX_BLOCK_SIZE / (1024 * 1024)); return -EINVAL; @@ -101,7 +101,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, /* read offsets */ if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { /* Prevent integer overflow */ - error_setg(errp, "n_blocks %u must be %zu or less", + error_setg(errp, "n_blocks %" PRIu32 " must be %zu or less", s->n_blocks, (UINT32_MAX - 1) / sizeof(uint64_t)); return -EINVAL; @@ -133,7 +133,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, if (s->offsets[i] < s->offsets[i - 1]) { error_setg(errp, "offsets not monotonically increasing at " - "index %u, image file is corrupt", i); + "index %" PRIu32 ", image file is corrupt", i); ret = -EINVAL; goto fail; } @@ -146,8 +146,8 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, * ridiculous s->compressed_block allocation. */ if (size > 2 * MAX_BLOCK_SIZE) { - error_setg(errp, "invalid compressed block size at index %u, " - "image file is corrupt", i); + error_setg(errp, "invalid compressed block size at index %" PRIu32 + ", image file is corrupt", i); ret = -EINVAL; goto fail; } diff --git a/block/commit.c b/block/commit.c index acec4ac5a..91517d351 100644 --- a/block/commit.c +++ b/block/commit.c @@ -37,6 +37,7 @@ typedef struct CommitBlockJob { BlockdevOnError on_error; int base_flags; int orig_overlay_flags; + char *backing_file_str; } CommitBlockJob; static int coroutine_fn commit_populate(BlockDriverState *bs, @@ -141,7 +142,7 @@ wait: if (!block_job_is_cancelled(&s->common) && sector_num == end) { /* success */ - ret = bdrv_drop_intermediate(active, top, base); + ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); } exit_free_buf: @@ -158,7 +159,7 @@ exit_restore_reopen: if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); } - + g_free(s->backing_file_str); block_job_completed(&s->common, ret); } @@ -182,7 +183,7 @@ static const BlockJobDriver commit_job_driver = { void commit_start(BlockDriverState *bs, BlockDriverState *base, BlockDriverState *top, int64_t speed, BlockdevOnError on_error, BlockDriverCompletionFunc *cb, - void *opaque, Error **errp) + void *opaque, const char *backing_file_str, Error **errp) { CommitBlockJob *s; BlockReopenQueue *reopen_queue = NULL; @@ -194,7 +195,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, if ((on_error == BLOCKDEV_ON_ERROR_STOP || on_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { - error_set(errp, QERR_INVALID_PARAMETER_COMBINATION); + error_setg(errp, "Invalid parameter combination"); return; } @@ -244,6 +245,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, s->base_flags = orig_base_flags; s->orig_overlay_flags = orig_overlay_flags; + s->backing_file_str = g_strdup(backing_file_str); + s->on_error = on_error; s->common.co = qemu_coroutine_create(commit_run); diff --git a/block/cow.c b/block/cow.c index 30deb88de..6ee483327 100644 --- a/block/cow.c +++ b/block/cow.c @@ -82,7 +82,7 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags, if (be32_to_cpu(cow_header.version) != COW_VERSION) { char version[64]; snprintf(version, sizeof(version), - "COW version %d", cow_header.version); + "COW version %" PRIu32, cow_header.version); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "cow", version); ret = -ENOTSUP; @@ -324,39 +324,31 @@ static void cow_close(BlockDriverState *bs) { } -static int cow_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int cow_create(const char *filename, QemuOpts *opts, Error **errp) { struct cow_header_v2 cow_header; struct stat st; int64_t image_sectors = 0; - const char *image_filename = NULL; + char *image_filename = NULL; Error *local_err = NULL; int ret; - BlockDriverState *cow_bs; + BlockDriverState *cow_bs = NULL; /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - image_sectors = options->value.n / 512; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - image_filename = options->value.s; - } - options++; - } + image_sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512; + image_filename = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return ret; + goto exit; } - cow_bs = NULL; ret = bdrv_open(&cow_bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return ret; + goto exit; } memset(&cow_header, 0, sizeof(cow_header)); @@ -389,22 +381,29 @@ static int cow_create(const char *filename, QEMUOptionParameter *options, } exit: - bdrv_unref(cow_bs); + g_free(image_filename); + if (cow_bs) { + bdrv_unref(cow_bs); + } return ret; } -static QEMUOptionParameter cow_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { NULL } +static QemuOptsList cow_create_opts = { + .name = "cow-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(cow_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_cow = { @@ -416,12 +415,13 @@ static BlockDriver bdrv_cow = { .bdrv_close = cow_close, .bdrv_create = cow_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, + .supports_backing = true, .bdrv_read = cow_co_read, .bdrv_write = cow_co_write, .bdrv_co_get_block_status = cow_co_get_block_status, - .create_options = cow_create_options, + .create_opts = &cow_create_opts, }; static void bdrv_cow_init(void) diff --git a/block/curl.c b/block/curl.c index 1b9b1f634..79ff2f1e4 100644 --- a/block/curl.c +++ b/block/curl.c @@ -23,6 +23,7 @@ */ #include "qemu-common.h" #include "block/block_int.h" +#include "qapi/qmp/qbool.h" #include <curl/curl.h> // #define DEBUG @@ -37,6 +38,21 @@ #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ #define NEED_CURL_TIMER_CALLBACK +#define HAVE_SOCKET_ACTION +#endif + +#ifndef HAVE_SOCKET_ACTION +/* If curl_multi_socket_action isn't available, define it statically here in + * terms of curl_multi_socket. Note that ev_bitmask will be ignored, which is + * less efficient but still safe. */ +static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, + curl_socket_t sockfd, + int ev_bitmask, + int *running_handles) +{ + return curl_multi_socket(multi_handle, sockfd, running_handles); +} +#define curl_multi_socket_action __curl_multi_socket_action #endif #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ @@ -46,12 +62,16 @@ #define CURL_NUM_STATES 8 #define CURL_NUM_ACB 8 #define SECTOR_SIZE 512 -#define READ_AHEAD_SIZE (256 * 1024) +#define READ_AHEAD_DEFAULT (256 * 1024) #define FIND_RET_NONE 0 #define FIND_RET_OK 1 #define FIND_RET_WAIT 2 +#define CURL_BLOCK_OPT_URL "url" +#define CURL_BLOCK_OPT_READAHEAD "readahead" +#define CURL_BLOCK_OPT_SSLVERIFY "sslverify" + struct BDRVCURLState; typedef struct CURLAIOCB { @@ -71,6 +91,7 @@ typedef struct CURLState struct BDRVCURLState *s; CURLAIOCB *acb[CURL_NUM_ACB]; CURL *curl; + curl_socket_t sock_fd; char *orig_buf; size_t buf_start; size_t buf_off; @@ -87,11 +108,14 @@ typedef struct BDRVCURLState { CURLState states[CURL_NUM_STATES]; char *url; size_t readahead_size; + bool sslverify; bool accept_range; + AioContext *aio_context; } BDRVCURLState; static void curl_clean_state(CURLState *s); static void curl_multi_do(void *arg); +static void curl_multi_read(void *arg); #ifdef NEED_CURL_TIMER_CALLBACK static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) @@ -111,21 +135,29 @@ static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) #endif static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, - void *s, void *sp) + void *userp, void *sp) { + BDRVCURLState *s; + CURLState *state = NULL; + curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state); + state->sock_fd = fd; + s = state->s; + DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); switch (action) { case CURL_POLL_IN: - qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, s); + aio_set_fd_handler(s->aio_context, fd, curl_multi_read, + NULL, state); break; case CURL_POLL_OUT: - qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, s); + aio_set_fd_handler(s->aio_context, fd, NULL, curl_multi_do, state); break; case CURL_POLL_INOUT: - qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, s); + aio_set_fd_handler(s->aio_context, fd, curl_multi_read, + curl_multi_do, state); break; case CURL_POLL_REMOVE: - qemu_aio_set_fd_handler(fd, NULL, NULL, NULL); + aio_set_fd_handler(s->aio_context, fd, NULL, NULL, NULL); break; } @@ -155,7 +187,7 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) DPRINTF("CURL: Just reading %zd bytes\n", realsize); if (!s || !s->orig_buf) - goto read_end; + return 0; if (s->buf_off >= s->buf_len) { /* buffer full, read nothing */ @@ -180,7 +212,6 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) } } -read_end: return realsize; } @@ -215,7 +246,8 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, } // Wait for unfinished chunks - if ((start >= state->buf_start) && + if (state->in_use && + (start >= state->buf_start) && (start <= buf_fend) && (end >= state->buf_start) && (end <= buf_fend)) @@ -237,68 +269,69 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, return FIND_RET_NONE; } -static void curl_multi_read(BDRVCURLState *s) +static void curl_multi_check_completion(BDRVCURLState *s) { int msgs_in_queue; /* Try to find done transfers, so we can free the easy * handle again. */ - do { + for (;;) { CURLMsg *msg; msg = curl_multi_info_read(s->multi, &msgs_in_queue); + /* Quit when there are no more completions */ if (!msg) break; - if (msg->msg == CURLMSG_NONE) - break; - switch (msg->msg) { - case CURLMSG_DONE: - { - CURLState *state = NULL; - curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state); - - /* ACBs for successful messages get completed in curl_read_cb */ - if (msg->data.result != CURLE_OK) { - int i; - for (i = 0; i < CURL_NUM_ACB; i++) { - CURLAIOCB *acb = state->acb[i]; - - if (acb == NULL) { - continue; - } - - acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); - state->acb[i] = NULL; + if (msg->msg == CURLMSG_DONE) { + CURLState *state = NULL; + curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, + (char **)&state); + + /* ACBs for successful messages get completed in curl_read_cb */ + if (msg->data.result != CURLE_OK) { + int i; + for (i = 0; i < CURL_NUM_ACB; i++) { + CURLAIOCB *acb = state->acb[i]; + + if (acb == NULL) { + continue; } - } - curl_clean_state(state); - break; + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_release(acb); + state->acb[i] = NULL; + } } - default: - msgs_in_queue = 0; - break; + + curl_clean_state(state); + break; } - } while(msgs_in_queue); + } } static void curl_multi_do(void *arg) { - BDRVCURLState *s = (BDRVCURLState *)arg; + CURLState *s = (CURLState *)arg; int running; int r; - if (!s->multi) { + if (!s->s->multi) { return; } do { - r = curl_multi_socket_all(s->multi, &running); + r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running); } while(r == CURLM_CALL_MULTI_PERFORM); - curl_multi_read(s); +} + +static void curl_multi_read(void *arg) +{ + CURLState *s = (CURLState *)arg; + + curl_multi_do(arg); + curl_multi_check_completion(s->s); } static void curl_multi_timeout_do(void *arg) @@ -313,7 +346,7 @@ static void curl_multi_timeout_do(void *arg) curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); - curl_multi_read(s); + curl_multi_check_completion(s); #else abort(); #endif @@ -337,44 +370,44 @@ static CURLState *curl_init_state(BDRVCURLState *s) break; } if (!state) { - g_usleep(100); - curl_multi_do(s); + aio_poll(state->s->aio_context, true); } } while(!state); - if (state->curl) - goto has_curl; - - state->curl = curl_easy_init(); - if (!state->curl) - return NULL; - curl_easy_setopt(state->curl, CURLOPT_URL, s->url); - curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5); - curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); - curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); - curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); - curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); - - /* Restrict supported protocols to avoid security issues in the more - * obscure protocols. For example, do not allow POP3/SMTP/IMAP see - * CVE-2013-0249. - * - * Restricting protocols is only supported from 7.19.4 upwards. - */ + if (!state->curl) { + state->curl = curl_easy_init(); + if (!state->curl) { + return NULL; + } + curl_easy_setopt(state->curl, CURLOPT_URL, s->url); + curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER, + (long) s->sslverify); + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, + (void *)curl_read_cb); + curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); + curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + + /* Restrict supported protocols to avoid security issues in the more + * obscure protocols. For example, do not allow POP3/SMTP/IMAP see + * CVE-2013-0249. + * + * Restricting protocols is only supported from 7.19.4 upwards. + */ #if LIBCURL_VERSION_NUM >= 0x071304 - curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); - curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); #endif #ifdef DEBUG_VERBOSE - curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); #endif - -has_curl: + } state->s = s; @@ -391,43 +424,50 @@ static void curl_clean_state(CURLState *s) static void curl_parse_filename(const char *filename, QDict *options, Error **errp) { + qdict_put(options, CURL_BLOCK_OPT_URL, qstring_from_str(filename)); +} - #define RA_OPTSTR ":readahead=" - char *file; - char *ra; - const char *ra_val; - int parse_state = 0; - - file = g_strdup(filename); - - /* Parse a trailing ":readahead=#:" param, if present. */ - ra = file + strlen(file) - 1; - while (ra >= file) { - if (parse_state == 0) { - if (*ra == ':') { - parse_state++; - } else { - break; - } - } else if (parse_state == 1) { - if (*ra > '9' || *ra < '0') { - char *opt_start = ra - strlen(RA_OPTSTR) + 1; - if (opt_start > file && - strncmp(opt_start, RA_OPTSTR, strlen(RA_OPTSTR)) == 0) { - ra_val = ra + 1; - ra -= strlen(RA_OPTSTR) - 1; - *ra = '\0'; - qdict_put(options, "readahead", qstring_from_str(ra_val)); - } - break; - } +static void curl_detach_aio_context(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + int i; + + for (i = 0; i < CURL_NUM_STATES; i++) { + if (s->states[i].in_use) { + curl_clean_state(&s->states[i]); } - ra--; + if (s->states[i].curl) { + curl_easy_cleanup(s->states[i].curl); + s->states[i].curl = NULL; + } + g_free(s->states[i].orig_buf); + s->states[i].orig_buf = NULL; + } + if (s->multi) { + curl_multi_cleanup(s->multi); + s->multi = NULL; } - qdict_put(options, "url", qstring_from_str(file)); + timer_del(&s->timer); +} + +static void curl_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVCURLState *s = bs->opaque; + + aio_timer_init(new_context, &s->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + curl_multi_timeout_do, s); - g_free(file); + assert(!s->multi); + s->multi = curl_multi_init(); + s->aio_context = new_context; + curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); +#ifdef NEED_CURL_TIMER_CALLBACK + curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif } static QemuOptsList runtime_opts = { @@ -435,15 +475,20 @@ static QemuOptsList runtime_opts = { .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), .desc = { { - .name = "url", + .name = CURL_BLOCK_OPT_URL, .type = QEMU_OPT_STRING, .help = "URL to open", }, { - .name = "readahead", + .name = CURL_BLOCK_OPT_READAHEAD, .type = QEMU_OPT_SIZE, .help = "Readahead size", }, + { + .name = CURL_BLOCK_OPT_SSLVERIFY, + .type = QEMU_OPT_BOOL, + .help = "Verify SSL certificate" + }, { /* end of list */ } }, }; @@ -472,14 +517,17 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, goto out_noclean; } - s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE); + s->readahead_size = qemu_opt_get_size(opts, CURL_BLOCK_OPT_READAHEAD, + READ_AHEAD_DEFAULT); if ((s->readahead_size & 0x1ff) != 0) { error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", s->readahead_size); goto out_noclean; } - file = qemu_opt_get(opts, "url"); + s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); + + file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL); if (file == NULL) { error_setg(errp, "curl block driver requires an 'url' option"); goto out_noclean; @@ -491,6 +539,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, } DPRINTF("CURL: Opening %s\n", file); + s->aio_context = bdrv_get_aio_context(bs); s->url = g_strdup(file); state = curl_init_state(s); if (!state) @@ -523,27 +572,13 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, curl_easy_cleanup(state->curl); state->curl = NULL; - aio_timer_init(bdrv_get_aio_context(bs), &s->timer, - QEMU_CLOCK_REALTIME, SCALE_NS, - curl_multi_timeout_do, s); - - // Now we know the file exists and its size, so let's - // initialize the multi interface! - - s->multi = curl_multi_init(); - curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s); - curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); -#ifdef NEED_CURL_TIMER_CALLBACK - curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); - curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); -#endif - curl_multi_do(s); + curl_attach_aio_context(bs, bdrv_get_aio_context(bs)); qemu_opts_del(opts); return 0; out: - fprintf(stderr, "CURL: Error opening file: %s\n", state->errmsg); + error_setg(errp, "CURL: Error opening file: %s", state->errmsg); curl_easy_cleanup(state->curl); state->curl = NULL; out_noclean: @@ -566,6 +601,7 @@ static const AIOCBInfo curl_aiocb_info = { static void curl_readv_bh_cb(void *p) { CURLState *state; + int running; CURLAIOCB *acb = p; BDRVCURLState *s = acb->common.bs->opaque; @@ -600,8 +636,7 @@ static void curl_readv_bh_cb(void *p) acb->end = (acb->nb_sectors * SECTOR_SIZE); state->buf_off = 0; - if (state->orig_buf) - g_free(state->orig_buf); + g_free(state->orig_buf); state->buf_start = start; state->buf_len = acb->end + s->readahead_size; end = MIN(start + state->buf_len, s->len) - 1; @@ -614,8 +649,9 @@ static void curl_readv_bh_cb(void *p) curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); curl_multi_add_handle(s->multi, state->curl); - curl_multi_do(s); + /* Tell curl it needs to kick things off */ + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); } static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, @@ -630,7 +666,7 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, acb->sector_num = sector_num; acb->nb_sectors = nb_sectors; - acb->bh = qemu_bh_new(curl_readv_bh_cb, acb); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb); qemu_bh_schedule(acb->bh); return &acb->common; } @@ -638,25 +674,9 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, static void curl_close(BlockDriverState *bs) { BDRVCURLState *s = bs->opaque; - int i; DPRINTF("CURL: Close\n"); - for (i=0; i<CURL_NUM_STATES; i++) { - if (s->states[i].in_use) - curl_clean_state(&s->states[i]); - if (s->states[i].curl) { - curl_easy_cleanup(s->states[i].curl); - s->states[i].curl = NULL; - } - if (s->states[i].orig_buf) { - g_free(s->states[i].orig_buf); - s->states[i].orig_buf = NULL; - } - } - if (s->multi) - curl_multi_cleanup(s->multi); - - timer_del(&s->timer); + curl_detach_aio_context(bs); g_free(s->url); } @@ -668,68 +688,83 @@ static int64_t curl_getlength(BlockDriverState *bs) } static BlockDriver bdrv_http = { - .format_name = "http", - .protocol_name = "http", + .format_name = "http", + .protocol_name = "http", - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, }; static BlockDriver bdrv_https = { - .format_name = "https", - .protocol_name = "https", + .format_name = "https", + .protocol_name = "https", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .bdrv_aio_readv = curl_aio_readv, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, }; static BlockDriver bdrv_ftp = { - .format_name = "ftp", - .protocol_name = "ftp", + .format_name = "ftp", + .protocol_name = "ftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .bdrv_aio_readv = curl_aio_readv, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, }; static BlockDriver bdrv_ftps = { - .format_name = "ftps", - .protocol_name = "ftps", + .format_name = "ftps", + .protocol_name = "ftps", - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, }; static BlockDriver bdrv_tftp = { - .format_name = "tftp", - .protocol_name = "tftp", + .format_name = "tftp", + .protocol_name = "tftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, + .bdrv_aio_readv = curl_aio_readv, - .bdrv_aio_readv = curl_aio_readv, + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, }; static void curl_block_init(void) diff --git a/block/dmg.c b/block/dmg.c index 856402e1f..1e153cd76 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -248,8 +248,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, offset += 8; if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { - error_report("sector count %" PRIu64 " for chunk %u is " - "larger than max (%u)", + error_report("sector count %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); ret = -EINVAL; goto fail; @@ -269,8 +269,8 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, offset += 8; if (s->lengths[i] > DMG_LENGTHS_MAX) { - error_report("length %" PRIu64 " for chunk %u is larger " - "than max (%u)", + error_report("length %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", s->lengths[i], i, DMG_LENGTHS_MAX); ret = -EINVAL; goto fail; diff --git a/block/gluster.c b/block/gluster.c index 883608564..9274dead7 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -16,6 +16,7 @@ typedef struct GlusterAIOCB { int ret; QEMUBH *bh; Coroutine *coroutine; + AioContext *aio_context; } GlusterAIOCB; typedef struct BDRVGlusterState { @@ -207,6 +208,11 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, "volume=%s image=%s transport=%s", gconf->server, gconf->port, gconf->volname, gconf->image, gconf->transport); + + /* glfs_init sometimes doesn't set errno although docs suggest that */ + if (errno == 0) + errno = EINVAL; + goto out; } return glfs; @@ -244,7 +250,7 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) acb->ret = -EIO; /* Partial read/write - fail it */ } - acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); + acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb); qemu_bh_schedule(acb->bh); } @@ -431,6 +437,7 @@ static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, acb->size = size; acb->ret = 0; acb->coroutine = qemu_coroutine_self(); + acb->aio_context = bdrv_get_aio_context(bs); ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); if (ret < 0) { @@ -471,39 +478,37 @@ static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, #endif static int qemu_gluster_create(const char *filename, - QEMUOptionParameter *options, Error **errp) + QemuOpts *opts, Error **errp) { struct glfs *glfs; struct glfs_fd *fd; int ret = 0; int prealloc = 0; int64_t total_size = 0; + char *tmp = NULL; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); glfs = qemu_gluster_init(gconf, filename, errp); if (!glfs) { - ret = -EINVAL; + ret = -errno; goto out; } - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / BDRV_SECTOR_SIZE; - } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { - if (!options->value.s || !strcmp(options->value.s, "off")) { - prealloc = 0; - } else if (!strcmp(options->value.s, "full") && - gluster_supports_zerofill()) { - prealloc = 1; - } else { - error_setg(errp, "Invalid preallocation mode: '%s'" - " or GlusterFS doesn't support zerofill API", - options->value.s); - ret = -EINVAL; - goto out; - } - } - options++; + total_size = + qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE; + + tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + if (!tmp || !strcmp(tmp, "off")) { + prealloc = 0; + } else if (!strcmp(tmp, "full") && + gluster_supports_zerofill()) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'" + " or GlusterFS doesn't support zerofill API", + tmp); + ret = -EINVAL; + goto out; } fd = glfs_creat(glfs, gconf->image, @@ -525,6 +530,7 @@ static int qemu_gluster_create(const char *filename, } } out: + g_free(tmp); qemu_gluster_gconf_free(gconf); if (glfs) { glfs_fini(glfs); @@ -544,6 +550,7 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, acb->size = size; acb->ret = 0; acb->coroutine = qemu_coroutine_self(); + acb->aio_context = bdrv_get_aio_context(bs); if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, @@ -600,6 +607,7 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) acb->size = 0; acb->ret = 0; acb->coroutine = qemu_coroutine_self(); + acb->aio_context = bdrv_get_aio_context(bs); ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); if (ret < 0) { @@ -628,6 +636,7 @@ static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, acb->size = 0; acb->ret = 0; acb->coroutine = qemu_coroutine_self(); + acb->aio_context = bdrv_get_aio_context(bs); ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); if (ret < 0) { @@ -688,18 +697,22 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs) return 0; } -static QEMUOptionParameter qemu_gluster_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = OPT_STRING, - .help = "Preallocation mode (allowed values: off, full)" - }, - { NULL } +static QemuOptsList qemu_gluster_create_opts = { + .name = "qemu-gluster-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_gluster = { @@ -726,7 +739,7 @@ static BlockDriver bdrv_gluster = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif - .create_options = qemu_gluster_create_options, + .create_opts = &qemu_gluster_create_opts, }; static BlockDriver bdrv_gluster_tcp = { @@ -753,7 +766,7 @@ static BlockDriver bdrv_gluster_tcp = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif - .create_options = qemu_gluster_create_options, + .create_opts = &qemu_gluster_create_opts, }; static BlockDriver bdrv_gluster_unix = { @@ -780,7 +793,7 @@ static BlockDriver bdrv_gluster_unix = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif - .create_options = qemu_gluster_create_options, + .create_opts = &qemu_gluster_create_opts, }; static BlockDriver bdrv_gluster_rdma = { @@ -807,7 +820,7 @@ static BlockDriver bdrv_gluster_rdma = { #ifdef CONFIG_GLUSTERFS_ZEROFILL .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif - .create_options = qemu_gluster_create_options, + .create_opts = &qemu_gluster_create_opts, }; static void bdrv_gluster_init(void) diff --git a/block/iscsi.c b/block/iscsi.c index f425573df..a7bb6970a 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,7 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> - * Copyright (c) 2012-2013 Peter Lieven <pl@kamp.de> + * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de> * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,10 +26,13 @@ #include "config-host.h" #include <poll.h> +#include <math.h> #include <arpa/inet.h> #include "qemu-common.h" #include "qemu/config-file.h" #include "qemu/error-report.h" +#include "qemu/bitops.h" +#include "qemu/bitmap.h" #include "block/block_int.h" #include "trace.h" #include "block/scsi.h" @@ -47,6 +50,7 @@ typedef struct IscsiLun { struct iscsi_context *iscsi; + AioContext *aio_context; int lun; enum scsi_inquiry_peripheral_device_type type; int block_size; @@ -59,6 +63,9 @@ typedef struct IscsiLun { struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; unsigned char *zeroblock; + unsigned long *allocationmap; + int cluster_sectors; + bool use_16_for_rw; } IscsiLun; typedef struct IscsiTask { @@ -69,6 +76,8 @@ typedef struct IscsiTask { struct scsi_task *task; Coroutine *co; QEMUBH *bh; + IscsiLun *iscsilun; + QEMUTimer retry_timer; } IscsiTask; typedef struct IscsiAIOCB { @@ -80,7 +89,6 @@ typedef struct IscsiAIOCB { uint8_t *buf; int status; int canceled; - int retries; int64_t sector_num; int nb_sectors; #ifdef __linux__ @@ -90,7 +98,17 @@ typedef struct IscsiAIOCB { #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 -#define ISCSI_CMD_RETRIES 5 +#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) +static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048}; + +/* this threshold is a trade-off knob to choose between + * the potential additional overhead of an extra GET_LBA_STATUS request + * vs. unnecessarily reading a lot of zero sectors over the wire. + * If a read request is greater or equal than ISCSI_CHECKALLOC_THRES + * sectors we check the allocation status of the area covered by the + * request first if the allocationmap indicates that the area might be + * unallocated. */ +#define ISCSI_CHECKALLOC_THRES 64 static void iscsi_bh_cb(void *p) @@ -120,17 +138,32 @@ iscsi_schedule_bh(IscsiAIOCB *acb) if (acb->bh) { return; } - acb->bh = qemu_bh_new(iscsi_bh_cb, acb); + acb->bh = aio_bh_new(acb->iscsilun->aio_context, iscsi_bh_cb, acb); qemu_bh_schedule(acb->bh); } static void iscsi_co_generic_bh_cb(void *opaque) { struct IscsiTask *iTask = opaque; + iTask->complete = 1; qemu_bh_delete(iTask->bh); qemu_coroutine_enter(iTask->co, NULL); } +static void iscsi_retry_timer_expired(void *opaque) +{ + struct IscsiTask *iTask = opaque; + iTask->complete = 1; + if (iTask->co) { + qemu_coroutine_enter(iTask->co, NULL); + } +} + +static inline unsigned exp_random(double mean) +{ + return -mean * log((double)rand() / RAND_MAX); +} + static void iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, void *command_data, void *opaque) @@ -138,26 +171,44 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, struct IscsiTask *iTask = opaque; struct scsi_task *task = command_data; - iTask->complete = 1; iTask->status = status; iTask->do_retry = 0; iTask->task = task; - if (iTask->retries-- > 0 && status == SCSI_STATUS_CHECK_CONDITION - && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { - error_report("iSCSI CheckCondition: %s", iscsi_get_error(iscsi)); - iTask->do_retry = 1; - goto out; - } - if (status != SCSI_STATUS_GOOD) { + if (iTask->retries++ < ISCSI_CMD_RETRIES) { + if (status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { + error_report("iSCSI CheckCondition: %s", + iscsi_get_error(iscsi)); + iTask->do_retry = 1; + goto out; + } + if (status == SCSI_STATUS_BUSY) { + unsigned retry_time = + exp_random(iscsi_retry_times[iTask->retries - 1]); + error_report("iSCSI Busy (retry #%u in %u ms): %s", + iTask->retries, retry_time, + iscsi_get_error(iscsi)); + aio_timer_init(iTask->iscsilun->aio_context, + &iTask->retry_timer, QEMU_CLOCK_REALTIME, + SCALE_MS, iscsi_retry_timer_expired, iTask); + timer_mod(&iTask->retry_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + retry_time); + iTask->do_retry = 1; + return; + } + } error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); } out: if (iTask->co) { - iTask->bh = qemu_bh_new(iscsi_co_generic_bh_cb, iTask); + iTask->bh = aio_bh_new(iTask->iscsilun->aio_context, + iscsi_co_generic_bh_cb, iTask); qemu_bh_schedule(iTask->bh); + } else { + iTask->complete = 1; } } @@ -165,7 +216,7 @@ static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) { *iTask = (struct IscsiTask) { .co = qemu_coroutine_self(), - .retries = ISCSI_CMD_RETRIES, + .iscsilun = iscsilun, }; } @@ -196,7 +247,7 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb) iscsi_abort_task_cb, acb); while (acb->status == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(iscsilun->aio_context, true); } } @@ -219,10 +270,11 @@ iscsi_set_events(IscsiLun *iscsilun) ev = POLLIN; ev |= iscsi_which_events(iscsi); if (ev != iscsilun->events) { - qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), - iscsi_process_read, - (ev & POLLOUT) ? iscsi_process_write : NULL, - iscsilun); + aio_set_fd_handler(iscsilun->aio_context, + iscsi_get_fd(iscsi), + iscsi_process_read, + (ev & POLLOUT) ? iscsi_process_write : NULL, + iscsilun); } @@ -273,6 +325,32 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, return 1; } +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + if (iscsilun->allocationmap == NULL) { + return; + } + bitmap_set(iscsilun->allocationmap, + sector_num / iscsilun->cluster_sectors, + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + int64_t cluster_num, nb_clusters; + if (iscsilun->allocationmap == NULL) { + return; + } + cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); + nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors + - cluster_num; + if (nb_clusters > 0) { + bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); + } +} + static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) @@ -281,8 +359,6 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - uint8_t *data = NULL; - uint8_t *buf = NULL; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; @@ -290,31 +366,24 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); -#if !defined(LIBISCSI_FEATURE_IOVECTOR) - /* if the iovec only contains one buffer we can pass it directly */ - if (iov->niov == 1) { - data = iov->iov[0].iov_base; - } else { - size_t size = MIN(nb_sectors * BDRV_SECTOR_SIZE, iov->size); - buf = g_malloc(size); - qemu_iovec_to_buf(iov, 0, buf, size); - data = buf; - } -#endif iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, - data, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, - iscsi_co_generic_cb, &iTask); + if (iscsilun->use_16_for_rw) { + iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, + NULL, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + } else { + iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, + NULL, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + } if (iTask.task == NULL) { - g_free(buf); return -ENOMEM; } -#if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); -#endif while (!iTask.complete) { iscsi_set_events(iscsilun); qemu_coroutine_yield(); @@ -330,15 +399,125 @@ retry: goto retry; } - g_free(buf); - if (iTask.status != SCSI_STATUS_GOOD) { return -EIO; } + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + return 0; } + +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, + int64_t sector_num, int nb_sectors) +{ + unsigned long size; + if (iscsilun->allocationmap == NULL) { + return true; + } + size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + return !(find_next_bit(iscsilun->allocationmap, size, + sector_num / iscsilun->cluster_sectors) == size); +} + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + IscsiLun *iscsilun = bs->opaque; + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; + int64_t ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + ret = -EINVAL; + goto out; + } + + /* default to all sectors allocated */ + ret = BDRV_BLOCK_DATA; + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; + *pnum = nb_sectors; + + /* LUN does not support logical block provisioning */ + if (iscsilun->lbpme == 0) { + goto out; + } + +retry: + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, + sector_qemu2lun(sector_num, iscsilun), + 8 + 16, iscsi_co_generic_cb, + &iTask) == NULL) { + ret = -ENOMEM; + goto out; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.do_retry) { + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + /* in case the get_lba_status_callout fails (i.e. + * because the device is busy or the cmd is not + * supported) we pretend all blocks are allocated + * for backwards compatibility */ + goto out; + } + + lbas = scsi_datain_unmarshall(iTask.task); + if (lbas == NULL) { + ret = -EIO; + goto out; + } + + lbasd = &lbas->descriptors[0]; + + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + ret = -EIO; + goto out; + } + + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { + ret &= ~BDRV_BLOCK_DATA; + if (iscsilun->lbprz) { + ret |= BDRV_BLOCK_ZERO; + } + } + + if (ret & BDRV_BLOCK_ZERO) { + iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, *pnum); + } + + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } +out: + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + } + return ret; +} + static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) @@ -347,48 +526,46 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; -#if !defined(LIBISCSI_FEATURE_IOVECTOR) - int i; -#endif if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } + if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && + !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + int64_t ret; + int pnum; + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); + if (ret < 0) { + return ret; + } + if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { + qemu_iovec_memset(iov, 0, 0x00, iov->size); + return 0; + } + } + lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - switch (iscsilun->type) { - case TYPE_DISK: + if (iscsilun->use_16_for_rw) { iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, num_sectors * iscsilun->block_size, iscsilun->block_size, 0, 0, 0, 0, 0, iscsi_co_generic_cb, &iTask); - break; - default: + } else { iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, num_sectors * iscsilun->block_size, iscsilun->block_size, -#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */ 0, 0, 0, 0, 0, -#endif iscsi_co_generic_cb, &iTask); - break; } if (iTask.task == NULL) { return -ENOMEM; } -#if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); -#else - for (i = 0; i < iov->niov; i++) { - scsi_task_add_data_in_buffer(iTask.task, - iov->iov[i].iov_len, - iov->iov[i].iov_base); - } -#endif while (!iTask.complete) { iscsi_set_events(iscsilun); @@ -543,18 +720,9 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, data.data = acb->ioh->dxferp; data.size = acb->ioh->dxfer_len; } else { -#if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_out(acb->task, (struct scsi_iovec *) acb->ioh->dxferp, acb->ioh->iovec_count); -#else - struct iovec *iov = (struct iovec *)acb->ioh->dxferp; - - acb->buf = g_malloc(acb->ioh->dxfer_len); - data.data = acb->buf; - data.size = iov_to_buf(iov, acb->ioh->iovec_count, 0, - acb->buf, acb->ioh->dxfer_len); -#endif } } @@ -574,20 +742,9 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, acb->ioh->dxfer_len, acb->ioh->dxferp); } else { -#if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_in(acb->task, (struct scsi_iovec *) acb->ioh->dxferp, acb->ioh->iovec_count); -#else - int i; - for (i = 0; i < acb->ioh->iovec_count; i++) { - struct iovec *iov = (struct iovec *)acb->ioh->dxferp; - - scsi_task_add_data_in_buffer(acb->task, - iov[i].iov_len, - iov[i].iov_base); - } -#endif } } @@ -596,7 +753,6 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, return &acb->common; } - static void ioctl_cb(void *opaque, int status) { int *p_status = opaque; @@ -620,7 +776,7 @@ static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status); while (status == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(iscsilun->aio_context, true); } return 0; @@ -643,101 +799,6 @@ iscsi_getlength(BlockDriverState *bs) return len; } -#if defined(LIBISCSI_FEATURE_IOVECTOR) - -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - IscsiLun *iscsilun = bs->opaque; - struct scsi_get_lba_status *lbas = NULL; - struct scsi_lba_status_descriptor *lbasd = NULL; - struct IscsiTask iTask; - int64_t ret; - - iscsi_co_init_iscsitask(iscsilun, &iTask); - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - ret = -EINVAL; - goto out; - } - - /* default to all sectors allocated */ - ret = BDRV_BLOCK_DATA; - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; - *pnum = nb_sectors; - - /* LUN does not support logical block provisioning */ - if (iscsilun->lbpme == 0) { - goto out; - } - -retry: - if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, - sector_qemu2lun(sector_num, iscsilun), - 8 + 16, iscsi_co_generic_cb, - &iTask) == NULL) { - ret = -ENOMEM; - goto out; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.do_retry) { - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - /* in case the get_lba_status_callout fails (i.e. - * because the device is busy or the cmd is not - * supported) we pretend all blocks are allocated - * for backwards compatibility */ - goto out; - } - - lbas = scsi_datain_unmarshall(iTask.task); - if (lbas == NULL) { - ret = -EIO; - goto out; - } - - lbasd = &lbas->descriptors[0]; - - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { - ret = -EIO; - goto out; - } - - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); - if (*pnum > nb_sectors) { - *pnum = nb_sectors; - } - - if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || - lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { - ret &= ~BDRV_BLOCK_DATA; - if (iscsilun->lbprz) { - ret |= BDRV_BLOCK_ZERO; - } - } - -out: - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - } - return ret; -} - -#endif /* LIBISCSI_FEATURE_IOVECTOR */ - static int coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) @@ -791,11 +852,11 @@ retry: return -EIO; } + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + return 0; } -#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) - static int coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) @@ -804,18 +865,27 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, struct IscsiTask iTask; uint64_t lba; uint32_t nb_blocks; + bool use_16_for_ws = iscsilun->use_16_for_rw; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; } - if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { - /* WRITE SAME without UNMAP is not supported by the target */ - return -ENOTSUP; + if (flags & BDRV_REQ_MAY_UNMAP) { + if (!use_16_for_ws && !iscsilun->lbp.lbpws10) { + /* WRITESAME10 with UNMAP is unsupported try WRITESAME16 */ + use_16_for_ws = true; + } + if (use_16_for_ws && !iscsilun->lbp.lbpws) { + /* WRITESAME16 with UNMAP is not supported by the target, + * fall back and try WRITESAME10/16 without UNMAP */ + flags &= ~BDRV_REQ_MAY_UNMAP; + use_16_for_ws = iscsilun->use_16_for_rw; + } } - if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) { - /* WRITE SAME with UNMAP is not supported by the target */ + if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { + /* WRITESAME without UNMAP is not supported by the target */ return -ENOTSUP; } @@ -828,10 +898,18 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - if (iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, - iscsilun->zeroblock, iscsilun->block_size, - nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), - 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { + if (use_16_for_ws) { + iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask); + } else { + iTask.task = iscsi_writesame10_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask); + } + if (iTask.task == NULL) { return -ENOMEM; } @@ -864,11 +942,15 @@ retry: return -EIO; } + if (flags & BDRV_REQ_MAY_UNMAP) { + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + } + return 0; } -#endif /* SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED */ - static void parse_chap(struct iscsi_context *iscsi, const char *target, Error **errp) { @@ -978,7 +1060,6 @@ static char *parse_initiator_name(const char *target) return iscsi_name; } -#if defined(LIBISCSI_FEATURE_NOP_COUNTER) static void iscsi_nop_timed_event(void *opaque) { IscsiLun *iscsilun = opaque; @@ -996,7 +1077,6 @@ static void iscsi_nop_timed_event(void *opaque) timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); iscsi_set_events(iscsilun); } -#endif static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) { @@ -1023,6 +1103,7 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) iscsilun->num_blocks = rc16->returned_lba + 1; iscsilun->lbpme = rc16->lbpme; iscsilun->lbprz = rc16->lbprz; + iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); } } break; @@ -1095,22 +1176,53 @@ static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, *inq = scsi_datain_unmarshall(task); if (*inq == NULL) { error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); - goto fail; + goto fail_with_err; } return task; fail: - if (!error_is_set(errp)) { - error_setg(errp, "iSCSI: Inquiry command failed : %s", - iscsi_get_error(iscsi)); - } + error_setg(errp, "iSCSI: Inquiry command failed : %s", + iscsi_get_error(iscsi)); +fail_with_err: if (task != NULL) { scsi_free_scsi_task(task); } return NULL; } +static void iscsi_detach_aio_context(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + + aio_set_fd_handler(iscsilun->aio_context, + iscsi_get_fd(iscsilun->iscsi), + NULL, NULL, NULL); + iscsilun->events = 0; + + if (iscsilun->nop_timer) { + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); + iscsilun->nop_timer = NULL; + } +} + +static void iscsi_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + IscsiLun *iscsilun = bs->opaque; + + iscsilun->aio_context = new_context; + iscsi_set_events(iscsilun); + + /* Set up a timer for sending out iSCSI NOPs */ + iscsilun->nop_timer = aio_timer_new(iscsilun->aio_context, + QEMU_CLOCK_REALTIME, SCALE_MS, + iscsi_nop_timed_event, iscsilun); + timer_mod(iscsilun->nop_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); +} + /* * We support iscsi url's on the form * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> @@ -1217,6 +1329,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, } iscsilun->iscsi = iscsi; + iscsilun->aio_context = bdrv_get_aio_context(bs); iscsilun->lun = iscsi_url->lun; iscsilun->has_write_same = true; @@ -1290,17 +1403,25 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, scsi_free_scsi_task(task); task = NULL; -#if defined(LIBISCSI_FEATURE_NOP_COUNTER) - /* Set up a timer for sending out iSCSI NOPs */ - iscsilun->nop_timer = timer_new_ms(QEMU_CLOCK_REALTIME, iscsi_nop_timed_event, iscsilun); - timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); -#endif + iscsi_attach_aio_context(bs, iscsilun->aio_context); + + /* Guess the internal cluster (page) size of the iscsi target by the means + * of opt_unmap_gran. Transfer the unmap granularity only if it has a + * reasonable size */ + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { + iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * + iscsilun->block_size) >> BDRV_SECTOR_BITS; + if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) { + iscsilun->allocationmap = + bitmap_new(DIV_ROUND_UP(bs->total_sectors, + iscsilun->cluster_sectors)); + } + } out: qemu_opts_del(opts); - if (initiator_name != NULL) { - g_free(initiator_name); - } + g_free(initiator_name); if (iscsi_url != NULL) { iscsi_destroy_url(iscsi_url); } @@ -1322,17 +1443,14 @@ static void iscsi_close(BlockDriverState *bs) IscsiLun *iscsilun = bs->opaque; struct iscsi_context *iscsi = iscsilun->iscsi; - if (iscsilun->nop_timer) { - timer_del(iscsilun->nop_timer); - timer_free(iscsilun->nop_timer); - } - qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); + iscsi_detach_aio_context(bs); iscsi_destroy_context(iscsi); g_free(iscsilun->zeroblock); + g_free(iscsilun->allocationmap); memset(iscsilun, 0, sizeof(IscsiLun)); } -static int iscsi_refresh_limits(BlockDriverState *bs) +static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) { IscsiLun *iscsilun = bs->opaque; @@ -1357,7 +1475,6 @@ static int iscsi_refresh_limits(BlockDriverState *bs) } bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); - return 0; } /* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in @@ -1389,11 +1506,17 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) return -EINVAL; } + if (iscsilun->allocationmap != NULL) { + g_free(iscsilun->allocationmap); + iscsilun->allocationmap = + bitmap_new(DIV_ROUND_UP(bs->total_sectors, + iscsilun->cluster_sectors)); + } + return 0; } -static int iscsi_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp) { int ret = 0; int64_t total_size = 0; @@ -1401,16 +1524,11 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options, IscsiLun *iscsilun = NULL; QDict *bs_options; - bs = bdrv_new(""); + bs = bdrv_new("", &error_abort); /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, "size")) { - total_size = options->value.n / BDRV_SECTOR_SIZE; - } - options++; - } - + total_size = + qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE; bs->opaque = g_malloc0(sizeof(struct IscsiLun)); iscsilun = bs->opaque; @@ -1422,10 +1540,7 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options, if (ret != 0) { goto out; } - if (iscsilun->nop_timer) { - timer_del(iscsilun->nop_timer); - timer_free(iscsilun->nop_timer); - } + iscsi_detach_aio_context(bs); if (iscsilun->type != TYPE_DISK) { ret = -ENODEV; goto out; @@ -1451,23 +1566,21 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) IscsiLun *iscsilun = bs->opaque; bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; - /* Guess the internal cluster (page) size of the iscsi target by the means - * of opt_unmap_gran. Transfer the unmap granularity only if it has a - * reasonable size for bdi->cluster_size */ - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && - iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { - bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; - } + bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; return 0; } -static QEMUOptionParameter iscsi_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } +static QemuOptsList iscsi_create_opts = { + .name = "iscsi-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_iscsi = { @@ -1479,7 +1592,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, .bdrv_create = iscsi_create, - .create_options = iscsi_create_options, + .create_opts = &iscsi_create_opts, .bdrv_reopen_prepare = iscsi_reopen_prepare, .bdrv_getlength = iscsi_getlength, @@ -1487,13 +1600,9 @@ static BlockDriver bdrv_iscsi = { .bdrv_truncate = iscsi_truncate, .bdrv_refresh_limits = iscsi_refresh_limits, -#if defined(LIBISCSI_FEATURE_IOVECTOR) .bdrv_co_get_block_status = iscsi_co_get_block_status, -#endif .bdrv_co_discard = iscsi_co_discard, -#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) .bdrv_co_write_zeroes = iscsi_co_write_zeroes, -#endif .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev = iscsi_co_writev, .bdrv_co_flush_to_disk = iscsi_co_flush, @@ -1502,6 +1611,9 @@ static BlockDriver bdrv_iscsi = { .bdrv_ioctl = iscsi_ioctl, .bdrv_aio_ioctl = iscsi_aio_ioctl, #endif + + .bdrv_detach_aio_context = iscsi_detach_aio_context, + .bdrv_attach_aio_context = iscsi_attach_aio_context, }; static QemuOptsList qemu_iscsi_opts = { diff --git a/block/linux-aio.c b/block/linux-aio.c index 53434e2df..7ac7e8c99 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -25,6 +25,8 @@ */ #define MAX_EVENTS 128 +#define MAX_QUEUED_IO 128 + struct qemu_laiocb { BlockDriverAIOCB common; struct qemu_laio_state *ctx; @@ -36,9 +38,19 @@ struct qemu_laiocb { QLIST_ENTRY(qemu_laiocb) node; }; +typedef struct { + struct iocb *iocbs[MAX_QUEUED_IO]; + int plugged; + unsigned int size; + unsigned int idx; +} LaioQueue; + struct qemu_laio_state { io_context_t ctx; EventNotifier e; + + /* io queue for submit at batch */ + LaioQueue io_q; }; static inline ssize_t io_event_ret(struct io_event *ev) @@ -135,6 +147,79 @@ static const AIOCBInfo laio_aiocb_info = { .cancel = laio_cancel, }; +static void ioq_init(LaioQueue *io_q) +{ + io_q->size = MAX_QUEUED_IO; + io_q->idx = 0; + io_q->plugged = 0; +} + +static int ioq_submit(struct qemu_laio_state *s) +{ + int ret, i = 0; + int len = s->io_q.idx; + + do { + ret = io_submit(s->ctx, len, s->io_q.iocbs); + } while (i++ < 3 && ret == -EAGAIN); + + /* empty io queue */ + s->io_q.idx = 0; + + if (ret < 0) { + i = 0; + } else { + i = ret; + } + + for (; i < len; i++) { + struct qemu_laiocb *laiocb = + container_of(s->io_q.iocbs[i], struct qemu_laiocb, iocb); + + laiocb->ret = (ret < 0) ? ret : -EIO; + qemu_laio_process_completion(s, laiocb); + } + return ret; +} + +static void ioq_enqueue(struct qemu_laio_state *s, struct iocb *iocb) +{ + unsigned int idx = s->io_q.idx; + + s->io_q.iocbs[idx++] = iocb; + s->io_q.idx = idx; + + /* submit immediately if queue is full */ + if (idx == s->io_q.size) { + ioq_submit(s); + } +} + +void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +{ + struct qemu_laio_state *s = aio_ctx; + + s->io_q.plugged++; +} + +int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +{ + struct qemu_laio_state *s = aio_ctx; + int ret = 0; + + assert(s->io_q.plugged > 0 || !unplug); + + if (unplug && --s->io_q.plugged > 0) { + return 0; + } + + if (s->io_q.idx > 0) { + ret = ioq_submit(s); + } + + return ret; +} + BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type) @@ -168,8 +253,13 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, } io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); - if (io_submit(s->ctx, 1, &iocbs) < 0) - goto out_free_aiocb; + if (!s->io_q.plugged) { + if (io_submit(s->ctx, 1, &iocbs) < 0) { + goto out_free_aiocb; + } + } else { + ioq_enqueue(s, iocbs); + } return &laiocb->common; out_free_aiocb: @@ -177,6 +267,20 @@ out_free_aiocb: return NULL; } +void laio_detach_aio_context(void *s_, AioContext *old_context) +{ + struct qemu_laio_state *s = s_; + + aio_set_event_notifier(old_context, &s->e, NULL); +} + +void laio_attach_aio_context(void *s_, AioContext *new_context) +{ + struct qemu_laio_state *s = s_; + + aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); +} + void *laio_init(void) { struct qemu_laio_state *s; @@ -190,7 +294,7 @@ void *laio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb); + ioq_init(&s->io_q); return s; @@ -200,3 +304,16 @@ out_free_state: g_free(s); return NULL; } + +void laio_cleanup(void *s_) +{ + struct qemu_laio_state *s = s_; + + event_notifier_cleanup(&s->e); + + if (io_destroy(s->ctx) != 0) { + fprintf(stderr, "%s: destroy AIO context %p failed\n", + __func__, &s->ctx); + } + g_free(s); +} diff --git a/block/mirror.c b/block/mirror.c index 0ef41f999..c7a655fc5 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -32,6 +32,12 @@ typedef struct MirrorBlockJob { RateLimit limit; BlockDriverState *target; BlockDriverState *base; + /* The name of the graph node to replace */ + char *replaces; + /* The BDS to replace */ + BlockDriverState *to_replace; + /* Used to block operations on the drive-mirror-replace target */ + Error *replace_blocker; bool is_none_mode; BlockdevOnError on_source_error, on_target_error; bool synced; @@ -118,7 +124,7 @@ static void mirror_write_complete(void *opaque, int ret) bdrv_set_dirty(source, op->sector_num, op->nb_sectors); action = mirror_error_action(s, false, -ret); - if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; } } @@ -135,7 +141,7 @@ static void mirror_read_complete(void *opaque, int ret) bdrv_set_dirty(source, op->sector_num, op->nb_sectors); action = mirror_error_action(s, true, -ret); - if (action == BDRV_ACTION_REPORT && s->ret >= 0) { + if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; } @@ -259,9 +265,11 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); + size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; - qemu_iovec_add(&op->qiov, buf, s->granularity); + qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. @@ -324,12 +332,21 @@ static void coroutine_fn mirror_run(void *opaque) } s->common.len = bdrv_getlength(bs); - if (s->common.len <= 0) { - block_job_completed(&s->common, s->common.len); - return; + if (s->common.len < 0) { + ret = s->common.len; + goto immediate_exit; + } else if (s->common.len == 0) { + /* Report BLOCK_JOB_READY and wait for complete. */ + block_job_event_ready(&s->common); + s->synced = true; + while (!block_job_is_cancelled(&s->common) && !s->should_complete) { + block_job_yield(&s->common); + } + s->common.cancelled = false; + goto immediate_exit; } - length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; + length = DIV_ROUND_UP(s->common.len, s->granularity); s->in_flight_bitmap = bitmap_new(length); /* If we have no backing file yet in the destination, we cannot let @@ -339,7 +356,10 @@ static void coroutine_fn mirror_run(void *opaque) bdrv_get_backing_filename(s->target, backing_filename, sizeof(backing_filename)); if (backing_filename[0] && !s->target->backing_hd) { - bdrv_get_info(s->target, &bdi); + ret = bdrv_get_info(s->target, &bdi); + if (ret < 0) { + goto immediate_exit; + } if (s->granularity < bdi.cluster_size) { s->buf_size = MAX(s->buf_size, bdi.cluster_size); s->cow_bitmap = bitmap_new(length); @@ -412,7 +432,8 @@ static void coroutine_fn mirror_run(void *opaque) trace_mirror_before_flush(s); ret = bdrv_flush(s->target); if (ret < 0) { - if (mirror_error_action(s, false, -ret) == BDRV_ACTION_REPORT) { + if (mirror_error_action(s, false, -ret) == + BLOCK_ERROR_ACTION_REPORT) { goto immediate_exit; } } else { @@ -423,7 +444,7 @@ static void coroutine_fn mirror_run(void *opaque) */ s->common.offset = end * BDRV_SECTOR_SIZE; if (!s->synced) { - block_job_ready(&s->common); + block_job_event_ready(&s->common); s->synced = true; } @@ -487,18 +508,28 @@ immediate_exit: bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { - if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { - bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL); + BlockDriverState *to_replace = s->common.bs; + if (s->to_replace) { + to_replace = s->to_replace; } - bdrv_swap(s->target, s->common.bs); + if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { + bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); + } + bdrv_swap(s->target, to_replace); if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { /* drop the bs loop chain formed by the swap: break the loop then * trigger the unref from the top one */ BlockDriverState *p = s->base->backing_hd; - s->base->backing_hd = NULL; + bdrv_set_backing_hd(s->base, NULL); bdrv_unref(p); } } + if (s->to_replace) { + bdrv_op_unblock_all(s->to_replace, s->replace_blocker); + error_free(s->replace_blocker); + bdrv_unref(s->to_replace); + } + g_free(s->replaces); bdrv_unref(s->target); block_job_completed(&s->common, ret); } @@ -537,6 +568,20 @@ static void mirror_complete(BlockJob *job, Error **errp) return; } + /* check the target bs is not blocked and block all operations on it */ + if (s->replaces) { + s->to_replace = check_to_replace_node(s->replaces, &local_err); + if (!s->to_replace) { + error_propagate(errp, local_err); + return; + } + + error_setg(&s->replace_blocker, + "block device is in use by block-job-complete"); + bdrv_op_block_all(s->to_replace, s->replace_blocker); + bdrv_ref(s->to_replace); + } + s->should_complete = true; block_job_resume(job); } @@ -559,14 +604,15 @@ static const BlockJobDriver commit_active_job_driver = { }; static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, - int64_t buf_size, - BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - BlockDriverCompletionFunc *cb, - void *opaque, Error **errp, - const BlockJobDriver *driver, - bool is_none_mode, BlockDriverState *base) + const char *replaces, + int64_t speed, int64_t granularity, + int64_t buf_size, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp, + const BlockJobDriver *driver, + bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; @@ -597,6 +643,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, return; } + s->replaces = g_strdup(replaces); s->on_source_error = on_source_error; s->on_target_error = on_target_error; s->target = target; @@ -605,7 +652,10 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->granularity = granularity; s->buf_size = MAX(buf_size, granularity); - s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity); + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp); + if (!s->dirty_bitmap) { + return; + } bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); @@ -615,6 +665,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, } void mirror_start(BlockDriverState *bs, BlockDriverState *target, + const char *replaces, int64_t speed, int64_t granularity, int64_t buf_size, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, @@ -626,7 +677,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, is_none_mode = mode == MIRROR_SYNC_MODE_NONE; base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; - mirror_start_job(bs, target, speed, granularity, buf_size, + mirror_start_job(bs, target, replaces, + speed, granularity, buf_size, on_source_error, on_target_error, cb, opaque, errp, &mirror_job_driver, is_none_mode, base); } @@ -674,10 +726,10 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base, } bdrv_ref(base); - mirror_start_job(bs, base, speed, 0, 0, + mirror_start_job(bs, base, NULL, speed, 0, 0, on_error, on_error, cb, opaque, &local_err, &commit_active_job_driver, false, base); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto error_restore_flags; } diff --git a/block/nbd-client.c b/block/nbd-client.c index 7d698cb61..6e1c97cad 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -49,7 +49,7 @@ static void nbd_teardown_connection(NbdClientSession *client) shutdown(client->sock, 2); nbd_recv_coroutines_enter_all(client); - qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL); + nbd_client_session_detach_aio_context(client); closesocket(client->sock); client->sock = -1; } @@ -103,11 +103,14 @@ static int nbd_co_send_request(NbdClientSession *s, struct nbd_request *request, QEMUIOVector *qiov, int offset) { + AioContext *aio_context; int rc, ret; qemu_co_mutex_lock(&s->send_mutex); s->send_coroutine = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s); + aio_context = bdrv_get_aio_context(s->bs); + aio_set_fd_handler(aio_context, s->sock, + nbd_reply_ready, nbd_restart_write, s); if (qiov) { if (!s->is_unix) { socket_set_cork(s->sock, 1); @@ -126,7 +129,7 @@ static int nbd_co_send_request(NbdClientSession *s, } else { rc = nbd_send_request(s->sock, request); } - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s); + aio_set_fd_handler(aio_context, s->sock, nbd_reply_ready, NULL, s); s->send_coroutine = NULL; qemu_co_mutex_unlock(&s->send_mutex); return rc; @@ -335,6 +338,19 @@ int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, } +void nbd_client_session_detach_aio_context(NbdClientSession *client) +{ + aio_set_fd_handler(bdrv_get_aio_context(client->bs), client->sock, + NULL, NULL, NULL); +} + +void nbd_client_session_attach_aio_context(NbdClientSession *client, + AioContext *new_context) +{ + aio_set_fd_handler(new_context, client->sock, + nbd_reply_ready, NULL, client); +} + void nbd_client_session_close(NbdClientSession *client) { struct nbd_request request = { @@ -381,7 +397,7 @@ int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, /* Now that we're connected, set the socket to be non-blocking and * kick the reply mechanism. */ qemu_set_nonblock(sock); - qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client); + nbd_client_session_attach_aio_context(client, bdrv_get_aio_context(bs)); logout("Established connection with NBD server\n"); return 0; diff --git a/block/nbd-client.h b/block/nbd-client.h index f2a63378b..cd478f3a9 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -47,4 +47,8 @@ int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num, int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); +void nbd_client_session_detach_aio_context(NbdClientSession *client); +void nbd_client_session_attach_aio_context(NbdClientSession *client, + AioContext *new_context); + #endif /* NBD_CLIENT_H */ diff --git a/block/nbd.c b/block/nbd.c index 55124239d..4eda0958d 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -175,7 +175,7 @@ static void nbd_parse_filename(const char *filename, QDict *options, InetSocketAddress *addr = NULL; addr = inet_parse(host_spec, errp); - if (error_is_set(errp)) { + if (!addr) { goto out; } @@ -323,46 +323,67 @@ static int64_t nbd_getlength(BlockDriverState *bs) return s->client.size; } +static void nbd_detach_aio_context(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + + nbd_client_session_detach_aio_context(&s->client); +} + +static void nbd_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVNBDState *s = bs->opaque; + + nbd_client_session_attach_aio_context(&s->client, new_context); +} + static BlockDriver bdrv_nbd = { - .format_name = "nbd", - .protocol_name = "nbd", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_getlength = nbd_getlength, + .format_name = "nbd", + .protocol_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, }; static BlockDriver bdrv_nbd_tcp = { - .format_name = "nbd", - .protocol_name = "nbd+tcp", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_getlength = nbd_getlength, + .format_name = "nbd", + .protocol_name = "nbd+tcp", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, }; static BlockDriver bdrv_nbd_unix = { - .format_name = "nbd", - .protocol_name = "nbd+unix", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_getlength = nbd_getlength, + .format_name = "nbd", + .protocol_name = "nbd+unix", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, }; static void bdrv_nbd_init(void) diff --git a/block/nfs.c b/block/nfs.c index 98aa363e4..8439e0d38 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -40,6 +40,7 @@ typedef struct NFSClient { struct nfsfh *fh; int events; bool has_zero_init; + AioContext *aio_context; } NFSClient; typedef struct NFSRPC { @@ -49,6 +50,7 @@ typedef struct NFSRPC { struct stat *st; Coroutine *co; QEMUBH *bh; + NFSClient *client; } NFSRPC; static void nfs_process_read(void *arg); @@ -58,10 +60,11 @@ static void nfs_set_events(NFSClient *client) { int ev = nfs_which_events(client->context); if (ev != client->events) { - qemu_aio_set_fd_handler(nfs_get_fd(client->context), - (ev & POLLIN) ? nfs_process_read : NULL, - (ev & POLLOUT) ? nfs_process_write : NULL, - client); + aio_set_fd_handler(client->aio_context, + nfs_get_fd(client->context), + (ev & POLLIN) ? nfs_process_read : NULL, + (ev & POLLOUT) ? nfs_process_write : NULL, + client); } client->events = ev; @@ -84,13 +87,15 @@ static void nfs_process_write(void *arg) static void nfs_co_init_task(NFSClient *client, NFSRPC *task) { *task = (NFSRPC) { - .co = qemu_coroutine_self(), + .co = qemu_coroutine_self(), + .client = client, }; } static void nfs_co_generic_bh_cb(void *opaque) { NFSRPC *task = opaque; + task->complete = 1; qemu_bh_delete(task->bh); qemu_coroutine_enter(task->co, NULL); } @@ -100,7 +105,6 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, void *private_data) { NFSRPC *task = private_data; - task->complete = 1; task->ret = ret; if (task->ret > 0 && task->iov) { if (task->ret <= task->iov->size) { @@ -116,8 +120,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, error_report("NFS Error: %s", nfs_get_error(nfs)); } if (task->co) { - task->bh = qemu_bh_new(nfs_co_generic_bh_cb, task); + task->bh = aio_bh_new(task->client->aio_context, + nfs_co_generic_bh_cb, task); qemu_bh_schedule(task->bh); + } else { + task->complete = 1; } } @@ -224,13 +231,34 @@ static QemuOptsList runtime_opts = { }, }; +static void nfs_detach_aio_context(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + + aio_set_fd_handler(client->aio_context, + nfs_get_fd(client->context), + NULL, NULL, NULL); + client->events = 0; +} + +static void nfs_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + NFSClient *client = bs->opaque; + + client->aio_context = new_context; + nfs_set_events(client); +} + static void nfs_client_close(NFSClient *client) { if (client->context) { if (client->fh) { nfs_close(client->context, client->fh); } - qemu_aio_set_fd_handler(nfs_get_fd(client->context), NULL, NULL, NULL); + aio_set_fd_handler(client->aio_context, + nfs_get_fd(client->context), + NULL, NULL, NULL); nfs_destroy_context(client->context); } memset(client, 0, sizeof(NFSClient)); @@ -256,6 +284,10 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, error_setg(errp, "Invalid URL specified"); goto fail; } + if (!uri->server) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } strp = strrchr(uri->path, '/'); if (strp == NULL) { error_setg(errp, "Invalid URL specified"); @@ -272,17 +304,27 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, qp = query_params_parse(uri->query); for (i = 0; i < qp->n; i++) { + unsigned long long val; if (!qp->p[i].value) { error_setg(errp, "Value for NFS parameter expected: %s", qp->p[i].name); goto fail; } - if (!strncmp(qp->p[i].name, "uid", 3)) { - nfs_set_uid(client->context, atoi(qp->p[i].value)); - } else if (!strncmp(qp->p[i].name, "gid", 3)) { - nfs_set_gid(client->context, atoi(qp->p[i].value)); - } else if (!strncmp(qp->p[i].name, "tcp-syncnt", 10)) { - nfs_set_tcp_syncnt(client->context, atoi(qp->p[i].value)); + if (parse_uint_full(qp->p[i].value, &val, 0)) { + error_setg(errp, "Illegal value for NFS parameter: %s", + qp->p[i].name); + goto fail; + } + if (!strcmp(qp->p[i].name, "uid")) { + nfs_set_uid(client->context, val); + } else if (!strcmp(qp->p[i].name, "gid")) { + nfs_set_gid(client->context, val); + } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) { + nfs_set_tcp_syncnt(client->context, val); +#ifdef LIBNFS_FEATURE_READAHEAD + } else if (!strcmp(qp->p[i].name, "readahead")) { + nfs_set_readahead(client->context, val); +#endif } else { error_setg(errp, "Unknown NFS parameter name: %s", qp->p[i].name); @@ -341,9 +383,11 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, QemuOpts *opts; Error *local_err = NULL; + client->aio_context = bdrv_get_aio_context(bs); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); return -EINVAL; } @@ -357,20 +401,16 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, return 0; } -static int nfs_file_create(const char *url, QEMUOptionParameter *options, - Error **errp) +static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) { int ret = 0; int64_t total_size = 0; NFSClient *client = g_malloc0(sizeof(NFSClient)); + client->aio_context = qemu_get_aio_context(); + /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, "size")) { - total_size = options->value.n; - } - options++; - } + total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); ret = nfs_client_open(client, url, O_CREAT, errp); if (ret < 0) { @@ -403,7 +443,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) while (!task.complete) { nfs_set_events(client); - qemu_aio_wait(); + aio_poll(client->aio_context, true); } return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize); @@ -416,22 +456,25 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) } static BlockDriver bdrv_nfs = { - .format_name = "nfs", - .protocol_name = "nfs", - - .instance_size = sizeof(NFSClient), - .bdrv_needs_filename = true, - .bdrv_has_zero_init = nfs_has_zero_init, - .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, - .bdrv_truncate = nfs_file_truncate, - - .bdrv_file_open = nfs_file_open, - .bdrv_close = nfs_file_close, - .bdrv_create = nfs_file_create, - - .bdrv_co_readv = nfs_co_readv, - .bdrv_co_writev = nfs_co_writev, - .bdrv_co_flush_to_disk = nfs_co_flush, + .format_name = "nfs", + .protocol_name = "nfs", + + .instance_size = sizeof(NFSClient), + .bdrv_needs_filename = true, + .bdrv_has_zero_init = nfs_has_zero_init, + .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, + .bdrv_truncate = nfs_file_truncate, + + .bdrv_file_open = nfs_file_open, + .bdrv_close = nfs_file_close, + .bdrv_create = nfs_file_create, + + .bdrv_co_readv = nfs_co_readv, + .bdrv_co_writev = nfs_co_writev, + .bdrv_co_flush_to_disk = nfs_co_flush, + + .bdrv_detach_aio_context = nfs_detach_aio_context, + .bdrv_attach_aio_context = nfs_attach_aio_context, }; static void nfs_block_init(void) diff --git a/block/qapi.c b/block/qapi.c index 8f2b4dbe7..f44f6b401 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -50,6 +50,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) } info->backing_file_depth = bdrv_get_backing_file_depth(bs); + info->detect_zeroes = bs->detect_zeroes; if (bs->io_limits_enabled) { ThrottleConfig cfg; @@ -292,7 +293,7 @@ void bdrv_query_info(BlockDriverState *bs, qapi_free_BlockInfo(info); } -BlockStats *bdrv_query_stats(const BlockDriverState *bs) +static BlockStats *bdrv_query_stats(const BlockDriverState *bs) { BlockStats *s; @@ -359,7 +360,11 @@ BlockStatsList *qmp_query_blockstats(Error **errp) while ((bs = bdrv_next(bs))) { BlockStatsList *info = g_malloc0(sizeof(*info)); + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); info->value = bdrv_query_stats(bs); + aio_context_release(ctx); *p_next = info; p_next = &info->next; @@ -474,6 +479,7 @@ static void dump_qobject(fprintf_function func_fprintf, void *f, case QTYPE_QERROR: { QString *value = qerror_human((QError *)obj); func_fprintf(f, "%s", qstring_get_str(value)); + QDECREF(value); break; } case QTYPE_NONE: @@ -532,12 +538,11 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, ImageInfoSpecific *info_spec) { - Error *local_err = NULL; QmpOutputVisitor *ov = qmp_output_visitor_new(); QObject *obj, *data; visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, - &local_err); + &error_abort); obj = qmp_output_get_qobject(ov); assert(qobject_type(obj) == QTYPE_QDICT); data = qdict_get(qobject_to_qdict(obj), "data"); diff --git a/block/qcow.c b/block/qcow.c index d5a7d5fd1..a874056cf 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -48,9 +48,10 @@ typedef struct QCowHeader { uint64_t size; /* in bytes */ uint8_t cluster_bits; uint8_t l2_bits; + uint16_t padding; uint32_t crypt_method; uint64_t l1_table_offset; -} QCowHeader; +} QEMU_PACKED QCowHeader; #define L2_CACHE_SIZE 16 @@ -60,7 +61,7 @@ typedef struct BDRVQcowState { int cluster_sectors; int l2_bits; int l2_size; - int l1_size; + unsigned int l1_size; uint64_t cluster_offset_mask; uint64_t l1_table_offset; uint64_t *l1_table; @@ -96,7 +97,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVQcowState *s = bs->opaque; - int len, i, shift, ret; + unsigned int len, i, shift; + int ret; QCowHeader header; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); @@ -119,18 +121,33 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } if (header.version != QCOW_VERSION) { char version[64]; - snprintf(version, sizeof(version), "QCOW version %d", header.version); + snprintf(version, sizeof(version), "QCOW version %" PRIu32, + header.version); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow", version); ret = -ENOTSUP; goto fail; } - if (header.size <= 1 || header.cluster_bits < 9) { - error_setg(errp, "invalid value in qcow header"); + if (header.size <= 1) { + error_setg(errp, "Image size is too small (must be at least 2 bytes)"); + ret = -EINVAL; + goto fail; + } + if (header.cluster_bits < 9 || header.cluster_bits > 16) { + error_setg(errp, "Cluster size must be between 512 and 64k"); ret = -EINVAL; goto fail; } + + /* l2_bits specifies number of entries; storing a uint64_t in each entry, + * so bytes = num_entries << 3. */ + if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) { + error_setg(errp, "L2 table size must be between 512 and 64k"); + ret = -EINVAL; + goto fail; + } + if (header.crypt_method > QCOW_CRYPT_AES) { error_setg(errp, "invalid encryption method in qcow header"); ret = -EINVAL; @@ -150,7 +167,19 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, /* read the level 1 table */ shift = s->cluster_bits + s->l2_bits; - s->l1_size = (header.size + (1LL << shift) - 1) >> shift; + if (header.size > UINT64_MAX - (1LL << shift)) { + error_setg(errp, "Image too large"); + ret = -EINVAL; + goto fail; + } else { + uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift; + if (l1_size > INT_MAX / sizeof(uint64_t)) { + error_setg(errp, "Image too large"); + ret = -EINVAL; + goto fail; + } + s->l1_size = l1_size; + } s->l1_table_offset = header.l1_table_offset; s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); @@ -174,7 +203,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, if (header.backing_file_offset != 0) { len = header.backing_file_size; if (len > 1023) { - len = 1023; + error_setg(errp, "Backing file name too long"); + ret = -EINVAL; + goto fail; } ret = bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len); @@ -662,35 +693,29 @@ static void qcow_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static int qcow_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) { int header_size, backing_filename_len, l1_size, shift, i; QCowHeader header; uint8_t *tmp; int64_t total_size = 0; - const char *backing_file = NULL; + char *backing_file = NULL; int flags = 0; Error *local_err = NULL; int ret; BlockDriverState *qcow_bs; /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / 512; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { - flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; - } - options++; + total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512; + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + flags |= BLOCK_FLAG_ENCRYPT; } - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return ret; + goto cleanup; } qcow_bs = NULL; @@ -698,7 +723,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options, BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { error_propagate(errp, local_err); - return ret; + goto cleanup; } ret = bdrv_truncate(qcow_bs, 0); @@ -769,6 +794,8 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options, ret = 0; exit: bdrv_unref(qcow_bs); +cleanup: + g_free(backing_file); return ret; } @@ -881,24 +908,28 @@ static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } - -static QEMUOptionParameter qcow_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = OPT_FLAG, - .help = "Encrypt the image" - }, - { NULL } +static QemuOptsList qcow_create_opts = { + .name = "qcow-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = QEMU_OPT_BOOL, + .help = "Encrypt the image", + .def_value_str = "off" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_qcow = { @@ -907,9 +938,10 @@ static BlockDriver bdrv_qcow = { .bdrv_probe = qcow_probe, .bdrv_open = qcow_open, .bdrv_close = qcow_close, - .bdrv_reopen_prepare = qcow_reopen_prepare, - .bdrv_create = qcow_create, + .bdrv_reopen_prepare = qcow_reopen_prepare, + .bdrv_create = qcow_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, + .supports_backing = true, .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, @@ -920,7 +952,7 @@ static BlockDriver bdrv_qcow = { .bdrv_write_compressed = qcow_write_compressed, .bdrv_get_info = qcow_get_info, - .create_options = qcow_create_options, + .create_opts = &qcow_create_opts, }; static void bdrv_qcow_init(void) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 331ab0802..4208dc08b 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -42,6 +42,13 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, if (min_size <= s->l1_size) return 0; + /* Do a sanity check on min_size before trying to calculate new_l1_size + * (this prevents overflows during the while loop for the calculation of + * new_l1_size) */ + if (min_size > INT_MAX / sizeof(uint64_t)) { + return -EFBIG; + } + if (exact_size) { new_l1_size = min_size; } else { @@ -372,7 +379,8 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); if (!bs->drv) { - return -ENOMEDIUM; + ret = -ENOMEDIUM; + goto out; } /* Call .bdrv_co_readv() directly instead of using the public block-layer @@ -1360,9 +1368,9 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); for (i = 0; i < nb_clusters; i++) { - uint64_t old_offset; + uint64_t old_l2_entry; - old_offset = be64_to_cpu(l2_table[l2_index + i]); + old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); /* * Make sure that a discarded area reads back as zeroes for v3 images @@ -1373,12 +1381,22 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, * TODO We might want to use bdrv_get_block_status(bs) here, but we're * holding s->lock, so that doesn't work today. */ - if (old_offset & QCOW_OFLAG_ZERO) { - continue; - } + switch (qcow2_get_cluster_type(old_l2_entry)) { + case QCOW2_CLUSTER_UNALLOCATED: + if (!bs->backing_hd) { + continue; + } + break; - if ((old_offset & L2E_OFFSET_MASK) == 0 && !bs->backing_hd) { - continue; + case QCOW2_CLUSTER_ZERO: + continue; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_COMPRESSED: + break; + + default: + abort(); } /* First remove L2 entries */ @@ -1390,7 +1408,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_offset, 1, type); + qcow2_free_any_clusters(bs, old_l2_entry, 1, type); } ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index a37ee4501..cc6cf743d 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -27,6 +27,7 @@ #include "block/qcow2.h" #include "qemu/range.h" #include "qapi/qmp/types.h" +#include "qapi-event.h" static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, @@ -653,6 +654,15 @@ retry: goto retry; } } + + /* Make sure that all offsets in the "allocated" range are representable + * in an int64_t */ + if (s->free_cluster_index > 0 && + s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) + { + return -EFBIG; + } + #ifdef DEBUG_ALLOC2 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", size, @@ -1480,6 +1490,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, int ret; size = bdrv_getlength(bs->file); + if (size < 0) { + res->check_errors++; + return size; + } + nb_clusters = size_to_clusters(s, size); if (nb_clusters > INT_MAX) { res->check_errors++; @@ -1793,7 +1808,6 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, } else if (ret > 0) { int metadata_ol_bitnr = ffs(ret) - 1; char *message; - QObject *data; assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); @@ -1802,12 +1816,14 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, metadata_ol_names[metadata_ol_bitnr]); message = g_strdup_printf("Prevented %s overwrite", metadata_ol_names[metadata_ol_bitnr]); - data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %" - PRId64 ", 'size': %" PRId64 " }", bs->device_name, message, - offset, size); - monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data); + qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), + message, + true, + offset, + true, + size, + &error_abort); g_free(message); - qobject_decref(data); qcow2_mark_corrupt(bs); bs->drv = NULL; /* make BDS unusable */ diff --git a/block/qcow2.c b/block/qcow2.c index e903d971c..1e3ab6bd0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -31,6 +31,7 @@ #include "qapi/qmp/qerror.h" #include "qapi/qmp/qbool.h" #include "trace.h" +#include "qemu/option_int.h" /* Differences with QCOW: @@ -124,8 +125,9 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_BACKING_FORMAT: if (ext.len >= sizeof(bs->backing_format)) { - error_setg(errp, "ERROR: ext_backing_format: len=%u too large" - " (>=%zu)", ext.len, sizeof(bs->backing_format)); + error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 + " too large (>=%zu)", ext.len, + sizeof(bs->backing_format)); return 2; } ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); @@ -208,20 +210,31 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, static void report_unsupported_feature(BlockDriverState *bs, Error **errp, Qcow2Feature *table, uint64_t mask) { + char *features = g_strdup(""); + char *old; + while (table && table->name[0] != '\0') { if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { - if (mask & (1 << table->bit)) { - report_unsupported(bs, errp, "%.46s", table->name); - mask &= ~(1 << table->bit); + if (mask & (1ULL << table->bit)) { + old = features; + features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", + table->name); + g_free(old); + mask &= ~(1ULL << table->bit); } } table++; } if (mask) { - report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64, - mask); + old = features; + features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, + old, *old ? ", " : "", mask); + g_free(old); } + + report_unsupported(bs, errp, "%s", features); + g_free(features); } /* @@ -483,7 +496,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } if (header.version < 2 || header.version > 3) { - report_unsupported(bs, errp, "QCOW version %d", header.version); + report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version); ret = -ENOTSUP; goto fail; } @@ -493,7 +506,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, /* Initialise cluster size */ if (header.cluster_bits < MIN_CLUSTER_BITS || header.cluster_bits > MAX_CLUSTER_BITS) { - error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits); + error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, + header.cluster_bits); ret = -EINVAL; goto fail; } @@ -591,7 +605,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, s->refcount_order = header.refcount_order; if (header.crypt_method > QCOW_CRYPT_AES) { - error_setg(errp, "Unsupported encryption method: %i", + error_setg(errp, "Unsupported encryption method: %" PRIu32, header.crypt_method); ret = -EINVAL; goto fail; @@ -852,13 +866,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, return ret; } -static int qcow2_refresh_limits(BlockDriverState *bs) +static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQcowState *s = bs->opaque; bs->bl.write_zeroes_alignment = s->cluster_sectors; - - return 0; } static int qcow2_set_key(BlockDriverState *bs, const char *key) @@ -1017,11 +1029,20 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, sector_num, cur_nr_sectors); if (n1 > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, hd_qiov.niov); + qemu_iovec_concat(&local_qiov, &hd_qiov, 0, + n1 * BDRV_SECTOR_SIZE); + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_readv(bs->backing_hd, sector_num, - n1, &hd_qiov); + n1, &local_qiov); qemu_co_mutex_lock(&s->lock); + + qemu_iovec_destroy(&local_qiov); + if (ret < 0) { goto fail; } @@ -1306,6 +1327,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) options = qdict_clone_shallow(bs->options); ret = qcow2_open(bs, options, flags, &local_err); + QDECREF(options); if (local_err) { error_setg(errp, "Could not reopen qcow2 layer: %s", error_get_pretty(local_err)); @@ -1316,8 +1338,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) return; } - QDECREF(options); - if (crypt_method) { s->crypt_method = crypt_method; memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); @@ -1593,7 +1613,7 @@ static int preallocate(BlockDriverState *bs) static int qcow2_create2(const char *filename, int64_t total_size, const char *backing_file, const char *backing_format, int flags, size_t cluster_size, int prealloc, - QEMUOptionParameter *options, int version, + QemuOpts *opts, int version, Error **errp) { /* Calculate cluster_bits */ @@ -1625,7 +1645,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, Error *local_err = NULL; int ret; - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); return ret; @@ -1761,11 +1781,11 @@ out: return ret; } -static int qcow2_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) { - const char *backing_file = NULL; - const char *backing_fmt = NULL; + char *backing_file = NULL; + char *backing_fmt = NULL; + char *buf = NULL; uint64_t sectors = 0; int flags = 0; size_t cluster_size = DEFAULT_CLUSTER_SIZE; @@ -1775,64 +1795,66 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options, int ret; /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - sectors = options->value.n / 512; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { - backing_fmt = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { - flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - cluster_size = options->value.n; - } - } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { - if (!options->value.s || !strcmp(options->value.s, "off")) { - prealloc = 0; - } else if (!strcmp(options->value.s, "metadata")) { - prealloc = 1; - } else { - error_setg(errp, "Invalid preallocation mode: '%s'", - options->value.s); - return -EINVAL; - } - } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { - if (!options->value.s) { - /* keep the default */ - } else if (!strcmp(options->value.s, "0.10")) { - version = 2; - } else if (!strcmp(options->value.s, "1.1")) { - version = 3; - } else { - error_setg(errp, "Invalid compatibility level: '%s'", - options->value.s); - return -EINVAL; - } - } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { - flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; - } - options++; + sectors = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512; + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + flags |= BLOCK_FLAG_ENCRYPT; + } + cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + if (!buf || !strcmp(buf, "off")) { + prealloc = 0; + } else if (!strcmp(buf, "metadata")) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'", buf); + ret = -EINVAL; + goto finish; + } + g_free(buf); + buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); + if (!buf) { + /* keep the default */ + } else if (!strcmp(buf, "0.10")) { + version = 2; + } else if (!strcmp(buf, "1.1")) { + version = 3; + } else { + error_setg(errp, "Invalid compatibility level: '%s'", buf); + ret = -EINVAL; + goto finish; + } + + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) { + flags |= BLOCK_FLAG_LAZY_REFCOUNTS; } if (backing_file && prealloc) { error_setg(errp, "Backing file and preallocation cannot be used at " "the same time"); - return -EINVAL; + ret = -EINVAL; + goto finish; } if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { error_setg(errp, "Lazy refcounts only supported with compatibility " "level 1.1 and above (use compat=1.1 or greater)"); - return -EINVAL; + ret = -EINVAL; + goto finish; } ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, - cluster_size, prealloc, options, version, &local_err); + cluster_size, prealloc, opts, version, &local_err); if (local_err) { error_propagate(errp, local_err); } + +finish: + g_free(backing_file); + g_free(backing_fmt); + g_free(buf); return ret; } @@ -2197,64 +2219,72 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version) return 0; } -static int qcow2_amend_options(BlockDriverState *bs, - QEMUOptionParameter *options) +static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts) { BDRVQcowState *s = bs->opaque; int old_version = s->qcow_version, new_version = old_version; uint64_t new_size = 0; const char *backing_file = NULL, *backing_format = NULL; bool lazy_refcounts = s->use_lazy_refcounts; + const char *compat = NULL; + uint64_t cluster_size = s->cluster_size; + bool encrypt; int ret; - int i; + QemuOptDesc *desc = opts->list->desc; - for (i = 0; options[i].name; i++) - { - if (!options[i].assigned) { + while (desc && desc->name) { + if (!qemu_opt_find(opts, desc->name)) { /* only change explicitly defined options */ + desc++; continue; } - if (!strcmp(options[i].name, "compat")) { - if (!options[i].value.s) { + if (!strcmp(desc->name, "compat")) { + compat = qemu_opt_get(opts, "compat"); + if (!compat) { /* preserve default */ - } else if (!strcmp(options[i].value.s, "0.10")) { + } else if (!strcmp(compat, "0.10")) { new_version = 2; - } else if (!strcmp(options[i].value.s, "1.1")) { + } else if (!strcmp(compat, "1.1")) { new_version = 3; } else { - fprintf(stderr, "Unknown compatibility level %s.\n", - options[i].value.s); + fprintf(stderr, "Unknown compatibility level %s.\n", compat); return -EINVAL; } - } else if (!strcmp(options[i].name, "preallocation")) { + } else if (!strcmp(desc->name, "preallocation")) { fprintf(stderr, "Cannot change preallocation mode.\n"); return -ENOTSUP; - } else if (!strcmp(options[i].name, "size")) { - new_size = options[i].value.n; - } else if (!strcmp(options[i].name, "backing_file")) { - backing_file = options[i].value.s; - } else if (!strcmp(options[i].name, "backing_fmt")) { - backing_format = options[i].value.s; - } else if (!strcmp(options[i].name, "encryption")) { - if ((options[i].value.n != !!s->crypt_method)) { + } else if (!strcmp(desc->name, "size")) { + new_size = qemu_opt_get_size(opts, "size", 0); + } else if (!strcmp(desc->name, "backing_file")) { + backing_file = qemu_opt_get(opts, "backing_file"); + } else if (!strcmp(desc->name, "backing_fmt")) { + backing_format = qemu_opt_get(opts, "backing_fmt"); + } else if (!strcmp(desc->name, "encryption")) { + encrypt = qemu_opt_get_bool(opts, "encryption", s->crypt_method); + if (encrypt != !!s->crypt_method) { fprintf(stderr, "Changing the encryption flag is not " "supported.\n"); return -ENOTSUP; } - } else if (!strcmp(options[i].name, "cluster_size")) { - if (options[i].value.n != s->cluster_size) { + } else if (!strcmp(desc->name, "cluster_size")) { + cluster_size = qemu_opt_get_size(opts, "cluster_size", + cluster_size); + if (cluster_size != s->cluster_size) { fprintf(stderr, "Changing the cluster size is not " "supported.\n"); return -ENOTSUP; } - } else if (!strcmp(options[i].name, "lazy_refcounts")) { - lazy_refcounts = options[i].value.n; + } else if (!strcmp(desc->name, "lazy_refcounts")) { + lazy_refcounts = qemu_opt_get_bool(opts, "lazy_refcounts", + lazy_refcounts); } else { /* if this assertion fails, this probably means a new option was * added without having it covered here */ assert(false); } + + desc++; } if (new_version != old_version) { @@ -2323,49 +2353,55 @@ static int qcow2_amend_options(BlockDriverState *bs, return 0; } -static QEMUOptionParameter qcow2_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_COMPAT_LEVEL, - .type = OPT_STRING, - .help = "Compatibility level (0.10 or 1.1)" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_BACKING_FMT, - .type = OPT_STRING, - .help = "Image format of the base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = OPT_FLAG, - .help = "Encrypt the image" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "qcow2 cluster size", - .value = { .n = DEFAULT_CLUSTER_SIZE }, - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = OPT_STRING, - .help = "Preallocation mode (allowed values: off, metadata)" - }, - { - .name = BLOCK_OPT_LAZY_REFCOUNTS, - .type = OPT_FLAG, - .help = "Postpone refcount updates", - }, - { NULL } +static QemuOptsList qcow2_create_opts = { + .name = "qcow2-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_COMPAT_LEVEL, + .type = QEMU_OPT_STRING, + .help = "Compatibility level (0.10 or 1.1)" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = QEMU_OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = QEMU_OPT_BOOL, + .help = "Encrypt the image", + .def_value_str = "off" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "qcow2 cluster size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, metadata)" + }, + { + .name = BLOCK_OPT_LAZY_REFCOUNTS, + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + .def_value_str = "off" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_qcow2 = { @@ -2393,21 +2429,22 @@ static BlockDriver bdrv_qcow2 = { .bdrv_snapshot_goto = qcow2_snapshot_goto, .bdrv_snapshot_delete = qcow2_snapshot_delete, .bdrv_snapshot_list = qcow2_snapshot_list, - .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, - .bdrv_get_info = qcow2_get_info, + .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, + .bdrv_get_info = qcow2_get_info, .bdrv_get_specific_info = qcow2_get_specific_info, .bdrv_save_vmstate = qcow2_save_vmstate, .bdrv_load_vmstate = qcow2_load_vmstate, + .supports_backing = true, .bdrv_change_backing_file = qcow2_change_backing_file, .bdrv_refresh_limits = qcow2_refresh_limits, .bdrv_invalidate_cache = qcow2_invalidate_cache, - .create_options = qcow2_create_options, - .bdrv_check = qcow2_check, - .bdrv_amend_options = qcow2_amend_options, + .create_opts = &qcow2_create_opts, + .bdrv_check = qcow2_check, + .bdrv_amend_options = qcow2_amend_options, }; static void bdrv_qcow2_init(void) diff --git a/block/qed-table.c b/block/qed-table.c index 76d2dcccf..f61107a1c 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -173,7 +173,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s) qed_read_table(s, s->header.l1_table_offset, s->l1_table, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(s->bs), true); } return ret; @@ -194,7 +194,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, qed_write_l1_table(s, index, n, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(s->bs), true); } return ret; @@ -267,7 +267,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(s->bs), true); } return ret; @@ -289,7 +289,7 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); while (ret == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(s->bs), true); } return ret; diff --git a/block/qed.c b/block/qed.c index 3bd9db9c8..794483218 100644 --- a/block/qed.c +++ b/block/qed.c @@ -21,12 +21,13 @@ static void qed_aio_cancel(BlockDriverAIOCB *blockacb) { QEDAIOCB *acb = (QEDAIOCB *)blockacb; + AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); bool finished = false; /* Wait for the request to finish */ acb->finished = &finished; while (!finished) { - qemu_aio_wait(); + aio_poll(aio_context, true); } } @@ -373,6 +374,27 @@ static void bdrv_qed_rebind(BlockDriverState *bs) s->bs = bs; } +static void bdrv_qed_detach_aio_context(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + qed_cancel_need_check_timer(s); + timer_free(s->need_check_timer); +} + +static void bdrv_qed_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVQEDState *s = bs->opaque; + + s->need_check_timer = aio_timer_new(new_context, + QEMU_CLOCK_VIRTUAL, SCALE_NS, + qed_need_check_timer_cb, s); + if (s->header.features & QED_F_NEED_CHECK) { + qed_start_need_check_timer(s); + } +} + static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -496,8 +518,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, } } - s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, - qed_need_check_timer_cb, s); + bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs)); out: if (ret) { @@ -507,13 +528,11 @@ out: return ret; } -static int bdrv_qed_refresh_limits(BlockDriverState *bs) +static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVQEDState *s = bs->opaque; bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; - - return 0; } /* We have nothing to do for QED reopen, stubs just return @@ -528,8 +547,7 @@ static void bdrv_qed_close(BlockDriverState *bs) { BDRVQEDState *s = bs->opaque; - qed_cancel_need_check_timer(s); - timer_free(s->need_check_timer); + bdrv_qed_detach_aio_context(bs); /* Ensure writes reach stable storage */ bdrv_flush(bs->file); @@ -547,7 +565,7 @@ static void bdrv_qed_close(BlockDriverState *bs) static int qed_create(const char *filename, uint32_t cluster_size, uint64_t image_size, uint32_t table_size, const char *backing_file, const char *backing_fmt, - Error **errp) + QemuOpts *opts, Error **errp) { QEDHeader header = { .magic = QED_MAGIC, @@ -566,7 +584,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, int ret = 0; BlockDriverState *bs; - ret = bdrv_create_file(filename, NULL, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); return ret; @@ -621,53 +639,53 @@ out: return ret; } -static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) { uint64_t image_size = 0; uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; uint32_t table_size = QED_DEFAULT_TABLE_SIZE; - const char *backing_file = NULL; - const char *backing_fmt = NULL; - - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - image_size = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { - backing_fmt = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - cluster_size = options->value.n; - } - } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { - if (options->value.n) { - table_size = options->value.n; - } - } - options++; - } + char *backing_file = NULL; + char *backing_fmt = NULL; + int ret; + + image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); + cluster_size = qemu_opt_get_size_del(opts, + BLOCK_OPT_CLUSTER_SIZE, + QED_DEFAULT_CLUSTER_SIZE); + table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE, + QED_DEFAULT_TABLE_SIZE); if (!qed_is_cluster_size_valid(cluster_size)) { - fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", - QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); - return -EINVAL; + error_setg(errp, "QED cluster size must be within range [%u, %u] " + "and power of 2", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + ret = -EINVAL; + goto finish; } if (!qed_is_table_size_valid(table_size)) { - fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", - QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); - return -EINVAL; + error_setg(errp, "QED table size must be within range [%u, %u] " + "and power of 2", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + ret = -EINVAL; + goto finish; } if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { - fprintf(stderr, "QED image size must be a non-zero multiple of " - "cluster size and less than %" PRIu64 " bytes\n", - qed_max_image_size(cluster_size, table_size)); - return -EINVAL; + error_setg(errp, "QED image size must be a non-zero multiple of " + "cluster size and less than %" PRIu64 " bytes", + qed_max_image_size(cluster_size, table_size)); + ret = -EINVAL; + goto finish; } - return qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt, errp); + ret = qed_create(filename, cluster_size, image_size, table_size, + backing_file, backing_fmt, opts, errp); + +finish: + g_free(backing_file); + g_free(backing_fmt); + return ret; } typedef struct { @@ -741,17 +759,19 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb) /** * Read from the backing file or zero-fill if no backing file * - * @s: QED state - * @pos: Byte position in device - * @qiov: Destination I/O vector - * @cb: Completion function - * @opaque: User data for completion function + * @s: QED state + * @pos: Byte position in device + * @qiov: Destination I/O vector + * @backing_qiov: Possibly shortened copy of qiov, to be allocated here + * @cb: Completion function + * @opaque: User data for completion function * * This function reads qiov->size bytes starting at pos from the backing file. * If there is no backing file then zeroes are read. */ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov, + QEMUIOVector **backing_qiov, BlockDriverCompletionFunc *cb, void *opaque) { uint64_t backing_length = 0; @@ -784,15 +804,21 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, /* If the read straddles the end of the backing file, shorten it */ size = MIN((uint64_t)backing_length - pos, qiov->size); + assert(*backing_qiov == NULL); + *backing_qiov = g_new(QEMUIOVector, 1); + qemu_iovec_init(*backing_qiov, qiov->niov); + qemu_iovec_concat(*backing_qiov, qiov, 0, size); + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, - qiov, size / BDRV_SECTOR_SIZE, cb, opaque); + *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); } typedef struct { GenericCB gencb; BDRVQEDState *s; QEMUIOVector qiov; + QEMUIOVector *backing_qiov; struct iovec iov; uint64_t offset; } CopyFromBackingFileCB; @@ -809,6 +835,12 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret) CopyFromBackingFileCB *copy_cb = opaque; BDRVQEDState *s = copy_cb->s; + if (copy_cb->backing_qiov) { + qemu_iovec_destroy(copy_cb->backing_qiov); + g_free(copy_cb->backing_qiov); + copy_cb->backing_qiov = NULL; + } + if (ret) { qed_copy_from_backing_file_cb(copy_cb, ret); return; @@ -846,11 +878,12 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); copy_cb->s = s; copy_cb->offset = offset; + copy_cb->backing_qiov = NULL; copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); copy_cb->iov.iov_len = len; qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); - qed_read_backing_file(s, pos, ©_cb->qiov, + qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov, qed_copy_from_backing_file_write, copy_cb); } @@ -917,7 +950,8 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret) /* Arrange for a bh to invoke the completion function */ acb->bh_ret = ret; - acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + qed_aio_complete_bh, acb); qemu_bh_schedule(acb->bh); /* Start next allocating write request waiting behind this one. Note that @@ -1292,7 +1326,7 @@ static void qed_aio_read_data(void *opaque, int ret, return; } else if (ret != QED_CLUSTER_FOUND) { qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - qed_aio_next_io, acb); + &acb->backing_qiov, qed_aio_next_io, acb); return; } @@ -1318,6 +1352,12 @@ static void qed_aio_next_io(void *opaque, int ret) trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + if (acb->backing_qiov) { + qemu_iovec_destroy(acb->backing_qiov); + g_free(acb->backing_qiov); + acb->backing_qiov = NULL; + } + /* Handle I/O error */ if (ret) { qed_aio_complete(acb, ret); @@ -1357,6 +1397,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, acb->qiov_offset = 0; acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->backing_qiov = NULL; acb->request.l2_table = NULL; qemu_iovec_init(&acb->cur_qiov, qiov->niov); @@ -1593,36 +1634,45 @@ static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, return qed_check(s, result, !!fix); } -static QEMUOptionParameter qed_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size (in bytes)" - }, { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, { - .name = BLOCK_OPT_BACKING_FMT, - .type = OPT_STRING, - .help = "Image format of the base image" - }, { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "Cluster size (in bytes)", - .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, - }, { - .name = BLOCK_OPT_TABLE_SIZE, - .type = OPT_SIZE, - .help = "L1/L2 table size (in clusters)" - }, - { /* end of list */ } +static QemuOptsList qed_create_opts = { + .name = "qed-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = QEMU_OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Cluster size (in bytes)", + .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE) + }, + { + .name = BLOCK_OPT_TABLE_SIZE, + .type = QEMU_OPT_SIZE, + .help = "L1/L2 table size (in clusters)" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_qed = { .format_name = "qed", .instance_size = sizeof(BDRVQEDState), - .create_options = qed_create_options, + .create_opts = &qed_create_opts, + .supports_backing = true, .bdrv_probe = bdrv_qed_probe, .bdrv_rebind = bdrv_qed_rebind, @@ -1642,6 +1692,8 @@ static BlockDriver bdrv_qed = { .bdrv_change_backing_file = bdrv_qed_change_backing_file, .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, .bdrv_check = bdrv_qed_check, + .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, + .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, }; static void bdrv_qed_init(void) diff --git a/block/qed.h b/block/qed.h index 5d65bea07..2b0e724e0 100644 --- a/block/qed.h +++ b/block/qed.h @@ -43,7 +43,7 @@ * * All fields are little-endian on disk. */ - +#define QED_DEFAULT_CLUSTER_SIZE 65536 enum { QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, @@ -69,7 +69,6 @@ enum { */ QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, - QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, /* Allocated clusters are tracked using a 2-level pagetable. Table size is * a multiple of clusters so large maximum image sizes can be supported @@ -143,6 +142,7 @@ typedef struct QEDAIOCB { /* Current cluster scatter-gather list */ QEMUIOVector cur_qiov; + QEMUIOVector *backing_qiov; uint64_t cur_pos; /* position on block device, in bytes */ uint64_t cur_cluster; /* cluster offset in image file */ unsigned int cur_nclusters; /* number of clusters being accessed */ diff --git a/block/quorum.c b/block/quorum.c index 7f580a83b..d5ee9c005 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -17,11 +17,13 @@ #include <gnutls/crypto.h> #include "block/block_int.h" #include "qapi/qmp/qjson.h" +#include "qapi-event.h" #define HASH_LENGTH 32 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" #define QUORUM_OPT_BLKVERIFY "blkverify" +#define QUORUM_OPT_REWRITE "rewrite-corrupted" /* This union holds a vote hash value */ typedef union QuorumVoteValue { @@ -69,6 +71,9 @@ typedef struct BDRVQuorumState { * It is useful to debug other block drivers by * comparing them with a reference one. */ + bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted + * block if Quorum is reached. + */ } BDRVQuorumState; typedef struct QuorumAIOCB QuorumAIOCB; @@ -104,13 +109,17 @@ struct QuorumAIOCB { int count; /* number of completed AIOCB */ int success_count; /* number of successfully completed AIOCB */ + int rewrite_count; /* number of replica to rewrite: count down to + * zero once writes are fired + */ + QuorumVotes votes; bool is_read; int vote_ret; }; -static void quorum_vote(QuorumAIOCB *acb); +static bool quorum_vote(QuorumAIOCB *acb); static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) { @@ -182,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, acb->qcrs = g_new0(QuorumChildRequest, s->num_children); acb->count = 0; acb->success_count = 0; + acb->rewrite_count = 0; acb->votes.compare = quorum_sha256_compare; QLIST_INIT(&acb->votes.vote_list); acb->is_read = false; @@ -198,32 +208,22 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) { - QObject *data; - assert(node_name); - data = qobject_from_jsonf("{ 'node-name': %s" - ", 'sector-num': %" PRId64 - ", 'sectors-count': %d }", - node_name, acb->sector_num, acb->nb_sectors); + const char *msg = NULL; if (ret < 0) { - QDict *dict = qobject_to_qdict(data); - qdict_put(dict, "error", qstring_from_str(strerror(-ret))); + msg = strerror(-ret); } - monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); - qobject_decref(data); + qapi_event_send_quorum_report_bad(!!msg, msg, node_name, + acb->sector_num, acb->nb_sectors, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) { - QObject *data; const char *reference = acb->common.bs->device_name[0] ? acb->common.bs->device_name : acb->common.bs->node_name; - data = qobject_from_jsonf("{ 'reference': %s" - ", 'sector-num': %" PRId64 - ", 'sectors-count': %d }", - reference, acb->sector_num, acb->nb_sectors); - monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); - qobject_decref(data); + + qapi_event_send_quorum_failure(reference, acb->sector_num, + acb->nb_sectors, &error_abort); } static int quorum_vote_error(QuorumAIOCB *acb); @@ -241,11 +241,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) return false; } +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ + QuorumAIOCB *acb = opaque; + + /* one less rewrite to do */ + acb->rewrite_count--; + + /* wait until all rewrite callbacks have completed */ + if (acb->rewrite_count) { + return; + } + + quorum_aio_finalize(acb); +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; + bool rewrite = false; sacb->ret = ret; acb->count++; @@ -262,12 +278,15 @@ static void quorum_aio_cb(void *opaque, int ret) /* Do the vote on read */ if (acb->is_read) { - quorum_vote(acb); + rewrite = quorum_vote(acb); } else { quorum_has_too_much_io_failed(acb); } - quorum_aio_finalize(acb); + /* if no rewrite is done the code will finish right away */ + if (!rewrite) { + quorum_aio_finalize(acb); + } } static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -287,6 +306,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, } } +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + int count = 0; + + /* first count the number of bad versions: done first to avoid concurrency + * issues. + */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + count++; + } + } + + /* quorum_rewrite_aio_cb will count down this to zero */ + acb->rewrite_count = count; + + /* now fire the correcting rewrites */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, + acb->nb_sectors, quorum_rewrite_aio_cb, acb); + } + } + + /* return true if any rewrite is done else false */ + return count; +} + static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { int i; @@ -477,16 +533,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } -static void quorum_vote(QuorumAIOCB *acb) +static bool quorum_vote(QuorumAIOCB *acb) { bool quorum = true; + bool rewrite = false; int i, j, ret; QuorumVoteValue hash; BDRVQuorumState *s = acb->common.bs->opaque; QuorumVoteVersion *winner; if (quorum_has_too_much_io_failed(acb)) { - return; + return false; } /* get the index of the first successful read */ @@ -514,7 +571,7 @@ static void quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return; + return false; } /* compute hashes for each successful read, also store indexes */ @@ -547,9 +604,15 @@ static void quorum_vote(QuorumAIOCB *acb) /* some versions are bad print them */ quorum_report_bad_versions(s, acb, &winner->value); + /* corruption correction is enabled */ + if (s->rewrite_corrupted) { + rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + } + free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); + return rewrite; } static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, @@ -714,6 +777,11 @@ static QemuOptsList quorum_runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Trigger block verify mode if set", }, + { + .name = QUORUM_OPT_REWRITE, + .type = QEMU_OPT_BOOL, + .help = "Rewrite corrupted block on read quorum", + }, { /* end of list */ } }, }; @@ -753,7 +821,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { ret = -EINVAL; goto exit; } @@ -775,6 +843,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, "and using two files with vote_threshold=2\n"); } + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false); + if (s->rewrite_corrupted && s->is_blkverify) { + error_setg(&local_err, + "rewrite-corrupted=on cannot be used with blkverify=on"); + ret = -EINVAL; + goto exit; + } + /* allocate the children BlockDriverState array */ s->bs = g_new0(BlockDriverState *, s->num_children); opened = g_new0(bool, s->num_children); @@ -828,7 +904,7 @@ close_exit: g_free(opened); exit: /* propagate error */ - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); } QDECREF(list); @@ -848,25 +924,49 @@ static void quorum_close(BlockDriverState *bs) g_free(s->bs); } +static void quorum_detach_aio_context(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_detach_aio_context(s->bs[i]); + } +} + +static void quorum_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_attach_aio_context(s->bs[i], new_context); + } +} + static BlockDriver bdrv_quorum = { - .format_name = "quorum", - .protocol_name = "quorum", + .format_name = "quorum", + .protocol_name = "quorum", + + .instance_size = sizeof(BDRVQuorumState), - .instance_size = sizeof(BDRVQuorumState), + .bdrv_file_open = quorum_open, + .bdrv_close = quorum_close, - .bdrv_file_open = quorum_open, - .bdrv_close = quorum_close, + .bdrv_co_flush_to_disk = quorum_co_flush, - .bdrv_co_flush_to_disk = quorum_co_flush, + .bdrv_getlength = quorum_getlength, - .bdrv_getlength = quorum_getlength, + .bdrv_aio_readv = quorum_aio_readv, + .bdrv_aio_writev = quorum_aio_writev, + .bdrv_invalidate_cache = quorum_invalidate_cache, - .bdrv_aio_readv = quorum_aio_readv, - .bdrv_aio_writev = quorum_aio_writev, - .bdrv_invalidate_cache = quorum_invalidate_cache, + .bdrv_detach_aio_context = quorum_detach_aio_context, + .bdrv_attach_aio_context = quorum_attach_aio_context, - .is_filter = true, - .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, + .is_filter = true, + .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, }; static void bdrv_quorum_init(void) diff --git a/block/raw-aio.h b/block/raw-aio.h index 7ad0a8a0a..e18c97509 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -34,19 +34,29 @@ /* linux-aio.c - Linux native implementation */ #ifdef CONFIG_LINUX_AIO void *laio_init(void); +void laio_cleanup(void *s); BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type); +void laio_detach_aio_context(void *s, AioContext *old_context); +void laio_attach_aio_context(void *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, void *aio_ctx); +int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); #endif #ifdef _WIN32 typedef struct QEMUWin32AIOState QEMUWin32AIOState; QEMUWin32AIOState *win32_aio_init(void); +void win32_aio_cleanup(QEMUWin32AIOState *aio); int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, QEMUWin32AIOState *aio, HANDLE hfile, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type); +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, + AioContext *old_context); +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, + AioContext *new_context); #endif #endif /* QEMU_RAW_AIO_H */ diff --git a/block/raw-posix.c b/block/raw-posix.c index 1688e16c6..8e9758e92 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -55,6 +55,9 @@ #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif #endif #ifdef CONFIG_FIEMAP #include <linux/fiemap.h> @@ -146,6 +149,9 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; bool discard_zeroes:1; +#ifdef CONFIG_FIEMAP + bool skip_fiemap; +#endif } BDRVRawState; typedef struct BDRVRawReopenState { @@ -215,7 +221,7 @@ static int raw_normalize_devicepath(const char **filename) } #endif -static void raw_probe_alignment(BlockDriverState *bs) +static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) { BDRVRawState *s = bs->opaque; char *buf; @@ -234,24 +240,24 @@ static void raw_probe_alignment(BlockDriverState *bs) s->buf_align = 0; #ifdef BLKSSZGET - if (ioctl(s->fd, BLKSSZGET, §or_size) >= 0) { + if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { bs->request_alignment = sector_size; } #endif #ifdef DKIOCGETBLOCKSIZE - if (ioctl(s->fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { + if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { bs->request_alignment = sector_size; } #endif #ifdef DIOCGSECTORSIZE - if (ioctl(s->fd, DIOCGSECTORSIZE, §or_size) >= 0) { + if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { bs->request_alignment = sector_size; } #endif #ifdef CONFIG_XFS if (s->is_xfs) { struct dioattr da; - if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) { + if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { bs->request_alignment = da.d_miniosz; /* The kernel returns wrong information for d_mem */ /* s->buf_align = da.d_mem; */ @@ -264,7 +270,7 @@ static void raw_probe_alignment(BlockDriverState *bs) size_t align; buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { - if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { + if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { s->buf_align = align; break; } @@ -276,13 +282,18 @@ static void raw_probe_alignment(BlockDriverState *bs) size_t align; buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { - if (pread(s->fd, buf, align, 0) >= 0) { + if (pread(fd, buf, align, 0) >= 0) { bs->request_alignment = align; break; } } qemu_vfree(buf); } + + if (!s->buf_align || !bs->request_alignment) { + error_setg(errp, "Could not find working O_DIRECT alignment. " + "Try cache.direct=off."); + } } static void raw_parse_flags(int bdrv_flags, int *open_flags) @@ -304,6 +315,29 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags) } } +static void raw_detach_aio_context(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + + if (s->use_aio) { + laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs)); + } +#endif +} + +static void raw_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + + if (s->use_aio) { + laio_attach_aio_context(s->aio_ctx, new_context); + } +#endif +} + #ifdef CONFIG_LINUX_AIO static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) { @@ -366,7 +400,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, BDRVRawState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename; + const char *filename = NULL; int fd, ret; struct stat st; @@ -444,8 +478,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif + raw_attach_aio_context(bs, bdrv_get_aio_context(bs)); + ret = 0; fail: + if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { + unlink(filename); + } qemu_opts_del(opts); return ret; } @@ -471,6 +510,7 @@ static int raw_reopen_prepare(BDRVReopenState *state, BDRVRawState *s; BDRVRawReopenState *raw_s; int ret = 0; + Error *local_err = NULL; assert(state != NULL); assert(state->bs != NULL); @@ -543,6 +583,19 @@ static int raw_reopen_prepare(BDRVReopenState *state, ret = -1; } } + + /* Fail already reopen_prepare() if we can't get a working O_DIRECT + * alignment with the new fd. */ + if (raw_s->fd != -1) { + raw_probe_alignment(state->bs, raw_s->fd, &local_err); + if (local_err) { + qemu_close(raw_s->fd); + raw_s->fd = -1; + error_propagate(errp, local_err); + ret = -EINVAL; + } + } + return ret; } @@ -581,14 +634,12 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } -static int raw_refresh_limits(BlockDriverState *bs) +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; - raw_probe_alignment(bs); + raw_probe_alignment(bs, s->fd, errp); bs->bl.opt_mem_alignment = s->buf_align; - - return 0; } static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) @@ -756,6 +807,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); p += aiocb->aio_iov[i].iov_len; } + assert(p - buf == aiocb->aio_nbytes); } nbytes = handle_aiocb_rw_linear(aiocb, buf); @@ -770,9 +822,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) copy = aiocb->aio_iov[i].iov_len; } memcpy(aiocb->aio_iov[i].iov_base, p, copy); + assert(count >= copy); p += copy; count -= copy; } + assert(count == 0); } qemu_vfree(buf); @@ -959,12 +1013,14 @@ static int paio_submit_co(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; + acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; + acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; + assert(qiov->size == acb->aio_nbytes); } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; trace_paio_submit_co(sector_num, nb_sectors, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); @@ -982,12 +1038,14 @@ static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->aio_type = type; acb->aio_fildes = fd; + acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; + acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + if (qiov) { acb->aio_iov = qiov->iov; acb->aio_niov = qiov->niov; + assert(qiov->size == acb->aio_nbytes); } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); @@ -1023,6 +1081,36 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, cb, opaque, type); } +static void raw_aio_plug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_plug(bs, s->aio_ctx); + } +#endif +} + +static void raw_aio_unplug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, true); + } +#endif +} + +static void raw_aio_flush_io_queue(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, false); + } +#endif +} + static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) @@ -1053,6 +1141,14 @@ static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, static void raw_close(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; + + raw_detach_aio_context(bs); + +#ifdef CONFIG_LINUX_AIO + if (s->use_aio) { + laio_cleanup(s->aio_ctx); + } +#endif if (s->fd >= 0) { qemu_close(s->fd); s->fd = -1; @@ -1091,12 +1187,12 @@ static int64_t raw_getlength(BlockDriverState *bs) struct stat st; if (fstat(fd, &st)) - return -1; + return -errno; if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { struct disklabel dl; if (ioctl(fd, DIOCGDINFO, &dl)) - return -1; + return -errno; return (uint64_t)dl.d_secsize * dl.d_partitions[DISKPART(st.st_rdev)].p_size; } else @@ -1110,7 +1206,7 @@ static int64_t raw_getlength(BlockDriverState *bs) struct stat st; if (fstat(fd, &st)) - return -1; + return -errno; if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { struct dkwedge_info dkw; @@ -1120,7 +1216,7 @@ static int64_t raw_getlength(BlockDriverState *bs) struct disklabel dl; if (ioctl(fd, DIOCGDINFO, &dl)) - return -1; + return -errno; return (uint64_t)dl.d_secsize * dl.d_partitions[DISKPART(st.st_rdev)].p_size; } @@ -1133,6 +1229,7 @@ static int64_t raw_getlength(BlockDriverState *bs) BDRVRawState *s = bs->opaque; struct dk_minfo minfo; int ret; + int64_t size; ret = fd_open(bs); if (ret < 0) { @@ -1151,7 +1248,11 @@ static int64_t raw_getlength(BlockDriverState *bs) * There are reports that lseek on some devices fails, but * irc discussion said that contingency on contingency was overkill. */ - return lseek(s->fd, 0, SEEK_END); + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; } #elif defined(CONFIG_BSD) static int64_t raw_getlength(BlockDriverState *bs) @@ -1186,9 +1287,12 @@ again: if (size == 0) #endif #if defined(__APPLE__) && defined(__MACH__) - size = LONG_LONG_MAX; + size = LLONG_MAX; #else size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } #endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) switch(s->type) { @@ -1205,6 +1309,9 @@ again: #endif } else { size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } } return size; } @@ -1213,13 +1320,18 @@ static int64_t raw_getlength(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; int ret; + int64_t size; ret = fd_open(bs); if (ret < 0) { return ret; } - return lseek(s->fd, 0, SEEK_END); + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; } #endif @@ -1234,22 +1346,19 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return (int64_t)st.st_blocks * 512; } -static int raw_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) { int fd; int result = 0; int64_t total_size = 0; + bool nocow = false; strstart(filename, "file:", &filename); /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / BDRV_SECTOR_SIZE; - } - options++; - } + total_size = + qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE; + nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); @@ -1257,6 +1366,21 @@ static int raw_create(const char *filename, QEMUOptionParameter *options, result = -errno; error_setg_errno(errp, -result, "Could not create file"); } else { + if (nocow) { +#ifdef __linux__ + /* Set NOCOW flag to solve performance issue on fs like btrfs. + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value + * will be ignored since any failure of this operation should not + * block the left work. + */ + int attr; + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { + attr |= FS_NOCOW_FL; + ioctl(fd, FS_IOC_SETFLAGS, &attr); + } +#endif + } + if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { result = -errno; error_setg_errno(errp, -result, "Could not resize file"); @@ -1269,53 +1393,29 @@ static int raw_create(const char *filename, QEMUOptionParameter *options, return result; } -/* - * Returns true iff the specified sector is present in the disk image. Drivers - * not implementing the functionality are assumed to not support backing files, - * hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) +static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int nb_sectors, int *pnum) { - off_t start, data, hole; - int64_t ret; - - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - start = sector_num * BDRV_SECTOR_SIZE; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; - #ifdef CONFIG_FIEMAP - BDRVRawState *s = bs->opaque; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; struct { struct fiemap fm; struct fiemap_extent fe; } f; + if (s->skip_fiemap) { + return -ENOTSUP; + } + f.fm.fm_start = start; f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; f.fm.fm_flags = 0; f.fm.fm_extent_count = 1; f.fm.fm_reserved = 0; if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { - /* Assume everything is allocated. */ - *pnum = nb_sectors; - return ret; + s->skip_fiemap = true; + return -errno; } if (f.fm.fm_mapped_extents == 0) { @@ -1323,44 +1423,92 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! */ off_t length = lseek(s->fd, 0, SEEK_END); - hole = f.fm.fm_start; - data = MIN(f.fm.fm_start + f.fm.fm_length, length); + *hole = f.fm.fm_start; + *data = MIN(f.fm.fm_start + f.fm.fm_length, length); } else { - data = f.fe.fe_logical; - hole = f.fe.fe_logical + f.fe.fe_length; + *data = f.fe.fe_logical; + *hole = f.fe.fe_logical + f.fe.fe_length; if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { ret |= BDRV_BLOCK_ZERO; } } -#elif defined SEEK_HOLE && defined SEEK_DATA + return ret; +#else + return -ENOTSUP; +#endif +} +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int *pnum) +{ +#if defined SEEK_HOLE && defined SEEK_DATA BDRVRawState *s = bs->opaque; - hole = lseek(s->fd, start, SEEK_HOLE); - if (hole == -1) { + *hole = lseek(s->fd, start, SEEK_HOLE); + if (*hole == -1) { /* -ENXIO indicates that sector_num was past the end of the file. * There is a virtual hole there. */ assert(errno != -ENXIO); - /* Most likely EINVAL. Assume everything is allocated. */ - *pnum = nb_sectors; - return ret; + return -errno; } - if (hole > start) { - data = start; + if (*hole > start) { + *data = start; } else { /* On a hole. We need another syscall to find its end. */ - data = lseek(s->fd, start, SEEK_DATA); - if (data == -1) { - data = lseek(s->fd, 0, SEEK_END); + *data = lseek(s->fd, start, SEEK_DATA); + if (*data == -1) { + *data = lseek(s->fd, 0, SEEK_END); } } + + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; #else - data = 0; - hole = start + nb_sectors * BDRV_SECTOR_SIZE; + return -ENOTSUP; #endif +} + +/* + * Returns true iff the specified sector is present in the disk image. Drivers + * not implementing the functionality are assumed to not support backing files, + * hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + off_t start, data = 0, hole = 0; + int64_t ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum); + if (ret < 0) { + ret = try_seek_hole(bs, start, &data, &hole, pnum); + if (ret < 0) { + /* Assume everything is allocated. */ + data = 0; + hole = start + nb_sectors * BDRV_SECTOR_SIZE; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + } + } if (data <= start) { /* On a data extent, compute sectors to the end of the extent. */ @@ -1410,13 +1558,22 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } -static QEMUOptionParameter raw_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_NOCOW, + .type = QEMU_OPT_BOOL, + .help = "Turn off copy-on-write (valid only on btrfs)" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_file = { @@ -1441,6 +1598,9 @@ static BlockDriver bdrv_file = { .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = raw_aio_discard, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1448,7 +1608,10 @@ static BlockDriver bdrv_file = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, - .create_options = raw_create_options, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + .create_opts = &raw_create_opts, }; /***********************************************/ @@ -1769,7 +1932,7 @@ static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, return -ENOTSUP; } -static int hdev_create(const char *filename, QEMUOptionParameter *options, +static int hdev_create(const char *filename, QemuOpts *opts, Error **errp) { int fd; @@ -1790,12 +1953,8 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options, (void)has_prefix; /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, "size")) { - total_size = options->value.n / BDRV_SECTOR_SIZE; - } - options++; - } + total_size = + qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / BDRV_SECTOR_SIZE; fd = qemu_open(filename, O_WRONLY | O_BINARY); if (fd < 0) { @@ -1832,8 +1991,8 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_options = raw_create_options, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, .bdrv_co_write_zeroes = hdev_co_write_zeroes, .bdrv_aio_readv = raw_aio_readv, @@ -1841,6 +2000,9 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = hdev_aio_discard, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1848,6 +2010,9 @@ static BlockDriver bdrv_host_device = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + /* generic scsi device */ #ifdef __linux__ .bdrv_ioctl = hdev_ioctl, @@ -1976,13 +2141,16 @@ static BlockDriver bdrv_host_floppy = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_options = raw_create_options, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1990,6 +2158,9 @@ static BlockDriver bdrv_host_floppy = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + /* removable device support */ .bdrv_is_inserted = floppy_is_inserted, .bdrv_media_changed = floppy_media_changed, @@ -2101,13 +2272,16 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_options = raw_create_options, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2115,6 +2289,9 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + /* removable device support */ .bdrv_is_inserted = cdrom_is_inserted, .bdrv_eject = cdrom_eject, @@ -2233,12 +2410,15 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, - .create_options = raw_create_options, + .create_opts = &raw_create_opts, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2246,6 +2426,9 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + /* removable device support */ .bdrv_is_inserted = cdrom_is_inserted, .bdrv_eject = cdrom_eject, @@ -2253,40 +2436,6 @@ static BlockDriver bdrv_host_cdrom = { }; #endif /* __FreeBSD__ */ -#ifdef CONFIG_LINUX_AIO -/** - * Return the file descriptor for Linux AIO - * - * This function is a layering violation and should be removed when it becomes - * possible to call the block layer outside the global mutex. It allows the - * caller to hijack the file descriptor so I/O can be performed outside the - * block layer. - */ -int raw_get_aio_fd(BlockDriverState *bs) -{ - BDRVRawState *s; - - if (!bs->drv) { - return -ENOMEDIUM; - } - - if (bs->drv == bdrv_find_format("raw")) { - bs = bs->file; - } - - /* raw-posix has several protocols so just check for raw_aio_readv */ - if (bs->drv->bdrv_aio_readv != raw_aio_readv) { - return -ENOTSUP; - } - - s = bs->opaque; - if (!s->use_aio) { - return -ENOTSUP; - } - return s->fd; -} -#endif /* CONFIG_LINUX_AIO */ - static void bdrv_file_init(void) { /* diff --git a/block/raw-win32.c b/block/raw-win32.c index 48cb2c225..902eab610 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -36,8 +36,6 @@ #define FTYPE_CD 1 #define FTYPE_HARDDISK 2 -static QEMUWin32AIOState *aio; - typedef struct RawWin32AIOData { BlockDriverState *bs; HANDLE hfile; @@ -202,6 +200,25 @@ static int set_sparse(int fd) NULL, 0, NULL, 0, &returned, NULL); } +static void raw_detach_aio_context(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); + } +} + +static void raw_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_attach_aio_context(s->aio, new_context); + } +} + static void raw_probe_alignment(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -300,15 +317,6 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, raw_parse_flags(flags, &access_flags, &overlapped); - if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) { - aio = win32_aio_init(); - if (aio == NULL) { - error_setg(errp, "Could not initialize AIO"); - ret = -EINVAL; - goto fail; - } - } - if (filename[0] && filename[1] == ':') { snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); } else if (filename[0] == '\\' && filename[1] == '\\') { @@ -335,13 +343,23 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, } if (flags & BDRV_O_NATIVE_AIO) { - ret = win32_aio_attach(aio, s->hfile); + s->aio = win32_aio_init(); + if (s->aio == NULL) { + CloseHandle(s->hfile); + error_setg(errp, "Could not initialize AIO"); + ret = -EINVAL; + goto fail; + } + + ret = win32_aio_attach(s->aio, s->hfile); if (ret < 0) { + win32_aio_cleanup(s->aio); CloseHandle(s->hfile); error_setg_errno(errp, -ret, "Could not enable AIO"); goto fail; } - s->aio = aio; + + win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs)); } raw_probe_alignment(bs); @@ -389,7 +407,17 @@ static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, static void raw_close(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); + win32_aio_cleanup(s->aio); + s->aio = NULL; + } + CloseHandle(s->hfile); + if (bs->open_flags & BDRV_O_TEMPORARY) { + unlink(bs->filename); + } } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -475,8 +503,7 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return st.st_size; } -static int raw_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) { int fd; int64_t total_size = 0; @@ -484,12 +511,8 @@ static int raw_create(const char *filename, QEMUOptionParameter *options, strstart(filename, "file:", &filename); /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / 512; - } - options++; - } + total_size = + qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0) / 512; fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); @@ -503,13 +526,18 @@ static int raw_create(const char *filename, QEMUOptionParameter *options, return 0; } -static QEMUOptionParameter raw_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } + +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_file = { @@ -518,9 +546,9 @@ static BlockDriver bdrv_file = { .instance_size = sizeof(BDRVRawState), .bdrv_needs_filename = true, .bdrv_parse_filename = raw_parse_filename, - .bdrv_file_open = raw_open, - .bdrv_close = raw_close, - .bdrv_create = raw_create, + .bdrv_file_open = raw_open, + .bdrv_close = raw_close, + .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_aio_readv = raw_aio_readv, @@ -532,7 +560,7 @@ static BlockDriver bdrv_file = { .bdrv_get_allocated_file_size = raw_get_allocated_file_size, - .create_options = raw_create_options, + .create_opts = &raw_create_opts, }; /***********************************************/ @@ -681,6 +709,9 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + .bdrv_getlength = raw_getlength, .has_variable_length = true, diff --git a/block/raw_bsd.c b/block/raw_bsd.c index 01ea692a4..f82f4c25d 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -29,13 +29,17 @@ #include "block/block_int.h" #include "qemu/option.h" -static QEMUOptionParameter raw_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { 0 } +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } }; static int raw_reopen_prepare(BDRVReopenState *reopen_state, @@ -90,10 +94,9 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return bdrv_get_info(bs->file, bdi); } -static int raw_refresh_limits(BlockDriverState *bs) +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { bs->bl = bs->file->bl; - return 0; } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -139,13 +142,12 @@ static int raw_has_zero_init(BlockDriverState *bs) return bdrv_has_zero_init(bs->file); } -static int raw_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) { Error *local_err = NULL; int ret; - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (local_err) { error_propagate(errp, local_err); } @@ -194,7 +196,7 @@ static BlockDriver bdrv_raw = { .bdrv_lock_medium = &raw_lock_medium, .bdrv_ioctl = &raw_ioctl, .bdrv_aio_ioctl = &raw_aio_ioctl, - .create_options = &raw_create_options[0], + .create_opts = &raw_create_opts, .bdrv_has_zero_init = &raw_has_zero_init }; diff --git a/block/rbd.c b/block/rbd.c index dbc79f452..2b797d3e8 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -105,7 +105,7 @@ typedef struct BDRVRBDState { static int qemu_rbd_next_tok(char *dst, int dst_len, char *src, char delim, const char *name, - char **p) + char **p, Error **errp) { int l; char *end; @@ -128,10 +128,10 @@ static int qemu_rbd_next_tok(char *dst, int dst_len, } l = strlen(src); if (l >= dst_len) { - error_report("%s too long", name); + error_setg(errp, "%s too long", name); return -EINVAL; } else if (l == 0) { - error_report("%s too short", name); + error_setg(errp, "%s too short", name); return -EINVAL; } @@ -157,13 +157,15 @@ static int qemu_rbd_parsename(const char *filename, char *pool, int pool_len, char *snap, int snap_len, char *name, int name_len, - char *conf, int conf_len) + char *conf, int conf_len, + Error **errp) { const char *start; char *p, *buf; int ret; if (!strstart(filename, "rbd:", &start)) { + error_setg(errp, "File name must start with 'rbd:'"); return -EINVAL; } @@ -172,7 +174,8 @@ static int qemu_rbd_parsename(const char *filename, *snap = '\0'; *conf = '\0'; - ret = qemu_rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); + ret = qemu_rbd_next_tok(pool, pool_len, p, + '/', "pool name", &p, errp); if (ret < 0 || !p) { ret = -EINVAL; goto done; @@ -180,21 +183,25 @@ static int qemu_rbd_parsename(const char *filename, qemu_rbd_unescape(pool); if (strchr(p, '@')) { - ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p); + ret = qemu_rbd_next_tok(name, name_len, p, + '@', "object name", &p, errp); if (ret < 0) { goto done; } - ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p); + ret = qemu_rbd_next_tok(snap, snap_len, p, + ':', "snap name", &p, errp); qemu_rbd_unescape(snap); } else { - ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p); + ret = qemu_rbd_next_tok(name, name_len, p, + ':', "object name", &p, errp); } qemu_rbd_unescape(name); if (ret < 0 || !p) { goto done; } - ret = qemu_rbd_next_tok(conf, conf_len, p, '\0', "configuration", &p); + ret = qemu_rbd_next_tok(conf, conf_len, p, + '\0', "configuration", &p, errp); done: g_free(buf); @@ -229,7 +236,7 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) return NULL; } -static int qemu_rbd_set_conf(rados_t cluster, const char *conf) +static int qemu_rbd_set_conf(rados_t cluster, const char *conf, Error **errp) { char *p, *buf; char name[RBD_MAX_CONF_NAME_SIZE]; @@ -241,20 +248,20 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) while (p) { ret = qemu_rbd_next_tok(name, sizeof(name), p, - '=', "conf option name", &p); + '=', "conf option name", &p, errp); if (ret < 0) { break; } qemu_rbd_unescape(name); if (!p) { - error_report("conf option %s has no value", name); + error_setg(errp, "conf option %s has no value", name); ret = -EINVAL; break; } ret = qemu_rbd_next_tok(value, sizeof(value), p, - ':', "conf option value", &p); + ':', "conf option value", &p, errp); if (ret < 0) { break; } @@ -263,7 +270,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) if (strcmp(name, "conf") == 0) { ret = rados_conf_read_file(cluster, value); if (ret < 0) { - error_report("error reading conf file %s", value); + error_setg(errp, "error reading conf file %s", value); break; } } else if (strcmp(name, "id") == 0) { @@ -271,7 +278,7 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) } else { ret = rados_conf_set(cluster, name, value); if (ret < 0) { - error_report("invalid conf option %s", name); + error_setg(errp, "invalid conf option %s", name); ret = -EINVAL; break; } @@ -282,9 +289,9 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) return ret; } -static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) { + Error *local_err = NULL; int64_t bytes = 0; int64_t objsize; int obj_order = 0; @@ -301,34 +308,29 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), name, sizeof(name), - conf, sizeof(conf)) < 0) { + conf, sizeof(conf), &local_err) < 0) { + error_propagate(errp, local_err); return -EINVAL; } /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - bytes = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - objsize = options->value.n; - if ((objsize - 1) & objsize) { /* not a power of 2? */ - error_report("obj size needs to be power of 2"); - return -EINVAL; - } - if (objsize < 4096) { - error_report("obj size too small"); - return -EINVAL; - } - obj_order = ffs(objsize) - 1; - } + bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0); + if (objsize) { + if ((objsize - 1) & objsize) { /* not a power of 2? */ + error_setg(errp, "obj size needs to be power of 2"); + return -EINVAL; } - options++; + if (objsize < 4096) { + error_setg(errp, "obj size too small"); + return -EINVAL; + } + obj_order = ffs(objsize) - 1; } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); if (rados_create(&cluster, clientname) < 0) { - error_report("error initializing"); + error_setg(errp, "error initializing"); return -EIO; } @@ -338,20 +340,20 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, } if (conf[0] != '\0' && - qemu_rbd_set_conf(cluster, conf) < 0) { - error_report("error setting config options"); + qemu_rbd_set_conf(cluster, conf, &local_err) < 0) { rados_shutdown(cluster); + error_propagate(errp, local_err); return -EIO; } if (rados_connect(cluster) < 0) { - error_report("error connecting"); + error_setg(errp, "error connecting"); rados_shutdown(cluster); return -EIO; } if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { - error_report("error opening pool %s", pool); + error_setg(errp, "error opening pool %s", pool); rados_shutdown(cluster); return -EIO; } @@ -441,8 +443,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); if (local_err) { - qerror_report_err(local_err); - error_free(local_err); + error_propagate(errp, local_err); qemu_opts_del(opts); return -EINVAL; } @@ -452,7 +453,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), s->name, sizeof(s->name), - conf, sizeof(conf)) < 0) { + conf, sizeof(conf), errp) < 0) { r = -EINVAL; goto failed_opts; } @@ -460,7 +461,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, clientname = qemu_rbd_parse_clientname(conf, clientname_buf); r = rados_create(&s->cluster, clientname); if (r < 0) { - error_report("error initializing"); + error_setg(&local_err, "error initializing"); goto failed_opts; } @@ -488,28 +489,27 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } if (conf[0] != '\0') { - r = qemu_rbd_set_conf(s->cluster, conf); + r = qemu_rbd_set_conf(s->cluster, conf, errp); if (r < 0) { - error_report("error setting config options"); goto failed_shutdown; } } r = rados_connect(s->cluster); if (r < 0) { - error_report("error connecting"); + error_setg(&local_err, "error connecting"); goto failed_shutdown; } r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); if (r < 0) { - error_report("error opening pool %s", pool); + error_setg(&local_err, "error opening pool %s", pool); goto failed_shutdown; } r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); if (r < 0) { - error_report("error reading header from %s", s->name); + error_setg(&local_err, "error reading header from %s", s->name); goto failed_open; } @@ -548,7 +548,7 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) acb->cancelled = 1; while (acb->status == -EINPROGRESS) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(acb->common.bs), true); } qemu_aio_release(acb); @@ -581,7 +581,8 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) rcb->ret = rbd_aio_get_return_value(c); rbd_aio_release(c); - acb->bh = qemu_bh_new(rbd_finish_bh, rcb); + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + rbd_finish_bh, rcb); qemu_bh_schedule(acb->bh); } @@ -677,13 +678,16 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, } if (r < 0) { - goto failed; + goto failed_completion; } return &acb->common; +failed_completion: + rbd_aio_release(c); failed: g_free(rcb); + qemu_vfree(acb->bounce); qemu_aio_release(acb); return NULL; } @@ -900,18 +904,22 @@ static BlockDriverAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, } #endif -static QEMUOptionParameter qemu_rbd_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "RBD object size" - }, - {NULL} +static QemuOptsList qemu_rbd_create_opts = { + .name = "rbd-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "RBD object size" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_rbd = { @@ -923,7 +931,7 @@ static BlockDriver bdrv_rbd = { .bdrv_create = qemu_rbd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_get_info = qemu_rbd_getinfo, - .create_options = qemu_rbd_create_options, + .create_opts = &qemu_rbd_create_opts, .bdrv_getlength = qemu_rbd_getlength, .bdrv_truncate = qemu_rbd_truncate, .protocol_name = "rbd", diff --git a/block/sheepdog.c b/block/sheepdog.c index 0eb33ee80..8d9350c26 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -200,6 +200,8 @@ typedef struct SheepdogInode { uint32_t data_vdi_id[MAX_DATA_OBJS]; } SheepdogInode; +#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id) + /* * 64 bit FNV-1a non-zero initial basis */ @@ -282,6 +284,7 @@ typedef struct AIOReq { unsigned int data_len; uint8_t flags; uint32_t id; + bool create; QLIST_ENTRY(AIOReq) aio_siblings; } AIOReq; @@ -314,6 +317,7 @@ struct SheepdogAIOCB { typedef struct BDRVSheepdogState { BlockDriverState *bs; + AioContext *aio_context; SheepdogInode inode; @@ -404,7 +408,7 @@ static const char * sd_strerror(int err) static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, uint64_t oid, unsigned int data_len, - uint64_t offset, uint8_t flags, + uint64_t offset, uint8_t flags, bool create, uint64_t base_oid, unsigned int iov_offset) { AIOReq *aio_req; @@ -418,6 +422,7 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, aio_req->data_len = data_len; aio_req->flags = flags; aio_req->id = s->aioreq_seq_num++; + aio_req->create = create; acb->nr_pending++; return aio_req; @@ -496,7 +501,7 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb) sd_finish_aiocb(acb); return; } - qemu_aio_wait(); + aio_poll(s->aio_context, true); } } @@ -526,17 +531,16 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, return acb; } -static int connect_to_sdog(BDRVSheepdogState *s) +static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) { int fd; - Error *err = NULL; if (s->is_unix) { - fd = unix_connect(s->host_spec, &err); + fd = unix_connect(s->host_spec, errp); } else { - fd = inet_connect(s->host_spec, &err); + fd = inet_connect(s->host_spec, errp); - if (err == NULL) { + if (fd >= 0) { int ret = socket_set_nodelay(fd); if (ret < 0) { error_report("%s", strerror(errno)); @@ -544,10 +548,7 @@ static int connect_to_sdog(BDRVSheepdogState *s) } } - if (err != NULL) { - qerror_report_err(err); - error_free(err); - } else { + if (fd >= 0) { qemu_set_nonblock(fd); } @@ -582,6 +583,7 @@ static void restart_co_req(void *opaque) typedef struct SheepdogReqCo { int sockfd; + AioContext *aio_context; SheepdogReq *hdr; void *data; unsigned int *wlen; @@ -602,14 +604,14 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, co); + aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, co); + aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { @@ -634,18 +636,19 @@ static coroutine_fn void do_co_req(void *opaque) out: /* there is at most one request for this sockfd, so it is safe to * set each handler to NULL. */ - qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL); + aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL); srco->ret = ret; srco->finished = true; } -static int do_req(int sockfd, SheepdogReq *hdr, void *data, - unsigned int *wlen, unsigned int *rlen) +static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, + void *data, unsigned int *wlen, unsigned int *rlen) { Coroutine *co; SheepdogReqCo srco = { .sockfd = sockfd, + .aio_context = aio_context, .hdr = hdr, .data = data, .wlen = wlen, @@ -660,7 +663,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, co = qemu_coroutine_create(do_co_req); qemu_coroutine_enter(co, &srco); while (!srco.finished) { - qemu_aio_wait(); + aio_poll(aio_context, true); } } @@ -668,11 +671,11 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, } static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, - struct iovec *iov, int niov, bool create, - enum AIOCBState aiocb_type); + struct iovec *iov, int niov, + enum AIOCBState aiocb_type); static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); -static int get_sheep_fd(BDRVSheepdogState *s); +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); static void co_write_request(void *opaque); static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) @@ -702,17 +705,18 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) /* move aio_req from pending list to inflight one */ QLIST_REMOVE(aio_req, aio_siblings); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, false, + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, acb->aiocb_type); } } static coroutine_fn void reconnect_to_sdog(void *opaque) { + Error *local_err = NULL; BDRVSheepdogState *s = opaque; AIOReq *aio_req, *next; - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); + aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); close(s->fd); s->fd = -1; @@ -723,9 +727,11 @@ static coroutine_fn void reconnect_to_sdog(void *opaque) /* Try to reconnect the sheepdog server every one second. */ while (s->fd < 0) { - s->fd = get_sheep_fd(s); + s->fd = get_sheep_fd(s, &local_err); if (s->fd < 0) { DPRINTF("Wait for connection to be established\n"); + error_report("%s", error_get_pretty(local_err)); + error_free(local_err); co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, 1000000000ULL); } @@ -798,7 +804,7 @@ static void coroutine_fn aio_read_response(void *opaque) } idx = data_oid_to_idx(aio_req->oid); - if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) { + if (aio_req->create) { /* * If the object is newly created one, we need to update * the vdi object (metadata object). min_dirty_data_idx @@ -914,16 +920,16 @@ static void co_write_request(void *opaque) * We cannot use this descriptor for other operations because * the block driver may be on waiting response from the server. */ -static int get_sheep_fd(BDRVSheepdogState *s) +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) { int fd; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, errp); if (fd < 0) { return fd; } - qemu_aio_set_fd_handler(fd, co_read_response, NULL, s); + aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s); return fd; } @@ -1061,7 +1067,7 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename, static int find_vdi_name(BDRVSheepdogState *s, const char *filename, uint32_t snapid, const char *tag, uint32_t *vid, - bool lock) + bool lock, Error **errp) { int ret, fd; SheepdogVdiReq hdr; @@ -1069,7 +1075,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename, unsigned int wlen, rlen = 0; char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, errp); if (fd < 0) { return fd; } @@ -1093,14 +1099,15 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename, hdr.snapid = snapid; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); if (ret) { + error_setg_errno(errp, -ret, "cannot get vdi info"); goto out; } if (rsp->result != SD_RES_SUCCESS) { - error_report("cannot get vdi info, %s, %s %d %s", - sd_strerror(rsp->result), filename, snapid, tag); + error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s", + sd_strerror(rsp->result), filename, snapid, tag); if (rsp->result == SD_RES_NO_VDI) { ret = -ENOENT; } else { @@ -1117,8 +1124,8 @@ out: } static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, - struct iovec *iov, int niov, bool create, - enum AIOCBState aiocb_type) + struct iovec *iov, int niov, + enum AIOCBState aiocb_type) { int nr_copies = s->inode.nr_copies; SheepdogObjReq hdr; @@ -1129,6 +1136,7 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, uint64_t offset = aio_req->offset; uint8_t flags = aio_req->flags; uint64_t old_oid = aio_req->base_oid; + bool create = aio_req->create; if (!nr_copies) { error_report("bug"); @@ -1173,7 +1181,8 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, qemu_co_mutex_lock(&s->lock); s->co_send = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, s); + aio_set_fd_handler(s->aio_context, s->fd, + co_read_response, co_write_request, s); socket_set_cork(s->fd, 1); /* send a header */ @@ -1191,12 +1200,13 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, } out: socket_set_cork(s->fd, 0); - qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, s); + aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s); s->co_send = NULL; qemu_co_mutex_unlock(&s->lock); } -static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies, +static int read_write_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool write, bool create, uint32_t cache_flags) { @@ -1229,7 +1239,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies, hdr.offset = offset; hdr.copies = copies; - ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); if (ret) { error_report("failed to send a request to the sheep"); return ret; @@ -1244,49 +1254,59 @@ static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies, } } -static int read_object(int fd, char *buf, uint64_t oid, uint8_t copies, +static int read_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, uint32_t cache_flags) { - return read_write_object(fd, buf, oid, copies, datalen, offset, false, + return read_write_object(fd, aio_context, buf, oid, copies, + datalen, offset, false, false, cache_flags); } -static int write_object(int fd, char *buf, uint64_t oid, uint8_t copies, +static int write_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool create, uint32_t cache_flags) { - return read_write_object(fd, buf, oid, copies, datalen, offset, true, + return read_write_object(fd, aio_context, buf, oid, copies, + datalen, offset, true, create, cache_flags); } /* update inode with the latest state */ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) { + Error *local_err = NULL; SheepdogInode *inode; int ret = 0, fd; uint32_t vid = 0; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); return -EIO; } - inode = g_malloc(sizeof(s->inode)); + inode = g_malloc(SD_INODE_HEADER_SIZE); - ret = find_vdi_name(s, s->name, snapid, tag, &vid, false); + ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err); if (ret) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); goto out; } - ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid), - s->inode.nr_copies, sizeof(*inode), 0, s->cache_flags); + ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid), + s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, + s->cache_flags); if (ret < 0) { goto out; } if (inode->vdi_id != s->inode.vdi_id) { - memcpy(&s->inode, inode, sizeof(s->inode)); + memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE); } out: @@ -1310,6 +1330,7 @@ static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req) DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid); aio_req->flags = 0; aio_req->base_oid = 0; + aio_req->create = false; QLIST_REMOVE(aio_req, aio_siblings); QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); return true; @@ -1322,7 +1343,8 @@ static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req) static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) { SheepdogAIOCB *acb = aio_req->aiocb; - bool create = false; + + aio_req->create = false; /* check whether this request becomes a CoW one */ if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { @@ -1340,20 +1362,36 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); aio_req->flags |= SD_FLAG_CMD_COW; } - create = true; + aio_req->create = true; } out: if (is_data_obj(aio_req->oid)) { - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, acb->aiocb_type); } else { struct iovec iov; iov.iov_base = &s->inode; iov.iov_len = sizeof(s->inode); - add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); + add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); } } +static void sd_detach_aio_context(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + + aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); +} + +static void sd_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVSheepdogState *s = bs->opaque; + + s->aio_context = new_context; + aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s); +} + /* TODO Convert to fine grained options */ static QemuOptsList runtime_opts = { .name = "sheepdog", @@ -1382,12 +1420,12 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, const char *filename; s->bs = bs; + s->aio_context = bdrv_get_aio_context(bs); opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); if (local_err) { - qerror_report_err(local_err); - error_free(local_err); + error_propagate(errp, local_err); ret = -EINVAL; goto out; } @@ -1408,15 +1446,16 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, ret = parse_vdiname(s, filename, vdi, &snapid, tag); } if (ret < 0) { + error_setg(errp, "Can't parse filename"); goto out; } - s->fd = get_sheep_fd(s); + s->fd = get_sheep_fd(s, errp); if (s->fd < 0) { ret = s->fd; goto out; } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, true); + ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp); if (ret) { goto out; } @@ -1436,19 +1475,20 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, s->is_snapshot = true; } - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, errp); if (fd < 0) { ret = fd; goto out; } buf = g_malloc(SD_INODE_SIZE); - ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0, - s->cache_flags); + ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + 0, SD_INODE_SIZE, 0, s->cache_flags); closesocket(fd); if (ret) { + error_setg(errp, "Can't read snapshot inode"); goto out; } @@ -1463,7 +1503,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, g_free(buf); return 0; out: - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); if (s->fd >= 0) { closesocket(s->fd); } @@ -1472,7 +1512,8 @@ out: return ret; } -static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) +static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, + Error **errp) { SheepdogVdiReq hdr; SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; @@ -1480,7 +1521,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) unsigned int wlen, rlen = 0; char buf[SD_MAX_VDI_LEN]; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, errp); if (fd < 0) { return fd; } @@ -1505,16 +1546,17 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) hdr.copy_policy = s->inode.copy_policy; hdr.copies = s->inode.nr_copies; - ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); closesocket(fd); if (ret) { + error_setg_errno(errp, -ret, "create failed"); return ret; } if (rsp->result != SD_RES_SUCCESS) { - error_report("%s, %s", sd_strerror(rsp->result), s->inode.name); + error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name); return -EIO; } @@ -1525,21 +1567,18 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) return 0; } -static int sd_prealloc(const char *filename) +static int sd_prealloc(const char *filename, Error **errp) { BlockDriverState *bs = NULL; uint32_t idx, max_idx; int64_t vdi_size; void *buf = g_malloc0(SD_DATA_OBJ_SIZE); - Error *local_err = NULL; int ret; ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); + NULL, errp); if (ret < 0) { - qerror_report_err(local_err); - error_free(local_err); - goto out; + goto out_with_err_set; } vdi_size = bdrv_getlength(bs); @@ -1563,7 +1602,12 @@ static int sd_prealloc(const char *filename) goto out; } } + out: + if (ret < 0) { + error_setg_errno(errp, -ret, "Can't pre-allocate"); + } +out_with_err_set: if (bs) { bdrv_unref(bs); } @@ -1626,17 +1670,17 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt) return 0; } -static int sd_create(const char *filename, QEMUOptionParameter *options, +static int sd_create(const char *filename, QemuOpts *opts, Error **errp) { int ret = 0; uint32_t vid = 0; char *backing_file = NULL; + char *buf = NULL; BDRVSheepdogState *s; char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; bool prealloc = false; - Error *local_err = NULL; s = g_malloc0(sizeof(BDRVSheepdogState)); @@ -1647,38 +1691,35 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, ret = parse_vdiname(s, filename, s->name, &snapid, tag); } if (ret < 0) { + error_setg(errp, "Can't parse filename"); goto out; } - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - s->inode.vdi_size = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { - if (!options->value.s || !strcmp(options->value.s, "off")) { - prealloc = false; - } else if (!strcmp(options->value.s, "full")) { - prealloc = true; - } else { - error_report("Invalid preallocation mode: '%s'", - options->value.s); - ret = -EINVAL; - goto out; - } - } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) { - if (options->value.s) { - ret = parse_redundancy(s, options->value.s); - if (ret < 0) { - goto out; - } - } + s->inode.vdi_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + if (!buf || !strcmp(buf, "off")) { + prealloc = false; + } else if (!strcmp(buf, "full")) { + prealloc = true; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'", buf); + ret = -EINVAL; + goto out; + } + + g_free(buf); + buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY); + if (buf) { + ret = parse_redundancy(s, buf); + if (ret < 0) { + error_setg(errp, "Invalid redundancy mode: '%s'", buf); + goto out; } - options++; } if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { - error_report("too big image size"); + error_setg(errp, "too big image size"); ret = -EINVAL; goto out; } @@ -1691,24 +1732,22 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, /* Currently, only Sheepdog backing image is supported. */ drv = bdrv_find_protocol(backing_file, true); if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { - error_report("backing_file must be a sheepdog image"); + error_setg(errp, "backing_file must be a sheepdog image"); ret = -EINVAL; goto out; } bs = NULL; ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL, - &local_err); + errp); if (ret < 0) { - qerror_report_err(local_err); - error_free(local_err); goto out; } base = bs->opaque; if (!is_snapshot(&base->inode)) { - error_report("cannot clone from a non snapshot vdi"); + error_setg(errp, "cannot clone from a non snapshot vdi"); bdrv_unref(bs); ret = -EINVAL; goto out; @@ -1717,19 +1756,25 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, bdrv_unref(bs); } - ret = do_sd_create(s, &vid, 0); - if (!prealloc || ret) { + s->aio_context = qemu_get_aio_context(); + ret = do_sd_create(s, &vid, 0, errp); + if (ret) { goto out; } - ret = sd_prealloc(filename); + if (prealloc) { + ret = sd_prealloc(filename, errp); + } out: + g_free(backing_file); + g_free(buf); g_free(s); return ret; } static void sd_close(BlockDriverState *bs) { + Error *local_err = NULL; BDRVSheepdogState *s = bs->opaque; SheepdogVdiReq hdr; SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; @@ -1738,8 +1783,10 @@ static void sd_close(BlockDriverState *bs) DPRINTF("%s\n", s->name); - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); return; } @@ -1751,7 +1798,8 @@ static void sd_close(BlockDriverState *bs) hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + s->name, &wlen, &rlen); closesocket(fd); @@ -1760,7 +1808,7 @@ static void sd_close(BlockDriverState *bs) error_report("%s, %s", sd_strerror(rsp->result), s->name); } - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); closesocket(s->fd); g_free(s->host_spec); } @@ -1774,6 +1822,7 @@ static int64_t sd_getlength(BlockDriverState *bs) static int sd_truncate(BlockDriverState *bs, int64_t offset) { + Error *local_err = NULL; BDRVSheepdogState *s = bs->opaque; int ret, fd; unsigned int datalen; @@ -1786,16 +1835,19 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) return -EINVAL; } - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); return fd; } /* we don't need to update entire object */ datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); s->inode.vdi_size = offset; - ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), - s->inode.nr_copies, datalen, 0, false, s->cache_flags); + ret = write_object(fd, s->aio_context, (char *)&s->inode, + vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, + datalen, 0, false, s->cache_flags); close(fd); if (ret < 0) { @@ -1831,9 +1883,9 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) iov.iov_base = &s->inode; iov.iov_len = sizeof(s->inode); aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), - data_len, offset, 0, 0, offset); + data_len, offset, 0, false, 0, offset); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); + add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); acb->aio_done_func = sd_finish_aiocb; acb->aiocb_type = AIOCB_WRITE_UDATA; @@ -1846,6 +1898,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) /* Delete current working VDI on the snapshot chain */ static bool sd_delete(BDRVSheepdogState *s) { + Error *local_err = NULL; unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; SheepdogVdiReq hdr = { .opcode = SD_OP_DEL_VDI, @@ -1856,12 +1909,15 @@ static bool sd_delete(BDRVSheepdogState *s) SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; int fd, ret; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); return false; } - ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + s->name, &wlen, &rlen); closesocket(fd); if (ret) { return false; @@ -1885,6 +1941,7 @@ static bool sd_delete(BDRVSheepdogState *s) */ static int sd_create_branch(BDRVSheepdogState *s) { + Error *local_err = NULL; int ret, fd; uint32_t vid; char *buf; @@ -1900,21 +1957,25 @@ static int sd_create_branch(BDRVSheepdogState *s) * false bail out. */ deleted = sd_delete(s); - ret = do_sd_create(s, &vid, !deleted); + ret = do_sd_create(s, &vid, !deleted, &local_err); if (ret) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); goto out; } DPRINTF("%" PRIx32 " is created.\n", vid); - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); ret = fd; goto out; } - ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, - SD_INODE_SIZE, 0, s->cache_flags); + ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); closesocket(fd); @@ -2023,7 +2084,8 @@ static int coroutine_fn sd_co_rw_vector(void *p) DPRINTF("new oid %" PRIx64 "\n", oid); } - aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); + aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, + old_oid, done); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); if (create) { @@ -2032,7 +2094,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) } } - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, acb->aiocb_type); done: offset = 0; @@ -2112,9 +2174,9 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) acb->aio_done_func = sd_finish_aiocb; aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), - 0, 0, 0, 0, 0); + 0, 0, 0, false, 0, 0); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); + add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type); qemu_coroutine_yield(); return acb->ret; @@ -2122,6 +2184,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { + Error *local_err = NULL; BDRVSheepdogState *s = bs->opaque; int ret, fd; uint32_t new_vid; @@ -2149,32 +2212,37 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); /* we don't need to update entire object */ datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); + inode = g_malloc(datalen); /* refresh inode. */ - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); ret = fd; goto cleanup; } - ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), - s->inode.nr_copies, datalen, 0, false, s->cache_flags); + ret = write_object(fd, s->aio_context, (char *)&s->inode, + vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, + datalen, 0, false, s->cache_flags); if (ret < 0) { error_report("failed to write snapshot's inode."); goto cleanup; } - ret = do_sd_create(s, &new_vid, 1); + ret = do_sd_create(s, &new_vid, 1, &local_err); if (ret < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); error_report("failed to create inode for snapshot. %s", strerror(errno)); goto cleanup; } - inode = (SheepdogInode *)g_malloc(datalen); - - ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid), - s->inode.nr_copies, datalen, 0, s->cache_flags); + ret = read_object(fd, s->aio_context, (char *)inode, + vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, + s->cache_flags); if (ret < 0) { error_report("failed to read new inode info. %s", strerror(errno)); @@ -2186,6 +2254,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) s->inode.name, s->inode.snap_id, s->inode.vdi_id); cleanup: + g_free(inode); closesocket(fd); return ret; } @@ -2249,6 +2318,7 @@ static int sd_snapshot_delete(BlockDriverState *bs, static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) { + Error *local_err = NULL; BDRVSheepdogState *s = bs->opaque; SheepdogReq req; int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); @@ -2263,8 +2333,10 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) vdi_inuse = g_malloc(max); - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); ret = fd; goto out; } @@ -2277,7 +2349,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) req.opcode = SD_OP_READ_VDIS; req.data_length = max; - ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen); + ret = do_req(fd, s->aio_context, (SheepdogReq *)&req, + vdi_inuse, &wlen, &rlen); closesocket(fd); if (ret) { @@ -2290,8 +2363,10 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); start_nr = hval & (SD_NR_VDIS - 1); - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); ret = fd; goto out; } @@ -2302,7 +2377,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) } /* we don't need to read entire object */ - ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid), + ret = read_object(fd, s->aio_context, (char *)&inode, + vid_to_vdi_oid(vid), 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, s->cache_flags); @@ -2316,8 +2392,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) sn_tab[found].vm_state_size = inode.vm_state_size; sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; - snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", - inode.snap_id); + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), + "%" PRIu32, inode.snap_id); pstrcpy(sn_tab[found].name, MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)), inode.tag); @@ -2341,6 +2417,7 @@ out: static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, int64_t pos, int size, int load) { + Error *local_err = NULL; bool create; int fd, ret = 0, remaining = size; unsigned int data_len; @@ -2349,8 +2426,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, uint32_t vdi_index; uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; - fd = connect_to_sdog(s); + fd = connect_to_sdog(s, &local_err); if (fd < 0) { + error_report("%s", error_get_pretty(local_err));; + error_free(local_err); return fd; } @@ -2364,11 +2443,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, create = (offset == 0); if (load) { - ret = read_object(fd, (char *)data, vmstate_oid, + ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, s->cache_flags); } else { - ret = write_object(fd, (char *)data, vmstate_oid, + ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, create, s->cache_flags); } @@ -2490,28 +2569,32 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs) return size; } -static QEMUOptionParameter sd_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = OPT_STRING, - .help = "Preallocation mode (allowed values: off, full)" - }, - { - .name = BLOCK_OPT_REDUNDANCY, - .type = OPT_STRING, - .help = "Redundancy of the image" - }, - { NULL } +static QemuOptsList sd_create_opts = { + .name = "sheepdog-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { + .name = BLOCK_OPT_REDUNDANCY, + .type = QEMU_OPT_STRING, + .help = "Redundancy of the image" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_sheepdog = { @@ -2541,7 +2624,10 @@ static BlockDriver bdrv_sheepdog = { .bdrv_save_vmstate = sd_save_vmstate, .bdrv_load_vmstate = sd_load_vmstate, - .create_options = sd_create_options, + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, }; static BlockDriver bdrv_sheepdog_tcp = { @@ -2571,7 +2657,10 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_save_vmstate = sd_save_vmstate, .bdrv_load_vmstate = sd_load_vmstate, - .create_options = sd_create_options, + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, }; static BlockDriver bdrv_sheepdog_unix = { @@ -2601,7 +2690,10 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_save_vmstate = sd_save_vmstate, .bdrv_load_vmstate = sd_load_vmstate, - .create_options = sd_create_options, + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, }; static void bdrv_sheepdog_init(void) diff --git a/block/ssh.c b/block/ssh.c index aa63c9d20..cd2fd751f 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -106,30 +106,59 @@ static void ssh_state_free(BDRVSSHState *s) } } -/* Wrappers around error_report which make sure to dump as much - * information from libssh2 as possible. - */ -static void GCC_FMT_ATTR(2, 3) -session_error_report(BDRVSSHState *s, const char *fs, ...) +static void GCC_FMT_ATTR(3, 4) +session_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) { va_list args; + char *msg; va_start(args, fs); - error_vprintf(fs, args); + msg = g_strdup_vprintf(fs, args); + va_end(args); - if ((s)->session) { + if (s->session) { char *ssh_err; int ssh_err_code; - libssh2_session_last_error((s)->session, &ssh_err, NULL, 0); /* This is not an errno. See <libssh2.h>. */ - ssh_err_code = libssh2_session_last_errno((s)->session); - - error_printf(": %s (libssh2 error code: %d)", ssh_err, ssh_err_code); + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); + error_setg(errp, "%s: %s (libssh2 error code: %d)", + msg, ssh_err, ssh_err_code); + } else { + error_setg(errp, "%s", msg); } + g_free(msg); +} + +static void GCC_FMT_ATTR(3, 4) +sftp_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + char *msg; + va_start(args, fs); + msg = g_strdup_vprintf(fs, args); va_end(args); - error_printf("\n"); + + if (s->sftp) { + char *ssh_err; + int ssh_err_code; + unsigned long sftp_err_code; + + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); + /* See <libssh2_sftp.h>. */ + sftp_err_code = libssh2_sftp_last_error((s)->sftp); + + error_setg(errp, + "%s: %s (libssh2 error code: %d, sftp error code: %lu)", + msg, ssh_err, ssh_err_code, sftp_err_code); + } else { + error_setg(errp, "%s", msg); + } + g_free(msg); } static void GCC_FMT_ATTR(2, 3) @@ -145,9 +174,9 @@ sftp_error_report(BDRVSSHState *s, const char *fs, ...) int ssh_err_code; unsigned long sftp_err_code; - libssh2_session_last_error((s)->session, &ssh_err, NULL, 0); /* This is not an errno. See <libssh2.h>. */ - ssh_err_code = libssh2_session_last_errno((s)->session); + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); /* See <libssh2_sftp.h>. */ sftp_err_code = libssh2_sftp_last_error((s)->sftp); @@ -243,7 +272,7 @@ static void ssh_parse_filename(const char *filename, QDict *options, } static int check_host_key_knownhosts(BDRVSSHState *s, - const char *host, int port) + const char *host, int port, Error **errp) { const char *home; char *knh_file = NULL; @@ -257,14 +286,15 @@ static int check_host_key_knownhosts(BDRVSSHState *s, hostkey = libssh2_session_hostkey(s->session, &len, &type); if (!hostkey) { ret = -EINVAL; - session_error_report(s, "failed to read remote host key"); + session_error_setg(errp, s, "failed to read remote host key"); goto out; } knh = libssh2_knownhost_init(s->session); if (!knh) { ret = -EINVAL; - session_error_report(s, "failed to initialize known hosts support"); + session_error_setg(errp, s, + "failed to initialize known hosts support"); goto out; } @@ -289,21 +319,23 @@ static int check_host_key_knownhosts(BDRVSSHState *s, break; case LIBSSH2_KNOWNHOST_CHECK_MISMATCH: ret = -EINVAL; - session_error_report(s, "host key does not match the one in known_hosts (found key %s)", - found->key); + session_error_setg(errp, s, + "host key does not match the one in known_hosts" + " (found key %s)", found->key); goto out; case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND: ret = -EINVAL; - session_error_report(s, "no host key was found in known_hosts"); + session_error_setg(errp, s, "no host key was found in known_hosts"); goto out; case LIBSSH2_KNOWNHOST_CHECK_FAILURE: ret = -EINVAL; - session_error_report(s, "failure matching the host key with known_hosts"); + session_error_setg(errp, s, + "failure matching the host key with known_hosts"); goto out; default: ret = -EINVAL; - session_error_report(s, "unknown error matching the host key with known_hosts (%d)", - r); + session_error_setg(errp, s, "unknown error matching the host key" + " with known_hosts (%d)", r); goto out; } @@ -358,20 +390,20 @@ static int compare_fingerprint(const unsigned char *fingerprint, size_t len, static int check_host_key_hash(BDRVSSHState *s, const char *hash, - int hash_type, size_t fingerprint_len) + int hash_type, size_t fingerprint_len, Error **errp) { const char *fingerprint; fingerprint = libssh2_hostkey_hash(s->session, hash_type); if (!fingerprint) { - session_error_report(s, "failed to read remote host key"); + session_error_setg(errp, s, "failed to read remote host key"); return -EINVAL; } if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len, hash) != 0) { - error_report("remote host key does not match host_key_check '%s'", - hash); + error_setg(errp, "remote host key does not match host_key_check '%s'", + hash); return -EPERM; } @@ -379,7 +411,7 @@ check_host_key_hash(BDRVSSHState *s, const char *hash, } static int check_host_key(BDRVSSHState *s, const char *host, int port, - const char *host_key_check) + const char *host_key_check, Error **errp) { /* host_key_check=no */ if (strcmp(host_key_check, "no") == 0) { @@ -389,25 +421,25 @@ static int check_host_key(BDRVSSHState *s, const char *host, int port, /* host_key_check=md5:xx:yy:zz:... */ if (strncmp(host_key_check, "md5:", 4) == 0) { return check_host_key_hash(s, &host_key_check[4], - LIBSSH2_HOSTKEY_HASH_MD5, 16); + LIBSSH2_HOSTKEY_HASH_MD5, 16, errp); } /* host_key_check=sha1:xx:yy:zz:... */ if (strncmp(host_key_check, "sha1:", 5) == 0) { return check_host_key_hash(s, &host_key_check[5], - LIBSSH2_HOSTKEY_HASH_SHA1, 20); + LIBSSH2_HOSTKEY_HASH_SHA1, 20, errp); } /* host_key_check=yes */ if (strcmp(host_key_check, "yes") == 0) { - return check_host_key_knownhosts(s, host, port); + return check_host_key_knownhosts(s, host, port, errp); } - error_report("unknown host_key_check setting (%s)", host_key_check); + error_setg(errp, "unknown host_key_check setting (%s)", host_key_check); return -EINVAL; } -static int authenticate(BDRVSSHState *s, const char *user) +static int authenticate(BDRVSSHState *s, const char *user, Error **errp) { int r, ret; const char *userauthlist; @@ -418,7 +450,8 @@ static int authenticate(BDRVSSHState *s, const char *user) userauthlist = libssh2_userauth_list(s->session, user, strlen(user)); if (strstr(userauthlist, "publickey") == NULL) { ret = -EPERM; - error_report("remote server does not support \"publickey\" authentication"); + error_setg(errp, + "remote server does not support \"publickey\" authentication"); goto out; } @@ -426,17 +459,18 @@ static int authenticate(BDRVSSHState *s, const char *user) agent = libssh2_agent_init(s->session); if (!agent) { ret = -EINVAL; - session_error_report(s, "failed to initialize ssh-agent support"); + session_error_setg(errp, s, "failed to initialize ssh-agent support"); goto out; } if (libssh2_agent_connect(agent)) { ret = -ECONNREFUSED; - session_error_report(s, "failed to connect to ssh-agent"); + session_error_setg(errp, s, "failed to connect to ssh-agent"); goto out; } if (libssh2_agent_list_identities(agent)) { ret = -EINVAL; - session_error_report(s, "failed requesting identities from ssh-agent"); + session_error_setg(errp, s, + "failed requesting identities from ssh-agent"); goto out; } @@ -447,7 +481,8 @@ static int authenticate(BDRVSSHState *s, const char *user) } if (r < 0) { ret = -EINVAL; - session_error_report(s, "failed to obtain identity from ssh-agent"); + session_error_setg(errp, s, + "failed to obtain identity from ssh-agent"); goto out; } r = libssh2_agent_userauth(agent, user, identity); @@ -461,8 +496,8 @@ static int authenticate(BDRVSSHState *s, const char *user) } ret = -EPERM; - error_report("failed to authenticate using publickey authentication " - "and the identities held by your ssh-agent"); + error_setg(errp, "failed to authenticate using publickey authentication " + "and the identities held by your ssh-agent"); out: if (agent != NULL) { @@ -476,10 +511,9 @@ static int authenticate(BDRVSSHState *s, const char *user) } static int connect_to_ssh(BDRVSSHState *s, QDict *options, - int ssh_flags, int creat_mode) + int ssh_flags, int creat_mode, Error **errp) { int r, ret; - Error *err = NULL; const char *host, *user, *path, *host_key_check; int port; @@ -498,6 +532,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, } else { user = g_get_user_name(); if (!user) { + error_setg_errno(errp, errno, "Can't get user name"); ret = -errno; goto err; } @@ -514,11 +549,9 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, s->hostport = g_strdup_printf("%s:%d", host, port); /* Open the socket and connect. */ - s->sock = inet_connect(s->hostport, &err); - if (err != NULL) { + s->sock = inet_connect(s->hostport, errp); + if (s->sock < 0) { ret = -errno; - qerror_report_err(err); - error_free(err); goto err; } @@ -526,7 +559,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, s->session = libssh2_session_init(); if (!s->session) { ret = -EINVAL; - session_error_report(s, "failed to initialize libssh2 session"); + session_error_setg(errp, s, "failed to initialize libssh2 session"); goto err; } @@ -537,18 +570,18 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, r = libssh2_session_handshake(s->session, s->sock); if (r != 0) { ret = -EINVAL; - session_error_report(s, "failed to establish SSH session"); + session_error_setg(errp, s, "failed to establish SSH session"); goto err; } /* Check the remote host's key against known_hosts. */ - ret = check_host_key(s, host, port, host_key_check); + ret = check_host_key(s, host, port, host_key_check, errp); if (ret < 0) { goto err; } /* Authenticate. */ - ret = authenticate(s, user); + ret = authenticate(s, user, errp); if (ret < 0) { goto err; } @@ -556,7 +589,7 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, /* Start SFTP. */ s->sftp = libssh2_sftp_init(s->session); if (!s->sftp) { - session_error_report(s, "failed to initialize sftp handle"); + session_error_setg(errp, s, "failed to initialize sftp handle"); ret = -EINVAL; goto err; } @@ -566,14 +599,14 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, path, ssh_flags, creat_mode); s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode); if (!s->sftp_handle) { - session_error_report(s, "failed to open remote file '%s'", path); + session_error_setg(errp, s, "failed to open remote file '%s'", path); ret = -EINVAL; goto err; } r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); if (r < 0) { - sftp_error_report(s, "failed to read file attributes"); + sftp_error_setg(errp, s, "failed to read file attributes"); return -EINVAL; } @@ -623,7 +656,7 @@ static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, } /* Start up SSH. */ - ret = connect_to_ssh(s, options, ssh_flags, 0); + ret = connect_to_ssh(s, options, ssh_flags, 0, errp); if (ret < 0) { goto err; } @@ -642,20 +675,22 @@ static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, return ret; } -static QEMUOptionParameter ssh_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } +static QemuOptsList ssh_create_opts = { + .name = "ssh-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } }; -static int ssh_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int ssh_create(const char *filename, QemuOpts *opts, Error **errp) { int r, ret; - Error *local_err = NULL; int64_t total_size = 0; QDict *uri_options = NULL; BDRVSSHState s; @@ -665,26 +700,20 @@ static int ssh_create(const char *filename, QEMUOptionParameter *options, ssh_state_init(&s); /* Get desired file size. */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n; - } - options++; - } + total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); DPRINTF("total_size=%" PRIi64, total_size); uri_options = qdict_new(); - r = parse_uri(filename, uri_options, &local_err); + r = parse_uri(filename, uri_options, errp); if (r < 0) { - qerror_report_err(local_err); - error_free(local_err); ret = r; goto out; } r = connect_to_ssh(&s, uri_options, LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE| - LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, 0644); + LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, + 0644, errp); if (r < 0) { ret = r; goto out; @@ -694,7 +723,7 @@ static int ssh_create(const char *filename, QEMUOptionParameter *options, libssh2_sftp_seek64(s.sftp_handle, total_size-1); r2 = libssh2_sftp_write(s.sftp_handle, c, 1); if (r2 < 0) { - sftp_error_report(&s, "truncate failed"); + sftp_error_setg(errp, &s, "truncate failed"); ret = -EINVAL; goto out; } @@ -742,7 +771,7 @@ static void restart_coroutine(void *opaque) qemu_coroutine_enter(co, NULL); } -static coroutine_fn void set_fd_handler(BDRVSSHState *s) +static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) { int r; IOHandler *rd_handler = NULL, *wr_handler = NULL; @@ -760,24 +789,26 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s) DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, rd_handler, wr_handler); - qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, co); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, + rd_handler, wr_handler, co); } -static coroutine_fn void clear_fd_handler(BDRVSSHState *s) +static coroutine_fn void clear_fd_handler(BDRVSSHState *s, + BlockDriverState *bs) { DPRINTF("s->sock=%d", s->sock); - qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, NULL, NULL, NULL); } /* A non-blocking call returned EAGAIN, so yield, ensuring the * handlers are set up so that we'll be rescheduled when there is an * interesting event on the socket. */ -static coroutine_fn void co_yield(BDRVSSHState *s) +static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) { - set_fd_handler(s); + set_fd_handler(s, bs); qemu_coroutine_yield(); - clear_fd_handler(s); + clear_fd_handler(s, bs); } /* SFTP has a function `libssh2_sftp_seek64' which seeks to a position @@ -807,7 +838,7 @@ static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags) } } -static coroutine_fn int ssh_read(BDRVSSHState *s, +static coroutine_fn int ssh_read(BDRVSSHState *s, BlockDriverState *bs, int64_t offset, size_t size, QEMUIOVector *qiov) { @@ -840,7 +871,7 @@ static coroutine_fn int ssh_read(BDRVSSHState *s, DPRINTF("sftp_read returned %zd", r); if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s); + co_yield(s, bs); goto again; } if (r < 0) { @@ -875,14 +906,14 @@ static coroutine_fn int ssh_co_readv(BlockDriverState *bs, int ret; qemu_co_mutex_lock(&s->lock); - ret = ssh_read(s, sector_num * BDRV_SECTOR_SIZE, + ret = ssh_read(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); qemu_co_mutex_unlock(&s->lock); return ret; } -static int ssh_write(BDRVSSHState *s, +static int ssh_write(BDRVSSHState *s, BlockDriverState *bs, int64_t offset, size_t size, QEMUIOVector *qiov) { @@ -910,7 +941,7 @@ static int ssh_write(BDRVSSHState *s, DPRINTF("sftp_write returned %zd", r); if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s); + co_yield(s, bs); goto again; } if (r < 0) { @@ -929,7 +960,7 @@ static int ssh_write(BDRVSSHState *s, */ if (r == 0) { ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE); - co_yield(s); + co_yield(s, bs); goto again; } @@ -957,7 +988,7 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs, int ret; qemu_co_mutex_lock(&s->lock); - ret = ssh_write(s, sector_num * BDRV_SECTOR_SIZE, + ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov); qemu_co_mutex_unlock(&s->lock); @@ -978,7 +1009,7 @@ static void unsafe_flush_warning(BDRVSSHState *s, const char *what) #ifdef HAS_LIBSSH2_SFTP_FSYNC -static coroutine_fn int ssh_flush(BDRVSSHState *s) +static coroutine_fn int ssh_flush(BDRVSSHState *s, BlockDriverState *bs) { int r; @@ -986,7 +1017,7 @@ static coroutine_fn int ssh_flush(BDRVSSHState *s) again: r = libssh2_sftp_fsync(s->sftp_handle); if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s); + co_yield(s, bs); goto again; } if (r == LIBSSH2_ERROR_SFTP_PROTOCOL && @@ -1008,7 +1039,7 @@ static coroutine_fn int ssh_co_flush(BlockDriverState *bs) int ret; qemu_co_mutex_lock(&s->lock); - ret = ssh_flush(s); + ret = ssh_flush(s, bs); qemu_co_mutex_unlock(&s->lock); return ret; @@ -1051,7 +1082,7 @@ static BlockDriver bdrv_ssh = { .bdrv_co_writev = ssh_co_writev, .bdrv_getlength = ssh_getlength, .bdrv_co_flush_to_disk = ssh_co_flush, - .create_options = ssh_create_options, + .create_opts = &ssh_create_opts, }; static void bdrv_ssh_init(void) diff --git a/block/stream.c b/block/stream.c index dd0b4ac3d..cdea3e8d0 100644 --- a/block/stream.c +++ b/block/stream.c @@ -32,7 +32,7 @@ typedef struct StreamBlockJob { RateLimit limit; BlockDriverState *base; BlockdevOnError on_error; - char backing_file_id[1024]; + char *backing_file_str; } StreamBlockJob; static int coroutine_fn stream_populate(BlockDriverState *bs, @@ -60,7 +60,7 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, /* Must assign before bdrv_delete() to prevent traversing dangling pointer * while we delete backing image instances. */ - top->backing_hd = base; + bdrv_set_backing_hd(top, base); while (intermediate) { BlockDriverState *unused; @@ -72,11 +72,11 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, unused = intermediate; intermediate = intermediate->backing_hd; - unused->backing_hd = NULL; + bdrv_set_backing_hd(unused, NULL); bdrv_unref(unused); } - bdrv_refresh_limits(top); + bdrv_refresh_limits(top, NULL); } static void coroutine_fn stream_run(void *opaque) @@ -159,14 +159,14 @@ wait: BlockErrorAction action = block_job_error_action(&s->common, s->common.bs, s->on_error, true, -ret); - if (action == BDRV_ACTION_STOP) { + if (action == BLOCK_ERROR_ACTION_STOP) { n = 0; continue; } if (error == 0) { error = ret; } - if (action == BDRV_ACTION_REPORT) { + if (action == BLOCK_ERROR_ACTION_REPORT) { break; } } @@ -186,7 +186,7 @@ wait: if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) { const char *base_id = NULL, *base_fmt = NULL; if (base) { - base_id = s->backing_file_id; + base_id = s->backing_file_str; if (base->drv) { base_fmt = base->drv->format_name; } @@ -196,6 +196,7 @@ wait: } qemu_vfree(buf); + g_free(s->backing_file_str); block_job_completed(&s->common, ret); } @@ -217,7 +218,7 @@ static const BlockJobDriver stream_job_driver = { }; void stream_start(BlockDriverState *bs, BlockDriverState *base, - const char *base_id, int64_t speed, + const char *backing_file_str, int64_t speed, BlockdevOnError on_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) @@ -237,9 +238,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, } s->base = base; - if (base_id) { - pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id); - } + s->backing_file_str = g_strdup(backing_file_str); s->on_error = on_error; s->common.co = qemu_coroutine_create(stream_run); diff --git a/block/vdi.c b/block/vdi.c index 820cd376b..197bd77c9 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -53,6 +53,13 @@ #include "block/block_int.h" #include "qemu/module.h" #include "migration/migration.h" +#ifdef __linux__ +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif +#endif #if defined(CONFIG_UUID) #include <uuid/uuid.h> @@ -239,7 +246,6 @@ static void vdi_header_to_le(VdiHeader *header) cpu_to_le32s(&header->block_extra); cpu_to_le32s(&header->blocks_in_image); cpu_to_le32s(&header->blocks_allocated); - cpu_to_le32s(&header->blocks_allocated); uuid_convert(header->uuid_image); uuid_convert(header->uuid_last_snap); uuid_convert(header->uuid_link); @@ -408,34 +414,35 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, } if (header.signature != VDI_SIGNATURE) { - error_setg(errp, "Image not in VDI format (bad signature %08x)", header.signature); + error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32 + ")", header.signature); ret = -EINVAL; goto fail; } else if (header.version != VDI_VERSION_1_1) { - error_setg(errp, "unsupported VDI image (version %u.%u)", - header.version >> 16, header.version & 0xffff); + error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32 + ")", header.version >> 16, header.version & 0xffff); ret = -ENOTSUP; goto fail; } else if (header.offset_bmap % SECTOR_SIZE != 0) { /* We only support block maps which start on a sector boundary. */ error_setg(errp, "unsupported VDI image (unaligned block map offset " - "0x%x)", header.offset_bmap); + "0x%" PRIx32 ")", header.offset_bmap); ret = -ENOTSUP; goto fail; } else if (header.offset_data % SECTOR_SIZE != 0) { /* We only support data blocks which start on a sector boundary. */ - error_setg(errp, "unsupported VDI image (unaligned data offset 0x%x)", - header.offset_data); + error_setg(errp, "unsupported VDI image (unaligned data offset 0x%" + PRIx32 ")", header.offset_data); ret = -ENOTSUP; goto fail; } else if (header.sector_size != SECTOR_SIZE) { - error_setg(errp, "unsupported VDI image (sector size %u is not %u)", - header.sector_size, SECTOR_SIZE); + error_setg(errp, "unsupported VDI image (sector size %" PRIu32 + " is not %u)", header.sector_size, SECTOR_SIZE); ret = -ENOTSUP; goto fail; } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { - error_setg(errp, "unsupported VDI image (block size %u is not %u)", - header.block_size, DEFAULT_CLUSTER_SIZE); + error_setg(errp, "unsupported VDI image (block size %" PRIu32 + " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE); ret = -ENOTSUP; goto fail; } else if (header.disk_size > @@ -672,8 +679,7 @@ static int vdi_co_write(BlockDriverState *bs, return ret; } -static int vdi_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) { int fd; int result = 0; @@ -684,29 +690,24 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options, VdiHeader header; size_t i; size_t bmap_size; + bool nocow = false; logout("\n"); /* Read out options. */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - bytes = options->value.n; + bytes = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); #if defined(CONFIG_VDI_BLOCK_SIZE) - } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { - if (options->value.n) { - /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ - block_size = options->value.n; - } + /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ + block_size = qemu_opt_get_size_del(opts, + BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE); #endif #if defined(CONFIG_VDI_STATIC_IMAGE) - } else if (!strcmp(options->name, BLOCK_OPT_STATIC)) { - if (options->value.n) { - image_type = VDI_TYPE_STATIC; - } -#endif - } - options++; + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) { + image_type = VDI_TYPE_STATIC; } +#endif + nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); if (bytes > VDI_DISK_SIZE_MAX) { result = -ENOTSUP; @@ -724,6 +725,21 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options, goto exit; } + if (nocow) { +#ifdef __linux__ + /* Set NOCOW flag to solve performance issue on fs like btrfs. + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will + * be ignored since any failure of this operation should not block the + * left work. + */ + int attr; + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { + attr |= FS_NOCOW_FL; + ioctl(fd, FS_IOC_SETFLAGS, &attr); + } +#endif + } + /* We need enough blocks to store the given disk size, so always round up. */ blocks = (bytes + block_size - 1) / block_size; @@ -755,6 +771,7 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options, vdi_header_to_le(&header); if (write(fd, &header, sizeof(header)) < 0) { result = -errno; + goto close_and_exit; } if (bmap_size > 0) { @@ -768,6 +785,8 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options, } if (write(fd, bmap, bmap_size) < 0) { result = -errno; + g_free(bmap); + goto close_and_exit; } g_free(bmap); } @@ -775,10 +794,12 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options, if (image_type == VDI_TYPE_STATIC) { if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) { result = -errno; + goto close_and_exit; } } - if (close(fd) < 0) { +close_and_exit: + if ((close(fd) < 0) && !result) { result = -errno; } @@ -796,29 +817,39 @@ static void vdi_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static QEMUOptionParameter vdi_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, +static QemuOptsList vdi_create_opts = { + .name = "vdi-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, #if defined(CONFIG_VDI_BLOCK_SIZE) - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = OPT_SIZE, - .help = "VDI cluster (block) size", - .value = { .n = DEFAULT_CLUSTER_SIZE }, - }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "VDI cluster (block) size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) + }, #endif #if defined(CONFIG_VDI_STATIC_IMAGE) - { - .name = BLOCK_OPT_STATIC, - .type = OPT_FLAG, - .help = "VDI static (pre-allocated) image" - }, + { + .name = BLOCK_OPT_STATIC, + .type = QEMU_OPT_BOOL, + .help = "VDI static (pre-allocated) image", + .def_value_str = "off" + }, #endif - /* TODO: An additional option to set UUID values might be useful. */ - { NULL } + { + .name = BLOCK_OPT_NOCOW, + .type = QEMU_OPT_BOOL, + .help = "Turn off copy-on-write (valid only on btrfs)" + }, + /* TODO: An additional option to set UUID values might be useful. */ + { /* end of list */ } + } }; static BlockDriver bdrv_vdi = { @@ -840,7 +871,7 @@ static BlockDriver bdrv_vdi = { .bdrv_get_info = vdi_get_info, - .create_options = vdi_create_options, + .create_opts = &vdi_create_opts, .bdrv_check = vdi_check, }; diff --git a/block/vhdx.c b/block/vhdx.c index 509baaf48..fedcf9f9c 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -473,7 +473,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } else if (h2_seq > h1_seq) { s->curr_header = 1; } else { - goto fail; + /* The Microsoft Disk2VHD tool will create 2 identical + * headers, with identical sequence numbers. If the headers are + * identical, don't consider the file corrupt */ + if (!memcmp(header1, header2, sizeof(VHDXHeader))) { + s->curr_header = 0; + } else { + goto fail; + } } } @@ -1716,8 +1723,7 @@ exit: * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. * 1MB */ -static int vhdx_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) { int ret = 0; uint64_t image_size = (uint64_t) 2 * GiB; @@ -1730,24 +1736,15 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options, gunichar2 *creator = NULL; glong creator_items; BlockDriverState *bs; - const char *type = NULL; + char *type = NULL; VHDXImageType image_type; Error *local_err = NULL; - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - image_size = options->value.n; - } else if (!strcmp(options->name, VHDX_BLOCK_OPT_LOG_SIZE)) { - log_size = options->value.n; - } else if (!strcmp(options->name, VHDX_BLOCK_OPT_BLOCK_SIZE)) { - block_size = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { - type = options->value.s; - } else if (!strcmp(options->name, VHDX_BLOCK_OPT_ZERO)) { - use_zero_blocks = options->value.n != 0; - } - options++; - } + image_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0); + block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0); + type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, false); if (image_size > VHDX_MAX_IMAGE_SIZE) { error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); @@ -1756,7 +1753,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options, } if (type == NULL) { - type = "dynamic"; + type = g_strdup("dynamic"); } if (!strcmp(type, "dynamic")) { @@ -1796,7 +1793,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options, block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : block_size; - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto exit; @@ -1856,6 +1853,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options, delete_and_exit: bdrv_unref(bs); exit: + g_free(type); g_free(creator); return ret; } @@ -1878,37 +1876,41 @@ static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, return 0; } -static QEMUOptionParameter vhdx_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size; max of 64TB." - }, - { - .name = VHDX_BLOCK_OPT_LOG_SIZE, - .type = OPT_SIZE, - .value.n = 1 * MiB, - .help = "Log size; min 1MB." - }, - { - .name = VHDX_BLOCK_OPT_BLOCK_SIZE, - .type = OPT_SIZE, - .value.n = 0, - .help = "Block Size; min 1MB, max 256MB. " \ - "0 means auto-calculate based on image size." - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = OPT_STRING, - .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ - "Default is 'dynamic'." - }, - { - .name = VHDX_BLOCK_OPT_ZERO, - .type = OPT_FLAG, - .help = "Force use of payload blocks of type 'ZERO'. Non-standard." - }, - { NULL } +static QemuOptsList vhdx_create_opts = { + .name = "vhdx-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vhdx_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size; max of 64TB." + }, + { + .name = VHDX_BLOCK_OPT_LOG_SIZE, + .type = QEMU_OPT_SIZE, + .def_value_str = stringify(DEFAULT_LOG_SIZE), + .help = "Log size; min 1MB." + }, + { + .name = VHDX_BLOCK_OPT_BLOCK_SIZE, + .type = QEMU_OPT_SIZE, + .def_value_str = stringify(0), + .help = "Block Size; min 1MB, max 256MB. " \ + "0 means auto-calculate based on image size." + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ + "Default is 'dynamic'." + }, + { + .name = VHDX_BLOCK_OPT_ZERO, + .type = QEMU_OPT_BOOL, + .help = "Force use of payload blocks of type 'ZERO'. Non-standard." + }, + { NULL } + } }; static BlockDriver bdrv_vhdx = { @@ -1924,7 +1926,7 @@ static BlockDriver bdrv_vhdx = { .bdrv_get_info = vhdx_get_info, .bdrv_check = vhdx_check, - .create_options = vhdx_create_options, + .create_opts = &vhdx_create_opts, }; static void bdrv_vhdx_init(void) diff --git a/block/vhdx.h b/block/vhdx.h index 8103d4c44..5370010c5 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -23,6 +23,7 @@ #define GiB (MiB * 1024) #define TiB ((uint64_t) GiB * 1024) +#define DEFAULT_LOG_SIZE 1048576 /* 1MiB */ /* Structures and fields present in the VHDX file */ /* The header section has the following blocks, diff --git a/block/vmdk.c b/block/vmdk.c index b69988d16..0517bbaf9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -262,7 +262,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) p_name = strstr(desc, cid_str); if (p_name != NULL) { p_name += cid_str_size; - sscanf(p_name, "%x", &cid); + sscanf(p_name, "%" SCNx32, &cid); } return cid; @@ -290,7 +290,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); - snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid); + snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); pstrcat(desc, sizeof(desc), tmp_desc); } @@ -640,7 +640,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (le32_to_cpu(header.version) > 3) { char buf[64]; - snprintf(buf, sizeof(buf), "VMDK version %d", + snprintf(buf, sizeof(buf), "VMDK version %" PRId32, le32_to_cpu(header.version)); error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "vmdk", buf); @@ -671,8 +671,9 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } if (bdrv_getlength(file) < le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { - error_setg(errp, "File truncated, expecting at least %lld bytes", - le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE); + error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", + (int64_t)(le64_to_cpu(header.grain_offset) + * BDRV_SECTOR_SIZE)); return -EINVAL; } @@ -937,7 +938,7 @@ fail: } -static int vmdk_refresh_limits(BlockDriverState *bs) +static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVVmdkState *s = bs->opaque; int i; @@ -949,8 +950,6 @@ static int vmdk_refresh_limits(BlockDriverState *bs) s->extents[i].cluster_sectors); } } - - return 0; } static int get_whole_cluster(BlockDriverState *bs, @@ -1495,6 +1494,19 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } +static int vmdk_write_compressed(BlockDriverState *bs, + int64_t sector_num, + const uint8_t *buf, + int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + if (s->num_extents == 1 && s->extents[0].compressed) { + return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + } else { + return -ENOTSUP; + } +} + static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, @@ -1515,17 +1527,17 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, static int vmdk_create_extent(const char *filename, int64_t filesize, bool flat, bool compress, bool zeroed_grain, - Error **errp) + QemuOpts *opts, Error **errp) { int ret, i; BlockDriverState *bs = NULL; VMDK4Header header; - Error *local_err; + Error *local_err = NULL; uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; uint32_t *gd_buf = NULL; int gd_buf_size; - ret = bdrv_create_file(filename, NULL, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto exit; @@ -1681,17 +1693,16 @@ static int filename_decompose(const char *filename, char *path, char *prefix, return VMDK_OK; } -static int vmdk_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) { int idx = 0; BlockDriverState *new_bs = NULL; - Error *local_err; + Error *local_err = NULL; char *desc = NULL; int64_t total_size = 0, filesize; - const char *adapter_type = NULL; - const char *backing_file = NULL; - const char *fmt = NULL; + char *adapter_type = NULL; + char *backing_file = NULL; + char *fmt = NULL; int flags = 0; int ret = 0; bool flat, split, compress; @@ -1707,8 +1718,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, const char desc_template[] = "# Disk DescriptorFile\n" "version=1\n" - "CID=%x\n" - "parentCID=%x\n" + "CID=%" PRIx32 "\n" + "parentCID=%" PRIx32 "\n" "createType=\"%s\"\n" "%s" "\n" @@ -1720,7 +1731,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, "\n" "ddb.virtualHWVersion = \"%d\"\n" "ddb.geometry.cylinders = \"%" PRId64 "\"\n" - "ddb.geometry.heads = \"%d\"\n" + "ddb.geometry.heads = \"%" PRIu32 "\"\n" "ddb.geometry.sectors = \"63\"\n" "ddb.adapterType = \"%s\"\n"; @@ -1731,24 +1742,19 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, goto exit; } /* Read out options */ - while (options && options->name) { - if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n; - } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) { - adapter_type = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) { - flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0; - } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { - fmt = options->value.s; - } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) { - zeroed_grain |= options->value.n; - } - options++; + total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { + flags |= BLOCK_FLAG_COMPAT6; } + fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { + zeroed_grain = true; + } + if (!adapter_type) { - adapter_type = "ide"; + adapter_type = g_strdup("ide"); } else if (strcmp(adapter_type, "ide") && strcmp(adapter_type, "buslogic") && strcmp(adapter_type, "lsilogic") && @@ -1764,7 +1770,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, } if (!fmt) { /* Default format to monolithicSparse */ - fmt = "monolithicSparse"; + fmt = g_strdup("monolithicSparse"); } else if (strcmp(fmt, "monolithicFlat") && strcmp(fmt, "monolithicSparse") && strcmp(fmt, "twoGbMaxExtentSparse") && @@ -1780,9 +1786,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, strcmp(fmt, "twoGbMaxExtentFlat")); compress = !strcmp(fmt, "streamOptimized"); if (flat) { - desc_extent_line = "RW %lld FLAT \"%s\" 0\n"; + desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; } else { - desc_extent_line = "RW %lld SPARSE \"%s\"\n"; + desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; } if (flat && backing_file) { error_setg(errp, "Flat image can't have backing file"); @@ -1837,7 +1843,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, path, desc_filename); if (vmdk_create_extent(ext_filename, size, - flat, compress, zeroed_grain, errp)) { + flat, compress, zeroed_grain, opts, errp)) { ret = -EINVAL; goto exit; } @@ -1850,7 +1856,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, } /* generate descriptor file */ desc = g_strdup_printf(desc_template, - (unsigned int)time(NULL), + (uint32_t)time(NULL), parent_cid, fmt, parent_desc_line, @@ -1865,9 +1871,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, if (!split && !flat) { desc_offset = 0x200; } else { - ret = bdrv_create_file(filename, options, &local_err); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { - error_setg_errno(errp, -ret, "Could not create image file"); + error_propagate(errp, local_err); goto exit; } } @@ -1875,7 +1881,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, ret = bdrv_open(&new_bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { - error_setg_errno(errp, -ret, "Could not write description"); + error_propagate(errp, local_err); goto exit; } ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); @@ -1895,6 +1901,9 @@ exit: if (new_bs) { bdrv_unref(new_bs); } + g_free(adapter_type); + g_free(backing_file); + g_free(fmt); g_free(desc); g_string_free(ext_desc_lines, true); return ret; @@ -2062,41 +2071,88 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) return spec_info; } -static QEMUOptionParameter vmdk_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_ADAPTER_TYPE, - .type = OPT_STRING, - .help = "Virtual adapter type, can be one of " - "ide (default), lsilogic, buslogic or legacyESX" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_COMPAT6, - .type = OPT_FLAG, - .help = "VMDK version 6 image" - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = OPT_STRING, - .help = - "VMDK flat extent format, can be one of " - "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " - }, - { - .name = BLOCK_OPT_ZEROED_GRAIN, - .type = OPT_FLAG, - .help = "Enable efficient zero writes using the zeroed-grain GTE feature" - }, - { NULL } +static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + int i; + BDRVVmdkState *s = bs->opaque; + assert(s->num_extents); + bdi->needs_compressed_writes = s->extents[0].compressed; + if (!s->extents[0].flat) { + bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; + } + /* See if we have multiple extents but they have different cases */ + for (i = 1; i < s->num_extents; i++) { + if (bdi->needs_compressed_writes != s->extents[i].compressed || + (bdi->cluster_size && bdi->cluster_size != + s->extents[i].cluster_sectors << BDRV_SECTOR_BITS)) { + return -ENOTSUP; + } + } + return 0; +} + +static void vmdk_detach_aio_context(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + bdrv_detach_aio_context(s->extents[i].file); + } +} + +static void vmdk_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + bdrv_attach_aio_context(s->extents[i].file, new_context); + } +} + +static QemuOptsList vmdk_create_opts = { + .name = "vmdk-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_ADAPTER_TYPE, + .type = QEMU_OPT_STRING, + .help = "Virtual adapter type, can be one of " + "ide (default), lsilogic, buslogic or legacyESX" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_COMPAT6, + .type = QEMU_OPT_BOOL, + .help = "VMDK version 6 image", + .def_value_str = "off" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = + "VMDK flat extent format, can be one of " + "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " + }, + { + .name = BLOCK_OPT_ZEROED_GRAIN, + .type = QEMU_OPT_BOOL, + .help = "Enable efficient zero writes " + "using the zeroed-grain GTE feature" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_vmdk = { @@ -2108,6 +2164,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_reopen_prepare = vmdk_reopen_prepare, .bdrv_read = vmdk_co_read, .bdrv_write = vmdk_co_write, + .bdrv_write_compressed = vmdk_write_compressed, .bdrv_co_write_zeroes = vmdk_co_write_zeroes, .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, @@ -2117,8 +2174,12 @@ static BlockDriver bdrv_vmdk = { .bdrv_has_zero_init = vmdk_has_zero_init, .bdrv_get_specific_info = vmdk_get_specific_info, .bdrv_refresh_limits = vmdk_refresh_limits, + .bdrv_get_info = vmdk_get_info, + .bdrv_detach_aio_context = vmdk_detach_aio_context, + .bdrv_attach_aio_context = vmdk_attach_aio_context, - .create_options = vmdk_create_options, + .supports_backing = true, + .create_opts = &vmdk_create_opts, }; static void bdrv_vmdk_init(void) diff --git a/block/vpc.c b/block/vpc.c index 2e25f5723..8b376a40b 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -29,6 +29,13 @@ #if defined(CONFIG_UUID) #include <uuid/uuid.h> #endif +#ifdef __linux__ +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif +#endif /**************************************************************/ @@ -738,12 +745,11 @@ static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) return ret; } -static int vpc_create(const char *filename, QEMUOptionParameter *options, - Error **errp) +static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) { uint8_t buf[1024]; VHDFooter *footer = (VHDFooter *) buf; - QEMUOptionParameter *disk_type_param; + char *disk_type_param; int fd, i; uint16_t cyls = 0; uint8_t heads = 0; @@ -752,27 +758,45 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options, int64_t total_size; int disk_type; int ret = -EIO; + bool nocow = false; /* Read out options */ - total_size = get_option_parameter(options, BLOCK_OPT_SIZE)->value.n; - - disk_type_param = get_option_parameter(options, BLOCK_OPT_SUBFMT); - if (disk_type_param && disk_type_param->value.s) { - if (!strcmp(disk_type_param->value.s, "dynamic")) { + total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); + disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + if (disk_type_param) { + if (!strcmp(disk_type_param, "dynamic")) { disk_type = VHD_DYNAMIC; - } else if (!strcmp(disk_type_param->value.s, "fixed")) { + } else if (!strcmp(disk_type_param, "fixed")) { disk_type = VHD_FIXED; } else { - return -EINVAL; + ret = -EINVAL; + goto out; } } else { disk_type = VHD_DYNAMIC; } + nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); /* Create the file */ fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); if (fd < 0) { - return -EIO; + ret = -EIO; + goto out; + } + + if (nocow) { +#ifdef __linux__ + /* Set NOCOW flag to solve performance issue on fs like btrfs. + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will + * be ignored since any failure of this operation should not block the + * left work. + */ + int attr; + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { + attr |= FS_NOCOW_FL; + ioctl(fd, FS_IOC_SETFLAGS, &attr); + } +#endif } /* @@ -837,8 +861,10 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options, ret = create_fixed_disk(fd, buf, total_size); } - fail: +fail: qemu_close(fd); +out: + g_free(disk_type_param); return ret; } @@ -866,20 +892,29 @@ static void vpc_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static QEMUOptionParameter vpc_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = OPT_STRING, - .help = - "Type of virtual hard disk format. Supported formats are " - "{dynamic (default) | fixed} " - }, - { NULL } +static QemuOptsList vpc_create_opts = { + .name = "vpc-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = + "Type of virtual hard disk format. Supported formats are " + "{dynamic (default) | fixed} " + }, + { + .name = BLOCK_OPT_NOCOW, + .type = QEMU_OPT_BOOL, + .help = "Turn off copy-on-write (valid only on btrfs)" + }, + { /* end of list */ } + } }; static BlockDriver bdrv_vpc = { @@ -897,7 +932,7 @@ static BlockDriver bdrv_vpc = { .bdrv_get_info = vpc_get_info, - .create_options = vpc_create_options, + .create_opts = &vpc_create_opts, .bdrv_has_zero_init = vpc_has_zero_init, }; diff --git a/block/vvfat.c b/block/vvfat.c index 1978c9ed6..70176b161 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -787,7 +787,9 @@ static int read_directory(BDRVVVFATState* s, int mapping_index) s->current_mapping->path=buffer; s->current_mapping->read_only = (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; - } + } else { + g_free(buffer); + } } closedir(dir); @@ -831,7 +833,8 @@ static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) } static int init_directories(BDRVVVFATState* s, - const char *dirname, int heads, int secs) + const char *dirname, int heads, int secs, + Error **errp) { bootsector_t* bootsector; mapping_t* mapping; @@ -892,8 +895,8 @@ static int init_directories(BDRVVVFATState* s, if (mapping->mode & MODE_DIRECTORY) { mapping->begin = cluster; if(read_directory(s, i)) { - fprintf(stderr, "Could not read directory %s\n", - mapping->path); + error_setg(errp, "Could not read directory %s", + mapping->path); return -1; } mapping = array_get(&(s->mapping), i); @@ -919,9 +922,10 @@ static int init_directories(BDRVVVFATState* s, cluster = mapping->end; if(cluster > s->cluster_count) { - fprintf(stderr,"Directory does not fit in FAT%d (capacity %.2f MB)\n", - s->fat_type, s->sector_count / 2000.0); - return -EINVAL; + error_setg(errp, + "Directory does not fit in FAT%d (capacity %.2f MB)", + s->fat_type, s->sector_count / 2000.0); + return -1; } /* fix fat for entry */ @@ -979,7 +983,7 @@ static int init_directories(BDRVVVFATState* s, static BDRVVVFATState *vvv = NULL; #endif -static int enable_write_target(BDRVVVFATState *s); +static int enable_write_target(BDRVVVFATState *s, Error **errp); static int is_consistent(BDRVVVFATState *s); static void vvfat_rebind(BlockDriverState *bs) @@ -1160,7 +1164,7 @@ DLOG(if (stderr == NULL) { s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); if (qemu_opt_get_bool(opts, "rw", false)) { - ret = enable_write_target(s); + ret = enable_write_target(s, errp); if (ret < 0) { goto fail; } @@ -1169,7 +1173,7 @@ DLOG(if (stderr == NULL) { bs->total_sectors = cyls * heads * secs; - if (init_directories(s, dirname, heads, secs)) { + if (init_directories(s, dirname, heads, secs, errp)) { ret = -EIO; goto fail; } @@ -1864,7 +1868,7 @@ static int check_directory_consistency(BDRVVVFATState *s, if (s->used_clusters[cluster_num] & USED_ANY) { fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); - return 0; + goto fail; } s->used_clusters[cluster_num] = USED_DIRECTORY; @@ -2904,11 +2908,10 @@ static BlockDriver vvfat_write_target = { .bdrv_close = write_target_close, }; -static int enable_write_target(BDRVVVFATState *s) +static int enable_write_target(BDRVVVFATState *s, Error **errp) { - BlockDriver *bdrv_qcow; - QEMUOptionParameter *options; - Error *local_err = NULL; + BlockDriver *bdrv_qcow = NULL; + QemuOpts *opts = NULL; int ret; int size = sector2cluster(s, s->sector_count); s->used_clusters = calloc(size, 1); @@ -2918,28 +2921,26 @@ static int enable_write_target(BDRVVVFATState *s) s->qcow_filename = g_malloc(1024); ret = get_tmp_filename(s->qcow_filename, 1024); if (ret < 0) { + error_setg_errno(errp, -ret, "can't create temporary file"); goto err; } bdrv_qcow = bdrv_find_format("qcow"); - options = parse_option_parameters("", bdrv_qcow->create_options, NULL); - set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); - set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); + opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, &error_abort); + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512); + qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:"); - ret = bdrv_create(bdrv_qcow, s->qcow_filename, options, &local_err); + ret = bdrv_create(bdrv_qcow, s->qcow_filename, opts, errp); + qemu_opts_del(opts); if (ret < 0) { - qerror_report_err(local_err); - error_free(local_err); goto err; } s->qcow = NULL; ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow, - &local_err); + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, + bdrv_qcow, errp); if (ret < 0) { - qerror_report_err(local_err); - error_free(local_err); goto err; } @@ -2947,7 +2948,7 @@ static int enable_write_target(BDRVVVFATState *s) unlink(s->qcow_filename); #endif - s->bs->backing_hd = bdrv_new(""); + bdrv_set_backing_hd(s->bs, bdrv_new("", &error_abort)); s->bs->backing_hd->drv = &vvfat_write_target; s->bs->backing_hd->opaque = g_malloc(sizeof(void*)); *(void**)s->bs->backing_hd->opaque = s; diff --git a/block/win32-aio.c b/block/win32-aio.c index 5d1d199b6..8e417f70a 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -40,6 +40,7 @@ struct QEMUWin32AIOState { HANDLE hIOCP; EventNotifier e; int count; + bool is_aio_context_attached; }; typedef struct QEMUWin32AIOCB { @@ -114,7 +115,7 @@ static void win32_aio_cancel(BlockDriverAIOCB *blockacb) * wait for completion. */ while (!HasOverlappedIoCompleted(&waiocb->ov)) { - qemu_aio_wait(); + aio_poll(bdrv_get_aio_context(blockacb->bs), true); } } @@ -180,6 +181,20 @@ int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile) } } +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, + AioContext *old_context) +{ + aio_set_event_notifier(old_context, &aio->e, NULL); + aio->is_aio_context_attached = false; +} + +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, + AioContext *new_context) +{ + aio->is_aio_context_attached = true; + aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb); +} + QEMUWin32AIOState *win32_aio_init(void) { QEMUWin32AIOState *s; @@ -194,8 +209,6 @@ QEMUWin32AIOState *win32_aio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb); - return s; out_close_efd: @@ -204,3 +217,11 @@ out_free_state: g_free(s); return NULL; } + +void win32_aio_cleanup(QEMUWin32AIOState *aio) +{ + assert(!aio->is_aio_context_attached); + CloseHandle(aio->hIOCP); + event_notifier_cleanup(&aio->e); + g_free(aio); +} |