diff options
40 files changed, 1329 insertions, 343 deletions
diff --git a/block-migration.c b/block-migration.c index 5f10486416..423c5a07b6 100644 --- a/block-migration.c +++ b/block-migration.c @@ -387,7 +387,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { if (bmds_aio_inflight(bmds, sector)) { - qemu_aio_flush(); + bdrv_drain_all(); } if (bdrv_get_dirty(bmds->bs, sector)) { @@ -30,6 +30,7 @@ #include "qjson.h" #include "qemu-coroutine.h" #include "qmp-commands.h" +#include "qemu-timer.h" #ifdef CONFIG_BSD #include <sys/types.h> @@ -73,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait); + static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -105,6 +113,79 @@ int is_windows_drive(const char *filename) } #endif +/* throttling disk I/O limits */ +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + while (qemu_co_queue_next(&bs->throttled_reqs)); + + if (bs->block_timer) { + qemu_del_timer(bs->block_timer); + qemu_free_timer(bs->block_timer); + bs->block_timer = NULL; + } + + bs->slice_start = 0; + bs->slice_end = 0; + bs->slice_time = 0; + memset(&bs->io_base, 0, sizeof(bs->io_base)); +} + +static void bdrv_block_timer(void *opaque) +{ + BlockDriverState *bs = opaque; + + qemu_co_queue_next(&bs->throttled_reqs); +} + +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + qemu_co_queue_init(&bs->throttled_reqs); + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); + bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; + bs->slice_start = qemu_get_clock_ns(vm_clock); + bs->slice_end = bs->slice_start + bs->slice_time; + memset(&bs->io_base, 0, sizeof(bs->io_base)); + bs->io_limits_enabled = true; +} + +bool bdrv_io_limits_enabled(BlockDriverState *bs) +{ + BlockIOLimit *io_limits = &bs->io_limits; + return io_limits->bps[BLOCK_IO_LIMIT_READ] + || io_limits->bps[BLOCK_IO_LIMIT_WRITE] + || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] + || io_limits->iops[BLOCK_IO_LIMIT_READ] + || io_limits->iops[BLOCK_IO_LIMIT_WRITE] + || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; +} + +static void bdrv_io_limits_intercept(BlockDriverState *bs, + bool is_write, int nb_sectors) +{ + int64_t wait_time = -1; + + if (!qemu_co_queue_empty(&bs->throttled_reqs)) { + qemu_co_queue_wait(&bs->throttled_reqs); + } + + /* In fact, we hope to keep each request's timing, in FIFO mode. The next + * throttled requests will not be dequeued until the current request is + * allowed to be serviced. So if the current request still exceeds the + * limits, it will be inserted to the head. All requests followed it will + * be still in throttled_reqs queue. + */ + + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + qemu_co_queue_wait_insert_head(&bs->throttled_reqs); + } + + qemu_co_queue_next(&bs->throttled_reqs); +} + /* check if the path starts with "<protocol>:" */ static int path_has_protocol(const char *path) { @@ -457,6 +538,22 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) return 0; } +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + /* * Common part for opening disk images and files */ @@ -478,6 +575,11 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, bs->growable = 0; bs->buffer_alignment = 512; + assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ + if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) { + bdrv_enable_copy_on_read(bs); + } + pstrcpy(bs->filename, sizeof(bs->filename), filename); bs->backing_file[0] = '\0'; @@ -687,6 +789,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, bdrv_dev_change_media_cb(bs, true); } + /* throttling disk I/O limits */ + if (bs->io_limits_enabled) { + bdrv_io_limits_enable(bs); + } + return 0; unlink_and_fail: @@ -715,6 +822,7 @@ void bdrv_close(BlockDriverState *bs) #endif bs->opaque = NULL; bs->drv = NULL; + bs->copy_on_read = 0; if (bs->file != NULL) { bdrv_close(bs->file); @@ -722,6 +830,11 @@ void bdrv_close(BlockDriverState *bs) bdrv_dev_change_media_cb(bs, false); } + + /*throttling disk I/O limits*/ + if (bs->io_limits_enabled) { + bdrv_io_limits_disable(bs); + } } void bdrv_close_all(void) @@ -733,6 +846,25 @@ void bdrv_close_all(void) } } +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + */ +void bdrv_drain_all(void) +{ + BlockDriverState *bs; + + qemu_aio_flush(); + + /* If requests are still pending there is a bug somewhere */ + QTAILQ_FOREACH(bs, &bdrv_states, list) { + assert(QLIST_EMPTY(&bs->tracked_requests)); + assert(qemu_co_queue_empty(&bs->throttled_reqs)); + } +} + /* make a BlockDriverState anonymous by removing from bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) @@ -922,7 +1054,7 @@ int bdrv_commit(BlockDriverState *bs) buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { - if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { + if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { if (bdrv_read(bs, sector, buf, n) != 0) { ret = -EIO; @@ -980,6 +1112,118 @@ void bdrv_commit_all(void) } } +struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + bool is_write; + QLIST_ENTRY(BdrvTrackedRequest) list; + Coroutine *co; /* owner, used for deadlock detection */ + CoQueue wait_queue; /* coroutines blocked on this request */ +}; + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .is_write = is_write, + .co = qemu_coroutine_self(), + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +/** + * Round a region to cluster boundaries + */ +static void round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t sector_num, int nb_sectors) { + /* aaaa bbbb */ + if (sector_num >= req->sector_num + req->nb_sectors) { + return false; + } + /* bbbb aaaa */ + if (req->sector_num >= sector_num + nb_sectors) { + return false; + } + return true; +} + +static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BdrvTrackedRequest *req; + int64_t cluster_sector_num; + int cluster_nb_sectors; + bool retry; + + /* If we touch the same cluster it counts as an overlap. This guarantees + * that allocating writes will be serialized and not race with each other + * for the same cluster. For example, in copy-on-read it ensures that the + * CoR read and write operations are atomic and guest writes cannot + * interleave between them. + */ + round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (tracked_request_overlaps(req, cluster_sector_num, + cluster_nb_sectors)) { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + qemu_co_queue_wait(&req->wait_queue); + retry = true; + break; + } + } + } while (retry); +} + /* * Return values: * 0 - success @@ -1252,6 +1496,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } +static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + /* * Handle a read request in coroutine context */ @@ -1259,6 +1558,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; if (!drv) { return -ENOMEDIUM; @@ -1267,7 +1568,36 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, return -EIO; } - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + /* throttling disk read I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, false, nb_sectors); + } + + if (bs->copy_on_read) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + + if (bs->copy_on_read) { + int pnum; + + ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + +out: + tracked_request_end(&req); + return ret; } int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, @@ -1285,6 +1615,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; int ret; if (!bs->drv) { @@ -1297,6 +1628,17 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return -EIO; } + /* throttling disk write I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, true, nb_sectors); + } + + if (bs->copy_on_read) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); if (bs->dirty_bitmap) { @@ -1307,6 +1649,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bs->wr_highest_sector = sector_num + nb_sectors - 1; } + tracked_request_end(&req); + return ret; } @@ -1526,6 +1870,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, *psecs = bs->secs; } +/* throttling disk io limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits) +{ + bs->io_limits = *io_limits; + bs->io_limits_enabled = bdrv_io_limits_enabled(bs); +} + /* Recognize floppy formats */ typedef struct FDFormat { FDriveType drive; @@ -1778,31 +2130,87 @@ int bdrv_has_zero_init(BlockDriverState *bs) return 1; } +typedef struct BdrvCoIsAllocatedData { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int *pnum; + int ret; + bool done; +} BdrvCoIsAllocatedData; + /* * Returns true iff the specified sector is present in the disk image. Drivers * not implementing the functionality are assumed to not support backing files, * hence all their sectors are reported as allocated. * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * * 'pnum' is set to the number of sectors (including and immediately following * the specified sector) that are known to be in the same * allocated/unallocated state. * - * 'nb_sectors' is the max value 'pnum' should be set to. + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. */ -int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { int64_t n; - if (!bs->drv->bdrv_is_allocated) { - if (sector_num >= bs->total_sectors) { - *pnum = 0; - return 0; - } - n = bs->total_sectors - sector_num; - *pnum = (n < nb_sectors) ? (n) : (nb_sectors); + + if (sector_num >= bs->total_sectors) { + *pnum = 0; + return 0; + } + + n = bs->total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_is_allocated) { + *pnum = nb_sectors; return 1; } - return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum); + + return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); +} + +/* Coroutine wrapper for bdrv_is_allocated() */ +static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) +{ + BdrvCoIsAllocatedData *data = opaque; + BlockDriverState *bs = data->bs; + + data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_is_allocated(). + * + * See bdrv_co_is_allocated() for details. + */ +int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + Coroutine *co; + BdrvCoIsAllocatedData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + co = qemu_coroutine_create(bdrv_is_allocated_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } + return data.ret; } void bdrv_mon_event(const BlockDriverState *bdrv, @@ -1869,6 +2277,21 @@ BlockInfoList *qmp_query_block(Error **errp) info->value->inserted->has_backing_file = true; info->value->inserted->backing_file = g_strdup(bs->backing_file); } + + if (bs->io_limits_enabled) { + info->value->inserted->bps = + bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + info->value->inserted->bps_rd = + bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; + info->value->inserted->bps_wr = + bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; + info->value->inserted->iops = + bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + info->value->inserted->iops_rd = + bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; + info->value->inserted->iops_wr = + bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; + } } /* XXX: waiting for the qapi to support GSList */ @@ -2480,6 +2903,170 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb) acb->pool->cancel(acb); } +/* block I/O throttling */ +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait) +{ + uint64_t bps_limit = 0; + double bytes_limit, bytes_base, bytes_res; + double slice_time, wait_time; + + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.bps[is_write]) { + bps_limit = bs->io_limits.bps[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + bytes_limit = bps_limit * slice_time; + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write]; + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write]; + } + + /* bytes_base: the bytes of data which have been read/written; and + * it is obtained from the history statistic info. + * bytes_res: the remaining bytes of data which need to be read/written. + * (bytes_base + bytes_res) / bps_limit: used to calcuate + * the total time for completing reading/writting all data. + */ + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + + if (bytes_base + bytes_res <= bytes_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; + + /* When the I/O rate at runtime exceeds the limits, + * bs->slice_end need to be extended in order that the current statistic + * info can be kept until the timer fire, so it is increased and tuned + * based on the result of experiment. + */ + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait) +{ + uint64_t iops_limit = 0; + double ios_limit, ios_base; + double slice_time, wait_time; + + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.iops[is_write]) { + iops_limit = bs->io_limits.iops[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + ios_limit = iops_limit * slice_time; + ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write]; + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write]; + } + + if (ios_base + 1 <= ios_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (ios_base + 1) / iops_limit; + if (wait_time > elapsed_time) { + wait_time = wait_time - elapsed_time; + } else { + wait_time = 0; + } + + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait) +{ + int64_t now, max_wait; + uint64_t bps_wait = 0, iops_wait = 0; + double elapsed_time; + int bps_ret, iops_ret; + + now = qemu_get_clock_ns(vm_clock); + if ((bs->slice_start < now) + && (bs->slice_end > now)) { + bs->slice_end = now + bs->slice_time; + } else { + bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; + bs->slice_start = now; + bs->slice_end = now + bs->slice_time; + + bs->io_base.bytes[is_write] = bs->nr_bytes[is_write]; + bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write]; + + bs->io_base.ios[is_write] = bs->nr_ops[is_write]; + bs->io_base.ios[!is_write] = bs->nr_ops[!is_write]; + } + + elapsed_time = now - bs->slice_start; + elapsed_time /= (NANOSECONDS_PER_SECOND); + + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, + is_write, elapsed_time, &bps_wait); + iops_ret = bdrv_exceed_iops_limits(bs, is_write, + elapsed_time, &iops_wait); + if (bps_ret || iops_ret) { + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; + if (wait) { + *wait = max_wait; + } + + now = qemu_get_clock_ns(vm_clock); + if (bs->slice_end < now + max_wait) { + bs->slice_end = now + max_wait; + } + + return true; + } + + if (wait) { + *wait = 0; + } + + return false; +} /**************************************************************/ /* async block device emulation */ @@ -70,6 +70,7 @@ typedef struct BlockDevOps { #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ +#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH) @@ -98,6 +99,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data); void bdrv_stats_print(Monitor *mon, const QObject *data); void bdrv_info_stats(Monitor *mon, QObject **ret_data); +/* disk I/O throttling */ +void bdrv_io_limits_enable(BlockDriverState *bs); +void bdrv_io_limits_disable(BlockDriverState *bs); +bool bdrv_io_limits_enabled(BlockDriverState *bs); + void bdrv_init(void); void bdrv_init_with_whitelist(void); BlockDriver *bdrv_find_protocol(const char *filename); @@ -138,6 +144,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); @@ -206,6 +214,7 @@ int bdrv_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_flush(BlockDriverState *bs); void bdrv_flush_all(void); void bdrv_close_all(void); +void bdrv_drain_all(void); int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); @@ -305,6 +314,9 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); int64_t bdrv_get_dirty_count(BlockDriverState *bs); +void bdrv_enable_copy_on_read(BlockDriverState *bs); +void bdrv_disable_copy_on_read(BlockDriverState *bs); + void bdrv_set_in_use(BlockDriverState *bs, int in_use); int bdrv_in_use(BlockDriverState *bs); diff --git a/block/cow.c b/block/cow.c index 3448296190..3c527358c6 100644 --- a/block/cow.c +++ b/block/cow.c @@ -132,8 +132,8 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) /* Return true if first block has been changed (ie. current version is * in COW file). Set the number of continuous blocks for which that * is true. */ -static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *num_same) +static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *num_same) { int changed; @@ -171,14 +171,14 @@ static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, return error; } -static int cow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) { BDRVCowState *s = bs->opaque; int ret, n; while (nb_sectors > 0) { - if (cow_is_allocated(bs, sector_num, nb_sectors, &n)) { + if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) { ret = bdrv_pread(bs->file, s->cow_sectors_offset + sector_num * 512, buf, n * 512); @@ -243,12 +243,12 @@ static void cow_close(BlockDriverState *bs) static int cow_create(const char *filename, QEMUOptionParameter *options) { - int fd, cow_fd; struct cow_header_v2 cow_header; struct stat st; int64_t image_sectors = 0; const char *image_filename = NULL; int ret; + BlockDriverState *cow_bs; /* Read out options */ while (options && options->name) { @@ -260,10 +260,16 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) options++; } - cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, - 0644); - if (cow_fd < 0) - return -errno; + ret = bdrv_create_file(filename, options); + if (ret < 0) { + return ret; + } + + ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR); + if (ret < 0) { + return ret; + } + memset(&cow_header, 0, sizeof(cow_header)); cow_header.magic = cpu_to_be32(COW_MAGIC); cow_header.version = cpu_to_be32(COW_VERSION); @@ -271,16 +277,9 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) /* Note: if no file, we put a dummy mtime */ cow_header.mtime = cpu_to_be32(0); - fd = open(image_filename, O_RDONLY | O_BINARY); - if (fd < 0) { - close(cow_fd); - goto mtime_fail; - } - if (fstat(fd, &st) != 0) { - close(fd); + if (stat(image_filename, &st) != 0) { goto mtime_fail; } - close(fd); cow_header.mtime = cpu_to_be32(st.st_mtime); mtime_fail: pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file), @@ -288,21 +287,20 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) } cow_header.sectorsize = cpu_to_be32(512); cow_header.size = cpu_to_be64(image_sectors * 512); - ret = qemu_write_full(cow_fd, &cow_header, sizeof(cow_header)); + ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header)); if (ret != sizeof(cow_header)) { - ret = -errno; goto exit; } /* resize to include at least all the bitmap */ - ret = ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3)); + ret = bdrv_truncate(cow_bs, + sizeof(cow_header) + ((image_sectors + 7) >> 3)); if (ret) { - ret = -errno; goto exit; } exit: - close(cow_fd); + bdrv_delete(cow_bs); return ret; } @@ -337,7 +335,7 @@ static BlockDriver bdrv_cow = { .bdrv_read = cow_co_read, .bdrv_write = cow_co_write, .bdrv_co_flush_to_disk = cow_co_flush, - .bdrv_is_allocated = cow_is_allocated, + .bdrv_co_is_allocated = cow_co_is_allocated, .create_options = cow_create_options, }; diff --git a/block/qcow.c b/block/qcow.c index 4814ed0ced..b16955d764 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -368,14 +368,16 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, return cluster_offset; } -static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) { BDRVQcowState *s = bs->opaque; int index_in_cluster, n; uint64_t cluster_offset; + qemu_co_mutex_lock(&s->lock); cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + qemu_co_mutex_unlock(&s->lock); index_in_cluster = sector_num & (s->cluster_sectors - 1); n = s->cluster_sectors - index_in_cluster; if (n > nb_sectors) @@ -433,7 +435,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) return 0; } -static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; @@ -531,7 +533,7 @@ fail: goto done; } -static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; @@ -844,7 +846,7 @@ static BlockDriver bdrv_qcow = { .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, .bdrv_co_flush_to_disk = qcow_co_flush, - .bdrv_is_allocated = qcow_is_allocated, + .bdrv_co_is_allocated = qcow_co_is_allocated, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index f4e049fa90..07a2e936fd 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -289,89 +289,62 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, } } - -static int qcow2_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int coroutine_fn copy_sectors(BlockDriverState *bs, + uint64_t start_sect, + uint64_t cluster_offset, + int n_start, int n_end) { BDRVQcowState *s = bs->opaque; - int ret, index_in_cluster, n, n1; - uint64_t cluster_offset; - struct iovec iov; QEMUIOVector qiov; + struct iovec iov; + int n, ret; - while (nb_sectors > 0) { - n = nb_sectors; - - ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n, - &cluster_offset); - if (ret < 0) { - return ret; - } - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - if (!cluster_offset) { - if (bs->backing_hd) { - /* read from the base image */ - iov.iov_base = buf; - iov.iov_len = n * 512; - qemu_iovec_init_external(&qiov, &iov, 1); - - n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n); - if (n1 > 0) { - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING); - ret = bdrv_read(bs->backing_hd, sector_num, buf, n1); - if (ret < 0) - return -1; - } - } else { - memset(buf, 0, 512 * n); - } - } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - if (qcow2_decompress_cluster(bs, cluster_offset) < 0) - return -1; - memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); - } else { - BLKDBG_EVENT(bs->file, BLKDBG_READ); - ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512); - if (ret != n * 512) - return -1; - if (s->crypt_method) { - qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0, - &s->aes_decrypt_key); - } - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; + /* + * If this is the last cluster and it is only partially used, we must only + * copy until the end of the image, or bdrv_check_request will fail for the + * bdrv_read/write calls below. + */ + if (start_sect + n_end > bs->total_sectors) { + n_end = bs->total_sectors - start_sect; } - return 0; -} - -static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, - uint64_t cluster_offset, int n_start, int n_end) -{ - BDRVQcowState *s = bs->opaque; - int n, ret; n = n_end - n_start; - if (n <= 0) + if (n <= 0) { return 0; + } + + iov.iov_len = n * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + + qemu_iovec_init_external(&qiov, &iov, 1); + BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); - ret = qcow2_read(bs, start_sect + n_start, s->cluster_data, n); - if (ret < 0) - return ret; + + /* Call .bdrv_co_readv() directly instead of using the public block-layer + * interface. This avoids double I/O throttling and request tracking, + * which can lead to deadlock when block layer copy-on-read is enabled. + */ + ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + if (s->crypt_method) { qcow2_encrypt_sectors(s, start_sect + n_start, - s->cluster_data, - s->cluster_data, n, 1, + iov.iov_base, iov.iov_base, n, 1, &s->aes_encrypt_key); } + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); - ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start, - s->cluster_data, n); - if (ret < 0) - return ret; - return 0; + ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(iov.iov_base); + return ret; } @@ -620,7 +593,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; if (m->n_start) { cow = true; + qemu_co_mutex_unlock(&s->lock); ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); + qemu_co_mutex_lock(&s->lock); if (ret < 0) goto err; } @@ -628,8 +603,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) if (m->nb_available & (s->cluster_sectors - 1)) { uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1); cow = true; + qemu_co_mutex_unlock(&s->lock); ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9), m->nb_available - end, s->cluster_sectors); + qemu_co_mutex_lock(&s->lock); if (ret < 0) goto err; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9605367777..2db2ede3d1 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -700,6 +700,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, l2_table = NULL; l1_table = NULL; l1_size2 = l1_size * sizeof(uint64_t); + + /* WARNING: qcow2_snapshot_goto relies on this function not using the + * l1_table_offset when it is the current s->l1_table_offset! Be careful + * when changing this! */ if (l1_table_offset != s->l1_table_offset) { if (l1_size2 != 0) { l1_table = g_malloc0(align_offset(l1_size2, 512)); @@ -819,7 +823,8 @@ fail: qcow2_cache_set_writethrough(bs, s->refcount_block_cache, old_refcount_writethrough); - if (l1_modified) { + /* Update L1 only if it isn't deleted anyway (addend = -1) */ + if (addend >= 0 && l1_modified) { for(i = 0; i < l1_size; i++) cpu_to_be64s(&l1_table[i]); if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index bdc33ba94c..c3112bf71a 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -68,6 +68,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) int i, id_str_size, name_size; int64_t offset; uint32_t extra_data_size; + int ret; if (!s->nb_snapshots) { s->snapshots = NULL; @@ -77,10 +78,15 @@ int qcow2_read_snapshots(BlockDriverState *bs) offset = s->snapshots_offset; s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); + for(i = 0; i < s->nb_snapshots; i++) { + /* Read statically sized part of the snapshot header */ offset = align_offset(offset, 8); - if (bdrv_pread(bs->file, offset, &h, sizeof(h)) != sizeof(h)) + ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { goto fail; + } + offset += sizeof(h); sn = s->snapshots + i; sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); @@ -94,25 +100,34 @@ int qcow2_read_snapshots(BlockDriverState *bs) id_str_size = be16_to_cpu(h.id_str_size); name_size = be16_to_cpu(h.name_size); + /* Skip extra data */ offset += extra_data_size; + /* Read snapshot ID */ sn->id_str = g_malloc(id_str_size + 1); - if (bdrv_pread(bs->file, offset, sn->id_str, id_str_size) != id_str_size) + ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { goto fail; + } offset += id_str_size; sn->id_str[id_str_size] = '\0'; + /* Read snapshot name */ sn->name = g_malloc(name_size + 1); - if (bdrv_pread(bs->file, offset, sn->name, name_size) != name_size) + ret = bdrv_pread(bs->file, offset, sn->name, name_size); + if (ret < 0) { goto fail; + } offset += name_size; sn->name[name_size] = '\0'; } + s->snapshots_size = offset - s->snapshots_offset; return 0; - fail: + +fail: qcow2_free_snapshots(bs); - return -1; + return ret; } /* add at the end of the file a new list of snapshots */ @@ -122,9 +137,12 @@ static int qcow2_write_snapshots(BlockDriverState *bs) QCowSnapshot *sn; QCowSnapshotHeader h; int i, name_size, id_str_size, snapshots_size; - uint64_t data64; - uint32_t data32; + struct { + uint32_t nb_snapshots; + uint64_t snapshots_offset; + } QEMU_PACKED header_data; int64_t offset, snapshots_offset; + int ret; /* compute the size of the snapshots */ offset = 0; @@ -137,6 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) } snapshots_size = offset; + /* Allocate space for the new snapshot list */ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); bdrv_flush(bs->file); offset = snapshots_offset; @@ -144,6 +163,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) return offset; } + /* Write all snapshots to the new list */ for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; memset(&h, 0, sizeof(h)); @@ -159,34 +179,55 @@ static int qcow2_write_snapshots(BlockDriverState *bs) h.id_str_size = cpu_to_be16(id_str_size); h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); - if (bdrv_pwrite_sync(bs->file, offset, &h, sizeof(h)) < 0) + + ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); + if (ret < 0) { goto fail; + } offset += sizeof(h); - if (bdrv_pwrite_sync(bs->file, offset, sn->id_str, id_str_size) < 0) + + ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); + if (ret < 0) { goto fail; + } offset += id_str_size; - if (bdrv_pwrite_sync(bs->file, offset, sn->name, name_size) < 0) + + ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); + if (ret < 0) { goto fail; + } offset += name_size; } - /* update the various header fields */ - data64 = cpu_to_be64(snapshots_offset); - if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, snapshots_offset), - &data64, sizeof(data64)) < 0) + /* + * Update the header to point to the new snapshot table. This requires the + * new table and its refcounts to be stable on disk. + */ + ret = bdrv_flush(bs); + if (ret < 0) { goto fail; - data32 = cpu_to_be32(s->nb_snapshots); - if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), - &data32, sizeof(data32)) < 0) + } + + QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != + offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + + header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); + header_data.snapshots_offset = cpu_to_be64(snapshots_offset); + + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), + &header_data, sizeof(header_data)); + if (ret < 0) { goto fail; + } /* free the old snapshot table */ qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size); s->snapshots_offset = snapshots_offset; s->snapshots_size = snapshots_size; return 0; - fail: - return -1; + +fail: + return ret; } static void find_new_snapshot_id(BlockDriverState *bs, @@ -236,72 +277,92 @@ static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { BDRVQcowState *s = bs->opaque; - QCowSnapshot *snapshots1, sn1, *sn = &sn1; + QCowSnapshot *new_snapshot_list = NULL; + QCowSnapshot *old_snapshot_list = NULL; + QCowSnapshot sn1, *sn = &sn1; int i, ret; uint64_t *l1_table = NULL; int64_t l1_table_offset; memset(sn, 0, sizeof(*sn)); + /* Generate an ID if it wasn't passed */ if (sn_info->id_str[0] == '\0') { - /* compute a new id */ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); } - /* check that the ID is unique */ - if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) + /* Check that the ID is unique */ + if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { return -ENOENT; + } + /* Populate sn with passed data */ sn->id_str = g_strdup(sn_info->id_str); - if (!sn->id_str) - goto fail; sn->name = g_strdup(sn_info->name); - if (!sn->name) - goto fail; + sn->vm_state_size = sn_info->vm_state_size; sn->date_sec = sn_info->date_sec; sn->date_nsec = sn_info->date_nsec; sn->vm_clock_nsec = sn_info->vm_clock_nsec; - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); - if (ret < 0) - goto fail; - - /* create the L1 table of the snapshot */ + /* Allocate the L1 table of the snapshot and copy the current one there. */ l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); if (l1_table_offset < 0) { + ret = l1_table_offset; goto fail; } - bdrv_flush(bs->file); sn->l1_table_offset = l1_table_offset; sn->l1_size = s->l1_size; - if (s->l1_size != 0) { - l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); - } else { - l1_table = NULL; - } - + l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); for(i = 0; i < s->l1_size; i++) { l1_table[i] = cpu_to_be64(s->l1_table[i]); } - if (bdrv_pwrite_sync(bs->file, sn->l1_table_offset, - l1_table, s->l1_size * sizeof(uint64_t)) < 0) + + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { goto fail; + } + g_free(l1_table); l1_table = NULL; - snapshots1 = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); + /* + * Increase the refcounts of all clusters and make sure everything is + * stable on disk before updating the snapshot table to contain a pointer + * to the new L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + goto fail; + } + + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + /* Append the new snapshot to the snapshot list */ + new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); if (s->snapshots) { - memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot)); - g_free(s->snapshots); + memcpy(new_snapshot_list, s->snapshots, + s->nb_snapshots * sizeof(QCowSnapshot)); + old_snapshot_list = s->snapshots; } - s->snapshots = snapshots1; + s->snapshots = new_snapshot_list; s->snapshots[s->nb_snapshots++] = *sn; - if (qcow2_write_snapshots(bs) < 0) + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + g_free(s->snapshots); + s->snapshots = old_snapshot_list; goto fail; + } + + g_free(old_snapshot_list); + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -309,10 +370,13 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) } #endif return 0; - fail: + +fail: + g_free(sn->id_str); g_free(sn->name); g_free(l1_table); - return -1; + + return ret; } /* copy the snapshot 'snapshot_name' into the current disk image */ @@ -322,38 +386,92 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) QCowSnapshot *sn; int i, snapshot_index; int cur_l1_bytes, sn_l1_bytes; + int ret; + uint64_t *sn_l1_table = NULL; + /* Search the snapshot */ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); - if (snapshot_index < 0) + if (snapshot_index < 0) { return -ENOENT; + } sn = &s->snapshots[snapshot_index]; - if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0) - goto fail; - - if (qcow2_grow_l1_table(bs, sn->l1_size, true) < 0) + /* + * Make sure that the current L1 table is big enough to contain the whole + * L1 table of the snapshot. If the snapshot L1 table is smaller, the + * current one must be padded with zeros. + */ + ret = qcow2_grow_l1_table(bs, sn->l1_size, true); + if (ret < 0) { goto fail; + } cur_l1_bytes = s->l1_size * sizeof(uint64_t); sn_l1_bytes = sn->l1_size * sizeof(uint64_t); - if (cur_l1_bytes > sn_l1_bytes) { - memset(s->l1_table + sn->l1_size, 0, cur_l1_bytes - sn_l1_bytes); + /* + * Copy the snapshot L1 table to the current L1 table. + * + * Before overwriting the old current L1 table on disk, make sure to + * increase all refcounts for the clusters referenced by the new one. + * Decrease the refcount referenced by the old one only when the L1 + * table is overwritten. + */ + sn_l1_table = g_malloc0(cur_l1_bytes); + + ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); + if (ret < 0) { + goto fail; } - /* copy the snapshot l1 table to the current l1 table */ - if (bdrv_pread(bs->file, sn->l1_table_offset, - s->l1_table, sn_l1_bytes) < 0) + ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, + sn->l1_size, 1); + if (ret < 0) { goto fail; - if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, - s->l1_table, cur_l1_bytes) < 0) + } + + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, + cur_l1_bytes); + if (ret < 0) { goto fail; + } + + /* + * Decrease refcount of clusters of current L1 table. + * + * At this point, the in-memory s->l1_table points to the old L1 table, + * whereas on disk we already have the new one. + * + * qcow2_update_snapshot_refcount special cases the current L1 table to use + * the in-memory data instead of really using the offset to load a new one, + * which is why this works. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, + s->l1_size, -1); + + /* + * Now update the in-memory L1 table to be in sync with the on-disk one. We + * need to do this even if updating refcounts failed. + */ for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); + s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); } - if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0) + if (ret < 0) { goto fail; + } + + g_free(sn_l1_table); + sn_l1_table = NULL; + + /* + * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed + * when we decreased the refcount of the old snapshot. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + goto fail; + } #ifdef DEBUG_ALLOC { @@ -362,39 +480,59 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) } #endif return 0; - fail: - return -EIO; + +fail: + g_free(sn_l1_table); + return ret; } int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) { BDRVQcowState *s = bs->opaque; - QCowSnapshot *sn; + QCowSnapshot sn; int snapshot_index, ret; + /* Search the snapshot */ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); - if (snapshot_index < 0) + if (snapshot_index < 0) { return -ENOENT; - sn = &s->snapshots[snapshot_index]; + } + sn = s->snapshots[snapshot_index]; + + /* Remove it from the snapshot list */ + memmove(s->snapshots + snapshot_index, + s->snapshots + snapshot_index + 1, + (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); + s->nb_snapshots--; + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + return ret; + } - ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1); - if (ret < 0) + /* + * The snapshot is now unused, clean up. If we fail after this point, we + * won't recover but just leak clusters. + */ + g_free(sn.id_str); + g_free(sn.name); + + /* + * Now decrease the refcounts of clusters referenced by the snapshot and + * free the L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, + sn.l1_size, -1); + if (ret < 0) { return ret; + } + qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t)); + /* must update the copied flag on the current cluster offsets */ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); - if (ret < 0) - return ret; - qcow2_free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t)); - - g_free(sn->id_str); - g_free(sn->name); - memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn)); - s->nb_snapshots--; - ret = qcow2_write_snapshots(bs); if (ret < 0) { - /* XXX: restore snapshot if error ? */ return ret; } + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -435,32 +573,42 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) { - int i, snapshot_index, l1_size2; + int i, snapshot_index; BDRVQcowState *s = bs->opaque; QCowSnapshot *sn; + uint64_t *new_l1_table; + int new_l1_bytes; + int ret; + + assert(bs->read_only); + /* Search the snapshot */ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); if (snapshot_index < 0) { return -ENOENT; } - sn = &s->snapshots[snapshot_index]; - s->l1_size = sn->l1_size; - l1_size2 = s->l1_size * sizeof(uint64_t); - if (s->l1_table != NULL) { - g_free(s->l1_table); - } - s->l1_table_offset = sn->l1_table_offset; - s->l1_table = g_malloc0(align_offset(l1_size2, 512)); + /* Allocate and read in the snapshot's L1 table */ + new_l1_bytes = s->l1_size * sizeof(uint64_t); + new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); - if (bdrv_pread(bs->file, sn->l1_table_offset, - s->l1_table, l1_size2) != l1_size2) { - return -1; + ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); + if (ret < 0) { + g_free(new_l1_table); + return ret; } + /* Switch the L1 table */ + g_free(s->l1_table); + + s->l1_size = sn->l1_size; + s->l1_table_offset = sn->l1_table_offset; + s->l1_table = new_l1_table; + for(i = 0;i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); } + return 0; } diff --git a/block/qcow2.c b/block/qcow2.c index 9e1b1eb2ed..37cd4424d4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -273,8 +273,9 @@ static int qcow2_open(BlockDriverState *bs, int flags) } bs->backing_file[len] = '\0'; } - if (qcow2_read_snapshots(bs) < 0) { - ret = -EINVAL; + + ret = qcow2_read_snapshots(bs); + if (ret < 0) { goto fail; } @@ -343,16 +344,19 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) return 0; } -static int qcow2_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) { + BDRVQcowState *s = bs->opaque; uint64_t cluster_offset; int ret; *pnum = nb_sectors; - /* FIXME We can get errors here, but the bdrv_is_allocated interface can't - * pass them on today */ + /* FIXME We can get errors here, but the bdrv_co_is_allocated interface + * can't pass them on today */ + qemu_co_mutex_lock(&s->lock); ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + qemu_co_mutex_unlock(&s->lock); if (ret < 0) { *pnum = 0; } @@ -377,7 +381,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; @@ -512,12 +516,12 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) /* Restart all dependent requests */ if (!qemu_co_queue_empty(&m->dependent_requests)) { qemu_co_mutex_unlock(&s->lock); - while(qemu_co_queue_next(&m->dependent_requests)); + qemu_co_queue_restart_all(&m->dependent_requests); qemu_co_mutex_lock(&s->lock); } } -static int qcow2_co_writev(BlockDriverState *bs, +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) @@ -1137,7 +1141,7 @@ fail: return ret; } -static int qcow2_co_flush_to_os(BlockDriverState *bs) +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; int ret; @@ -1159,7 +1163,7 @@ static int qcow2_co_flush_to_os(BlockDriverState *bs) return 0; } -static int qcow2_co_flush_to_disk(BlockDriverState *bs) +static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs) { return bdrv_co_flush(bs->file); } @@ -1276,7 +1280,7 @@ static BlockDriver bdrv_qcow2 = { .bdrv_open = qcow2_open, .bdrv_close = qcow2_close, .bdrv_create = qcow2_create, - .bdrv_is_allocated = qcow2_is_allocated, + .bdrv_co_is_allocated = qcow2_co_is_allocated, .bdrv_set_key = qcow2_set_key, .bdrv_make_empty = qcow2_make_empty, diff --git a/block/qed-table.c b/block/qed-table.c index f31f9ff069..8ee844346c 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -29,7 +29,7 @@ static void qed_read_table_cb(void *opaque, int ret) { QEDReadTableCB *read_table_cb = opaque; QEDTable *table = read_table_cb->table; - int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t); + int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); int i; /* Handle I/O error */ @@ -65,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, - read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, + qiov->size / BDRV_SECTOR_SIZE, qed_read_table_cb, read_table_cb); if (!aiocb) { qed_read_table_cb(read_table_cb, -EIO); @@ -160,7 +160,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, &write_table_cb->qiov, - write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE, + write_table_cb->qiov.size / BDRV_SECTOR_SIZE, qed_write_table_cb, write_table_cb); if (!aiocb) { qed_write_table_cb(write_table_cb, -EIO); diff --git a/block/qed.c b/block/qed.c index 7e22e77b9d..22e467202f 100644 --- a/block/qed.c +++ b/block/qed.c @@ -661,6 +661,7 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) } typedef struct { + Coroutine *co; int is_allocated; int *pnum; } QEDIsAllocatedCB; @@ -670,10 +671,14 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l QEDIsAllocatedCB *cb = opaque; *cb->pnum = len / BDRV_SECTOR_SIZE; cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } } -static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) { BDRVQEDState *s = bs->opaque; uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; @@ -686,8 +691,10 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + /* Now sleep if the callback wasn't invoked immediately */ while (cb.is_allocated == -1) { - qemu_aio_wait(); + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); } qed_unref_l2_cache_entry(request.l2_table); @@ -1485,7 +1492,7 @@ static BlockDriver bdrv_qed = { .bdrv_open = bdrv_qed_open, .bdrv_close = bdrv_qed_close, .bdrv_create = bdrv_qed_create, - .bdrv_is_allocated = bdrv_qed_is_allocated, + .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, .bdrv_make_empty = bdrv_qed_make_empty, .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, diff --git a/block/sheepdog.c b/block/sheepdog.c index 62f1f3a0cf..aa9707f2ae 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1715,7 +1715,7 @@ out: return 1; } -static int sd_co_writev(BlockDriverState *bs, int64_t sector_num, +static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { SheepdogAIOCB *acb; @@ -1744,7 +1744,7 @@ static int sd_co_writev(BlockDriverState *bs, int64_t sector_num, return acb->ret; } -static int sd_co_readv(BlockDriverState *bs, int64_t sector_num, +static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { SheepdogAIOCB *acb; diff --git a/block/vdi.c b/block/vdi.c index 02da6b44d0..e1d8cffc2d 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -472,8 +472,8 @@ static int vdi_open(BlockDriverState *bs, int flags) return -1; } -static int vdi_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) { /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ BDRVVdiState *s = (BDRVVdiState *)bs->opaque; @@ -996,7 +996,7 @@ static BlockDriver bdrv_vdi = { .bdrv_close = vdi_close, .bdrv_create = vdi_create, .bdrv_co_flush_to_disk = vdi_co_flush, - .bdrv_is_allocated = vdi_is_allocated, + .bdrv_co_is_allocated = vdi_co_is_allocated, .bdrv_make_empty = vdi_make_empty, .bdrv_aio_readv = vdi_aio_readv, diff --git a/block/vmdk.c b/block/vmdk.c index f5441591d7..5623ac10cd 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -861,8 +861,8 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) { BDRVVmdkState *s = bs->opaque; int64_t index_in_cluster, n, ret; @@ -873,8 +873,10 @@ static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num, if (!extent) { return 0; } + qemu_co_mutex_lock(&s->lock); ret = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0, &offset); + qemu_co_mutex_unlock(&s->lock); /* get_cluster_offset returning 0 means success */ ret = !ret; @@ -1596,7 +1598,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_is_allocated = vmdk_is_allocated, + .bdrv_co_is_allocated = vmdk_co_is_allocated, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .create_options = vmdk_create_options, diff --git a/block/vvfat.c b/block/vvfat.c index a310ce8c3e..eeffc4a4a8 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2758,7 +2758,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } -static int vvfat_is_allocated(BlockDriverState *bs, +static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int* n) { BDRVVVFATState* s = bs->opaque; @@ -2855,7 +2855,7 @@ static BlockDriver bdrv_vvfat = { .bdrv_read = vvfat_co_read, .bdrv_write = vvfat_co_write, .bdrv_close = vvfat_close, - .bdrv_is_allocated = vvfat_is_allocated, + .bdrv_co_is_allocated = vvfat_co_is_allocated, .protocol_name = "fat", }; diff --git a/block_int.h b/block_int.h index 77c0187c3d..311bd2a6fa 100644 --- a/block_int.h +++ b/block_int.h @@ -34,6 +34,13 @@ #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 +#define BLOCK_IO_LIMIT_READ 0 +#define BLOCK_IO_LIMIT_WRITE 1 +#define BLOCK_IO_LIMIT_TOTAL 2 + +#define BLOCK_IO_SLICE_TIME 100000000 +#define NANOSECONDS_PER_SECOND 1000000000.0 + #define BLOCK_OPT_SIZE "size" #define BLOCK_OPT_ENCRYPT "encryption" #define BLOCK_OPT_COMPAT6 "compat6" @@ -44,12 +51,24 @@ #define BLOCK_OPT_PREALLOC "preallocation" #define BLOCK_OPT_SUBFMT "subformat" +typedef struct BdrvTrackedRequest BdrvTrackedRequest; + typedef struct AIOPool { void (*cancel)(BlockDriverAIOCB *acb); int aiocb_size; BlockDriverAIOCB *free_aiocb; } AIOPool; +typedef struct BlockIOLimit { + int64_t bps[3]; + int64_t iops[3]; +} BlockIOLimit; + +typedef struct BlockIOBaseValue { + uint64_t bytes[2]; + uint64_t ios[2]; +} BlockIOBaseValue; + struct BlockDriver { const char *format_name; int instance_size; @@ -63,8 +82,6 @@ struct BlockDriver { const uint8_t *buf, int nb_sectors); void (*bdrv_close)(BlockDriverState *bs); int (*bdrv_create)(const char *filename, QEMUOptionParameter *options); - int (*bdrv_is_allocated)(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum); int (*bdrv_set_key)(BlockDriverState *bs, const char *key); int (*bdrv_make_empty)(BlockDriverState *bs); /* aio */ @@ -86,6 +103,8 @@ struct BlockDriver { int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs, int64_t sector_num, int nb_sectors); + int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum); /* * Invalidate any cached meta-data. @@ -179,6 +198,8 @@ struct BlockDriverState { int encrypted; /* if true, the media is encrypted */ int valid_key; /* if true, a valid encryption key has been set */ int sg; /* if true, the device is a /dev/sg* */ + int copy_on_read; /* if true, copy read backing sectors into image + note this is a reference count */ BlockDriver *drv; /* NULL means no media */ void *opaque; @@ -201,6 +222,16 @@ struct BlockDriverState { void *sync_aiocb; + /* the time for latest disk I/O */ + int64_t slice_time; + int64_t slice_start; + int64_t slice_end; + BlockIOLimit io_limits; + BlockIOBaseValue io_base; + CoQueue throttled_reqs; + QEMUTimer *block_timer; + bool io_limits_enabled; + /* I/O stats (display with "info blockstats"). */ uint64_t nr_bytes[BDRV_MAX_IOTYPE]; uint64_t nr_ops[BDRV_MAX_IOTYPE]; @@ -228,6 +259,8 @@ struct BlockDriverState { int in_use; /* users other than guest access, eg. block migration */ QTAILQ_ENTRY(BlockDriverState) list; void *private; + + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; }; struct BlockDriverAIOCB { @@ -244,6 +277,9 @@ void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque); void qemu_aio_release(void *p); +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits); + #ifdef _WIN32 int is_windows_drive(const char *filename); #endif diff --git a/blockdev.c b/blockdev.c index 222818690d..dbf0251a77 100644 --- a/blockdev.c +++ b/blockdev.c @@ -216,6 +216,26 @@ static int parse_block_error_action(const char *buf, int is_read) } } +static bool do_check_io_limits(BlockIOLimit *io_limits) +{ + bool bps_flag; + bool iops_flag; + + assert(io_limits); + + bps_flag = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0) + && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0) + || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0)); + iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0) + && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0) + || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0)); + if (bps_flag || iops_flag) { + return false; + } + + return true; +} + DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) { const char *buf; @@ -235,7 +255,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) int on_read_error, on_write_error; const char *devaddr; DriveInfo *dinfo; + BlockIOLimit io_limits; int snapshot = 0; + bool copy_on_read; int ret; translation = BIOS_ATA_TRANSLATION_AUTO; @@ -252,6 +274,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) snapshot = qemu_opt_get_bool(opts, "snapshot", 0); ro = qemu_opt_get_bool(opts, "readonly", 0); + copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false); file = qemu_opt_get(opts, "file"); serial = qemu_opt_get(opts, "serial"); @@ -353,6 +376,26 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) } } + /* disk I/O throttling */ + io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = + qemu_opt_get_number(opts, "bps", 0); + io_limits.bps[BLOCK_IO_LIMIT_READ] = + qemu_opt_get_number(opts, "bps_rd", 0); + io_limits.bps[BLOCK_IO_LIMIT_WRITE] = + qemu_opt_get_number(opts, "bps_wr", 0); + io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = + qemu_opt_get_number(opts, "iops", 0); + io_limits.iops[BLOCK_IO_LIMIT_READ] = + qemu_opt_get_number(opts, "iops_rd", 0); + io_limits.iops[BLOCK_IO_LIMIT_WRITE] = + qemu_opt_get_number(opts, "iops_wr", 0); + + if (!do_check_io_limits(&io_limits)) { + error_report("bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) " + "cannot be used at the same time"); + return NULL; + } + on_write_error = BLOCK_ERR_STOP_ENOSPC; if ((buf = qemu_opt_get(opts, "werror")) != NULL) { if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { @@ -460,6 +503,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); + /* disk I/O throttling */ + bdrv_set_io_limits(dinfo->bdrv, &io_limits); + switch(type) { case IF_IDE: case IF_SCSI: @@ -502,6 +548,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH); } + if (copy_on_read) { + bdrv_flags |= BDRV_O_COPY_ON_READ; + } + if (media == MEDIA_CDROM) { /* CDROM is fine for any interface, don't check. */ ro = 1; @@ -603,7 +653,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) goto out; } - qemu_aio_flush(); + bdrv_drain_all(); bdrv_flush(bs); bdrv_close(bs); @@ -715,6 +765,65 @@ int do_change_block(Monitor *mon, const char *device, return monitor_read_bdrv_key_start(mon, bs, NULL, NULL); } +/* throttling disk I/O limits */ +int do_block_set_io_throttle(Monitor *mon, + const QDict *qdict, QObject **ret_data) +{ + BlockIOLimit io_limits; + const char *devname = qdict_get_str(qdict, "device"); + BlockDriverState *bs; + + io_limits.bps[BLOCK_IO_LIMIT_TOTAL] + = qdict_get_try_int(qdict, "bps", -1); + io_limits.bps[BLOCK_IO_LIMIT_READ] + = qdict_get_try_int(qdict, "bps_rd", -1); + io_limits.bps[BLOCK_IO_LIMIT_WRITE] + = qdict_get_try_int(qdict, "bps_wr", -1); + io_limits.iops[BLOCK_IO_LIMIT_TOTAL] + = qdict_get_try_int(qdict, "iops", -1); + io_limits.iops[BLOCK_IO_LIMIT_READ] + = qdict_get_try_int(qdict, "iops_rd", -1); + io_limits.iops[BLOCK_IO_LIMIT_WRITE] + = qdict_get_try_int(qdict, "iops_wr", -1); + + bs = bdrv_find(devname); + if (!bs) { + qerror_report(QERR_DEVICE_NOT_FOUND, devname); + return -1; + } + + if ((io_limits.bps[BLOCK_IO_LIMIT_TOTAL] == -1) + || (io_limits.bps[BLOCK_IO_LIMIT_READ] == -1) + || (io_limits.bps[BLOCK_IO_LIMIT_WRITE] == -1) + || (io_limits.iops[BLOCK_IO_LIMIT_TOTAL] == -1) + || (io_limits.iops[BLOCK_IO_LIMIT_READ] == -1) + || (io_limits.iops[BLOCK_IO_LIMIT_WRITE] == -1)) { + qerror_report(QERR_MISSING_PARAMETER, + "bps/bps_rd/bps_wr/iops/iops_rd/iops_wr"); + return -1; + } + + if (!do_check_io_limits(&io_limits)) { + qerror_report(QERR_INVALID_PARAMETER_COMBINATION); + return -1; + } + + bs->io_limits = io_limits; + bs->slice_time = BLOCK_IO_SLICE_TIME; + + if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) { + bdrv_io_limits_enable(bs); + } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) { + bdrv_io_limits_disable(bs); + } else { + if (bs->block_timer) { + qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock)); + } + } + + return 0; +} + int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) { const char *id = qdict_get_str(qdict, "id"); @@ -731,7 +840,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) } /* quiesce block driver; prevent further io */ - qemu_aio_flush(); + bdrv_drain_all(); bdrv_flush(bs); bdrv_close(bs); diff --git a/blockdev.h b/blockdev.h index 3587786a64..1b48a75499 100644 --- a/blockdev.h +++ b/blockdev.h @@ -63,6 +63,8 @@ int do_block_set_passwd(Monitor *mon, const QDict *qdict, QObject **ret_data); int do_change_block(Monitor *mon, const char *device, const char *filename, const char *fmt); int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data); +int do_block_set_io_throttle(Monitor *mon, + const QDict *qdict, QObject **ret_data); int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data); int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data); @@ -396,7 +396,7 @@ static void do_vm_stop(RunState state) pause_all_vcpus(); runstate_set(state); vm_state_notify(0, state); - qemu_aio_flush(); + bdrv_drain_all(); bdrv_flush_all(); monitor_protocol_event(QEVENT_STOP, NULL); } diff --git a/dma-helpers.c b/dma-helpers.c index bdcd38cd27..9d6b6fa1fd 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -9,6 +9,7 @@ #include "dma.h" #include "block_int.h" +#include "trace.h" void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint) { @@ -83,6 +84,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs) static void dma_complete(DMAAIOCB *dbs, int ret) { + trace_dma_complete(dbs, ret, dbs->common.cb); + dma_bdrv_unmap(dbs); if (dbs->common.cb) { dbs->common.cb(dbs->common.opaque, ret); @@ -106,6 +109,8 @@ static void dma_bdrv_cb(void *opaque, int ret) target_phys_addr_t cur_addr, cur_len; void *mem; + trace_dma_bdrv_cb(dbs, ret); + dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; dma_bdrv_unmap(dbs); @@ -130,6 +135,7 @@ static void dma_bdrv_cb(void *opaque, int ret) } if (dbs->iov.size == 0) { + trace_dma_map_wait(dbs); cpu_register_map_client(dbs, continue_after_map_failure); return; } @@ -145,6 +151,8 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb) { DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common); + trace_dma_aio_cancel(dbs); + if (dbs->acb) { BlockDriverAIOCB *acb = dbs->acb; dbs->acb = NULL; @@ -168,6 +176,8 @@ BlockDriverAIOCB *dma_bdrv_io( { DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque); + trace_dma_bdrv_io(dbs, bs, sector_num, to_dev); + dbs->acb = NULL; dbs->bs = bs; dbs->sg = sg; diff --git a/hmp-commands.hx b/hmp-commands.hx index 089c1ac23d..79a919526d 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -860,9 +860,10 @@ ETEXI .args_type = "pci_addr:s,opts:s", .params = "[[<domain>:]<bus>:]<slot>\n" "[file=file][,if=type][,bus=n]\n" - "[,unit=m][,media=d][index=i]\n" + "[,unit=m][,media=d][,index=i]\n" "[,cyls=c,heads=h,secs=s[,trans=t]]\n" - "[snapshot=on|off][,cache=on|off]", + "[,snapshot=on|off][,cache=on|off]\n" + "[,readonly=on|off][,copy-on-read=on|off]", .help = "add drive to PCI storage controller", .mhandler.cmd = drive_hot_add, }, @@ -1207,6 +1208,21 @@ ETEXI }, STEXI +@item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr} +@findex block_set_io_throttle +Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr} +ETEXI + + { + .name = "block_set_io_throttle", + .args_type = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l", + .params = "device bps bps_rd bps_wr iops iops_rd iops_wr", + .help = "change I/O throttle limits for a block drive", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_block_set_io_throttle, + }, + +STEXI @item block_passwd @var{device} @var{password} @findex block_passwd Set the encrypted device @var{device} password to @var{password} @@ -216,6 +216,16 @@ void hmp_info_block(Monitor *mon) info->value->inserted->ro, info->value->inserted->drv, info->value->inserted->encrypted); + + monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64 + " bps_wr=%" PRId64 " iops=%" PRId64 + " iops_rd=%" PRId64 " iops_wr=%" PRId64, + info->value->inserted->bps, + info->value->inserted->bps_rd, + info->value->inserted->bps_wr, + info->value->inserted->iops, + info->value->inserted->iops_rd, + info->value->inserted->iops_wr); } else { monitor_printf(mon, " [not inserted]"); } diff --git a/hw/ide/macio.c b/hw/ide/macio.c index 70b33422d2..c09d2e0a35 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -200,8 +200,9 @@ static void pmac_ide_flush(DBDMA_io *io) { MACIOIDEState *m = io->opaque; - if (m->aiocb) - qemu_aio_flush(); + if (m->aiocb) { + bdrv_drain_all(); + } } /* PowerMac IDE memory IO */ diff --git a/hw/ide/pci.c b/hw/ide/pci.c index 49b823df79..5078c0b565 100644 --- a/hw/ide/pci.c +++ b/hw/ide/pci.c @@ -309,7 +309,7 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val) * aio operation with preadv/pwritev. */ if (bm->bus->dma->aiocb) { - qemu_aio_flush(); + bdrv_drain_all(); assert(bm->bus->dma->aiocb == NULL); assert((bm->status & BM_STATUS_DMAING) == 0); } diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index d6d1f87cda..4b0d113ba8 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -474,7 +474,7 @@ static void virtio_blk_reset(VirtIODevice *vdev) * This should cancel pending requests, but can't do nicely until there * are per-device request lists. */ - qemu_aio_flush(); + bdrv_drain_all(); } /* coalesce internal state, copy to pci i/o region 0 diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 286bbac54a..192e81746f 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -49,7 +49,6 @@ static int syncwrite = 0; static int batch_maps = 0; static int max_requests = 32; -static int use_aio = 1; /* ------------------------------------------------------------- */ @@ -314,76 +313,6 @@ static int ioreq_map(struct ioreq *ioreq) return 0; } -static int ioreq_runio_qemu_sync(struct ioreq *ioreq) -{ - struct XenBlkDev *blkdev = ioreq->blkdev; - int i, rc; - off_t pos; - - if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) { - goto err_no_map; - } - if (ioreq->presync) { - bdrv_flush(blkdev->bs); - } - - switch (ioreq->req.operation) { - case BLKIF_OP_READ: - pos = ioreq->start; - for (i = 0; i < ioreq->v.niov; i++) { - rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE, - ioreq->v.iov[i].iov_base, - ioreq->v.iov[i].iov_len / BLOCK_SIZE); - if (rc != 0) { - xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n", - ioreq->v.iov[i].iov_base, - ioreq->v.iov[i].iov_len); - goto err; - } - pos += ioreq->v.iov[i].iov_len; - } - break; - case BLKIF_OP_WRITE: - case BLKIF_OP_WRITE_BARRIER: - if (!ioreq->req.nr_segments) { - break; - } - pos = ioreq->start; - for (i = 0; i < ioreq->v.niov; i++) { - rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE, - ioreq->v.iov[i].iov_base, - ioreq->v.iov[i].iov_len / BLOCK_SIZE); - if (rc != 0) { - xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n", - ioreq->v.iov[i].iov_base, - ioreq->v.iov[i].iov_len); - goto err; - } - pos += ioreq->v.iov[i].iov_len; - } - break; - default: - /* unknown operation (shouldn't happen -- parse catches this) */ - goto err; - } - - if (ioreq->postsync) { - bdrv_flush(blkdev->bs); - } - ioreq->status = BLKIF_RSP_OKAY; - - ioreq_unmap(ioreq); - ioreq_finish(ioreq); - return 0; - -err: - ioreq_unmap(ioreq); -err_no_map: - ioreq_finish(ioreq); - ioreq->status = BLKIF_RSP_ERROR; - return -1; -} - static void qemu_aio_complete(void *opaque, int ret) { struct ioreq *ioreq = opaque; @@ -554,9 +483,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) rp = blkdev->rings.common.sring->req_prod; xen_rmb(); /* Ensure we see queued requests up to 'rp'. */ - if (use_aio) { - blk_send_response_all(blkdev); - } + blk_send_response_all(blkdev); while (rc != rp) { /* pull request from ring */ if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) { @@ -579,16 +506,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) continue; } - if (use_aio) { - /* run i/o in aio mode */ - ioreq_runio_qemu_aio(ioreq); - } else { - /* run i/o in sync mode */ - ioreq_runio_qemu_sync(ioreq); - } - } - if (!use_aio) { - blk_send_response_all(blkdev); + ioreq_runio_qemu_aio(ioreq); } if (blkdev->more_work && blkdev->requests_inflight < max_requests) { diff --git a/hw/xen_platform.c b/hw/xen_platform.c index 5e792f56f6..e62eaef7d1 100644 --- a/hw/xen_platform.c +++ b/hw/xen_platform.c @@ -120,7 +120,7 @@ static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t v devices, and bit 2 the non-primary-master IDE devices. */ if (val & UNPLUG_ALL_IDE_DISKS) { DPRINTF("unplug disks\n"); - qemu_aio_flush(); + bdrv_drain_all(); bdrv_flush_all(); pci_unplug_disks(s->pci_dev.bus); } diff --git a/qapi-schema.json b/qapi-schema.json index cb1ba776df..fbbdbe0914 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -370,13 +370,27 @@ # # @encrypted: true if the backing device is encrypted # +# @bps: total throughput limit in bytes per second is specified +# +# @bps_rd: read throughput limit in bytes per second is specified +# +# @bps_wr: write throughput limit in bytes per second is specified +# +# @iops: total I/O operations per second is specified +# +# @iops_rd: read I/O operations per second is specified +# +# @iops_wr: write I/O operations per second is specified +# # Since: 0.14.0 # # Notes: This interface is only found in @BlockInfo. ## { 'type': 'BlockDeviceInfo', 'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str', - '*backing_file': 'str', 'encrypted': 'bool' } } + '*backing_file': 'str', 'encrypted': 'bool', + 'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int', + 'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int'} } ## # @BlockDeviceIoStatus: diff --git a/qemu-common.h b/qemu-common.h index 2ce47aa12d..44870fe523 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -341,6 +341,12 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) return res.ll; } +/* Round number down to multiple */ +#define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m)) + +/* Round number up to multiple */ +#define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m)) + #include "module.h" #endif diff --git a/qemu-config.c b/qemu-config.c index 597d7e10b1..18f30204a1 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -85,6 +85,34 @@ static QemuOptsList qemu_drive_opts = { .name = "readonly", .type = QEMU_OPT_BOOL, .help = "open drive file as read-only", + },{ + .name = "iops", + .type = QEMU_OPT_NUMBER, + .help = "limit total I/O operations per second", + },{ + .name = "iops_rd", + .type = QEMU_OPT_NUMBER, + .help = "limit read operations per second", + },{ + .name = "iops_wr", + .type = QEMU_OPT_NUMBER, + .help = "limit write operations per second", + },{ + .name = "bps", + .type = QEMU_OPT_NUMBER, + .help = "limit total bytes per second", + },{ + .name = "bps_rd", + .type = QEMU_OPT_NUMBER, + .help = "limit read bytes per second", + },{ + .name = "bps_wr", + .type = QEMU_OPT_NUMBER, + .help = "limit write bytes per second", + },{ + .name = "copy-on-read", + .type = QEMU_OPT_BOOL, + .help = "copy read data from backing file into image file", }, { /* end of list */ } }, diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c index 6b58160058..26ad76bf50 100644 --- a/qemu-coroutine-lock.c +++ b/qemu-coroutine-lock.c @@ -61,6 +61,14 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue) assert(qemu_in_coroutine()); } +void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue) +{ + Coroutine *self = qemu_coroutine_self(); + QTAILQ_INSERT_HEAD(&queue->entries, self, co_queue_next); + qemu_coroutine_yield(); + assert(qemu_in_coroutine()); +} + bool qemu_co_queue_next(CoQueue *queue) { Coroutine *next; @@ -76,6 +84,13 @@ bool qemu_co_queue_next(CoQueue *queue) return (next != NULL); } +void qemu_co_queue_restart_all(CoQueue *queue) +{ + while (qemu_co_queue_next(queue)) { + /* Do nothing */ + } +} + bool qemu_co_queue_empty(CoQueue *queue) { return (QTAILQ_FIRST(&queue->entries) == NULL); @@ -136,13 +151,7 @@ void qemu_co_rwlock_unlock(CoRwlock *lock) assert(qemu_in_coroutine()); if (lock->writer) { lock->writer = false; - while (!qemu_co_queue_empty(&lock->queue)) { - /* - * Wakeup every body. This will include some - * writers too. - */ - qemu_co_queue_next(&lock->queue); - } + qemu_co_queue_restart_all(&lock->queue); } else { lock->reader--; assert(lock->reader >= 0); diff --git a/qemu-coroutine.h b/qemu-coroutine.h index b8fc4f4332..8a55fe125e 100644 --- a/qemu-coroutine.h +++ b/qemu-coroutine.h @@ -118,6 +118,12 @@ void qemu_co_queue_init(CoQueue *queue); void coroutine_fn qemu_co_queue_wait(CoQueue *queue); /** + * Adds the current coroutine to the head of the CoQueue and transfers control to the + * caller of the coroutine. + */ +void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue); + +/** * Restarts the next coroutine in the CoQueue and removes it from the queue. * * Returns true if a coroutine was restarted, false if the queue is empty. @@ -125,6 +131,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue); bool qemu_co_queue_next(CoQueue *queue); /** + * Restarts all coroutines in the CoQueue and leaves the queue empty. + */ +void qemu_co_queue_restart_all(CoQueue *queue); + +/** * Checks if the CoQueue is empty. */ bool qemu_co_queue_empty(CoQueue *queue); @@ -1853,9 +1853,9 @@ int main(int argc, char **argv) command_loop(); /* - * Make sure all outstanding requests get flushed the program exits. + * Make sure all outstanding requests complete before the program exits. */ - qemu_aio_flush(); + bdrv_drain_all(); if (bs) { bdrv_delete(bs); diff --git a/qemu-options.hx b/qemu-options.hx index 681eaf198e..b3db10c0ad 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -135,7 +135,8 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" - " [,readonly=on|off]\n" + " [,readonly=on|off][,copy-on-read=on|off]\n" + " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n" " use 'file' as a drive image\n", QEMU_ARCH_ALL) STEXI @item -drive @var{option}[,@var{option}[,@var{option}[,...]]] @@ -186,6 +187,9 @@ host disk is full; report the error to the guest otherwise). The default setting is @option{werror=enospc} and @option{rerror=report}. @item readonly Open drive @option{file} as read-only. Guest write attempts will fail. +@item copy-on-read=@var{copy-on-read} +@var{copy-on-read} is "on" or "off" and enables whether to copy read backing +file sectors into the image file. @end table By default, writethrough caching is used for all block device. This means that @@ -217,6 +221,10 @@ like your host losing power, the disk storage getting disconnected accidently, etc. you're image will most probably be rendered unusable. When using the @option{-snapshot} option, unsafe caching is always used. +Copy-on-read avoids accessing the same backing file sectors repeatedly and is +useful when the backing file is over a slow network. By default copy-on-read +is off. + Instead of @option{-cdrom} you can use: @example qemu -drive file=file,index=2,media=cdrom @@ -251,6 +251,10 @@ static const QErrorStringTable qerror_table[] = { .error_fmt = QERR_QGA_COMMAND_FAILED, .desc = "Guest agent command failed, error was '%(message)'", }, + { + .error_fmt = QERR_INVALID_PARAMETER_COMBINATION, + .desc = "Invalid paramter combination", + }, {} }; @@ -207,4 +207,7 @@ QError *qobject_to_qerror(const QObject *obj); #define QERR_QGA_COMMAND_FAILED \ "{ 'class': 'QgaCommandFailed', 'data': { 'message': %s } }" +#define QERR_INVALID_PARAMETER_COMBINATION \ + "{ 'class': 'InvalidParameterCombination', 'data': {} }" + #endif /* QERROR_H */ diff --git a/qmp-commands.hx b/qmp-commands.hx index 97975a5207..94da2a8ef2 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -851,6 +851,44 @@ Example: EQMP { + .name = "block_set_io_throttle", + .args_type = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l", + .params = "device bps bps_rd bps_wr iops iops_rd iops_wr", + .help = "change I/O throttle limits for a block drive", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_block_set_io_throttle, + }, + +SQMP +block_set_io_throttle +------------ + +Change I/O throttle limits for a block drive. + +Arguments: + +- "device": device name (json-string) +- "bps": total throughput limit in bytes per second(json-int) +- "bps_rd": read throughput limit in bytes per second(json-int) +- "bps_wr": read throughput limit in bytes per second(json-int) +- "iops": total I/O operations per second(json-int) +- "iops_rd": read I/O operations per second(json-int) +- "iops_wr": write I/O operations per second(json-int) + +Example: + +-> { "execute": "block_set_io_throttle", "arguments": { "device": "virtio0", + "bps": "1000000", + "bps_rd": "0", + "bps_wr": "0", + "iops": "0", + "iops_rd": "0", + "iops_wr": "0" } } +<- { "return": {} } + +EQMP + + { .name = "set_password", .args_type = "protocol:s,password:s,connected:s?", .params = "protocol password action-if-connected", @@ -1152,6 +1190,13 @@ Each json-object contain the following: "tftp", "vdi", "vmdk", "vpc", "vvfat" - "backing_file": backing file name (json-string, optional) - "encrypted": true if encrypted, false otherwise (json-bool) + - "bps": limit total bytes per second (json-int) + - "bps_rd": limit read bytes per second (json-int) + - "bps_wr": limit write bytes per second (json-int) + - "iops": limit total I/O operations per second (json-int) + - "iops_rd": limit read operations per second (json-int) + - "iops_wr": limit write operations per second (json-int) + - "io-status": I/O operation status, only present if the device supports it and the VM is configured to stop on errors. It's always reset to "ok" when the "cont" command is issued (json_string, optional) @@ -1171,7 +1216,13 @@ Example: "ro":false, "drv":"qcow2", "encrypted":false, - "file":"disks/test.img" + "file":"disks/test.img", + "bps":1000000, + "bps_rd":0, + "bps_wr":0, + "iops":1000000, + "iops_rd":0, + "iops_wr":0, }, "type":"unknown" }, @@ -2104,7 +2104,7 @@ int load_vmstate(const char *name) } /* Flush all IO requests so they don't interfere with the new state. */ - qemu_aio_flush(); + bdrv_drain_all(); bs = NULL; while ((bs = bdrv_next(bs))) { diff --git a/trace-events b/trace-events index 962caca598..bf1cf57032 100644 --- a/trace-events +++ b/trace-events @@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d" bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p" +bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d" # hw/virtio-blk.c virtio_blk_req_complete(void *req, int status) "req %p status %d" @@ -631,3 +632,10 @@ win_helper_no_switch_pstate(uint32_t new_pstate_regs) "change_pstate: regs new=% win_helper_wrpil(uint32_t psrpil, uint32_t new_pil) "old=%x new=%x" win_helper_done(uint32_t tl) "tl=%d" win_helper_retry(uint32_t tl) "tl=%d" + +# dma-helpers.c +dma_bdrv_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d" +dma_aio_cancel(void *dbs) "dbs=%p" +dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p" +dma_bdrv_cb(void *dbs, int ret) "dbs=%p ret=%d" +dma_map_wait(void *dbs) "dbs=%p" diff --git a/xen-mapcache.c b/xen-mapcache.c index 7bcb86e4f8..9fecc64e97 100644 --- a/xen-mapcache.c +++ b/xen-mapcache.c @@ -351,7 +351,7 @@ void xen_invalidate_map_cache(void) MapCacheRev *reventry; /* Flush pending AIO before destroying the mapcache */ - qemu_aio_flush(); + bdrv_drain_all(); QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) { DPRINTF("There should be no locked mappings at this time, " |