diff options
Diffstat (limited to 'migration')
-rw-r--r-- | migration/Makefile.objs | 2 | ||||
-rw-r--r-- | migration/block.c | 31 | ||||
-rw-r--r-- | migration/migration.c | 879 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 761 | ||||
-rw-r--r-- | migration/qemu-file-buf.c | 11 | ||||
-rw-r--r-- | migration/qemu-file-stdio.c | 17 | ||||
-rw-r--r-- | migration/qemu-file-unix.c | 124 | ||||
-rw-r--r-- | migration/qemu-file.c | 90 | ||||
-rw-r--r-- | migration/ram.c | 1268 | ||||
-rw-r--r-- | migration/rdma.c | 36 | ||||
-rw-r--r-- | migration/savevm.c | 1074 |
11 files changed, 3695 insertions, 598 deletions
diff --git a/migration/Makefile.objs b/migration/Makefile.objs index d929e969ae..0cac6d707a 100644 --- a/migration/Makefile.objs +++ b/migration/Makefile.objs @@ -1,7 +1,7 @@ common-obj-y += migration.o tcp.o common-obj-y += vmstate.o common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o -common-obj-y += xbzrle.o +common-obj-y += xbzrle.o postcopy-ram.o common-obj-$(CONFIG_RDMA) += rdma.o common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o diff --git a/migration/block.c b/migration/block.c index ed865ed23b..656f38f341 100644 --- a/migration/block.c +++ b/migration/block.c @@ -36,6 +36,8 @@ #define MAX_IS_ALLOCATED_SEARCH 65536 +#define MAX_INFLIGHT_IO 512 + //#define DEBUG_BLK_MIGRATION #ifdef DEBUG_BLK_MIGRATION @@ -591,7 +593,7 @@ static int64_t get_remaining_dirty(void) /* Called with iothread lock taken. */ -static void blk_mig_cleanup(void) +static void block_migration_cleanup(void *opaque) { BlkMigDevState *bmds; BlkMigBlock *blk; @@ -618,11 +620,6 @@ static void blk_mig_cleanup(void) blk_mig_unlock(); } -static void block_migration_cancel(void *opaque) -{ - blk_mig_cleanup(); -} - static int block_save_setup(QEMUFile *f, void *opaque) { int ret; @@ -670,7 +667,10 @@ static int block_save_iterate(QEMUFile *f, void *opaque) blk_mig_lock(); while ((block_mig_state.submitted + block_mig_state.read_done) * BLOCK_SIZE < - qemu_file_get_rate_limit(f)) { + qemu_file_get_rate_limit(f) && + (block_mig_state.submitted + + block_mig_state.read_done) < + MAX_INFLIGHT_IO) { blk_mig_unlock(); if (block_mig_state.bulk_completed == 0) { /* first finish the bulk phase */ @@ -750,11 +750,12 @@ static int block_save_complete(QEMUFile *f, void *opaque) qemu_put_be64(f, BLK_MIG_FLAG_EOS); - blk_mig_cleanup(); return 0; } -static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending) { /* Estimate pending number of bytes to send */ uint64_t pending; @@ -773,7 +774,8 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) qemu_mutex_unlock_iothread(); DPRINTF("Enter save live pending %" PRIu64 "\n", pending); - return pending; + /* We don't do postcopy */ + *non_postcopiable_pending += pending; } static int block_load(QEMUFile *f, void *opaque, int version_id) @@ -808,6 +810,11 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) return -EINVAL; } bs = blk_bs(blk); + if (!bs) { + fprintf(stderr, "Block device %s has no medium\n", + device_name); + return -EINVAL; + } if (bs != bs_prev) { bs_prev = bs; @@ -877,10 +884,10 @@ static SaveVMHandlers savevm_block_handlers = { .set_params = block_set_params, .save_live_setup = block_save_setup, .save_live_iterate = block_save_iterate, - .save_live_complete = block_save_complete, + .save_live_complete_precopy = block_save_complete, .save_live_pending = block_save_pending, .load_state = block_load, - .cancel = block_migration_cancel, + .cleanup = block_migration_cleanup, .is_active = block_is_active, }; diff --git a/migration/migration.c b/migration/migration.c index c4a7d0b705..adc6b6f1c9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -21,16 +21,20 @@ #include "sysemu/sysemu.h" #include "block/block.h" #include "qapi/qmp/qerror.h" +#include "qapi/util.h" #include "qemu/sockets.h" #include "qemu/rcu.h" #include "migration/block.h" +#include "migration/postcopy-ram.h" #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" -#include "qapi/util.h" #include "qapi-event.h" +#include "qom/cpu.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" -#define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ +#define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ /* Amount of time to allocate to each "chunk" of bandwidth-throttled * data. */ @@ -44,6 +48,9 @@ #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 +/* Define default autoconverge cpu throttle migration parameters */ +#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL 20 +#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT 10 /* Migration XBZRLE default cache size */ #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) @@ -53,6 +60,13 @@ static NotifierList migration_state_notifiers = static bool deferred_incoming; +/* + * Current state of incoming postcopy; note this is not part of + * MigrationIncomingState since it's state is used during cleanup + * at the end as MIS is being freed. + */ +static PostcopyState incoming_postcopy_state; + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -60,6 +74,7 @@ static bool deferred_incoming; /* For outgoing */ MigrationState *migrate_get_current(void) { + static bool once; static MigrationState current_migration = { .state = MIGRATION_STATUS_NONE, .bandwidth_limit = MAX_THROTTLE, @@ -71,8 +86,16 @@ MigrationState *migrate_get_current(void) DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, .parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, + .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] = + DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL, + .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] = + DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT, }; + if (!once) { + qemu_mutex_init(¤t_migration.src_page_req_mutex); + once = true; + } return ¤t_migration; } @@ -86,15 +109,18 @@ MigrationIncomingState *migration_incoming_get_current(void) MigrationIncomingState *migration_incoming_state_new(QEMUFile* f) { - mis_current = g_malloc0(sizeof(MigrationIncomingState)); - mis_current->file = f; + mis_current = g_new0(MigrationIncomingState, 1); + mis_current->from_src_file = f; QLIST_INIT(&mis_current->loadvm_handlers); + qemu_mutex_init(&mis_current->rp_mutex); + qemu_event_init(&mis_current->main_thread_load_event, false); return mis_current; } void migration_incoming_state_destroy(void) { + qemu_event_destroy(&mis_current->main_thread_load_event); loadvm_free_handlers(mis_current); g_free(mis_current); mis_current = NULL; @@ -240,6 +266,35 @@ static void deferred_incoming_migration(Error **errp) deferred_incoming = true; } +/* Request a range of pages from the source VM at the given + * start address. + * rbname: Name of the RAMBlock to request the page in, if NULL it's the same + * as the last request (a name must have been given previously) + * Start: Address offset within the RB + * Len: Length in bytes required - must be a multiple of pagesize + */ +void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname, + ram_addr_t start, size_t len) +{ + uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */ + size_t msglen = 12; /* start + len */ + + *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); + *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); + + if (rbname) { + int rbname_len = strlen(rbname); + assert(rbname_len < 256); + + bufc[msglen++] = rbname_len; + memcpy(bufc + msglen, rbname, rbname_len); + msglen += rbname_len; + migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc); + } else { + migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc); + } +} + void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; @@ -270,12 +325,37 @@ static void process_incoming_migration_co(void *opaque) { QEMUFile *f = opaque; Error *local_err = NULL; + MigrationIncomingState *mis; + PostcopyState ps; int ret; - migration_incoming_state_new(f); + mis = migration_incoming_state_new(f); + postcopy_state_set(POSTCOPY_INCOMING_NONE); migrate_generate_event(MIGRATION_STATUS_ACTIVE); + ret = qemu_loadvm_state(f); + ps = postcopy_state_get(); + trace_process_incoming_migration_co_end(ret, ps); + if (ps != POSTCOPY_INCOMING_NONE) { + if (ps == POSTCOPY_INCOMING_ADVISE) { + /* + * Where a migration had postcopy enabled (and thus went to advise) + * but managed to complete within the precopy period, we can use + * the normal exit. + */ + postcopy_ram_incoming_cleanup(mis); + } else if (ret >= 0) { + /* + * Postcopy was started, cleanup should happen at the end of the + * postcopy thread. + */ + trace_process_incoming_migration_co_postcopy_end_main(); + return; + } + /* Else if something went wrong then just fall out of the normal exit */ + } + qemu_fclose(f); free_xbzrle_decoded_buf(); migration_incoming_state_destroy(); @@ -286,7 +366,6 @@ static void process_incoming_migration_co(void *opaque) migrate_decompress_threads_join(); exit(EXIT_FAILURE); } - qemu_announce_self(); /* Make sure all file formats flush their mutable metadata */ bdrv_invalidate_cache_all(&local_err); @@ -297,6 +376,12 @@ static void process_incoming_migration_co(void *opaque) exit(EXIT_FAILURE); } + /* + * This must happen after all error conditions are dealt with and + * we're sure the VM is going to be running on this host. + */ + qemu_announce_self(); + /* If global state section was not received or we are in running state, we need to obey autostart. Any other state is set with runstate_set. */ @@ -331,6 +416,50 @@ void process_incoming_migration(QEMUFile *f) qemu_coroutine_enter(co, f); } +/* + * Send a message on the return channel back to the source + * of the migration. + */ +void migrate_send_rp_message(MigrationIncomingState *mis, + enum mig_rp_message_type message_type, + uint16_t len, void *data) +{ + trace_migrate_send_rp_message((int)message_type, len); + qemu_mutex_lock(&mis->rp_mutex); + qemu_put_be16(mis->to_src_file, (unsigned int)message_type); + qemu_put_be16(mis->to_src_file, len); + qemu_put_buffer(mis->to_src_file, data, len); + qemu_fflush(mis->to_src_file); + qemu_mutex_unlock(&mis->rp_mutex); +} + +/* + * Send a 'SHUT' message on the return channel with the given value + * to indicate that we've finished with the RP. Non-0 value indicates + * error. + */ +void migrate_send_rp_shut(MigrationIncomingState *mis, + uint32_t value) +{ + uint32_t buf; + + buf = cpu_to_be32(value); + migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); +} + +/* + * Send a 'PONG' message on the return channel with the given value + * (normally in response to a 'PING') + */ +void migrate_send_rp_pong(MigrationIncomingState *mis, + uint32_t value) +{ + uint32_t buf; + + buf = cpu_to_be32(value); + migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); +} + /* amount of nanoseconds we are willing to wait for migration to be down. * the choice of nanoseconds is because it is the maximum resolution that * get_clock() can achieve. It is an internal measure. All user-visible @@ -378,10 +507,32 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; params->decompress_threads = s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; + params->x_cpu_throttle_initial = + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL]; + params->x_cpu_throttle_increment = + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT]; return params; } +/* + * Return true if we're already in the middle of a migration + * (i.e. any of the active or setup states) + */ +static bool migration_is_setup_or_active(int state) +{ + switch (state) { + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_SETUP: + return true; + + default: + return false; + + } +} + static void get_xbzrle_cache_stats(MigrationInfo *info) { if (migrate_use_xbzrle()) { @@ -441,6 +592,44 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->disk->total = blk_mig_bytes_total(); } + if (cpu_throttle_active()) { + info->has_x_cpu_throttle_percentage = true; + info->x_cpu_throttle_percentage = cpu_throttle_get_percentage(); + } + + get_xbzrle_cache_stats(info); + break; + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + /* Mostly the same as active; TODO add some postcopy stats */ + info->has_status = true; + info->has_total_time = true; + info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - s->total_time; + info->has_expected_downtime = true; + info->expected_downtime = s->expected_downtime; + info->has_setup_time = true; + info->setup_time = s->setup_time; + + info->has_ram = true; + info->ram = g_malloc0(sizeof(*info->ram)); + info->ram->transferred = ram_bytes_transferred(); + info->ram->remaining = ram_bytes_remaining(); + info->ram->total = ram_bytes_total(); + info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->skipped = skipped_mig_pages_transferred(); + info->ram->normal = norm_mig_pages_transferred(); + info->ram->normal_bytes = norm_mig_bytes_transferred(); + info->ram->dirty_pages_rate = s->dirty_pages_rate; + info->ram->mbps = s->mbps; + + if (blk_mig_active()) { + info->has_disk = true; + info->disk = g_malloc0(sizeof(*info->disk)); + info->disk->transferred = blk_mig_bytes_transferred(); + info->disk->remaining = blk_mig_bytes_remaining(); + info->disk->total = blk_mig_bytes_total(); + } + get_xbzrle_cache_stats(info); break; case MIGRATION_STATUS_COMPLETED: @@ -484,8 +673,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, MigrationState *s = migrate_get_current(); MigrationCapabilityStatusList *cap; - if (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_SETUP) { + if (migration_is_setup_or_active(s->state)) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; } @@ -493,6 +681,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, for (cap = params; cap; cap = cap->next) { s->enabled_capabilities[cap->value->capability] = cap->value->state; } + + if (migrate_postcopy_ram()) { + if (migrate_use_compression()) { + /* The decompression threads asynchronously write into RAM + * rather than use the atomic copies needed to avoid + * userfaulting. It should be possible to fix the decompression + * threads for compatibility in future. + */ + error_report("Postcopy is not currently compatible with " + "compression"); + s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] = + false; + } + } } void qmp_migrate_set_parameters(bool has_compress_level, @@ -500,7 +702,11 @@ void qmp_migrate_set_parameters(bool has_compress_level, bool has_compress_threads, int64_t compress_threads, bool has_decompress_threads, - int64_t decompress_threads, Error **errp) + int64_t decompress_threads, + bool has_x_cpu_throttle_initial, + int64_t x_cpu_throttle_initial, + bool has_x_cpu_throttle_increment, + int64_t x_cpu_throttle_increment, Error **errp) { MigrationState *s = migrate_get_current(); @@ -523,6 +729,18 @@ void qmp_migrate_set_parameters(bool has_compress_level, "is invalid, it should be in the range of 1 to 255"); return; } + if (has_x_cpu_throttle_initial && + (x_cpu_throttle_initial < 1 || x_cpu_throttle_initial > 99)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "x_cpu_throttle_initial", + "an integer in the range of 1 to 99"); + } + if (has_x_cpu_throttle_increment && + (x_cpu_throttle_increment < 1 || x_cpu_throttle_increment > 99)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "x_cpu_throttle_increment", + "an integer in the range of 1 to 99"); + } if (has_compress_level) { s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; @@ -534,6 +752,37 @@ void qmp_migrate_set_parameters(bool has_compress_level, s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = decompress_threads; } + if (has_x_cpu_throttle_initial) { + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] = + x_cpu_throttle_initial; + } + + if (has_x_cpu_throttle_increment) { + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] = + x_cpu_throttle_increment; + } +} + +void qmp_migrate_start_postcopy(Error **errp) +{ + MigrationState *s = migrate_get_current(); + + if (!migrate_postcopy_ram()) { + error_setg(errp, "Enable postcopy with migrate_set_capability before" + " the start of migration"); + return; + } + + if (s->state == MIGRATION_STATUS_NONE) { + error_setg(errp, "Postcopy must be started after migration has been" + " started"); + return; + } + /* + * we don't error if migration has finished since that would be racy + * with issuing this command. + */ + atomic_set(&s->start_postcopy, true); } /* shared migration helpers */ @@ -553,10 +802,15 @@ static void migrate_fd_cleanup(void *opaque) qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; + flush_page_queue(s); + if (s->file) { trace_migrate_fd_cleanup(); qemu_mutex_unlock_iothread(); - qemu_thread_join(&s->thread); + if (s->migration_thread_running) { + qemu_thread_join(&s->thread); + s->migration_thread_running = false; + } qemu_mutex_lock_iothread(); migrate_compress_threads_join(); @@ -564,14 +818,12 @@ static void migrate_fd_cleanup(void *opaque) s->file = NULL; } - assert(s->state != MIGRATION_STATUS_ACTIVE); + assert((s->state != MIGRATION_STATUS_ACTIVE) && + (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE)); - if (s->state != MIGRATION_STATUS_COMPLETED) { - qemu_savevm_state_cancel(); - if (s->state == MIGRATION_STATUS_CANCELLING) { - migrate_set_state(s, MIGRATION_STATUS_CANCELLING, - MIGRATION_STATUS_CANCELLED); - } + if (s->state == MIGRATION_STATUS_CANCELLING) { + migrate_set_state(s, MIGRATION_STATUS_CANCELLING, + MIGRATION_STATUS_CANCELLED); } notifier_list_notify(&migration_state_notifiers, s); @@ -591,10 +843,14 @@ static void migrate_fd_cancel(MigrationState *s) QEMUFile *f = migrate_get_current()->file; trace_migrate_fd_cancel(); + if (s->rp_state.from_dst_file) { + /* shutdown the rp socket, so causing the rp thread to shutdown */ + qemu_file_shutdown(s->rp_state.from_dst_file); + } + do { old_state = s->state; - if (old_state != MIGRATION_STATUS_SETUP && - old_state != MIGRATION_STATUS_ACTIVE) { + if (!migration_is_setup_or_active(old_state)) { break; } migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING); @@ -638,35 +894,43 @@ bool migration_has_failed(MigrationState *s) s->state == MIGRATION_STATUS_FAILED); } -static MigrationState *migrate_init(const MigrationParams *params) +bool migration_in_postcopy(MigrationState *s) { - MigrationState *s = migrate_get_current(); - int64_t bandwidth_limit = s->bandwidth_limit; - bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; - int64_t xbzrle_cache_size = s->xbzrle_cache_size; - int compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; - int compress_thread_count = - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; - int decompress_thread_count = - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; + return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); +} - memcpy(enabled_capabilities, s->enabled_capabilities, - sizeof(enabled_capabilities)); +MigrationState *migrate_init(const MigrationParams *params) +{ + MigrationState *s = migrate_get_current(); - memset(s, 0, sizeof(*s)); + /* + * Reinitialise all migration state, except + * parameters/capabilities that the user set, and + * locks. + */ + s->bytes_xfer = 0; + s->xfer_limit = 0; + s->cleanup_bh = 0; + s->file = NULL; + s->state = MIGRATION_STATUS_NONE; s->params = *params; - memcpy(s->enabled_capabilities, enabled_capabilities, - sizeof(enabled_capabilities)); - s->xbzrle_cache_size = xbzrle_cache_size; - - s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = - compress_thread_count; - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = - decompress_thread_count; - s->bandwidth_limit = bandwidth_limit; + s->rp_state.from_dst_file = NULL; + s->rp_state.error = false; + s->mbps = 0.0; + s->downtime = 0; + s->expected_downtime = 0; + s->dirty_pages_rate = 0; + s->dirty_bytes_rate = 0; + s->setup_time = 0; + s->dirty_sync_count = 0; + s->start_postcopy = false; + s->migration_thread_running = false; + s->last_req_rb = NULL; + migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); + QSIMPLEQ_INIT(&s->src_page_requests); + s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); return s; } @@ -718,8 +982,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, params.blk = has_blk && blk; params.shared = has_inc && inc; - if (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_SETUP || + if (migration_is_setup_or_active(s->state) || s->state == MIGRATION_STATUS_CANCELLING) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; @@ -838,6 +1101,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) max_downtime = (uint64_t)value; } +bool migrate_postcopy_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM]; +} + bool migrate_auto_converge(void) { MigrationState *s; @@ -920,76 +1192,489 @@ int64_t migrate_xbzrle_cache_size(void) } /* migration thread support */ +/* + * Something bad happened to the RP stream, mark an error + * The caller shall print or trace something to indicate why + */ +static void mark_source_rp_bad(MigrationState *s) +{ + s->rp_state.error = true; +} + +static struct rp_cmd_args { + ssize_t len; /* -1 = variable */ + const char *name; +} rp_cmd_args[] = { + [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, + [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, + [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, + [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, + [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, + [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, +}; + +/* + * Process a request for pages received on the return path, + * We're allowed to send more than requested (e.g. to round to our page size) + * and we don't need to send pages that have already been sent. + */ +static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, + ram_addr_t start, size_t len) +{ + long our_host_ps = getpagesize(); + + trace_migrate_handle_rp_req_pages(rbname, start, len); + + /* + * Since we currently insist on matching page sizes, just sanity check + * we're being asked for whole host pages. + */ + if (start & (our_host_ps-1) || + (len & (our_host_ps-1))) { + error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT + " len: %zd", __func__, start, len); + mark_source_rp_bad(ms); + return; + } + if (ram_save_queue_pages(ms, rbname, start, len)) { + mark_source_rp_bad(ms); + } +} + +/* + * Handles messages sent on the return path towards the source VM + * + */ +static void *source_return_path_thread(void *opaque) +{ + MigrationState *ms = opaque; + QEMUFile *rp = ms->rp_state.from_dst_file; + uint16_t header_len, header_type; + const int max_len = 512; + uint8_t buf[max_len]; + uint32_t tmp32, sibling_error; + ram_addr_t start = 0; /* =0 to silence warning */ + size_t len = 0, expected_len; + int res; + + trace_source_return_path_thread_entry(); + while (!ms->rp_state.error && !qemu_file_get_error(rp) && + migration_is_setup_or_active(ms->state)) { + trace_source_return_path_thread_loop_top(); + header_type = qemu_get_be16(rp); + header_len = qemu_get_be16(rp); + + if (header_type >= MIG_RP_MSG_MAX || + header_type == MIG_RP_MSG_INVALID) { + error_report("RP: Received invalid message 0x%04x length 0x%04x", + header_type, header_len); + mark_source_rp_bad(ms); + goto out; + } + + if ((rp_cmd_args[header_type].len != -1 && + header_len != rp_cmd_args[header_type].len) || + header_len > max_len) { + error_report("RP: Received '%s' message (0x%04x) with" + "incorrect length %d expecting %zu", + rp_cmd_args[header_type].name, header_type, header_len, + (size_t)rp_cmd_args[header_type].len); + mark_source_rp_bad(ms); + goto out; + } + + /* We know we've got a valid header by this point */ + res = qemu_get_buffer(rp, buf, header_len); + if (res != header_len) { + error_report("RP: Failed reading data for message 0x%04x" + " read %d expected %d", + header_type, res, header_len); + mark_source_rp_bad(ms); + goto out; + } + + /* OK, we have the message and the data */ + switch (header_type) { + case MIG_RP_MSG_SHUT: + sibling_error = be32_to_cpup((uint32_t *)buf); + trace_source_return_path_thread_shut(sibling_error); + if (sibling_error) { + error_report("RP: Sibling indicated error %d", sibling_error); + mark_source_rp_bad(ms); + } + /* + * We'll let the main thread deal with closing the RP + * we could do a shutdown(2) on it, but we're the only user + * anyway, so there's nothing gained. + */ + goto out; + + case MIG_RP_MSG_PONG: + tmp32 = be32_to_cpup((uint32_t *)buf); + trace_source_return_path_thread_pong(tmp32); + break; + + case MIG_RP_MSG_REQ_PAGES: + start = be64_to_cpup((uint64_t *)buf); + len = be32_to_cpup((uint32_t *)(buf + 8)); + migrate_handle_rp_req_pages(ms, NULL, start, len); + break; + + case MIG_RP_MSG_REQ_PAGES_ID: + expected_len = 12 + 1; /* header + termination */ + + if (header_len >= expected_len) { + start = be64_to_cpup((uint64_t *)buf); + len = be32_to_cpup((uint32_t *)(buf + 8)); + /* Now we expect an idstr */ + tmp32 = buf[12]; /* Length of the following idstr */ + buf[13 + tmp32] = '\0'; + expected_len += tmp32; + } + if (header_len != expected_len) { + error_report("RP: Req_Page_id with length %d expecting %zd", + header_len, expected_len); + mark_source_rp_bad(ms); + goto out; + } + migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); + break; + + default: + break; + } + } + if (qemu_file_get_error(rp)) { + trace_source_return_path_thread_bad_end(); + mark_source_rp_bad(ms); + } + + trace_source_return_path_thread_end(); +out: + ms->rp_state.from_dst_file = NULL; + qemu_fclose(rp); + return NULL; +} + +static int open_return_path_on_source(MigrationState *ms) +{ + + ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file); + if (!ms->rp_state.from_dst_file) { + return -1; + } + + trace_open_return_path_on_source(); + qemu_thread_create(&ms->rp_state.rp_thread, "return path", + source_return_path_thread, ms, QEMU_THREAD_JOINABLE); + + trace_open_return_path_on_source_continue(); + + return 0; +} + +/* Returns 0 if the RP was ok, otherwise there was an error on the RP */ +static int await_return_path_close_on_source(MigrationState *ms) +{ + /* + * If this is a normal exit then the destination will send a SHUT and the + * rp_thread will exit, however if there's an error we need to cause + * it to exit. + */ + if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) { + /* + * shutdown(2), if we have it, will cause it to unblock if it's stuck + * waiting for the destination. + */ + qemu_file_shutdown(ms->rp_state.from_dst_file); + mark_source_rp_bad(ms); + } + trace_await_return_path_close_on_source_joining(); + qemu_thread_join(&ms->rp_state.rp_thread); + trace_await_return_path_close_on_source_close(); + return ms->rp_state.error; +} + +/* + * Switch from normal iteration to postcopy + * Returns non-0 on error + */ +static int postcopy_start(MigrationState *ms, bool *old_vm_running) +{ + int ret; + const QEMUSizedBuffer *qsb; + int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + migrate_set_state(ms, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + + trace_postcopy_start(); + qemu_mutex_lock_iothread(); + trace_postcopy_start_set_run(); + + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); + *old_vm_running = runstate_is_running(); + global_state_store(); + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + + if (ret < 0) { + goto fail; + } + + /* + * Cause any non-postcopiable, but iterative devices to + * send out their final data. + */ + qemu_savevm_state_complete_precopy(ms->file, true); + + /* + * in Finish migrate and with the io-lock held everything should + * be quiet, but we've potentially still got dirty pages and we + * need to tell the destination to throw any pages it's already received + * that are dirty + */ + if (ram_postcopy_send_discard_bitmap(ms)) { + error_report("postcopy send discard bitmap failed"); + goto fail; + } + + /* + * send rest of state - note things that are doing postcopy + * will notice we're in POSTCOPY_ACTIVE and not actually + * wrap their state up here + */ + qemu_file_set_rate_limit(ms->file, INT64_MAX); + /* Ping just for debugging, helps line traces up */ + qemu_savevm_send_ping(ms->file, 2); + + /* + * While loading the device state we may trigger page transfer + * requests and the fd must be free to process those, and thus + * the destination must read the whole device state off the fd before + * it starts processing it. Unfortunately the ad-hoc migration format + * doesn't allow the destination to know the size to read without fully + * parsing it through each devices load-state code (especially the open + * coded devices that use get/put). + * So we wrap the device state up in a package with a length at the start; + * to do this we use a qemu_buf to hold the whole of the device state. + */ + QEMUFile *fb = qemu_bufopen("w", NULL); + if (!fb) { + error_report("Failed to create buffered file"); + goto fail; + } + + /* + * Make sure the receiver can get incoming pages before we send the rest + * of the state + */ + qemu_savevm_send_postcopy_listen(fb); + + qemu_savevm_state_complete_precopy(fb, false); + qemu_savevm_send_ping(fb, 3); + + qemu_savevm_send_postcopy_run(fb); + + /* <><> end of stuff going into the package */ + qsb = qemu_buf_get(fb); + + /* Now send that blob */ + if (qemu_savevm_send_packaged(ms->file, qsb)) { + goto fail_closefb; + } + qemu_fclose(fb); + ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; + + qemu_mutex_unlock_iothread(); + + /* + * Although this ping is just for debug, it could potentially be + * used for getting a better measurement of downtime at the source. + */ + qemu_savevm_send_ping(ms->file, 4); + + ret = qemu_file_get_error(ms->file); + if (ret) { + error_report("postcopy_start: Migration stream errored"); + migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_FAILED); + } + + return ret; + +fail_closefb: + qemu_fclose(fb); +fail: + migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + return -1; +} + +/** + * migration_completion: Used by migration_thread when there's not much left. + * The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + * @current_active_state: The migration state we expect to be in + * @*old_vm_running: Pointer to old_vm_running flag + * @*start_time: Pointer to time to update + */ +static void migration_completion(MigrationState *s, int current_active_state, + bool *old_vm_running, + int64_t *start_time) +{ + int ret; + + if (s->state == MIGRATION_STATUS_ACTIVE) { + qemu_mutex_lock_iothread(); + *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); + *old_vm_running = runstate_is_running(); + ret = global_state_store(); + + if (!ret) { + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret >= 0) { + qemu_file_set_rate_limit(s->file, INT64_MAX); + qemu_savevm_state_complete_precopy(s->file, false); + } + } + qemu_mutex_unlock_iothread(); + + if (ret < 0) { + goto fail; + } + } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + trace_migration_completion_postcopy_end(); + + qemu_savevm_state_complete_postcopy(s->file); + trace_migration_completion_postcopy_end_after_complete(); + } + + /* + * If rp was opened we must clean up the thread before + * cleaning everything else up (since if there are no failures + * it will wait for the destination to send it's status in + * a SHUT command). + * Postcopy opens rp if enabled (even if it's not avtivated) + */ + if (migrate_postcopy_ram()) { + int rp_error; + trace_migration_completion_postcopy_end_before_rp(); + rp_error = await_return_path_close_on_source(s); + trace_migration_completion_postcopy_end_after_rp(rp_error); + if (rp_error) { + goto fail; + } + } + + if (qemu_file_get_error(s->file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED); +} + +/* + * Master migration thread on the source VM. + * It drives the migration and pumps the data down the outgoing channel. + */ static void *migration_thread(void *opaque) { MigrationState *s = opaque; + /* Used by the bandwidth calcs, updated later */ int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); int64_t initial_bytes = 0; int64_t max_size = 0; int64_t start_time = initial_time; + int64_t end_time; bool old_vm_running = false; + bool entered_postcopy = false; + /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ + enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; rcu_register_thread(); qemu_savevm_state_header(s->file); + + if (migrate_postcopy_ram()) { + /* Now tell the dest that it should open its end so it can reply */ + qemu_savevm_send_open_return_path(s->file); + + /* And do a ping that will make stuff easier to debug */ + qemu_savevm_send_ping(s->file, 1); + + /* + * Tell the destination that we *might* want to do postcopy later; + * if the other end can't do postcopy it should fail now, nice and + * early. + */ + qemu_savevm_send_postcopy_advise(s->file); + } + qemu_savevm_state_begin(s->file, &s->params); s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + current_active_state = MIGRATION_STATUS_ACTIVE; migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); - while (s->state == MIGRATION_STATUS_ACTIVE) { + trace_migration_thread_setup_complete(); + + while (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { int64_t current_time; uint64_t pending_size; if (!qemu_file_rate_limit(s->file)) { - pending_size = qemu_savevm_state_pending(s->file, max_size); - trace_migrate_pending(pending_size, max_size); + uint64_t pend_post, pend_nonpost; + + qemu_savevm_state_pending(s->file, max_size, &pend_nonpost, + &pend_post); + pending_size = pend_nonpost + pend_post; + trace_migrate_pending(pending_size, max_size, + pend_post, pend_nonpost); if (pending_size && pending_size >= max_size) { - qemu_savevm_state_iterate(s->file); - } else { - int ret; - - qemu_mutex_lock_iothread(); - start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - old_vm_running = runstate_is_running(); - - ret = global_state_store(); - if (!ret) { - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - qemu_file_set_rate_limit(s->file, INT64_MAX); - qemu_savevm_state_complete(s->file); - } - } - qemu_mutex_unlock_iothread(); + /* Still a significant amount to transfer */ - if (ret < 0) { - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); - break; - } + if (migrate_postcopy_ram() && + s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && + pend_nonpost <= max_size && + atomic_read(&s->start_postcopy)) { + + if (!postcopy_start(s, &old_vm_running)) { + current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; + entered_postcopy = true; + } - if (!qemu_file_get_error(s->file)) { - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_COMPLETED); - break; + continue; } + /* Just another iteration step */ + qemu_savevm_state_iterate(s->file, entered_postcopy); + } else { + trace_migration_thread_low_pending(pending_size); + migration_completion(s, current_active_state, + &old_vm_running, &start_time); + break; } } if (qemu_file_get_error(s->file)) { - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); + migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED); + trace_migration_thread_file_err(); break; } current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); if (current_time >= initial_time + BUFFER_DELAY) { uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes; uint64_t time_spent = current_time - initial_time; - double bandwidth = transferred_bytes / time_spent; + double bandwidth = (double)transferred_bytes / time_spent; max_size = bandwidth * migrate_max_downtime() / 1000000; s->mbps = time_spent ? (((double) transferred_bytes * 8.0) / @@ -1013,19 +1698,26 @@ static void *migration_thread(void *opaque) } } + trace_migration_thread_after_loop(); + /* If we enabled cpu throttling for auto-converge, turn it off. */ + cpu_throttle_stop(); + end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_mutex_lock_iothread(); + qemu_savevm_state_cleanup(); if (s->state == MIGRATION_STATUS_COMPLETED) { - int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); uint64_t transferred_bytes = qemu_ftell(s->file); s->total_time = end_time - s->total_time; - s->downtime = end_time - start_time; + if (!entered_postcopy) { + s->downtime = end_time - start_time; + } if (s->total_time) { s->mbps = (((double) transferred_bytes * 8.0) / ((double) s->total_time)) / 1000; } runstate_set(RUN_STATE_POSTMIGRATE); } else { - if (old_vm_running) { + if (old_vm_running && !entered_postcopy) { vm_start(); } } @@ -1048,7 +1740,34 @@ void migrate_fd_connect(MigrationState *s) /* Notify before starting migration thread */ notifier_list_notify(&migration_state_notifiers, s); + /* + * Open the return path; currently for postcopy but other things might + * also want it. + */ + if (migrate_postcopy_ram()) { + if (open_return_path_on_source(s)) { + error_report("Unable to open return-path for postcopy"); + migrate_set_state(s, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + migrate_fd_cleanup(s); + return; + } + } + migrate_compress_threads_create(); qemu_thread_create(&s->thread, "migration", migration_thread, s, QEMU_THREAD_JOINABLE); + s->migration_thread_running = true; +} + +PostcopyState postcopy_state_get(void) +{ + return atomic_mb_read(&incoming_postcopy_state); } + +/* Set the state and return the old state */ +PostcopyState postcopy_state_set(PostcopyState new_state) +{ + return atomic_xchg(&incoming_postcopy_state, new_state); +} + diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c new file mode 100644 index 0000000000..3946aa98aa --- /dev/null +++ b/migration/postcopy-ram.c @@ -0,0 +1,761 @@ +/* + * Postcopy migration for RAM + * + * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert <dgilbert@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * Postcopy is a migration technique where the execution flips from the + * source to the destination before all the data has been copied. + */ + +#include <glib.h> +#include <stdio.h> +#include <unistd.h> + +#include "qemu-common.h" +#include "migration/migration.h" +#include "migration/postcopy-ram.h" +#include "sysemu/sysemu.h" +#include "sysemu/balloon.h" +#include "qemu/error-report.h" +#include "trace.h" + +/* Arbitrary limit on size of each discard command, + * keeps them around ~200 bytes + */ +#define MAX_DISCARDS_PER_COMMAND 12 + +struct PostcopyDiscardState { + const char *ramblock_name; + uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */ + uint16_t cur_entry; + /* + * Start and length of a discard range (bytes) + */ + uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; + uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; + unsigned int nsentwords; + unsigned int nsentcmds; +}; + +/* Postcopy needs to detect accesses to pages that haven't yet been copied + * across, and efficiently map new pages in, the techniques for doing this + * are target OS specific. + */ +#if defined(__linux__) + +#include <poll.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <asm/types.h> /* for __u64 */ +#endif + +#if defined(__linux__) && defined(__NR_userfaultfd) +#include <linux/userfaultfd.h> + +static bool ufd_version_check(int ufd) +{ + struct uffdio_api api_struct; + uint64_t ioctl_mask; + + api_struct.api = UFFD_API; + api_struct.features = 0; + if (ioctl(ufd, UFFDIO_API, &api_struct)) { + error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s", + strerror(errno)); + return false; + } + + ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | + (__u64)1 << _UFFDIO_UNREGISTER; + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + error_report("Missing userfault features: %" PRIx64, + (uint64_t)(~api_struct.ioctls & ioctl_mask)); + return false; + } + + return true; +} + +/* + * Note: This has the side effect of munlock'ing all of RAM, that's + * normally fine since if the postcopy succeeds it gets turned back on at the + * end. + */ +bool postcopy_ram_supported_by_host(void) +{ + long pagesize = getpagesize(); + int ufd = -1; + bool ret = false; /* Error unless we change it */ + void *testarea = NULL; + struct uffdio_register reg_struct; + struct uffdio_range range_struct; + uint64_t feature_mask; + + if ((1ul << qemu_target_page_bits()) > pagesize) { + error_report("Target page size bigger than host page size"); + goto out; + } + + ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (ufd == -1) { + error_report("%s: userfaultfd not available: %s", __func__, + strerror(errno)); + goto out; + } + + /* Version and features check */ + if (!ufd_version_check(ufd)) { + goto out; + } + + /* + * userfault and mlock don't go together; we'll put it back later if + * it was enabled. + */ + if (munlockall()) { + error_report("%s: munlockall: %s", __func__, strerror(errno)); + return -1; + } + + /* + * We need to check that the ops we need are supported on anon memory + * To do that we need to register a chunk and see the flags that + * are returned. + */ + testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (testarea == MAP_FAILED) { + error_report("%s: Failed to map test area: %s", __func__, + strerror(errno)); + goto out; + } + g_assert(((size_t)testarea & (pagesize-1)) == 0); + + reg_struct.range.start = (uintptr_t)testarea; + reg_struct.range.len = pagesize; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { + error_report("%s userfault register: %s", __func__, strerror(errno)); + goto out; + } + + range_struct.start = (uintptr_t)testarea; + range_struct.len = pagesize; + if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { + error_report("%s userfault unregister: %s", __func__, strerror(errno)); + goto out; + } + + feature_mask = (__u64)1 << _UFFDIO_WAKE | + (__u64)1 << _UFFDIO_COPY | + (__u64)1 << _UFFDIO_ZEROPAGE; + if ((reg_struct.ioctls & feature_mask) != feature_mask) { + error_report("Missing userfault map features: %" PRIx64, + (uint64_t)(~reg_struct.ioctls & feature_mask)); + goto out; + } + + /* Success! */ + ret = true; +out: + if (testarea) { + munmap(testarea, pagesize); + } + if (ufd != -1) { + close(ufd); + } + return ret; +} + +/** + * postcopy_ram_discard_range: Discard a range of memory. + * We can assume that if we've been called postcopy_ram_hosttest returned true. + * + * @mis: Current incoming migration state. + * @start, @length: range of memory to discard. + * + * returns: 0 on success. + */ +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + size_t length) +{ + trace_postcopy_ram_discard_range(start, length); + if (madvise(start, length, MADV_DONTNEED)) { + error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno)); + return -1; + } + + return 0; +} + +/* + * Setup an area of RAM so that it *can* be used for postcopy later; this + * must be done right at the start prior to pre-copy. + * opaque should be the MIS. + */ +static int init_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + MigrationIncomingState *mis = opaque; + + trace_postcopy_init_range(block_name, host_addr, offset, length); + + /* + * We need the whole of RAM to be truly empty for postcopy, so things + * like ROMs and any data tables built during init must be zero'd + * - we're going to get the copy from the source anyway. + * (Precopy will just overwrite this data, so doesn't need the discard) + */ + if (postcopy_ram_discard_range(mis, host_addr, length)) { + return -1; + } + + return 0; +} + +/* + * At the end of migration, undo the effects of init_range + * opaque should be the MIS. + */ +static int cleanup_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffdio_range range_struct; + trace_postcopy_cleanup_range(block_name, host_addr, offset, length); + + /* + * We turned off hugepage for the precopy stage with postcopy enabled + * we can turn it back on now. + */ + qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE); + + /* + * We can also turn off userfault now since we should have all the + * pages. It can be useful to leave it on to debug postcopy + * if you're not sure it's always getting every page. + */ + range_struct.start = (uintptr_t)host_addr; + range_struct.len = length; + + if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { + error_report("%s: userfault unregister %s", __func__, strerror(errno)); + + return -1; + } + + return 0; +} + +/* + * Initialise postcopy-ram, setting the RAM to a state where we can go into + * postcopy later; must be called prior to any precopy. + * called from arch_init's similarly named ram_postcopy_incoming_init + */ +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +{ + if (qemu_ram_foreach_block(init_range, mis)) { + return -1; + } + + return 0; +} + +/* + * At the end of a migration where postcopy_ram_incoming_init was called. + */ +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) +{ + trace_postcopy_ram_incoming_cleanup_entry(); + + if (mis->have_fault_thread) { + uint64_t tmp64; + + if (qemu_ram_foreach_block(cleanup_range, mis)) { + return -1; + } + /* + * Tell the fault_thread to exit, it's an eventfd that should + * currently be at 0, we're going to increment it to 1 + */ + tmp64 = 1; + if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { + trace_postcopy_ram_incoming_cleanup_join(); + qemu_thread_join(&mis->fault_thread); + } else { + /* Not much we can do here, but may as well report it */ + error_report("%s: incrementing userfault_quit_fd: %s", __func__, + strerror(errno)); + } + trace_postcopy_ram_incoming_cleanup_closeuf(); + close(mis->userfault_fd); + close(mis->userfault_quit_fd); + mis->have_fault_thread = false; + } + + qemu_balloon_inhibit(false); + + if (enable_mlock) { + if (os_mlock() < 0) { + error_report("mlock: %s", strerror(errno)); + /* + * It doesn't feel right to fail at this point, we have a valid + * VM state. + */ + } + } + + postcopy_state_set(POSTCOPY_INCOMING_END); + migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); + + if (mis->postcopy_tmp_page) { + munmap(mis->postcopy_tmp_page, getpagesize()); + mis->postcopy_tmp_page = NULL; + } + trace_postcopy_ram_incoming_cleanup_exit(); + return 0; +} + +/* + * Disable huge pages on an area + */ +static int nhp_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + trace_postcopy_nhp_range(block_name, host_addr, offset, length); + + /* + * Before we do discards we need to ensure those discards really + * do delete areas of the page, even if THP thinks a hugepage would + * be a good idea, so force hugepages off. + */ + qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE); + + return 0; +} + +/* + * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard + * however leaving it until after precopy means that most of the precopy + * data is still THPd + */ +int postcopy_ram_prepare_discard(MigrationIncomingState *mis) +{ + if (qemu_ram_foreach_block(nhp_range, mis)) { + return -1; + } + + postcopy_state_set(POSTCOPY_INCOMING_DISCARD); + + return 0; +} + +/* + * Mark the given area of RAM as requiring notification to unwritten areas + * Used as a callback on qemu_ram_foreach_block. + * host_addr: Base of area to mark + * offset: Offset in the whole ram arena + * length: Length of the section + * opaque: MigrationIncomingState pointer + * Returns 0 on success + */ +static int ram_block_enable_notify(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, + void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffdio_register reg_struct; + + reg_struct.range.start = (uintptr_t)host_addr; + reg_struct.range.len = length; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + /* Now tell our userfault_fd that it's responsible for this area */ + if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { + error_report("%s userfault register: %s", __func__, strerror(errno)); + return -1; + } + + return 0; +} + +/* + * Handle faults detected by the USERFAULT markings + */ +static void *postcopy_ram_fault_thread(void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffd_msg msg; + int ret; + size_t hostpagesize = getpagesize(); + RAMBlock *rb = NULL; + RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ + + trace_postcopy_ram_fault_thread_entry(); + qemu_sem_post(&mis->fault_thread_sem); + + while (true) { + ram_addr_t rb_offset; + ram_addr_t in_raspace; + struct pollfd pfd[2]; + + /* + * We're mainly waiting for the kernel to give us a faulting HVA, + * however we can be told to quit via userfault_quit_fd which is + * an eventfd + */ + pfd[0].fd = mis->userfault_fd; + pfd[0].events = POLLIN; + pfd[0].revents = 0; + pfd[1].fd = mis->userfault_quit_fd; + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ + pfd[1].revents = 0; + + if (poll(pfd, 2, -1 /* Wait forever */) == -1) { + error_report("%s: userfault poll: %s", __func__, strerror(errno)); + break; + } + + if (pfd[1].revents) { + trace_postcopy_ram_fault_thread_quit(); + break; + } + + ret = read(mis->userfault_fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault message: %s", + __func__, strerror(errno)); + break; + } else { + error_report("%s: Read %d bytes from userfaultfd expected %zd", + __func__, ret, sizeof(msg)); + break; /* Lost alignment, don't know what we'd read next */ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud from userfaultfd", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } + + rb = qemu_ram_block_from_host( + (void *)(uintptr_t)msg.arg.pagefault.address, + true, &in_raspace, &rb_offset); + if (!rb) { + error_report("postcopy_ram_fault_thread: Fault outside guest: %" + PRIx64, (uint64_t)msg.arg.pagefault.address); + break; + } + + rb_offset &= ~(hostpagesize - 1); + trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, + qemu_ram_get_idstr(rb), + rb_offset); + + /* + * Send the request to the source - we want to request one + * of our host page sizes (which is >= TPS) + */ + if (rb != last_rb) { + last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + rb_offset, hostpagesize); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, + rb_offset, hostpagesize); + } + } + trace_postcopy_ram_fault_thread_exit(); + return NULL; +} + +int postcopy_ram_enable_notify(MigrationIncomingState *mis) +{ + /* Open the fd for the kernel to give us userfaults */ + mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (mis->userfault_fd == -1) { + error_report("%s: Failed to open userfault fd: %s", __func__, + strerror(errno)); + return -1; + } + + /* + * Although the host check already tested the API, we need to + * do the check again as an ABI handshake on the new fd. + */ + if (!ufd_version_check(mis->userfault_fd)) { + return -1; + } + + /* Now an eventfd we use to tell the fault-thread to quit */ + mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); + if (mis->userfault_quit_fd == -1) { + error_report("%s: Opening userfault_quit_fd: %s", __func__, + strerror(errno)); + close(mis->userfault_fd); + return -1; + } + + qemu_sem_init(&mis->fault_thread_sem, 0); + qemu_thread_create(&mis->fault_thread, "postcopy/fault", + postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); + qemu_sem_wait(&mis->fault_thread_sem); + qemu_sem_destroy(&mis->fault_thread_sem); + mis->have_fault_thread = true; + + /* Mark so that we get notified of accesses to unwritten areas */ + if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { + return -1; + } + + /* + * Ballooning can mark pages as absent while we're postcopying + * that would cause false userfaults. + */ + qemu_balloon_inhibit(true); + + trace_postcopy_ram_enable_notify(); + + return 0; +} + +/* + * Place a host page (from) at (host) atomically + * returns 0 on success + */ +int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) +{ + struct uffdio_copy copy_struct; + + copy_struct.dst = (uint64_t)(uintptr_t)host; + copy_struct.src = (uint64_t)(uintptr_t)from; + copy_struct.len = getpagesize(); + copy_struct.mode = 0; + + /* copy also acks to the kernel waking the stalled thread up + * TODO: We can inhibit that ack and only do it if it was requested + * which would be slightly cheaper, but we'd have to be careful + * of the order of updating our page state. + */ + if (ioctl(mis->userfault_fd, UFFDIO_COPY, ©_struct)) { + int e = errno; + error_report("%s: %s copy host: %p from: %p", + __func__, strerror(e), host, from); + + return -e; + } + + trace_postcopy_place_page(host); + return 0; +} + +/* + * Place a zero page at (host) atomically + * returns 0 on success + */ +int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) +{ + struct uffdio_zeropage zero_struct; + + zero_struct.range.start = (uint64_t)(uintptr_t)host; + zero_struct.range.len = getpagesize(); + zero_struct.mode = 0; + + if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) { + int e = errno; + error_report("%s: %s zero host: %p", + __func__, strerror(e), host); + + return -e; + } + + trace_postcopy_place_page_zero(host); + return 0; +} + +/* + * Returns a target page of memory that can be mapped at a later point in time + * using postcopy_place_page + * The same address is used repeatedly, postcopy_place_page just takes the + * backing page away. + * Returns: Pointer to allocated page + * + */ +void *postcopy_get_tmp_page(MigrationIncomingState *mis) +{ + if (!mis->postcopy_tmp_page) { + mis->postcopy_tmp_page = mmap(NULL, getpagesize(), + PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (!mis->postcopy_tmp_page) { + error_report("%s: %s", __func__, strerror(errno)); + return NULL; + } + } + + return mis->postcopy_tmp_page; +} + +#else +/* No target OS support, stubs just fail */ +bool postcopy_ram_supported_by_host(void) +{ + error_report("%s: No OS support", __func__); + return false; +} + +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +{ + error_report("postcopy_ram_incoming_init: No OS support"); + return -1; +} + +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + size_t length) +{ + assert(0); + return -1; +} + +int postcopy_ram_prepare_discard(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_ram_enable_notify(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) +{ + assert(0); + return -1; +} + +int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) +{ + assert(0); + return -1; +} + +void *postcopy_get_tmp_page(MigrationIncomingState *mis) +{ + assert(0); + return NULL; +} + +#endif + +/* ------------------------------------------------------------------------- */ + +/** + * postcopy_discard_send_init: Called at the start of each RAMBlock before + * asking to discard individual ranges. + * + * @ms: The current migration state. + * @offset: the bitmap offset of the named RAMBlock in the migration + * bitmap. + * @name: RAMBlock that discards will operate on. + * + * returns: a new PDS. + */ +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, + unsigned long offset, + const char *name) +{ + PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState)); + + if (res) { + res->ramblock_name = name; + res->offset = offset; + } + + return res; +} + +/** + * postcopy_discard_send_range: Called by the bitmap code for each chunk to + * discard. May send a discard message, may just leave it queued to + * be sent later. + * + * @ms: Current migration state. + * @pds: Structure initialised by postcopy_discard_send_init(). + * @start,@length: a range of pages in the migration bitmap in the + * RAM block passed to postcopy_discard_send_init() (length=1 is one page) + */ +void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, + unsigned long start, unsigned long length) +{ + size_t tp_bits = qemu_target_page_bits(); + /* Convert to byte offsets within the RAM block */ + pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits; + pds->length_list[pds->cur_entry] = length << tp_bits; + trace_postcopy_discard_send_range(pds->ramblock_name, start, length); + pds->cur_entry++; + pds->nsentwords++; + + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { + /* Full set, ship it! */ + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, + pds->cur_entry, + pds->start_list, + pds->length_list); + pds->nsentcmds++; + pds->cur_entry = 0; + } +} + +/** + * postcopy_discard_send_finish: Called at the end of each RAMBlock by the + * bitmap code. Sends any outstanding discard messages, frees the PDS + * + * @ms: Current migration state. + * @pds: Structure initialised by postcopy_discard_send_init(). + */ +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) +{ + /* Anything unsent? */ + if (pds->cur_entry) { + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, + pds->cur_entry, + pds->start_list, + pds->length_list); + pds->nsentcmds++; + } + + trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords, + pds->nsentcmds); + + g_free(pds); +} diff --git a/migration/qemu-file-buf.c b/migration/qemu-file-buf.c index 2de9330ca5..49516b8643 100644 --- a/migration/qemu-file-buf.c +++ b/migration/qemu-file-buf.c @@ -29,7 +29,7 @@ #include "qemu/error-report.h" #include "qemu/iov.h" #include "qemu/sockets.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include "migration/migration.h" #include "migration/qemu-file.h" #include "migration/qemu-file-internal.h" @@ -372,7 +372,8 @@ typedef struct QEMUBuffer { bool qsb_allocated; } QEMUBuffer; -static int buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +static ssize_t buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size) { QEMUBuffer *s = opaque; ssize_t len = qsb_get_length(s->qsb) - pos; @@ -387,8 +388,8 @@ static int buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) return qsb_get_buffer(s->qsb, pos, len, buf); } -static int buf_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, int size) +static ssize_t buf_put_buffer(void *opaque, const uint8_t *buf, + int64_t pos, size_t size) { QEMUBuffer *s = opaque; @@ -439,7 +440,7 @@ QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input) return NULL; } - s = g_malloc0(sizeof(QEMUBuffer)); + s = g_new0(QEMUBuffer, 1); s->qsb = input; if (s->qsb == NULL) { diff --git a/migration/qemu-file-stdio.c b/migration/qemu-file-stdio.c index 285068b303..9bde9db566 100644 --- a/migration/qemu-file-stdio.c +++ b/migration/qemu-file-stdio.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ #include "qemu-common.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include "migration/qemu-file.h" typedef struct QEMUFileStdio { @@ -37,11 +37,11 @@ static int stdio_get_fd(void *opaque) return fileno(s->stdio_file); } -static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, - int size) +static ssize_t stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, + size_t size) { QEMUFileStdio *s = opaque; - int res; + size_t res; res = fwrite(buf, 1, size, s->stdio_file); @@ -51,11 +51,12 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, return res; } -static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +static ssize_t stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size) { QEMUFileStdio *s = opaque; FILE *fp = s->stdio_file; - int bytes; + ssize_t bytes; for (;;) { clearerr(fp); @@ -143,7 +144,7 @@ QEMUFile *qemu_popen_cmd(const char *command, const char *mode) return NULL; } - s = g_malloc0(sizeof(QEMUFileStdio)); + s = g_new0(QEMUFileStdio, 1); s->stdio_file = stdio_file; @@ -175,7 +176,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode) return NULL; } - s = g_malloc0(sizeof(QEMUFileStdio)); + s = g_new0(QEMUFileStdio, 1); s->stdio_file = fopen(filename, mode); if (!s->stdio_file) { diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c index bfbc0861ab..6ca53e7d67 100644 --- a/migration/qemu-file-unix.c +++ b/migration/qemu-file-unix.c @@ -22,9 +22,10 @@ * THE SOFTWARE. */ #include "qemu-common.h" +#include "qemu/error-report.h" #include "qemu/iov.h" #include "qemu/sockets.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include "migration/qemu-file.h" #include "migration/qemu-file-internal.h" @@ -39,12 +40,44 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, QEMUFileSocket *s = opaque; ssize_t len; ssize_t size = iov_size(iov, iovcnt); + ssize_t offset = 0; + int err; - len = iov_send(s->fd, iov, iovcnt, 0, size); - if (len < size) { - len = -socket_error(); - } - return len; + while (size > 0) { + len = iov_send(s->fd, iov, iovcnt, offset, size); + + if (len > 0) { + size -= len; + offset += len; + } + + if (size > 0) { + err = socket_error(); + + if (err != EAGAIN && err != EWOULDBLOCK) { + error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)", + err, (size_t)size, (size_t)len); + /* + * If I've already sent some but only just got the error, I + * could return the amount validly sent so far and wait for the + * next call to report the error, but I'd rather flag the error + * immediately. + */ + return -err; + } + + /* Emulate blocking */ + GPollFD pfd; + + pfd.fd = s->fd; + pfd.events = G_IO_OUT | G_IO_ERR; + pfd.revents = 0; + TFR(err = g_poll(&pfd, 1, -1 /* no timeout */)); + /* Errors other than EINTR intentionally ignored */ + } + } + + return offset; } static int socket_get_fd(void *opaque) @@ -54,7 +87,8 @@ static int socket_get_fd(void *opaque) return s->fd; } -static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size) { QEMUFileSocket *s = opaque; ssize_t len; @@ -96,6 +130,56 @@ static int socket_shutdown(void *opaque, bool rd, bool wr) } } +static int socket_return_close(void *opaque) +{ + QEMUFileSocket *s = opaque; + /* + * Note: We don't close the socket, that should be done by the forward + * path. + */ + g_free(s); + return 0; +} + +static const QEMUFileOps socket_return_read_ops = { + .get_fd = socket_get_fd, + .get_buffer = socket_get_buffer, + .close = socket_return_close, + .shut_down = socket_shutdown, +}; + +static const QEMUFileOps socket_return_write_ops = { + .get_fd = socket_get_fd, + .writev_buffer = socket_writev_buffer, + .close = socket_return_close, + .shut_down = socket_shutdown, +}; + +/* + * Give a QEMUFile* off the same socket but data in the opposite + * direction. + */ +static QEMUFile *socket_get_return_path(void *opaque) +{ + QEMUFileSocket *forward = opaque; + QEMUFileSocket *reverse; + + if (qemu_file_get_error(forward->file)) { + /* If the forward file is in error, don't try and open a return */ + return NULL; + } + + reverse = g_malloc0(sizeof(QEMUFileSocket)); + reverse->fd = forward->fd; + /* I don't think there's a better way to tell which direction 'this' is */ + if (forward->file->ops->get_buffer != NULL) { + /* being called from the read side, so we need to be able to write */ + return qemu_fopen_ops(reverse, &socket_return_write_ops); + } else { + return qemu_fopen_ops(reverse, &socket_return_read_ops); + } +} + static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, int64_t pos) { @@ -138,7 +222,8 @@ static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, return total; } -static int unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +static ssize_t unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size) { QEMUFileSocket *s = opaque; ssize_t len; @@ -192,7 +277,7 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) return NULL; } - s = g_malloc0(sizeof(QEMUFileSocket)); + s = g_new0(QEMUFileSocket, 1); s->fd = fd; if (mode[0] == 'r') { @@ -204,18 +289,19 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) } static const QEMUFileOps socket_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_close, - .shut_down = socket_shutdown - + .get_fd = socket_get_fd, + .get_buffer = socket_get_buffer, + .close = socket_close, + .shut_down = socket_shutdown, + .get_return_path = socket_get_return_path }; static const QEMUFileOps socket_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_close, - .shut_down = socket_shutdown + .get_fd = socket_get_fd, + .writev_buffer = socket_writev_buffer, + .close = socket_close, + .shut_down = socket_shutdown, + .get_return_path = socket_get_return_path }; QEMUFile *qemu_fopen_socket(int fd, const char *mode) @@ -226,7 +312,7 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode) return NULL; } - s = g_malloc0(sizeof(QEMUFileSocket)); + s = g_new0(QEMUFileSocket, 1); s->fd = fd; if (mode[0] == 'w') { qemu_set_block(s->fd); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 6bb3dc15cd..0bbd2574a8 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -26,7 +26,7 @@ #include "qemu/error-report.h" #include "qemu/iov.h" #include "qemu/sockets.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include "migration/migration.h" #include "migration/qemu-file.h" #include "migration/qemu-file-internal.h" @@ -44,6 +44,18 @@ int qemu_file_shutdown(QEMUFile *f) return f->ops->shut_down(f->opaque, true, true); } +/* + * Result: QEMUFile* for a 'return path' for comms in the opposite direction + * NULL if not available + */ +QEMUFile *qemu_file_get_return_path(QEMUFile *f) +{ + if (!f->ops->get_return_path) { + return NULL; + } + return f->ops->get_return_path(f->opaque); +} + bool qemu_file_mode_is_not_valid(const char *mode) { if (mode == NULL || @@ -60,7 +72,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops) { QEMUFile *f; - f = g_malloc0(sizeof(QEMUFile)); + f = g_new0(QEMUFile, 1); f->opaque = opaque; f->ops = ops; @@ -270,7 +282,7 @@ int qemu_fclose(QEMUFile *f) return ret; } -static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size) +static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) { /* check for adjacent buffer and coalesce them */ if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + @@ -286,7 +298,7 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size) } } -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size) +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) { if (!f->ops->writev_buffer) { qemu_put_buffer(f, buf, size); @@ -301,9 +313,9 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size) add_to_iovec(f, buf, size); } -void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size) +void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) { - int l; + size_t l; if (f->last_error) { return; @@ -363,10 +375,10 @@ void qemu_file_skip(QEMUFile *f, int size) * return as many as it managed to read (assuming blocking fd's which * all current QEMUFile are) */ -int qemu_peek_buffer(QEMUFile *f, uint8_t **buf, int size, size_t offset) +size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset) { - int pending; - int index; + ssize_t pending; + size_t index; assert(!qemu_file_is_writable(f)); assert(offset < IO_BUF_SIZE); @@ -411,13 +423,13 @@ int qemu_peek_buffer(QEMUFile *f, uint8_t **buf, int size, size_t offset) * return as many as it managed to read (assuming blocking fd's which * all current QEMUFile are) */ -int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size) +size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size) { - int pending = size; - int done = 0; + size_t pending = size; + size_t done = 0; while (pending > 0) { - int res; + size_t res; uint8_t *src; res = qemu_peek_buffer(f, &src, MIN(pending, IO_BUF_SIZE), 0); @@ -434,6 +446,43 @@ int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size) } /* + * Read 'size' bytes of data from the file. + * 'size' can be larger than the internal buffer. + * + * The data: + * may be held on an internal buffer (in which case *buf is updated + * to point to it) that is valid until the next qemu_file operation. + * OR + * will be copied to the *buf that was passed in. + * + * The code tries to avoid the copy if possible. + * + * It will return size bytes unless there was an error, in which case it will + * return as many as it managed to read (assuming blocking fd's which + * all current QEMUFile are) + * + * Note: Since **buf may get changed, the caller should take care to + * keep a pointer to the original buffer if it needs to deallocate it. + */ +size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) +{ + if (size < IO_BUF_SIZE) { + size_t res; + uint8_t *src; + + res = qemu_peek_buffer(f, &src, size, 0); + + if (res == size) { + qemu_file_skip(f, res); + *buf = src; + return res; + } + } + + return qemu_get_buffer(f, *buf, size); +} + +/* * Peeks a single byte from the buffer; this isn't guaranteed to work if * offset leaves a gap after the previous read/peeked data. */ @@ -611,3 +660,18 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256]) return res == len ? res : 0; } + +/* + * Set the blocking state of the QEMUFile. + * Note: On some transports the OS only keeps a single blocking state for + * both directions, and thus changing the blocking on the main + * QEMUFile can also affect the return path. + */ +void qemu_file_set_blocking(QEMUFile *f, bool block) +{ + if (block) { + qemu_set_block(qemu_get_fd(f)); + } else { + qemu_set_nonblock(qemu_get_fd(f)); + } +} diff --git a/migration/ram.c b/migration/ram.c index 7f007e6432..0490f005dd 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -32,6 +32,7 @@ #include "qemu/timer.h" #include "qemu/main-loop.h" #include "migration/migration.h" +#include "migration/postcopy-ram.h" #include "exec/address-spaces.h" #include "migration/page_cache.h" #include "qemu/error-report.h" @@ -47,9 +48,7 @@ do { } while (0) #endif -static bool mig_throttle_on; static int dirty_rate_high_cnt; -static void check_guest_throttling(void); static uint64_t bitmap_sync_count; @@ -221,12 +220,34 @@ static RAMBlock *last_seen_block; /* This is the last block from where we have sent data */ static RAMBlock *last_sent_block; static ram_addr_t last_offset; -static unsigned long *migration_bitmap; static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; +/* used by the search for pages to send */ +struct PageSearchStatus { + /* Current block being searched */ + RAMBlock *block; + /* Current offset to search from */ + ram_addr_t offset; + /* Set once we wrap around */ + bool complete_round; +}; +typedef struct PageSearchStatus PageSearchStatus; + +static struct BitmapRcu { + struct rcu_head rcu; + /* Main migration bitmap */ + unsigned long *bmap; + /* bitmap of pages that haven't been sent even once + * only maintained and used in postcopy at the moment + * where it's used to send the dirtymap at the start + * of the postcopy phase + */ + unsigned long *unsentmap; +} *migration_bitmap_rcu; + struct CompressParam { bool start; bool done; @@ -396,6 +417,29 @@ static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) return size; } +/* Reduce amount of guest cpu execution to hopefully slow down memory writes. + * If guest dirty memory rate is reduced below the rate at which we can + * transfer pages to the destination then we should be able to complete + * migration. Some workloads dirty memory way too fast and will not effectively + * converge, even with auto-converge. + */ +static void mig_throttle_guest_down(void) +{ + MigrationState *s = migrate_get_current(); + uint64_t pct_initial = + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL]; + uint64_t pct_icrement = + s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT]; + + /* We have not started throttling yet. Let's start it. */ + if (!cpu_throttle_active()) { + cpu_throttle_set(pct_initial); + } else { + /* Throttling already on, just increase the rate */ + cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); + } +} + /* Update the xbzrle cache to reflect a page that's been sent as all 0. * The important thing is that a stale (not-yet-0'd) page be replaced * by the new data. @@ -495,43 +539,60 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return 1; } -/* Called with rcu_read_lock() to protect migration_bitmap */ +/* Called with rcu_read_lock() to protect migration_bitmap + * rb: The RAMBlock to search for dirty pages in + * start: Start address (typically so we can continue from previous page) + * ram_addr_abs: Pointer into which to store the address of the dirty page + * within the global ram_addr space + * + * Returns: byte offset within memory region of the start of a dirty page + */ static inline -ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, - ram_addr_t start) +ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, + ram_addr_t start, + ram_addr_t *ram_addr_abs) { - unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; + unsigned long base = rb->offset >> TARGET_PAGE_BITS; unsigned long nr = base + (start >> TARGET_PAGE_BITS); - uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); - unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); + uint64_t rb_size = rb->used_length; + unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); unsigned long *bitmap; unsigned long next; - bitmap = atomic_rcu_read(&migration_bitmap); + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; if (ram_bulk_stage && nr > base) { next = nr + 1; } else { next = find_next_bit(bitmap, size, nr); } - if (next < size) { - clear_bit(next, bitmap); + *ram_addr_abs = next << TARGET_PAGE_BITS; + return (next - base) << TARGET_PAGE_BITS; +} + +static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) +{ + bool ret; + int nr = addr >> TARGET_PAGE_BITS; + unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + + ret = test_and_clear_bit(nr, bitmap); + + if (ret) { migration_dirty_pages--; } - return (next - base) << TARGET_PAGE_BITS; + return ret; } -/* Called with rcu_read_lock() to protect migration_bitmap */ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) { unsigned long *bitmap; - bitmap = atomic_rcu_read(&migration_bitmap); + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); } - /* Fix me: there are too many global variables used in migration process. */ static int64_t start_time; static int64_t bytes_xfer_prev; @@ -573,7 +634,7 @@ static void migration_bitmap_sync(void) qemu_mutex_lock(&migration_bitmap_mutex); rcu_read_lock(); QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); + migration_bitmap_sync_range(block->offset, block->used_length); } rcu_read_unlock(); qemu_mutex_unlock(&migration_bitmap_mutex); @@ -589,21 +650,21 @@ static void migration_bitmap_sync(void) /* The following detection logic can be refined later. For now: Check to see if the dirtied bytes is 50% more than the approx. amount of bytes that just got transferred since the last time we - were in this routine. If that happens >N times (for now N==4) - we turn on the throttle down logic */ + were in this routine. If that happens twice, start or increase + throttling */ bytes_xfer_now = ram_bytes_transferred(); + if (s->dirty_pages_rate && (num_dirty_pages_period * TARGET_PAGE_SIZE > (bytes_xfer_now - bytes_xfer_prev)/2) && - (dirty_rate_high_cnt++ > 4)) { + (dirty_rate_high_cnt++ >= 2)) { trace_migration_throttle(); - mig_throttle_on = true; dirty_rate_high_cnt = 0; + mig_throttle_guest_down(); } bytes_xfer_prev = bytes_xfer_now; - } else { - mig_throttle_on = false; } + if (migrate_use_xbzrle()) { if (iterations_prev != acct_info.iterations) { acct_info.xbzrle_cache_miss_rate = @@ -655,6 +716,9 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, * ram_save_page: Send the given page to the stream * * Returns: Number of pages written. + * < 0 - error + * >=0 - Number of pages written - this might legally be 0 + * if xbzrle noticed the page was the same. * * @f: QEMUFile where to send the data * @block: block that contains the page we want to send @@ -668,12 +732,11 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset, int pages = -1; uint64_t bytes_xmit; ram_addr_t current_addr; - MemoryRegion *mr = block->mr; uint8_t *p; int ret; bool send_async = true; - p = memory_region_get_ram_ptr(mr) + offset; + p = block->host + offset; /* In doubt sent page as normal */ bytes_xmit = 0; @@ -744,7 +807,7 @@ static int do_compress_ram_page(CompressParam *param) RAMBlock *block = param->block; ram_addr_t offset = param->offset; - p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK); + p = block->host + (offset & TARGET_PAGE_MASK); bytes_sent = save_page_header(param->file, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); @@ -852,11 +915,10 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, { int pages = -1; uint64_t bytes_xmit; - MemoryRegion *mr = block->mr; uint8_t *p; int ret; - p = memory_region_get_ram_ptr(mr) + offset; + p = block->host + offset; bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, @@ -909,6 +971,339 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, return pages; } +/* + * Find the next dirty page and update any state associated with + * the search process. + * + * Returns: True if a page is found + * + * @f: Current migration stream. + * @pss: Data about the state of the current dirty page scan. + * @*again: Set to false if the search has scanned the whole of RAM + * *ram_addr_abs: Pointer into which to store the address of the dirty page + * within the global ram_addr space + */ +static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, + bool *again, ram_addr_t *ram_addr_abs) +{ + pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, + ram_addr_abs); + if (pss->complete_round && pss->block == last_seen_block && + pss->offset >= last_offset) { + /* + * We've been once around the RAM and haven't found anything. + * Give up. + */ + *again = false; + return false; + } + if (pss->offset >= pss->block->used_length) { + /* Didn't find anything in this RAM Block */ + pss->offset = 0; + pss->block = QLIST_NEXT_RCU(pss->block, next); + if (!pss->block) { + /* Hit the end of the list */ + pss->block = QLIST_FIRST_RCU(&ram_list.blocks); + /* Flag that we've looped */ + pss->complete_round = true; + ram_bulk_stage = false; + if (migrate_use_xbzrle()) { + /* If xbzrle is on, stop using the data compression at this + * point. In theory, xbzrle can do better than compression. + */ + flush_compressed_data(f); + compression_switch = false; + } + } + /* Didn't find anything this time, but try again on the new block */ + *again = true; + return false; + } else { + /* Can go around again, but... */ + *again = true; + /* We've found something so probably don't need to */ + return true; + } +} + +/* + * Helper for 'get_queued_page' - gets a page off the queue + * ms: MigrationState in + * *offset: Used to return the offset within the RAMBlock + * ram_addr_abs: global offset in the dirty/sent bitmaps + * + * Returns: block (or NULL if none available) + */ +static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, + ram_addr_t *ram_addr_abs) +{ + RAMBlock *block = NULL; + + qemu_mutex_lock(&ms->src_page_req_mutex); + if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { + struct MigrationSrcPageRequest *entry = + QSIMPLEQ_FIRST(&ms->src_page_requests); + block = entry->rb; + *offset = entry->offset; + *ram_addr_abs = (entry->offset + entry->rb->offset) & + TARGET_PAGE_MASK; + + if (entry->len > TARGET_PAGE_SIZE) { + entry->len -= TARGET_PAGE_SIZE; + entry->offset += TARGET_PAGE_SIZE; + } else { + memory_region_unref(block->mr); + QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); + g_free(entry); + } + } + qemu_mutex_unlock(&ms->src_page_req_mutex); + + return block; +} + +/* + * Unqueue a page from the queue fed by postcopy page requests; skips pages + * that are already sent (!dirty) + * + * ms: MigrationState in + * pss: PageSearchStatus structure updated with found block/offset + * ram_addr_abs: global offset in the dirty/sent bitmaps + * + * Returns: true if a queued page is found + */ +static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, + ram_addr_t *ram_addr_abs) +{ + RAMBlock *block; + ram_addr_t offset; + bool dirty; + + do { + block = unqueue_page(ms, &offset, ram_addr_abs); + /* + * We're sending this page, and since it's postcopy nothing else + * will dirty it, and we must make sure it doesn't get sent again + * even if this queue request was received after the background + * search already sent it. + */ + if (block) { + unsigned long *bitmap; + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); + if (!dirty) { + trace_get_queued_page_not_dirty( + block->idstr, (uint64_t)offset, + (uint64_t)*ram_addr_abs, + test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, + atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); + } else { + trace_get_queued_page(block->idstr, + (uint64_t)offset, + (uint64_t)*ram_addr_abs); + } + } + + } while (block && !dirty); + + if (block) { + /* + * As soon as we start servicing pages out of order, then we have + * to kill the bulk stage, since the bulk stage assumes + * in (migration_bitmap_find_and_reset_dirty) that every page is + * dirty, that's no longer true. + */ + ram_bulk_stage = false; + + /* + * We want the background search to continue from the queued page + * since the guest is likely to want other pages near to the page + * it just requested. + */ + pss->block = block; + pss->offset = offset; + } + + return !!block; +} + +/** + * flush_page_queue: Flush any remaining pages in the ram request queue + * it should be empty at the end anyway, but in error cases there may be + * some left. + * + * ms: MigrationState + */ +void flush_page_queue(MigrationState *ms) +{ + struct MigrationSrcPageRequest *mspr, *next_mspr; + /* This queue generally should be empty - but in the case of a failed + * migration might have some droppings in. + */ + rcu_read_lock(); + QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { + memory_region_unref(mspr->rb->mr); + QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); + g_free(mspr); + } + rcu_read_unlock(); +} + +/** + * Queue the pages for transmission, e.g. a request from postcopy destination + * ms: MigrationStatus in which the queue is held + * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) + * start: Offset from the start of the RAMBlock + * len: Length (in bytes) to send + * Return: 0 on success + */ +int ram_save_queue_pages(MigrationState *ms, const char *rbname, + ram_addr_t start, ram_addr_t len) +{ + RAMBlock *ramblock; + + rcu_read_lock(); + if (!rbname) { + /* Reuse last RAMBlock */ + ramblock = ms->last_req_rb; + + if (!ramblock) { + /* + * Shouldn't happen, we can't reuse the last RAMBlock if + * it's the 1st request. + */ + error_report("ram_save_queue_pages no previous block"); + goto err; + } + } else { + ramblock = qemu_ram_block_by_name(rbname); + + if (!ramblock) { + /* We shouldn't be asked for a non-existent RAMBlock */ + error_report("ram_save_queue_pages no block '%s'", rbname); + goto err; + } + ms->last_req_rb = ramblock; + } + trace_ram_save_queue_pages(ramblock->idstr, start, len); + if (start+len > ramblock->used_length) { + error_report("%s request overrun start=" RAM_ADDR_FMT " len=" + RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, + __func__, start, len, ramblock->used_length); + goto err; + } + + struct MigrationSrcPageRequest *new_entry = + g_malloc0(sizeof(struct MigrationSrcPageRequest)); + new_entry->rb = ramblock; + new_entry->offset = start; + new_entry->len = len; + + memory_region_ref(ramblock->mr); + qemu_mutex_lock(&ms->src_page_req_mutex); + QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); + qemu_mutex_unlock(&ms->src_page_req_mutex); + rcu_read_unlock(); + + return 0; + +err: + rcu_read_unlock(); + return -1; +} + +/** + * ram_save_target_page: Save one target page + * + * + * @f: QEMUFile where to send the data + * @block: pointer to block that contains the page we want to send + * @offset: offset inside the block for the page; + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space + * + * Returns: Number of pages written. + */ +static int ram_save_target_page(MigrationState *ms, QEMUFile *f, + RAMBlock *block, ram_addr_t offset, + bool last_stage, + uint64_t *bytes_transferred, + ram_addr_t dirty_ram_abs) +{ + int res = 0; + + /* Check the pages is dirty and if it is send it */ + if (migration_bitmap_clear_dirty(dirty_ram_abs)) { + unsigned long *unsentmap; + if (compression_switch && migrate_use_compression()) { + res = ram_save_compressed_page(f, block, offset, + last_stage, + bytes_transferred); + } else { + res = ram_save_page(f, block, offset, last_stage, + bytes_transferred); + } + + if (res < 0) { + return res; + } + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + if (unsentmap) { + clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); + } + /* Only update last_sent_block if a block was actually sent; xbzrle + * might have decided the page was identical so didn't bother writing + * to the stream. + */ + if (res > 0) { + last_sent_block = block; + } + } + + return res; +} + +/** + * ram_save_host_page: Starting at *offset send pages upto the end + * of the current host page. It's valid for the initial + * offset to point into the middle of a host page + * in which case the remainder of the hostpage is sent. + * Only dirty target pages are sent. + * + * Returns: Number of pages written. + * + * @f: QEMUFile where to send the data + * @block: pointer to block that contains the page we want to send + * @offset: offset inside the block for the page; updated to last target page + * sent + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space + */ +static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block, + ram_addr_t *offset, bool last_stage, + uint64_t *bytes_transferred, + ram_addr_t dirty_ram_abs) +{ + int tmppages, pages = 0; + do { + tmppages = ram_save_target_page(ms, f, block, *offset, last_stage, + bytes_transferred, dirty_ram_abs); + if (tmppages < 0) { + return tmppages; + } + + pages += tmppages; + *offset += TARGET_PAGE_SIZE; + dirty_ram_abs += TARGET_PAGE_SIZE; + } while (*offset & (qemu_host_page_size - 1)); + + /* The offset we leave with is the last one we looked at */ + *offset -= TARGET_PAGE_SIZE; + return pages; +} + /** * ram_find_and_save_block: Finds a dirty page and sends it to f * @@ -920,61 +1315,47 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, * @f: QEMUFile where to send the data * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes + * + * On systems where host-page-size > target-page-size it will send all the + * pages in a host page that are dirty. */ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, uint64_t *bytes_transferred) { - RAMBlock *block = last_seen_block; - ram_addr_t offset = last_offset; - bool complete_round = false; + PageSearchStatus pss; + MigrationState *ms = migrate_get_current(); int pages = 0; - MemoryRegion *mr; + bool again, found; + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in + ram_addr_t space */ - if (!block) - block = QLIST_FIRST_RCU(&ram_list.blocks); + pss.block = last_seen_block; + pss.offset = last_offset; + pss.complete_round = false; - while (true) { - mr = block->mr; - offset = migration_bitmap_find_and_reset_dirty(mr, offset); - if (complete_round && block == last_seen_block && - offset >= last_offset) { - break; + if (!pss.block) { + pss.block = QLIST_FIRST_RCU(&ram_list.blocks); + } + + do { + again = true; + found = get_queued_page(ms, &pss, &dirty_ram_abs); + + if (!found) { + /* priority queue empty, so just search for something dirty */ + found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); } - if (offset >= block->used_length) { - offset = 0; - block = QLIST_NEXT_RCU(block, next); - if (!block) { - block = QLIST_FIRST_RCU(&ram_list.blocks); - complete_round = true; - ram_bulk_stage = false; - if (migrate_use_xbzrle()) { - /* If xbzrle is on, stop using the data compression at this - * point. In theory, xbzrle can do better than compression. - */ - flush_compressed_data(f); - compression_switch = false; - } - } - } else { - if (compression_switch && migrate_use_compression()) { - pages = ram_save_compressed_page(f, block, offset, last_stage, - bytes_transferred); - } else { - pages = ram_save_page(f, block, offset, last_stage, - bytes_transferred); - } - /* if page is unmodified, continue to the next */ - if (pages > 0) { - last_sent_block = block; - break; - } + if (found) { + pages = ram_save_host_page(ms, f, pss.block, &pss.offset, + last_stage, bytes_transferred, + dirty_ram_abs); } - } + } while (!pages && again); - last_seen_block = block; - last_offset = offset; + last_seen_block = pss.block; + last_offset = pss.offset; return pages; } @@ -1024,17 +1405,23 @@ void free_xbzrle_decoded_buf(void) xbzrle_decoded_buf = NULL; } -static void migration_end(void) +static void migration_bitmap_free(struct BitmapRcu *bmap) +{ + g_free(bmap->bmap); + g_free(bmap->unsentmap); + g_free(bmap); +} + +static void ram_migration_cleanup(void *opaque) { /* caller have hold iothread lock or is in a bh, so there is * no writing race against this migration_bitmap */ - unsigned long *bitmap = migration_bitmap; - atomic_rcu_set(&migration_bitmap, NULL); + struct BitmapRcu *bitmap = migration_bitmap_rcu; + atomic_rcu_set(&migration_bitmap_rcu, NULL); if (bitmap) { memory_global_dirty_log_stop(); - synchronize_rcu(); - g_free(bitmap); + call_rcu(bitmap, migration_bitmap_free, rcu); } XBZRLE_cache_lock(); @@ -1049,11 +1436,6 @@ static void migration_end(void) XBZRLE_cache_unlock(); } -static void ram_migration_cancel(void *opaque) -{ - migration_end(); -} - static void reset_ram_globals(void) { last_seen_block = NULL; @@ -1070,9 +1452,10 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) /* called in qemu main thread, so there is * no writing race against this migration_bitmap */ - if (migration_bitmap) { - unsigned long *old_bitmap = migration_bitmap, *bitmap; - bitmap = bitmap_new(new); + if (migration_bitmap_rcu) { + struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; + bitmap = g_new(struct BitmapRcu, 1); + bitmap->bmap = bitmap_new(new); /* prevent migration_bitmap content from being set bit * by migration_bitmap_sync_range() at the same time. @@ -1080,16 +1463,410 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) * at the same time. */ qemu_mutex_lock(&migration_bitmap_mutex); - bitmap_copy(bitmap, old_bitmap, old); - bitmap_set(bitmap, old, new - old); - atomic_rcu_set(&migration_bitmap, bitmap); + bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); + bitmap_set(bitmap->bmap, old, new - old); + + /* We don't have a way to safely extend the sentmap + * with RCU; so mark it as missing, entry to postcopy + * will fail. + */ + bitmap->unsentmap = NULL; + + atomic_rcu_set(&migration_bitmap_rcu, bitmap); qemu_mutex_unlock(&migration_bitmap_mutex); migration_dirty_pages += new - old; - synchronize_rcu(); - g_free(old_bitmap); + call_rcu(old_bitmap, migration_bitmap_free, rcu); } } +/* + * 'expected' is the value you expect the bitmap mostly to be full + * of; it won't bother printing lines that are all this value. + * If 'todump' is null the migration bitmap is dumped. + */ +void ram_debug_dump_bitmap(unsigned long *todump, bool expected) +{ + int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; + + int64_t cur; + int64_t linelen = 128; + char linebuf[129]; + + if (!todump) { + todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + } + + for (cur = 0; cur < ram_pages; cur += linelen) { + int64_t curb; + bool found = false; + /* + * Last line; catch the case where the line length + * is longer than remaining ram + */ + if (cur + linelen > ram_pages) { + linelen = ram_pages - cur; + } + for (curb = 0; curb < linelen; curb++) { + bool thisbit = test_bit(cur + curb, todump); + linebuf[curb] = thisbit ? '1' : '.'; + found = found || (thisbit != expected); + } + if (found) { + linebuf[curb] = '\0'; + fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); + } + } +} + +/* **** functions for postcopy ***** */ + +/* + * Callback from postcopy_each_ram_send_discard for each RAMBlock + * Note: At this point the 'unsentmap' is the processed bitmap combined + * with the dirtymap; so a '1' means it's either dirty or unsent. + * start,length: Indexes into the bitmap for the first bit + * representing the named block and length in target-pages + */ +static int postcopy_send_discard_bm_ram(MigrationState *ms, + PostcopyDiscardState *pds, + unsigned long start, + unsigned long length) +{ + unsigned long end = start + length; /* one after the end */ + unsigned long current; + unsigned long *unsentmap; + + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + for (current = start; current < end; ) { + unsigned long one = find_next_bit(unsentmap, end, current); + + if (one <= end) { + unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); + unsigned long discard_length; + + if (zero >= end) { + discard_length = end - one; + } else { + discard_length = zero - one; + } + postcopy_discard_send_range(ms, pds, one, discard_length); + current = one + discard_length; + } else { + current = one; + } + } + + return 0; +} + +/* + * Utility for the outgoing postcopy code. + * Calls postcopy_send_discard_bm_ram for each RAMBlock + * passing it bitmap indexes and name. + * Returns: 0 on success + * (qemu_ram_foreach_block ends up passing unscaled lengths + * which would mean postcopy code would have to deal with target page) + */ +static int postcopy_each_ram_send_discard(MigrationState *ms) +{ + struct RAMBlock *block; + int ret; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + PostcopyDiscardState *pds = postcopy_discard_send_init(ms, + first, + block->idstr); + + /* + * Postcopy sends chunks of bitmap over the wire, but it + * just needs indexes at this point, avoids it having + * target page specific code. + */ + ret = postcopy_send_discard_bm_ram(ms, pds, first, + block->used_length >> TARGET_PAGE_BITS); + postcopy_discard_send_finish(ms, pds); + if (ret) { + return ret; + } + } + + return 0; +} + +/* + * Helper for postcopy_chunk_hostpages; it's called twice to cleanup + * the two bitmaps, that are similar, but one is inverted. + * + * We search for runs of target-pages that don't start or end on a + * host page boundary; + * unsent_pass=true: Cleans up partially unsent host pages by searching + * the unsentmap + * unsent_pass=false: Cleans up partially dirty host pages by searching + * the main migration bitmap + * + */ +static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, + RAMBlock *block, + PostcopyDiscardState *pds) +{ + unsigned long *bitmap; + unsigned long *unsentmap; + unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; + unsigned long first = block->offset >> TARGET_PAGE_BITS; + unsigned long len = block->used_length >> TARGET_PAGE_BITS; + unsigned long last = first + (len - 1); + unsigned long run_start; + + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + + if (unsent_pass) { + /* Find a sent page */ + run_start = find_next_zero_bit(unsentmap, last + 1, first); + } else { + /* Find a dirty page */ + run_start = find_next_bit(bitmap, last + 1, first); + } + + while (run_start <= last) { + bool do_fixup = false; + unsigned long fixup_start_addr; + unsigned long host_offset; + + /* + * If the start of this run of pages is in the middle of a host + * page, then we need to fixup this host page. + */ + host_offset = run_start % host_ratio; + if (host_offset) { + do_fixup = true; + run_start -= host_offset; + fixup_start_addr = run_start; + /* For the next pass */ + run_start = run_start + host_ratio; + } else { + /* Find the end of this run */ + unsigned long run_end; + if (unsent_pass) { + run_end = find_next_bit(unsentmap, last + 1, run_start + 1); + } else { + run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); + } + /* + * If the end isn't at the start of a host page, then the + * run doesn't finish at the end of a host page + * and we need to discard. + */ + host_offset = run_end % host_ratio; + if (host_offset) { + do_fixup = true; + fixup_start_addr = run_end - host_offset; + /* + * This host page has gone, the next loop iteration starts + * from after the fixup + */ + run_start = fixup_start_addr + host_ratio; + } else { + /* + * No discards on this iteration, next loop starts from + * next sent/dirty page + */ + run_start = run_end + 1; + } + } + + if (do_fixup) { + unsigned long page; + + /* Tell the destination to discard this page */ + if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { + /* For the unsent_pass we: + * discard partially sent pages + * For the !unsent_pass (dirty) we: + * discard partially dirty pages that were sent + * (any partially sent pages were already discarded + * by the previous unsent_pass) + */ + postcopy_discard_send_range(ms, pds, fixup_start_addr, + host_ratio); + } + + /* Clean up the bitmap */ + for (page = fixup_start_addr; + page < fixup_start_addr + host_ratio; page++) { + /* All pages in this host page are now not sent */ + set_bit(page, unsentmap); + + /* + * Remark them as dirty, updating the count for any pages + * that weren't previously dirty. + */ + migration_dirty_pages += !test_and_set_bit(page, bitmap); + } + } + + if (unsent_pass) { + /* Find the next sent page for the next iteration */ + run_start = find_next_zero_bit(unsentmap, last + 1, + run_start); + } else { + /* Find the next dirty page for the next iteration */ + run_start = find_next_bit(bitmap, last + 1, run_start); + } + } +} + +/* + * Utility for the outgoing postcopy code. + * + * Discard any partially sent host-page size chunks, mark any partially + * dirty host-page size chunks as all dirty. + * + * Returns: 0 on success + */ +static int postcopy_chunk_hostpages(MigrationState *ms) +{ + struct RAMBlock *block; + + if (qemu_host_page_size == TARGET_PAGE_SIZE) { + /* Easy case - TPS==HPS - nothing to be done */ + return 0; + } + + /* Easiest way to make sure we don't resume in the middle of a host-page */ + last_seen_block = NULL; + last_sent_block = NULL; + last_offset = 0; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + + PostcopyDiscardState *pds = + postcopy_discard_send_init(ms, first, block->idstr); + + /* First pass: Discard all partially sent host pages */ + postcopy_chunk_hostpages_pass(ms, true, block, pds); + /* + * Second pass: Ensure that all partially dirty host pages are made + * fully dirty. + */ + postcopy_chunk_hostpages_pass(ms, false, block, pds); + + postcopy_discard_send_finish(ms, pds); + } /* ram_list loop */ + + return 0; +} + +/* + * Transmit the set of pages to be discarded after precopy to the target + * these are pages that: + * a) Have been previously transmitted but are now dirty again + * b) Pages that have never been transmitted, this ensures that + * any pages on the destination that have been mapped by background + * tasks get discarded (transparent huge pages is the specific concern) + * Hopefully this is pretty sparse + */ +int ram_postcopy_send_discard_bitmap(MigrationState *ms) +{ + int ret; + unsigned long *bitmap, *unsentmap; + + rcu_read_lock(); + + /* This should be our last sync, the src is now paused */ + migration_bitmap_sync(); + + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + if (!unsentmap) { + /* We don't have a safe way to resize the sentmap, so + * if the bitmap was resized it will be NULL at this + * point. + */ + error_report("migration ram resized during precopy phase"); + rcu_read_unlock(); + return -EINVAL; + } + + /* Deal with TPS != HPS */ + ret = postcopy_chunk_hostpages(ms); + if (ret) { + rcu_read_unlock(); + return ret; + } + + /* + * Update the unsentmap to be unsentmap = unsentmap | dirty + */ + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + bitmap_or(unsentmap, unsentmap, bitmap, + last_ram_offset() >> TARGET_PAGE_BITS); + + + trace_ram_postcopy_send_discard_bitmap(); +#ifdef DEBUG_POSTCOPY + ram_debug_dump_bitmap(unsentmap, true); +#endif + + ret = postcopy_each_ram_send_discard(ms); + rcu_read_unlock(); + + return ret; +} + +/* + * At the start of the postcopy phase of migration, any now-dirty + * precopied pages are discarded. + * + * start, length describe a byte address range within the RAMBlock + * + * Returns 0 on success. + */ +int ram_discard_range(MigrationIncomingState *mis, + const char *block_name, + uint64_t start, size_t length) +{ + int ret = -1; + + rcu_read_lock(); + RAMBlock *rb = qemu_ram_block_by_name(block_name); + + if (!rb) { + error_report("ram_discard_range: Failed to find block '%s'", + block_name); + goto err; + } + + uint8_t *host_startaddr = rb->host + start; + + if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { + error_report("ram_discard_range: Unaligned start address: %p", + host_startaddr); + goto err; + } + + if ((start + length) <= rb->used_length) { + uint8_t *host_endaddr = host_startaddr + length; + if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { + error_report("ram_discard_range: Unaligned end address: %p", + host_endaddr); + goto err; + } + ret = postcopy_ram_discard_range(mis, host_startaddr, length); + } else { + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 + "/%zx/" RAM_ADDR_FMT")", + block_name, start, length, rb->used_length); + } + +err: + rcu_read_unlock(); + + return ret; +} + + /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code * start to become numerous it will be necessary to reduce the @@ -1101,7 +1878,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) RAMBlock *block; int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ - mig_throttle_on = false; dirty_rate_high_cnt = 0; bitmap_sync_count = 0; migration_bitmap_sync_init(); @@ -1145,8 +1921,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque) reset_ram_globals(); ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; - migration_bitmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap, 0, ram_bitmap_pages); + migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); + migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); + + if (migrate_postcopy_ram()) { + migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + } /* * Count the total number of pages used by ram blocks not including any @@ -1206,7 +1988,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } pages_sent += pages; acct_info.iterations++; - check_guest_throttling(); + /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some @@ -1247,7 +2029,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque) { rcu_read_lock(); - migration_bitmap_sync(); + if (!migration_in_postcopy(migrate_get_current())) { + migration_bitmap_sync(); + } ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -1269,19 +2053,21 @@ static int ram_save_complete(QEMUFile *f, void *opaque) rcu_read_unlock(); - migration_end(); qemu_put_be64(f, RAM_SAVE_FLAG_EOS); return 0; } -static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending) { uint64_t remaining_size; remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; - if (remaining_size < max_size) { + if (!migration_in_postcopy(migrate_get_current()) && + remaining_size < max_size) { qemu_mutex_lock_iothread(); rcu_read_lock(); migration_bitmap_sync(); @@ -1289,7 +2075,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) qemu_mutex_unlock_iothread(); remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; } - return remaining_size; + + /* We can do postcopy, and all the data is postcopiable */ + *postcopiable_pending += remaining_size; } static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) @@ -1330,6 +2118,14 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) /* Must be called from within a rcu critical section. * Returns a pointer from within the RCU-protected ram_list. */ +/* + * Read a RAMBlock ID from the stream f, find the host address of the + * start of that block and add on 'offset' + * + * f: Stream to read from + * offset: Offset within the block + * flags: Page flags (mostly to see if it's a continuation of previous block) + */ static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -1344,21 +2140,19 @@ static inline void *host_from_stream_offset(QEMUFile *f, return NULL; } - return memory_region_get_ram_ptr(block->mr) + offset; + return block->host + offset; } len = qemu_get_byte(f); qemu_get_buffer(f, (uint8_t *)id, len); id[len] = 0; - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - if (!strncmp(id, block->idstr, sizeof(id)) && - block->max_length > offset) { - return memory_region_get_ram_ptr(block->mr) + offset; - } + block = qemu_ram_block_by_name(id); + if (block && block->max_length > offset) { + return block->host + offset; } - error_report("Can't find block %s!", id); + error_report("Can't find block %s", id); return NULL; } @@ -1466,11 +2260,148 @@ static void decompress_data_with_multi_threads(uint8_t *compbuf, } } +/* + * Allocate data structures etc needed by incoming migration with postcopy-ram + * postcopy-ram's similarly names postcopy_ram_incoming_init does the work + */ +int ram_postcopy_incoming_init(MigrationIncomingState *mis) +{ + size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; + + return postcopy_ram_incoming_init(mis, ram_pages); +} + +/* + * Called in postcopy mode by ram_load(). + * rcu_read_lock is taken prior to this being called. + */ +static int ram_load_postcopy(QEMUFile *f) +{ + int flags = 0, ret = 0; + bool place_needed = false; + bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; + MigrationIncomingState *mis = migration_incoming_get_current(); + /* Temporary page that is later 'placed' */ + void *postcopy_host_page = postcopy_get_tmp_page(mis); + void *last_host = NULL; + bool all_zero = false; + + while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { + ram_addr_t addr; + void *host = NULL; + void *page_buffer = NULL; + void *place_source = NULL; + uint8_t ch; + + addr = qemu_get_be64(f); + flags = addr & ~TARGET_PAGE_MASK; + addr &= TARGET_PAGE_MASK; + + trace_ram_load_postcopy_loop((uint64_t)addr, flags); + place_needed = false; + if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { + host = host_from_stream_offset(f, addr, flags); + if (!host) { + error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); + ret = -EINVAL; + break; + } + page_buffer = host; + /* + * Postcopy requires that we place whole host pages atomically. + * To make it atomic, the data is read into a temporary page + * that's moved into place later. + * The migration protocol uses, possibly smaller, target-pages + * however the source ensures it always sends all the components + * of a host page in order. + */ + page_buffer = postcopy_host_page + + ((uintptr_t)host & ~qemu_host_page_mask); + /* If all TP are zero then we can optimise the place */ + if (!((uintptr_t)host & ~qemu_host_page_mask)) { + all_zero = true; + } else { + /* not the 1st TP within the HP */ + if (host != (last_host + TARGET_PAGE_SIZE)) { + error_report("Non-sequential target page %p/%p\n", + host, last_host); + ret = -EINVAL; + break; + } + } + + + /* + * If it's the last part of a host page then we place the host + * page + */ + place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & + ~qemu_host_page_mask) == 0; + place_source = postcopy_host_page; + } + last_host = host; + + switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { + case RAM_SAVE_FLAG_COMPRESS: + ch = qemu_get_byte(f); + memset(page_buffer, ch, TARGET_PAGE_SIZE); + if (ch) { + all_zero = false; + } + break; + + case RAM_SAVE_FLAG_PAGE: + all_zero = false; + if (!place_needed || !matching_page_sizes) { + qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); + } else { + /* Avoids the qemu_file copy during postcopy, which is + * going to do a copy later; can only do it when we + * do this read in one go (matching page sizes) + */ + qemu_get_buffer_in_place(f, (uint8_t **)&place_source, + TARGET_PAGE_SIZE); + } + break; + case RAM_SAVE_FLAG_EOS: + /* normal exit */ + break; + default: + error_report("Unknown combination of migration flags: %#x" + " (postcopy mode)", flags); + ret = -EINVAL; + } + + if (place_needed) { + /* This gets called at the last target page in the host page */ + if (all_zero) { + ret = postcopy_place_page_zero(mis, + host + TARGET_PAGE_SIZE - + qemu_host_page_size); + } else { + ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - + qemu_host_page_size, + place_source); + } + } + if (!ret) { + ret = qemu_file_get_error(f); + } + } + + return ret; +} + static int ram_load(QEMUFile *f, void *opaque, int version_id) { int flags = 0, ret = 0; static uint64_t seq_iter; int len = 0; + /* + * If system is running in postcopy mode, page inserts to host memory must + * be atomic + */ + bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; seq_iter++; @@ -1484,15 +2415,30 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) * critical section. */ rcu_read_lock(); - while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { + + if (postcopy_running) { + ret = ram_load_postcopy(f); + } + + while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr, total_ram_bytes; - void *host; + void *host = NULL; uint8_t ch; addr = qemu_get_be64(f); flags = addr & ~TARGET_PAGE_MASK; addr &= TARGET_PAGE_MASK; + if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | + RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { + host = host_from_stream_offset(f, addr, flags); + if (!host) { + error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); + ret = -EINVAL; + break; + } + } + switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: /* Synchronize RAM block list */ @@ -1507,23 +2453,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) id[len] = 0; length = qemu_get_be64(f); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - if (!strncmp(id, block->idstr, sizeof(id))) { - if (length != block->used_length) { - Error *local_err = NULL; + block = qemu_ram_block_by_name(id); + if (block) { + if (length != block->used_length) { + Error *local_err = NULL; - ret = qemu_ram_resize(block->offset, length, &local_err); - if (local_err) { - error_report_err(local_err); - } + ret = qemu_ram_resize(block->offset, length, + &local_err); + if (local_err) { + error_report_err(local_err); } - ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, - block->idstr); - break; } - } - - if (!block) { + ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, + block->idstr); + } else { error_report("Unknown ramblock \"%s\", cannot " "accept migration", id); ret = -EINVAL; @@ -1532,33 +2475,17 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) total_ram_bytes -= length; } break; + case RAM_SAVE_FLAG_COMPRESS: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } ch = qemu_get_byte(f); ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); break; + case RAM_SAVE_FLAG_PAGE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } qemu_get_buffer(f, host, TARGET_PAGE_SIZE); break; - case RAM_SAVE_FLAG_COMPRESS_PAGE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Invalid RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } + case RAM_SAVE_FLAG_COMPRESS_PAGE: len = qemu_get_be32(f); if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { error_report("Invalid compressed data length: %d", len); @@ -1568,13 +2495,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, compressed_data_buf, len); decompress_data_with_multi_threads(compressed_data_buf, host, len); break; + case RAM_SAVE_FLAG_XBZRLE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } if (load_xbzrle(f, addr, host) < 0) { error_report("Failed to decompress XBZRLE page at " RAM_ADDR_FMT, addr); @@ -1608,10 +2530,11 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) static SaveVMHandlers savevm_ram_handlers = { .save_live_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, - .save_live_complete = ram_save_complete, + .save_live_complete_postcopy = ram_save_complete, + .save_live_complete_precopy = ram_save_complete, .save_live_pending = ram_save_pending, .load_state = ram_load, - .cancel = ram_migration_cancel, + .cleanup = ram_migration_cleanup, }; void ram_mig_init(void) @@ -1619,52 +2542,3 @@ void ram_mig_init(void) qemu_mutex_init(&XBZRLE.lock); register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); } -/* Stub function that's gets run on the vcpu when its brought out of the - VM to run inside qemu via async_run_on_cpu()*/ - -static void mig_sleep_cpu(void *opq) -{ - qemu_mutex_unlock_iothread(); - g_usleep(30*1000); - qemu_mutex_lock_iothread(); -} - -/* To reduce the dirty rate explicitly disallow the VCPUs from spending - much time in the VM. The migration thread will try to catchup. - Workload will experience a performance drop. -*/ -static void mig_throttle_guest_down(void) -{ - CPUState *cpu; - - qemu_mutex_lock_iothread(); - CPU_FOREACH(cpu) { - async_run_on_cpu(cpu, mig_sleep_cpu, NULL); - } - qemu_mutex_unlock_iothread(); -} - -static void check_guest_throttling(void) -{ - static int64_t t0; - int64_t t1; - - if (!mig_throttle_on) { - return; - } - - if (!t0) { - t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - return; - } - - t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - - /* If it has been more than 40 ms since the last time the guest - * was throttled then do it again. - */ - if (40 < (t1-t0)/1000000) { - mig_throttle_guest_down(); - t0 = t1; - } -} diff --git a/migration/rdma.c b/migration/rdma.c index 74876fd7ab..dcabb91005 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -19,7 +19,7 @@ #include "qemu/main-loop.h" #include "qemu/sockets.h" #include "qemu/bitmap.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include <stdio.h> #include <sys/types.h> #include <sys/socket.h> @@ -541,7 +541,7 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, RDMALocalBlock *block; RDMALocalBlock *old = local->block; - local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); + local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1); if (local->nb_blocks) { int x; @@ -572,12 +572,12 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name, bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); block->unregister_bitmap = bitmap_new(block->nb_chunks); bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); - block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t)); + block->remote_keys = g_new0(uint32_t, block->nb_chunks); block->is_ram_block = local->init ? false : true; if (rdma->blockmap) { - g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block); } trace_rdma_add_block(block_name, local->nb_blocks, @@ -617,8 +617,8 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) memset(local, 0, sizeof *local); qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); trace_qemu_rdma_init_ram_blocks(local->nb_blocks); - rdma->dest_blocks = (RDMADestBlock *) g_malloc0(sizeof(RDMADestBlock) * - rdma->local_ram_blocks.nb_blocks); + rdma->dest_blocks = g_new0(RDMADestBlock, + rdma->local_ram_blocks.nb_blocks); local->init = true; return 0; } @@ -677,8 +677,7 @@ static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) if (local->nb_blocks > 1) { - local->block = g_malloc0(sizeof(RDMALocalBlock) * - (local->nb_blocks - 1)); + local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1); if (block->index) { memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); @@ -778,7 +777,7 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) * * If the source VM connects with an IPv4 address without knowing that the * destination has bound to '[::]' the migration will unconditionally fail - * unless the management software is explicitly listening on the the IPv4 + * unless the management software is explicitly listening on the IPv4 * address while using a RoCE-based device. * * If the source VM connects with an IPv6 address, then we're OK because we can @@ -1164,7 +1163,7 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, /* allocate memory to store chunk MRs */ if (!block->pmr) { - block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *)); + block->pmr = g_new0(struct ibv_mr *, block->nb_chunks); } /* @@ -2494,7 +2493,7 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp) InetSocketAddress *addr; if (host_port) { - rdma = g_malloc0(sizeof(RDMAContext)); + rdma = g_new0(RDMAContext, 1); rdma->current_index = -1; rdma->current_chunk = -1; @@ -2519,8 +2518,8 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp) * SEND messages for control only. * VM's ram is handled with regular RDMA messages. */ -static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, int size) +static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, + int64_t pos, size_t size) { QEMUFileRDMA *r = opaque; QEMUFile *f = r->file; @@ -2547,7 +2546,8 @@ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, r->len = MIN(remaining, RDMA_SEND_INCREMENT); remaining -= r->len; - head.len = r->len; + /* Guaranteed to fit due to RDMA_SEND_INCREMENT MIN above */ + head.len = (uint32_t)r->len; head.type = RDMA_CONTROL_QEMU_FILE; ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); @@ -2564,7 +2564,7 @@ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, } static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, - int size, int idx) + size_t size, int idx) { size_t len = 0; @@ -2585,8 +2585,8 @@ static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, * RDMA links don't use bytestreams, so we have to * return bytes to QEMUFile opportunistically. */ -static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, - int64_t pos, int size) +static ssize_t qemu_rdma_get_buffer(void *opaque, uint8_t *buf, + int64_t pos, size_t size) { QEMUFileRDMA *r = opaque; RDMAContext *rdma = r->rdma; @@ -3399,7 +3399,7 @@ static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) return NULL; } - r = g_malloc0(sizeof(QEMUFileRDMA)); + r = g_new0(QEMUFileRDMA, 1); r->rdma = rdma; if (mode[0] == 'w') { diff --git a/migration/savevm.c b/migration/savevm.c index 60712153fa..0ad1b93a8b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "qemu/timer.h" #include "audio/audio.h" #include "migration/migration.h" +#include "migration/postcopy-ram.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/sockets.h" @@ -45,6 +46,7 @@ #include "exec/memory.h" #include "qmp-commands.h" #include "trace.h" +#include "qemu/bitops.h" #include "qemu/iov.h" #include "block/snapshot.h" #include "block/qapi.h" @@ -57,8 +59,26 @@ #define ARP_PTYPE_IP 0x0800 #define ARP_OP_REQUEST_REV 0x3 +const unsigned int postcopy_ram_discard_version = 0; + static bool skip_section_footers; +static struct mig_cmd_args { + ssize_t len; /* -1 = variable */ + const char *name; +} mig_cmd_args[] = { + [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" }, + [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" }, + [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" }, + [MIG_CMD_POSTCOPY_ADVISE] = { .len = 16, .name = "POSTCOPY_ADVISE" }, + [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" }, + [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" }, + [MIG_CMD_POSTCOPY_RAM_DISCARD] = { + .len = -1, .name = "POSTCOPY_RAM_DISCARD" }, + [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, + [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, +}; + static int announce_self_create(uint8_t *buf, uint8_t *mac_addr) { @@ -138,14 +158,15 @@ static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, return qiov.size; } -static int block_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, int size) +static ssize_t block_put_buffer(void *opaque, const uint8_t *buf, + int64_t pos, size_t size) { bdrv_save_vmstate(opaque, buf, pos, size); return size; } -static int block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, + size_t size) { return bdrv_load_vmstate(opaque, buf, pos, size); } @@ -480,7 +501,7 @@ int register_savevm_live(DeviceState *dev, { SaveStateEntry *se; - se = g_malloc0(sizeof(SaveStateEntry)); + se = g_new0(SaveStateEntry, 1); se->version_id = version_id; se->section_id = savevm_state.global_section_id++; se->ops = ops; @@ -498,7 +519,7 @@ int register_savevm_live(DeviceState *dev, pstrcat(se->idstr, sizeof(se->idstr), "/"); g_free(id); - se->compat = g_malloc0(sizeof(CompatEntry)); + se->compat = g_new0(CompatEntry, 1); pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr); se->compat->instance_id = instance_id == -1 ? calculate_compat_instance_id(idstr) : instance_id; @@ -526,7 +547,7 @@ int register_savevm(DeviceState *dev, LoadStateHandler *load_state, void *opaque) { - SaveVMHandlers *ops = g_malloc0(sizeof(SaveVMHandlers)); + SaveVMHandlers *ops = g_new0(SaveVMHandlers, 1); ops->save_state = save_state; ops->load_state = load_state; return register_savevm_live(dev, idstr, instance_id, version_id, @@ -551,9 +572,7 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { QTAILQ_REMOVE(&savevm_state.handlers, se, entry); - if (se->compat) { - g_free(se->compat); - } + g_free(se->compat); g_free(se->ops); g_free(se); } @@ -570,7 +589,7 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, /* If this triggers, alias support can be dropped for the vmsd. */ assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id); - se = g_malloc0(sizeof(SaveStateEntry)); + se = g_new0(SaveStateEntry, 1); se->version_id = vmsd->version_id; se->section_id = savevm_state.global_section_id++; se->opaque = opaque; @@ -584,7 +603,7 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, pstrcat(se->idstr, sizeof(se->idstr), "/"); g_free(id); - se->compat = g_malloc0(sizeof(CompatEntry)); + se->compat = g_new0(CompatEntry, 1); pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); se->compat->instance_id = instance_id == -1 ? calculate_compat_instance_id(vmsd->name) : instance_id; @@ -612,9 +631,7 @@ void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd, QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { if (se->vmsd == vmsd && se->opaque == opaque) { QTAILQ_REMOVE(&savevm_state.handlers, se, entry); - if (se->compat) { - g_free(se->compat); - } + g_free(se->compat); g_free(se); } } @@ -697,6 +714,156 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } +/** + * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the + * command and associated data. + * + * @f: File to send command on + * @command: Command type to send + * @len: Length of associated data + * @data: Data associated with command. + */ +void qemu_savevm_command_send(QEMUFile *f, + enum qemu_vm_cmd command, + uint16_t len, + uint8_t *data) +{ + trace_savevm_command_send(command, len); + qemu_put_byte(f, QEMU_VM_COMMAND); + qemu_put_be16(f, (uint16_t)command); + qemu_put_be16(f, len); + qemu_put_buffer(f, data, len); + qemu_fflush(f); +} + +void qemu_savevm_send_ping(QEMUFile *f, uint32_t value) +{ + uint32_t buf; + + trace_savevm_send_ping(value); + buf = cpu_to_be32(value); + qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf); +} + +void qemu_savevm_send_open_return_path(QEMUFile *f) +{ + trace_savevm_send_open_return_path(); + qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL); +} + +/* We have a buffer of data to send; we don't want that all to be loaded + * by the command itself, so the command contains just the length of the + * extra buffer that we then send straight after it. + * TODO: Must be a better way to organise that + * + * Returns: + * 0 on success + * -ve on error + */ +int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb) +{ + size_t cur_iov; + size_t len = qsb_get_length(qsb); + uint32_t tmp; + + if (len > MAX_VM_CMD_PACKAGED_SIZE) { + error_report("%s: Unreasonably large packaged state: %zu", + __func__, len); + return -1; + } + + tmp = cpu_to_be32(len); + + trace_qemu_savevm_send_packaged(); + qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); + + /* all the data follows (concatinating the iov's) */ + for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) { + /* The iov entries are partially filled */ + size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len); + len -= towrite; + + if (!towrite) { + break; + } + + qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite); + } + + return 0; +} + +/* Send prior to any postcopy transfer */ +void qemu_savevm_send_postcopy_advise(QEMUFile *f) +{ + uint64_t tmp[2]; + tmp[0] = cpu_to_be64(getpagesize()); + tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits()); + + trace_qemu_savevm_send_postcopy_advise(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp); +} + +/* Sent prior to starting the destination running in postcopy, discard pages + * that have already been sent but redirtied on the source. + * CMD_POSTCOPY_RAM_DISCARD consist of: + * byte version (0) + * byte Length of name field (not including 0) + * n x byte RAM block name + * byte 0 terminator (just for safety) + * n x Byte ranges within the named RAMBlock + * be64 Start of the range + * be64 Length + * + * name: RAMBlock name that these entries are part of + * len: Number of page entries + * start_list: 'len' addresses + * length_list: 'len' addresses + * + */ +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, + uint16_t len, + uint64_t *start_list, + uint64_t *length_list) +{ + uint8_t *buf; + uint16_t tmplen; + uint16_t t; + size_t name_len = strlen(name); + + trace_qemu_savevm_send_postcopy_ram_discard(name, len); + assert(name_len < 256); + buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len); + buf[0] = postcopy_ram_discard_version; + buf[1] = name_len; + memcpy(buf + 2, name, name_len); + tmplen = 2 + name_len; + buf[tmplen++] = '\0'; + + for (t = 0; t < len; t++) { + cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]); + tmplen += 8; + cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]); + tmplen += 8; + } + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf); + g_free(buf); +} + +/* Get the destination into a state where it can receive postcopy data. */ +void qemu_savevm_send_postcopy_listen(QEMUFile *f) +{ + trace_savevm_send_postcopy_listen(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL); +} + +/* Kick the destination into running */ +void qemu_savevm_send_postcopy_run(QEMUFile *f) +{ + trace_savevm_send_postcopy_run(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL); +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -716,6 +883,12 @@ void qemu_savevm_state_header(QEMUFile *f) trace_savevm_state_header(); qemu_put_be32(f, QEMU_VM_FILE_MAGIC); qemu_put_be32(f, QEMU_VM_FILE_VERSION); + + if (!savevm_state.skip_configuration) { + qemu_put_byte(f, QEMU_VM_CONFIGURATION); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + } + } void qemu_savevm_state_begin(QEMUFile *f, @@ -732,11 +905,6 @@ void qemu_savevm_state_begin(QEMUFile *f, se->ops->set_params(params, se->opaque); } - if (!savevm_state.skip_configuration) { - qemu_put_byte(f, QEMU_VM_CONFIGURATION); - vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); - } - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; @@ -763,7 +931,7 @@ void qemu_savevm_state_begin(QEMUFile *f, * 0 : We haven't finished, caller have to go again * 1 : We have finished, we can go to complete phase */ -int qemu_savevm_state_iterate(QEMUFile *f) +int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) { SaveStateEntry *se; int ret = 1; @@ -778,6 +946,15 @@ int qemu_savevm_state_iterate(QEMUFile *f) continue; } } + /* + * In the postcopy phase, any device that doesn't know how to + * do postcopy should have saved it's state in the _complete + * call that's already run, it might get confused if we call + * iterate afterwards. + */ + if (postcopy && !se->ops->save_live_complete_postcopy) { + continue; + } if (qemu_file_rate_limit(f)) { return 0; } @@ -806,24 +983,69 @@ int qemu_savevm_state_iterate(QEMUFile *f) static bool should_send_vmdesc(void) { MachineState *machine = MACHINE(qdev_get_machine()); - return !machine->suppress_vmdesc; + bool in_postcopy = migration_in_postcopy(migrate_get_current()); + return !machine->suppress_vmdesc && !in_postcopy; } -void qemu_savevm_state_complete(QEMUFile *f) +/* + * Calls the save_live_complete_postcopy methods + * causing the last few pages to be sent immediately and doing any associated + * cleanup. + * Note postcopy also calls qemu_savevm_state_complete_precopy to complete + * all the other devices, but that happens at the point we switch to postcopy. + */ +void qemu_savevm_state_complete_postcopy(QEMUFile *f) +{ + SaveStateEntry *se; + int ret; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->save_live_complete_postcopy) { + continue; + } + if (se->ops && se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + trace_savevm_section_start(se->idstr, se->section_id); + /* Section type */ + qemu_put_byte(f, QEMU_VM_SECTION_END); + qemu_put_be32(f, se->section_id); + + ret = se->ops->save_live_complete_postcopy(f, se->opaque); + trace_savevm_section_end(se->idstr, se->section_id, ret); + save_section_footer(f, se); + if (ret < 0) { + qemu_file_set_error(f, ret); + return; + } + } + + qemu_put_byte(f, QEMU_VM_EOF); + qemu_fflush(f); +} + +void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only) { QJSON *vmdesc; int vmdesc_len; SaveStateEntry *se; int ret; + bool in_postcopy = migration_in_postcopy(migrate_get_current()); - trace_savevm_state_complete(); + trace_savevm_state_complete_precopy(); cpu_synchronize_all_states(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_complete) { + if (!se->ops || + (in_postcopy && se->ops->save_live_complete_postcopy) || + (in_postcopy && !iterable_only) || + !se->ops->save_live_complete_precopy) { continue; } + if (se->ops && se->ops->is_active) { if (!se->ops->is_active(se->opaque)) { continue; @@ -833,7 +1055,7 @@ void qemu_savevm_state_complete(QEMUFile *f) save_section_header(f, se, QEMU_VM_SECTION_END); - ret = se->ops->save_live_complete(f, se->opaque); + ret = se->ops->save_live_complete_precopy(f, se->opaque); trace_savevm_section_end(se->idstr, se->section_id, ret); save_section_footer(f, se); if (ret < 0) { @@ -842,6 +1064,10 @@ void qemu_savevm_state_complete(QEMUFile *f) } } + if (iterable_only) { + return; + } + vmdesc = qjson_new(); json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE); json_start_array(vmdesc, "devices"); @@ -870,7 +1096,10 @@ void qemu_savevm_state_complete(QEMUFile *f) save_section_footer(f, se); } - qemu_put_byte(f, QEMU_VM_EOF); + if (!in_postcopy) { + /* Postcopy stream will still be going */ + qemu_put_byte(f, QEMU_VM_EOF); + } json_end_array(vmdesc); qjson_finish(vmdesc); @@ -886,10 +1115,19 @@ void qemu_savevm_state_complete(QEMUFile *f) qemu_fflush(f); } -uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) +/* Give an estimate of the amount left to be transferred, + * the result is split into the amount for units that can and + * for units that can't do postcopy. + */ +void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size, + uint64_t *res_non_postcopiable, + uint64_t *res_postcopiable) { SaveStateEntry *se; - uint64_t ret = 0; + + *res_non_postcopiable = 0; + *res_postcopiable = 0; + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_pending) { @@ -900,19 +1138,19 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) continue; } } - ret += se->ops->save_live_pending(f, se->opaque, max_size); + se->ops->save_live_pending(f, se->opaque, max_size, + res_non_postcopiable, res_postcopiable); } - return ret; } -void qemu_savevm_state_cancel(void) +void qemu_savevm_state_cleanup(void) { SaveStateEntry *se; - trace_savevm_state_cancel(); + trace_savevm_state_cleanup(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (se->ops && se->ops->cancel) { - se->ops->cancel(se->opaque); + if (se->ops && se->ops->cleanup) { + se->ops->cleanup(se->opaque); } } } @@ -924,6 +1162,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) .blk = 0, .shared = 0 }; + MigrationState *ms = migrate_init(¶ms); + ms->file = f; if (qemu_savevm_state_blocked(errp)) { return -EINVAL; @@ -935,18 +1175,18 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) qemu_mutex_lock_iothread(); while (qemu_file_get_error(f) == 0) { - if (qemu_savevm_state_iterate(f) > 0) { + if (qemu_savevm_state_iterate(f, false) > 0) { break; } } ret = qemu_file_get_error(f); if (ret == 0) { - qemu_savevm_state_complete(f); + qemu_savevm_state_complete_precopy(f, false); ret = qemu_file_get_error(f); } + qemu_savevm_state_cleanup(); if (ret != 0) { - qemu_savevm_state_cancel(); error_setg_errno(errp, -ret, "Error while writing VM state"); } return ret; @@ -1004,6 +1244,420 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id) return NULL; } +enum LoadVMExitCodes { + /* Allow a command to quit all layers of nested loadvm loops */ + LOADVM_QUIT = 1, +}; + +static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); + +/* ------ incoming postcopy messages ------ */ +/* 'advise' arrives before any transfers just to tell us that a postcopy + * *might* happen - it might be skipped if precopy transferred everything + * quickly. + */ +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); + uint64_t remote_hps, remote_tps; + + trace_loadvm_postcopy_handle_advise(); + if (ps != POSTCOPY_INCOMING_NONE) { + error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps); + return -1; + } + + if (!postcopy_ram_supported_by_host()) { + return -1; + } + + remote_hps = qemu_get_be64(mis->from_src_file); + if (remote_hps != getpagesize()) { + /* + * Some combinations of mismatch are probably possible but it gets + * a bit more complicated. In particular we need to place whole + * host pages on the dest at once, and we need to ensure that we + * handle dirtying to make sure we never end up sending part of + * a hostpage on it's own. + */ + error_report("Postcopy needs matching host page sizes (s=%d d=%d)", + (int)remote_hps, getpagesize()); + return -1; + } + + remote_tps = qemu_get_be64(mis->from_src_file); + if (remote_tps != (1ul << qemu_target_page_bits())) { + /* + * Again, some differences could be dealt with, but for now keep it + * simple. + */ + error_report("Postcopy needs matching target page sizes (s=%d d=%d)", + (int)remote_tps, 1 << qemu_target_page_bits()); + return -1; + } + + if (ram_postcopy_incoming_init(mis)) { + return -1; + } + + postcopy_state_set(POSTCOPY_INCOMING_ADVISE); + + return 0; +} + +/* After postcopy we will be told to throw some pages away since they're + * dirty and will have to be demand fetched. Must happen before CPU is + * started. + * There can be 0..many of these messages, each encoding multiple pages. + */ +static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, + uint16_t len) +{ + int tmp; + char ramid[256]; + PostcopyState ps = postcopy_state_get(); + + trace_loadvm_postcopy_ram_handle_discard(); + + switch (ps) { + case POSTCOPY_INCOMING_ADVISE: + /* 1st discard */ + tmp = postcopy_ram_prepare_discard(mis); + if (tmp) { + return tmp; + } + break; + + case POSTCOPY_INCOMING_DISCARD: + /* Expected state */ + break; + + default: + error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", + ps); + return -1; + } + /* We're expecting a + * Version (0) + * a RAM ID string (length byte, name, 0 term) + * then at least 1 16 byte chunk + */ + if (len < (1 + 1 + 1 + 1 + 2 * 8)) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + return -1; + } + + tmp = qemu_get_byte(mis->from_src_file); + if (tmp != postcopy_ram_discard_version) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); + return -1; + } + + if (!qemu_get_counted_string(mis->from_src_file, ramid)) { + error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); + return -1; + } + tmp = qemu_get_byte(mis->from_src_file); + if (tmp != 0) { + error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); + return -1; + } + + len -= 3 + strlen(ramid); + if (len % 16) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + return -1; + } + trace_loadvm_postcopy_ram_handle_discard_header(ramid, len); + while (len) { + uint64_t start_addr, block_length; + start_addr = qemu_get_be64(mis->from_src_file); + block_length = qemu_get_be64(mis->from_src_file); + + len -= 16; + int ret = ram_discard_range(mis, ramid, start_addr, + block_length); + if (ret) { + return ret; + } + } + trace_loadvm_postcopy_ram_handle_discard_end(); + + return 0; +} + +/* + * Triggered by a postcopy_listen command; this thread takes over reading + * the input stream, leaving the main thread free to carry on loading the rest + * of the device state (from RAM). + * (TODO:This could do with being in a postcopy file - but there again it's + * just another input loop, not that postcopy specific) + */ +static void *postcopy_ram_listen_thread(void *opaque) +{ + QEMUFile *f = opaque; + MigrationIncomingState *mis = migration_incoming_get_current(); + int load_res; + + qemu_sem_post(&mis->listen_thread_sem); + trace_postcopy_ram_listen_thread_start(); + + /* + * Because we're a thread and not a coroutine we can't yield + * in qemu_file, and thus we must be blocking now. + */ + qemu_file_set_blocking(f, true); + load_res = qemu_loadvm_state_main(f, mis); + /* And non-blocking again so we don't block in any cleanup */ + qemu_file_set_blocking(f, false); + + trace_postcopy_ram_listen_thread_exit(); + if (load_res < 0) { + error_report("%s: loadvm failed: %d", __func__, load_res); + qemu_file_set_error(f, load_res); + } else { + /* + * This looks good, but it's possible that the device loading in the + * main thread hasn't finished yet, and so we might not be in 'RUN' + * state yet; wait for the end of the main thread. + */ + qemu_event_wait(&mis->main_thread_load_event); + } + postcopy_ram_incoming_cleanup(mis); + /* + * If everything has worked fine, then the main thread has waited + * for us to start, and we're the last use of the mis. + * (If something broke then qemu will have to exit anyway since it's + * got a bad migration state). + */ + migration_incoming_state_destroy(); + + if (load_res < 0) { + /* + * If something went wrong then we have a bad state so exit; + * depending how far we got it might be possible at this point + * to leave the guest running and fire MCEs for pages that never + * arrived as a desperate recovery step. + */ + exit(EXIT_FAILURE); + } + + return NULL; +} + +/* After this message we must be able to immediately receive postcopy data */ +static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); + trace_loadvm_postcopy_handle_listen(); + if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { + error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); + return -1; + } + if (ps == POSTCOPY_INCOMING_ADVISE) { + /* + * A rare case, we entered listen without having to do any discards, + * so do the setup that's normally done at the time of the 1st discard. + */ + postcopy_ram_prepare_discard(mis); + } + + /* + * Sensitise RAM - can now generate requests for blocks that don't exist + * However, at this point the CPU shouldn't be running, and the IO + * shouldn't be doing anything yet so don't actually expect requests + */ + if (postcopy_ram_enable_notify(mis)) { + return -1; + } + + if (mis->have_listen_thread) { + error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread"); + return -1; + } + + mis->have_listen_thread = true; + /* Start up the listening thread and wait for it to signal ready */ + qemu_sem_init(&mis->listen_thread_sem, 0); + qemu_thread_create(&mis->listen_thread, "postcopy/listen", + postcopy_ram_listen_thread, mis->from_src_file, + QEMU_THREAD_JOINABLE); + qemu_sem_wait(&mis->listen_thread_sem); + qemu_sem_destroy(&mis->listen_thread_sem); + + return 0; +} + +/* After all discards we can start running and asking for pages */ +static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING); + Error *local_err = NULL; + + trace_loadvm_postcopy_handle_run(); + if (ps != POSTCOPY_INCOMING_LISTENING) { + error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); + return -1; + } + + /* TODO we should move all of this lot into postcopy_ram.c or a shared code + * in migration.c + */ + cpu_synchronize_all_post_init(); + + qemu_announce_self(); + + /* Make sure all file formats flush their mutable metadata */ + bdrv_invalidate_cache_all(&local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + + trace_loadvm_postcopy_handle_run_cpu_sync(); + cpu_synchronize_all_post_init(); + + trace_loadvm_postcopy_handle_run_vmstart(); + + if (autostart) { + /* Hold onto your hats, starting the CPU */ + vm_start(); + } else { + /* leave it paused and let management decide when to start the CPU */ + runstate_set(RUN_STATE_PAUSED); + } + + /* We need to finish reading the stream from the package + * and also stop reading anything more from the stream that loaded the + * package (since it's now being read by the listener thread). + * LOADVM_QUIT will quit all the layers of nested loadvm loops. + */ + return LOADVM_QUIT; +} + +/** + * Immediately following this command is a blob of data containing an embedded + * chunk of migration stream; read it and load it. + * + * @mis: Incoming state + * @length: Length of packaged data to read + * + * Returns: Negative values on error + * + */ +static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) +{ + int ret; + uint8_t *buffer; + uint32_t length; + QEMUSizedBuffer *qsb; + + length = qemu_get_be32(mis->from_src_file); + trace_loadvm_handle_cmd_packaged(length); + + if (length > MAX_VM_CMD_PACKAGED_SIZE) { + error_report("Unreasonably large packaged state: %u", length); + return -1; + } + buffer = g_malloc0(length); + ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length); + if (ret != length) { + g_free(buffer); + error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n", + ret, length); + return (ret < 0) ? ret : -EAGAIN; + } + trace_loadvm_handle_cmd_packaged_received(ret); + + /* Setup a dummy QEMUFile that actually reads from the buffer */ + qsb = qsb_create(buffer, length); + g_free(buffer); /* Because qsb_create copies */ + if (!qsb) { + error_report("Unable to create qsb"); + } + QEMUFile *packf = qemu_bufopen("r", qsb); + + ret = qemu_loadvm_state_main(packf, mis); + trace_loadvm_handle_cmd_packaged_main(ret); + qemu_fclose(packf); + qsb_free(qsb); + + return ret; +} + +/* + * Process an incoming 'QEMU_VM_COMMAND' + * 0 just a normal return + * LOADVM_QUIT All good, but exit the loop + * <0 Error + */ +static int loadvm_process_command(QEMUFile *f) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + uint16_t cmd; + uint16_t len; + uint32_t tmp32; + + cmd = qemu_get_be16(f); + len = qemu_get_be16(f); + + trace_loadvm_process_command(cmd, len); + if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) { + error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); + return -EINVAL; + } + + if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) { + error_report("%s received with bad length - expecting %zu, got %d", + mig_cmd_args[cmd].name, + (size_t)mig_cmd_args[cmd].len, len); + return -ERANGE; + } + + switch (cmd) { + case MIG_CMD_OPEN_RETURN_PATH: + if (mis->to_src_file) { + error_report("CMD_OPEN_RETURN_PATH called when RP already open"); + /* Not really a problem, so don't give up */ + return 0; + } + mis->to_src_file = qemu_file_get_return_path(f); + if (!mis->to_src_file) { + error_report("CMD_OPEN_RETURN_PATH failed"); + return -1; + } + break; + + case MIG_CMD_PING: + tmp32 = qemu_get_be32(f); + trace_loadvm_process_command_ping(tmp32); + if (!mis->to_src_file) { + error_report("CMD_PING (0x%x) received with no return path", + tmp32); + return -1; + } + migrate_send_rp_pong(mis, tmp32); + break; + + case MIG_CMD_PACKAGED: + return loadvm_handle_cmd_packaged(mis); + + case MIG_CMD_POSTCOPY_ADVISE: + return loadvm_postcopy_handle_advise(mis); + + case MIG_CMD_POSTCOPY_LISTEN: + return loadvm_postcopy_handle_listen(mis); + + case MIG_CMD_POSTCOPY_RUN: + return loadvm_postcopy_handle_run(mis); + + case MIG_CMD_POSTCOPY_RAM_DISCARD: + return loadvm_postcopy_ram_handle_discard(mis, len); + } + + return 0; +} + struct LoadStateEntry { QLIST_ENTRY(LoadStateEntry) entry; SaveStateEntry *se; @@ -1056,47 +1710,10 @@ void loadvm_free_handlers(MigrationIncomingState *mis) } } -int qemu_loadvm_state(QEMUFile *f) +static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) { - MigrationIncomingState *mis = migration_incoming_get_current(); - Error *local_err = NULL; uint8_t section_type; - unsigned int v; int ret; - int file_error_after_eof = -1; - - if (qemu_savevm_state_blocked(&local_err)) { - error_report_err(local_err); - return -EINVAL; - } - - v = qemu_get_be32(f); - if (v != QEMU_VM_FILE_MAGIC) { - error_report("Not a migration stream"); - return -EINVAL; - } - - v = qemu_get_be32(f); - if (v == QEMU_VM_FILE_VERSION_COMPAT) { - error_report("SaveVM v2 format is obsolete and don't work anymore"); - return -ENOTSUP; - } - if (v != QEMU_VM_FILE_VERSION) { - error_report("Unsupported migration stream version"); - return -ENOTSUP; - } - - if (!savevm_state.skip_configuration) { - if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { - error_report("Configuration section missing"); - return -EINVAL; - } - ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); - - if (ret) { - return ret; - } - } while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -1125,16 +1742,14 @@ int qemu_loadvm_state(QEMUFile *f) if (se == NULL) { error_report("Unknown savevm section or instance '%s' %d", idstr, instance_id); - ret = -EINVAL; - goto out; + return -EINVAL; } /* Validate version */ if (version_id > se->version_id) { error_report("savevm: unsupported version %d for '%s' v%d", version_id, idstr, se->version_id); - ret = -EINVAL; - goto out; + return -EINVAL; } /* Add entry */ @@ -1149,11 +1764,10 @@ int qemu_loadvm_state(QEMUFile *f) if (ret < 0) { error_report("error while loading state for instance 0x%x of" " device '%s'", instance_id, idstr); - goto out; + return ret; } if (!check_section_footer(f, le)) { - ret = -EINVAL; - goto out; + return -EINVAL; } break; case QEMU_VM_SECTION_PART: @@ -1168,29 +1782,88 @@ int qemu_loadvm_state(QEMUFile *f) } if (le == NULL) { error_report("Unknown savevm section %d", section_id); - ret = -EINVAL; - goto out; + return -EINVAL; } ret = vmstate_load(f, le->se, le->version_id); if (ret < 0) { error_report("error while loading state section id %d(%s)", section_id, le->se->idstr); - goto out; + return ret; } if (!check_section_footer(f, le)) { - ret = -EINVAL; - goto out; + return -EINVAL; + } + break; + case QEMU_VM_COMMAND: + ret = loadvm_process_command(f); + trace_qemu_loadvm_state_section_command(ret); + if ((ret < 0) || (ret & LOADVM_QUIT)) { + return ret; } break; default: error_report("Unknown savevm section type %d", section_type); - ret = -EINVAL; - goto out; + return -EINVAL; } } - file_error_after_eof = qemu_file_get_error(f); + return 0; +} + +int qemu_loadvm_state(QEMUFile *f) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + Error *local_err = NULL; + unsigned int v; + int ret; + + if (qemu_savevm_state_blocked(&local_err)) { + error_report_err(local_err); + return -EINVAL; + } + + v = qemu_get_be32(f); + if (v != QEMU_VM_FILE_MAGIC) { + error_report("Not a migration stream"); + return -EINVAL; + } + + v = qemu_get_be32(f); + if (v == QEMU_VM_FILE_VERSION_COMPAT) { + error_report("SaveVM v2 format is obsolete and don't work anymore"); + return -ENOTSUP; + } + if (v != QEMU_VM_FILE_VERSION) { + error_report("Unsupported migration stream version"); + return -ENOTSUP; + } + + if (!savevm_state.skip_configuration) { + if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { + error_report("Configuration section missing"); + return -EINVAL; + } + ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + + if (ret) { + return ret; + } + } + + ret = qemu_loadvm_state_main(f, mis); + qemu_event_set(&mis->main_thread_load_event); + + trace_qemu_loadvm_state_post_main(ret); + + if (mis->have_listen_thread) { + /* Listen thread still going, can't clean up yet */ + return ret; + } + + if (ret == 0) { + ret = qemu_file_get_error(f); + } /* * Try to read in the VMDESC section as well, so that dumping tools that @@ -1202,10 +1875,10 @@ int qemu_loadvm_state(QEMUFile *f) * We also mustn't read data that isn't there; some transports (RDMA) * will stall waiting for that data when the source has already closed. */ - if (should_send_vmdesc()) { + if (ret == 0 && should_send_vmdesc()) { uint8_t *buf; uint32_t size; - section_type = qemu_get_byte(f); + uint8_t section_type = qemu_get_byte(f); if (section_type != QEMU_VM_VMDESCRIPTION) { error_report("Expected vmdescription section, but got %d", @@ -1229,57 +1902,9 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); - ret = 0; - -out: - if (ret == 0) { - /* We may not have a VMDESC section, so ignore relative errors */ - ret = file_error_after_eof; - } - return ret; } -static BlockDriverState *find_vmstate_bs(void) -{ - BlockDriverState *bs = NULL; - while ((bs = bdrv_next(bs))) { - if (bdrv_can_snapshot(bs)) { - return bs; - } - } - return NULL; -} - -/* - * Deletes snapshots of a given name in all opened images. - */ -static int del_existing_snapshots(Monitor *mon, const char *name) -{ - BlockDriverState *bs; - QEMUSnapshotInfo sn1, *snapshot = &sn1; - Error *err = NULL; - - bs = NULL; - while ((bs = bdrv_next(bs))) { - if (bdrv_can_snapshot(bs) && - bdrv_snapshot_find(bs, snapshot, name) >= 0) { - bdrv_snapshot_delete_by_id_or_name(bs, name, &err); - if (err) { - monitor_printf(mon, - "Error while deleting snapshot on device '%s':" - " %s\n", - bdrv_get_device_name(bs), - error_get_pretty(err)); - error_free(err); - return -1; - } - } - } - - return 0; -} - void hmp_savevm(Monitor *mon, const QDict *qdict) { BlockDriverState *bs, *bs1; @@ -1292,27 +1917,29 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) struct tm tm; const char *name = qdict_get_try_str(qdict, "name"); Error *local_err = NULL; + AioContext *aio_context; - /* Verify if there is a device that doesn't support snapshots and is writable */ - bs = NULL; - while ((bs = bdrv_next(bs))) { - - if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - continue; - } + if (!bdrv_all_can_snapshot(&bs)) { + monitor_printf(mon, "Device '%s' is writable but does not " + "support snapshots.\n", bdrv_get_device_name(bs)); + return; + } - if (!bdrv_can_snapshot(bs)) { - monitor_printf(mon, "Device '%s' is writable but does not support snapshots.\n", - bdrv_get_device_name(bs)); - return; - } + /* Delete old snapshots of the same name */ + if (name && bdrv_all_delete_snapshot(name, &bs1, &local_err) < 0) { + monitor_printf(mon, + "Error while deleting snapshot on device '%s': %s\n", + bdrv_get_device_name(bs1), error_get_pretty(local_err)); + error_free(local_err); + return; } - bs = find_vmstate_bs(); - if (!bs) { + bs = bdrv_all_find_vmstate_bs(); + if (bs == NULL) { monitor_printf(mon, "No block device can accept snapshots\n"); return; } + aio_context = bdrv_get_aio_context(bs); saved_vm_running = runstate_is_running(); @@ -1323,6 +1950,8 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) } vm_stop(RUN_STATE_SAVE_VM); + aio_context_acquire(aio_context); + memset(sn, 0, sizeof(*sn)); /* fill auxiliary fields */ @@ -1345,11 +1974,6 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm); } - /* Delete old snapshots of the same name */ - if (name && del_existing_snapshots(mon, name) < 0) { - goto the_end; - } - /* save the VM state */ f = qemu_fopen_bdrv(bs, 1); if (!f) { @@ -1365,22 +1989,14 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) goto the_end; } - /* create the snapshots */ - - bs1 = NULL; - while ((bs1 = bdrv_next(bs1))) { - if (bdrv_can_snapshot(bs1)) { - /* Write VM state size only to the image that contains the state */ - sn->vm_state_size = (bs == bs1 ? vm_state_size : 0); - ret = bdrv_snapshot_create(bs1, sn); - if (ret < 0) { - monitor_printf(mon, "Error while creating snapshot on '%s'\n", - bdrv_get_device_name(bs1)); - } - } + ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); + if (ret < 0) { + monitor_printf(mon, "Error while creating snapshot on '%s'\n", + bdrv_get_device_name(bs)); } the_end: + aio_context_release(aio_context); if (saved_vm_running) { vm_start(); } @@ -1419,15 +2035,31 @@ int load_vmstate(const char *name) QEMUSnapshotInfo sn; QEMUFile *f; int ret; + AioContext *aio_context; - bs_vm_state = find_vmstate_bs(); + if (!bdrv_all_can_snapshot(&bs)) { + error_report("Device '%s' is writable but does not support snapshots.", + bdrv_get_device_name(bs)); + return -ENOTSUP; + } + ret = bdrv_all_find_snapshot(name, &bs); + if (ret < 0) { + error_report("Device '%s' does not have the requested snapshot '%s'", + bdrv_get_device_name(bs), name); + return ret; + } + + bs_vm_state = bdrv_all_find_vmstate_bs(); if (!bs_vm_state) { error_report("No block device supports snapshots"); return -ENOTSUP; } + aio_context = bdrv_get_aio_context(bs_vm_state); /* Don't even try to load empty VM states */ + aio_context_acquire(aio_context); ret = bdrv_snapshot_find(bs_vm_state, &sn, name); + aio_context_release(aio_context); if (ret < 0) { return ret; } else if (sn.vm_state_size == 0) { @@ -1436,42 +2068,14 @@ int load_vmstate(const char *name) return -EINVAL; } - /* Verify if there is any device that doesn't support snapshots and is - writable and check if the requested snapshot is available too. */ - bs = NULL; - while ((bs = bdrv_next(bs))) { - - if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - continue; - } - - if (!bdrv_can_snapshot(bs)) { - error_report("Device '%s' is writable but does not support snapshots.", - bdrv_get_device_name(bs)); - return -ENOTSUP; - } - - ret = bdrv_snapshot_find(bs, &sn, name); - if (ret < 0) { - error_report("Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_name(bs), name); - return ret; - } - } - /* Flush all IO requests so they don't interfere with the new state. */ bdrv_drain_all(); - bs = NULL; - while ((bs = bdrv_next(bs))) { - if (bdrv_can_snapshot(bs)) { - ret = bdrv_snapshot_goto(bs, name); - if (ret < 0) { - error_report("Error %d while activating snapshot '%s' on '%s'", - ret, name, bdrv_get_device_name(bs)); - return ret; - } - } + ret = bdrv_all_goto_snapshot(name, &bs); + if (ret < 0) { + error_report("Error %d while activating snapshot '%s' on '%s'", + ret, name, bdrv_get_device_name(bs)); + return ret; } /* restore the VM state */ @@ -1483,9 +2087,12 @@ int load_vmstate(const char *name) qemu_system_reset(VMRESET_SILENT); migration_incoming_state_new(f); - ret = qemu_loadvm_state(f); + aio_context_acquire(aio_context); + ret = qemu_loadvm_state(f); qemu_fclose(f); + aio_context_release(aio_context); + migration_incoming_state_destroy(); if (ret < 0) { error_report("Error %d while loading VM state", ret); @@ -1501,43 +2108,34 @@ void hmp_delvm(Monitor *mon, const QDict *qdict) Error *err; const char *name = qdict_get_str(qdict, "name"); - if (!find_vmstate_bs()) { - monitor_printf(mon, "No block device supports snapshots\n"); - return; - } - - bs = NULL; - while ((bs = bdrv_next(bs))) { - if (bdrv_can_snapshot(bs)) { - err = NULL; - bdrv_snapshot_delete_by_id_or_name(bs, name, &err); - if (err) { - monitor_printf(mon, - "Error while deleting snapshot on device '%s':" - " %s\n", - bdrv_get_device_name(bs), - error_get_pretty(err)); - error_free(err); - } - } + if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { + monitor_printf(mon, + "Error while deleting snapshot on device '%s': %s\n", + bdrv_get_device_name(bs), error_get_pretty(err)); + error_free(err); } } void hmp_info_snapshots(Monitor *mon, const QDict *qdict) { BlockDriverState *bs, *bs1; - QEMUSnapshotInfo *sn_tab, *sn, s, *sn_info = &s; - int nb_sns, i, ret, available; + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i; int total; int *available_snapshots; + AioContext *aio_context; - bs = find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(); if (!bs) { monitor_printf(mon, "No available block device supports snapshots\n"); return; } + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); nb_sns = bdrv_snapshot_list(bs, &sn_tab); + aio_context_release(aio_context); + if (nb_sns < 0) { monitor_printf(mon, "bdrv_snapshot_list: error %d\n", nb_sns); return; @@ -1548,24 +2146,10 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) return; } - available_snapshots = g_malloc0(sizeof(int) * nb_sns); + available_snapshots = g_new0(int, nb_sns); total = 0; for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - available = 1; - bs1 = NULL; - - while ((bs1 = bdrv_next(bs1))) { - if (bdrv_can_snapshot(bs1) && bs1 != bs) { - ret = bdrv_snapshot_find(bs1, sn_info, sn->id_str); - if (ret < 0) { - available = 0; - break; - } - } - } - - if (available) { + if (bdrv_all_find_snapshot(sn_tab[i].id_str, &bs1) == 0) { available_snapshots[total] = i; total++; } |