summaryrefslogtreecommitdiff
path: root/migration
diff options
context:
space:
mode:
Diffstat (limited to 'migration')
-rw-r--r--migration/Makefile.objs2
-rw-r--r--migration/block.c31
-rw-r--r--migration/migration.c879
-rw-r--r--migration/postcopy-ram.c761
-rw-r--r--migration/qemu-file-buf.c11
-rw-r--r--migration/qemu-file-stdio.c17
-rw-r--r--migration/qemu-file-unix.c124
-rw-r--r--migration/qemu-file.c90
-rw-r--r--migration/ram.c1268
-rw-r--r--migration/rdma.c36
-rw-r--r--migration/savevm.c1074
11 files changed, 3695 insertions, 598 deletions
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d929e969ae..0cac6d707a 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,7 +1,7 @@
common-obj-y += migration.o tcp.o
common-obj-y += vmstate.o
common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o
-common-obj-y += xbzrle.o
+common-obj-y += xbzrle.o postcopy-ram.o
common-obj-$(CONFIG_RDMA) += rdma.o
common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
diff --git a/migration/block.c b/migration/block.c
index ed865ed23b..656f38f341 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -36,6 +36,8 @@
#define MAX_IS_ALLOCATED_SEARCH 65536
+#define MAX_INFLIGHT_IO 512
+
//#define DEBUG_BLK_MIGRATION
#ifdef DEBUG_BLK_MIGRATION
@@ -591,7 +593,7 @@ static int64_t get_remaining_dirty(void)
/* Called with iothread lock taken. */
-static void blk_mig_cleanup(void)
+static void block_migration_cleanup(void *opaque)
{
BlkMigDevState *bmds;
BlkMigBlock *blk;
@@ -618,11 +620,6 @@ static void blk_mig_cleanup(void)
blk_mig_unlock();
}
-static void block_migration_cancel(void *opaque)
-{
- blk_mig_cleanup();
-}
-
static int block_save_setup(QEMUFile *f, void *opaque)
{
int ret;
@@ -670,7 +667,10 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
blk_mig_lock();
while ((block_mig_state.submitted +
block_mig_state.read_done) * BLOCK_SIZE <
- qemu_file_get_rate_limit(f)) {
+ qemu_file_get_rate_limit(f) &&
+ (block_mig_state.submitted +
+ block_mig_state.read_done) <
+ MAX_INFLIGHT_IO) {
blk_mig_unlock();
if (block_mig_state.bulk_completed == 0) {
/* first finish the bulk phase */
@@ -750,11 +750,12 @@ static int block_save_complete(QEMUFile *f, void *opaque)
qemu_put_be64(f, BLK_MIG_FLAG_EOS);
- blk_mig_cleanup();
return 0;
}
-static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+ uint64_t *non_postcopiable_pending,
+ uint64_t *postcopiable_pending)
{
/* Estimate pending number of bytes to send */
uint64_t pending;
@@ -773,7 +774,8 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
qemu_mutex_unlock_iothread();
DPRINTF("Enter save live pending %" PRIu64 "\n", pending);
- return pending;
+ /* We don't do postcopy */
+ *non_postcopiable_pending += pending;
}
static int block_load(QEMUFile *f, void *opaque, int version_id)
@@ -808,6 +810,11 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
return -EINVAL;
}
bs = blk_bs(blk);
+ if (!bs) {
+ fprintf(stderr, "Block device %s has no medium\n",
+ device_name);
+ return -EINVAL;
+ }
if (bs != bs_prev) {
bs_prev = bs;
@@ -877,10 +884,10 @@ static SaveVMHandlers savevm_block_handlers = {
.set_params = block_set_params,
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
- .save_live_complete = block_save_complete,
+ .save_live_complete_precopy = block_save_complete,
.save_live_pending = block_save_pending,
.load_state = block_load,
- .cancel = block_migration_cancel,
+ .cleanup = block_migration_cleanup,
.is_active = block_is_active,
};
diff --git a/migration/migration.c b/migration/migration.c
index c4a7d0b705..adc6b6f1c9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -21,16 +21,20 @@
#include "sysemu/sysemu.h"
#include "block/block.h"
#include "qapi/qmp/qerror.h"
+#include "qapi/util.h"
#include "qemu/sockets.h"
#include "qemu/rcu.h"
#include "migration/block.h"
+#include "migration/postcopy-ram.h"
#include "qemu/thread.h"
#include "qmp-commands.h"
#include "trace.h"
-#include "qapi/util.h"
#include "qapi-event.h"
+#include "qom/cpu.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
-#define MAX_THROTTLE (32 << 20) /* Migration speed throttling */
+#define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */
/* Amount of time to allocate to each "chunk" of bandwidth-throttled
* data. */
@@ -44,6 +48,9 @@
#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
/*0: means nocompress, 1: best speed, ... 9: best compress ratio */
#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
+/* Define default autoconverge cpu throttle migration parameters */
+#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL 20
+#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT 10
/* Migration XBZRLE default cache size */
#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
@@ -53,6 +60,13 @@ static NotifierList migration_state_notifiers =
static bool deferred_incoming;
+/*
+ * Current state of incoming postcopy; note this is not part of
+ * MigrationIncomingState since it's state is used during cleanup
+ * at the end as MIS is being freed.
+ */
+static PostcopyState incoming_postcopy_state;
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -60,6 +74,7 @@ static bool deferred_incoming;
/* For outgoing */
MigrationState *migrate_get_current(void)
{
+ static bool once;
static MigrationState current_migration = {
.state = MIGRATION_STATUS_NONE,
.bandwidth_limit = MAX_THROTTLE,
@@ -71,8 +86,16 @@ MigrationState *migrate_get_current(void)
DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT,
.parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT,
+ .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] =
+ DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL,
+ .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] =
+ DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT,
};
+ if (!once) {
+ qemu_mutex_init(&current_migration.src_page_req_mutex);
+ once = true;
+ }
return &current_migration;
}
@@ -86,15 +109,18 @@ MigrationIncomingState *migration_incoming_get_current(void)
MigrationIncomingState *migration_incoming_state_new(QEMUFile* f)
{
- mis_current = g_malloc0(sizeof(MigrationIncomingState));
- mis_current->file = f;
+ mis_current = g_new0(MigrationIncomingState, 1);
+ mis_current->from_src_file = f;
QLIST_INIT(&mis_current->loadvm_handlers);
+ qemu_mutex_init(&mis_current->rp_mutex);
+ qemu_event_init(&mis_current->main_thread_load_event, false);
return mis_current;
}
void migration_incoming_state_destroy(void)
{
+ qemu_event_destroy(&mis_current->main_thread_load_event);
loadvm_free_handlers(mis_current);
g_free(mis_current);
mis_current = NULL;
@@ -240,6 +266,35 @@ static void deferred_incoming_migration(Error **errp)
deferred_incoming = true;
}
+/* Request a range of pages from the source VM at the given
+ * start address.
+ * rbname: Name of the RAMBlock to request the page in, if NULL it's the same
+ * as the last request (a name must have been given previously)
+ * Start: Address offset within the RB
+ * Len: Length in bytes required - must be a multiple of pagesize
+ */
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
+ ram_addr_t start, size_t len)
+{
+ uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */
+ size_t msglen = 12; /* start + len */
+
+ *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
+ *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
+
+ if (rbname) {
+ int rbname_len = strlen(rbname);
+ assert(rbname_len < 256);
+
+ bufc[msglen++] = rbname_len;
+ memcpy(bufc + msglen, rbname, rbname_len);
+ msglen += rbname_len;
+ migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc);
+ } else {
+ migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc);
+ }
+}
+
void qemu_start_incoming_migration(const char *uri, Error **errp)
{
const char *p;
@@ -270,12 +325,37 @@ static void process_incoming_migration_co(void *opaque)
{
QEMUFile *f = opaque;
Error *local_err = NULL;
+ MigrationIncomingState *mis;
+ PostcopyState ps;
int ret;
- migration_incoming_state_new(f);
+ mis = migration_incoming_state_new(f);
+ postcopy_state_set(POSTCOPY_INCOMING_NONE);
migrate_generate_event(MIGRATION_STATUS_ACTIVE);
+
ret = qemu_loadvm_state(f);
+ ps = postcopy_state_get();
+ trace_process_incoming_migration_co_end(ret, ps);
+ if (ps != POSTCOPY_INCOMING_NONE) {
+ if (ps == POSTCOPY_INCOMING_ADVISE) {
+ /*
+ * Where a migration had postcopy enabled (and thus went to advise)
+ * but managed to complete within the precopy period, we can use
+ * the normal exit.
+ */
+ postcopy_ram_incoming_cleanup(mis);
+ } else if (ret >= 0) {
+ /*
+ * Postcopy was started, cleanup should happen at the end of the
+ * postcopy thread.
+ */
+ trace_process_incoming_migration_co_postcopy_end_main();
+ return;
+ }
+ /* Else if something went wrong then just fall out of the normal exit */
+ }
+
qemu_fclose(f);
free_xbzrle_decoded_buf();
migration_incoming_state_destroy();
@@ -286,7 +366,6 @@ static void process_incoming_migration_co(void *opaque)
migrate_decompress_threads_join();
exit(EXIT_FAILURE);
}
- qemu_announce_self();
/* Make sure all file formats flush their mutable metadata */
bdrv_invalidate_cache_all(&local_err);
@@ -297,6 +376,12 @@ static void process_incoming_migration_co(void *opaque)
exit(EXIT_FAILURE);
}
+ /*
+ * This must happen after all error conditions are dealt with and
+ * we're sure the VM is going to be running on this host.
+ */
+ qemu_announce_self();
+
/* If global state section was not received or we are in running
state, we need to obey autostart. Any other state is set with
runstate_set. */
@@ -331,6 +416,50 @@ void process_incoming_migration(QEMUFile *f)
qemu_coroutine_enter(co, f);
}
+/*
+ * Send a message on the return channel back to the source
+ * of the migration.
+ */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+ enum mig_rp_message_type message_type,
+ uint16_t len, void *data)
+{
+ trace_migrate_send_rp_message((int)message_type, len);
+ qemu_mutex_lock(&mis->rp_mutex);
+ qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
+ qemu_put_be16(mis->to_src_file, len);
+ qemu_put_buffer(mis->to_src_file, data, len);
+ qemu_fflush(mis->to_src_file);
+ qemu_mutex_unlock(&mis->rp_mutex);
+}
+
+/*
+ * Send a 'SHUT' message on the return channel with the given value
+ * to indicate that we've finished with the RP. Non-0 value indicates
+ * error.
+ */
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+ uint32_t value)
+{
+ uint32_t buf;
+
+ buf = cpu_to_be32(value);
+ migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
+}
+
+/*
+ * Send a 'PONG' message on the return channel with the given value
+ * (normally in response to a 'PING')
+ */
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+ uint32_t value)
+{
+ uint32_t buf;
+
+ buf = cpu_to_be32(value);
+ migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
+}
+
/* amount of nanoseconds we are willing to wait for migration to be down.
* the choice of nanoseconds is because it is the maximum resolution that
* get_clock() can achieve. It is an internal measure. All user-visible
@@ -378,10 +507,32 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS];
params->decompress_threads =
s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
+ params->x_cpu_throttle_initial =
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
+ params->x_cpu_throttle_increment =
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
return params;
}
+/*
+ * Return true if we're already in the middle of a migration
+ * (i.e. any of the active or setup states)
+ */
+static bool migration_is_setup_or_active(int state)
+{
+ switch (state) {
+ case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_SETUP:
+ return true;
+
+ default:
+ return false;
+
+ }
+}
+
static void get_xbzrle_cache_stats(MigrationInfo *info)
{
if (migrate_use_xbzrle()) {
@@ -441,6 +592,44 @@ MigrationInfo *qmp_query_migrate(Error **errp)
info->disk->total = blk_mig_bytes_total();
}
+ if (cpu_throttle_active()) {
+ info->has_x_cpu_throttle_percentage = true;
+ info->x_cpu_throttle_percentage = cpu_throttle_get_percentage();
+ }
+
+ get_xbzrle_cache_stats(info);
+ break;
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ /* Mostly the same as active; TODO add some postcopy stats */
+ info->has_status = true;
+ info->has_total_time = true;
+ info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
+ - s->total_time;
+ info->has_expected_downtime = true;
+ info->expected_downtime = s->expected_downtime;
+ info->has_setup_time = true;
+ info->setup_time = s->setup_time;
+
+ info->has_ram = true;
+ info->ram = g_malloc0(sizeof(*info->ram));
+ info->ram->transferred = ram_bytes_transferred();
+ info->ram->remaining = ram_bytes_remaining();
+ info->ram->total = ram_bytes_total();
+ info->ram->duplicate = dup_mig_pages_transferred();
+ info->ram->skipped = skipped_mig_pages_transferred();
+ info->ram->normal = norm_mig_pages_transferred();
+ info->ram->normal_bytes = norm_mig_bytes_transferred();
+ info->ram->dirty_pages_rate = s->dirty_pages_rate;
+ info->ram->mbps = s->mbps;
+
+ if (blk_mig_active()) {
+ info->has_disk = true;
+ info->disk = g_malloc0(sizeof(*info->disk));
+ info->disk->transferred = blk_mig_bytes_transferred();
+ info->disk->remaining = blk_mig_bytes_remaining();
+ info->disk->total = blk_mig_bytes_total();
+ }
+
get_xbzrle_cache_stats(info);
break;
case MIGRATION_STATUS_COMPLETED:
@@ -484,8 +673,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
MigrationState *s = migrate_get_current();
MigrationCapabilityStatusList *cap;
- if (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_SETUP) {
+ if (migration_is_setup_or_active(s->state)) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
}
@@ -493,6 +681,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
for (cap = params; cap; cap = cap->next) {
s->enabled_capabilities[cap->value->capability] = cap->value->state;
}
+
+ if (migrate_postcopy_ram()) {
+ if (migrate_use_compression()) {
+ /* The decompression threads asynchronously write into RAM
+ * rather than use the atomic copies needed to avoid
+ * userfaulting. It should be possible to fix the decompression
+ * threads for compatibility in future.
+ */
+ error_report("Postcopy is not currently compatible with "
+ "compression");
+ s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] =
+ false;
+ }
+ }
}
void qmp_migrate_set_parameters(bool has_compress_level,
@@ -500,7 +702,11 @@ void qmp_migrate_set_parameters(bool has_compress_level,
bool has_compress_threads,
int64_t compress_threads,
bool has_decompress_threads,
- int64_t decompress_threads, Error **errp)
+ int64_t decompress_threads,
+ bool has_x_cpu_throttle_initial,
+ int64_t x_cpu_throttle_initial,
+ bool has_x_cpu_throttle_increment,
+ int64_t x_cpu_throttle_increment, Error **errp)
{
MigrationState *s = migrate_get_current();
@@ -523,6 +729,18 @@ void qmp_migrate_set_parameters(bool has_compress_level,
"is invalid, it should be in the range of 1 to 255");
return;
}
+ if (has_x_cpu_throttle_initial &&
+ (x_cpu_throttle_initial < 1 || x_cpu_throttle_initial > 99)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+ "x_cpu_throttle_initial",
+ "an integer in the range of 1 to 99");
+ }
+ if (has_x_cpu_throttle_increment &&
+ (x_cpu_throttle_increment < 1 || x_cpu_throttle_increment > 99)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+ "x_cpu_throttle_increment",
+ "an integer in the range of 1 to 99");
+ }
if (has_compress_level) {
s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level;
@@ -534,6 +752,37 @@ void qmp_migrate_set_parameters(bool has_compress_level,
s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
decompress_threads;
}
+ if (has_x_cpu_throttle_initial) {
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] =
+ x_cpu_throttle_initial;
+ }
+
+ if (has_x_cpu_throttle_increment) {
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] =
+ x_cpu_throttle_increment;
+ }
+}
+
+void qmp_migrate_start_postcopy(Error **errp)
+{
+ MigrationState *s = migrate_get_current();
+
+ if (!migrate_postcopy_ram()) {
+ error_setg(errp, "Enable postcopy with migrate_set_capability before"
+ " the start of migration");
+ return;
+ }
+
+ if (s->state == MIGRATION_STATUS_NONE) {
+ error_setg(errp, "Postcopy must be started after migration has been"
+ " started");
+ return;
+ }
+ /*
+ * we don't error if migration has finished since that would be racy
+ * with issuing this command.
+ */
+ atomic_set(&s->start_postcopy, true);
}
/* shared migration helpers */
@@ -553,10 +802,15 @@ static void migrate_fd_cleanup(void *opaque)
qemu_bh_delete(s->cleanup_bh);
s->cleanup_bh = NULL;
+ flush_page_queue(s);
+
if (s->file) {
trace_migrate_fd_cleanup();
qemu_mutex_unlock_iothread();
- qemu_thread_join(&s->thread);
+ if (s->migration_thread_running) {
+ qemu_thread_join(&s->thread);
+ s->migration_thread_running = false;
+ }
qemu_mutex_lock_iothread();
migrate_compress_threads_join();
@@ -564,14 +818,12 @@ static void migrate_fd_cleanup(void *opaque)
s->file = NULL;
}
- assert(s->state != MIGRATION_STATUS_ACTIVE);
+ assert((s->state != MIGRATION_STATUS_ACTIVE) &&
+ (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
- if (s->state != MIGRATION_STATUS_COMPLETED) {
- qemu_savevm_state_cancel();
- if (s->state == MIGRATION_STATUS_CANCELLING) {
- migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
- MIGRATION_STATUS_CANCELLED);
- }
+ if (s->state == MIGRATION_STATUS_CANCELLING) {
+ migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
+ MIGRATION_STATUS_CANCELLED);
}
notifier_list_notify(&migration_state_notifiers, s);
@@ -591,10 +843,14 @@ static void migrate_fd_cancel(MigrationState *s)
QEMUFile *f = migrate_get_current()->file;
trace_migrate_fd_cancel();
+ if (s->rp_state.from_dst_file) {
+ /* shutdown the rp socket, so causing the rp thread to shutdown */
+ qemu_file_shutdown(s->rp_state.from_dst_file);
+ }
+
do {
old_state = s->state;
- if (old_state != MIGRATION_STATUS_SETUP &&
- old_state != MIGRATION_STATUS_ACTIVE) {
+ if (!migration_is_setup_or_active(old_state)) {
break;
}
migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING);
@@ -638,35 +894,43 @@ bool migration_has_failed(MigrationState *s)
s->state == MIGRATION_STATUS_FAILED);
}
-static MigrationState *migrate_init(const MigrationParams *params)
+bool migration_in_postcopy(MigrationState *s)
{
- MigrationState *s = migrate_get_current();
- int64_t bandwidth_limit = s->bandwidth_limit;
- bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
- int64_t xbzrle_cache_size = s->xbzrle_cache_size;
- int compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL];
- int compress_thread_count =
- s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS];
- int decompress_thread_count =
- s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
+ return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+}
- memcpy(enabled_capabilities, s->enabled_capabilities,
- sizeof(enabled_capabilities));
+MigrationState *migrate_init(const MigrationParams *params)
+{
+ MigrationState *s = migrate_get_current();
- memset(s, 0, sizeof(*s));
+ /*
+ * Reinitialise all migration state, except
+ * parameters/capabilities that the user set, and
+ * locks.
+ */
+ s->bytes_xfer = 0;
+ s->xfer_limit = 0;
+ s->cleanup_bh = 0;
+ s->file = NULL;
+ s->state = MIGRATION_STATUS_NONE;
s->params = *params;
- memcpy(s->enabled_capabilities, enabled_capabilities,
- sizeof(enabled_capabilities));
- s->xbzrle_cache_size = xbzrle_cache_size;
-
- s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level;
- s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] =
- compress_thread_count;
- s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
- decompress_thread_count;
- s->bandwidth_limit = bandwidth_limit;
+ s->rp_state.from_dst_file = NULL;
+ s->rp_state.error = false;
+ s->mbps = 0.0;
+ s->downtime = 0;
+ s->expected_downtime = 0;
+ s->dirty_pages_rate = 0;
+ s->dirty_bytes_rate = 0;
+ s->setup_time = 0;
+ s->dirty_sync_count = 0;
+ s->start_postcopy = false;
+ s->migration_thread_running = false;
+ s->last_req_rb = NULL;
+
migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
+ QSIMPLEQ_INIT(&s->src_page_requests);
+
s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
return s;
}
@@ -718,8 +982,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
params.blk = has_blk && blk;
params.shared = has_inc && inc;
- if (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_SETUP ||
+ if (migration_is_setup_or_active(s->state) ||
s->state == MIGRATION_STATUS_CANCELLING) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
@@ -838,6 +1101,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
max_downtime = (uint64_t)value;
}
+bool migrate_postcopy_ram(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM];
+}
+
bool migrate_auto_converge(void)
{
MigrationState *s;
@@ -920,76 +1192,489 @@ int64_t migrate_xbzrle_cache_size(void)
}
/* migration thread support */
+/*
+ * Something bad happened to the RP stream, mark an error
+ * The caller shall print or trace something to indicate why
+ */
+static void mark_source_rp_bad(MigrationState *s)
+{
+ s->rp_state.error = true;
+}
+
+static struct rp_cmd_args {
+ ssize_t len; /* -1 = variable */
+ const char *name;
+} rp_cmd_args[] = {
+ [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" },
+ [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" },
+ [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
+ [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
+ [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
+ [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
+};
+
+/*
+ * Process a request for pages received on the return path,
+ * We're allowed to send more than requested (e.g. to round to our page size)
+ * and we don't need to send pages that have already been sent.
+ */
+static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
+ ram_addr_t start, size_t len)
+{
+ long our_host_ps = getpagesize();
+
+ trace_migrate_handle_rp_req_pages(rbname, start, len);
+
+ /*
+ * Since we currently insist on matching page sizes, just sanity check
+ * we're being asked for whole host pages.
+ */
+ if (start & (our_host_ps-1) ||
+ (len & (our_host_ps-1))) {
+ error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
+ " len: %zd", __func__, start, len);
+ mark_source_rp_bad(ms);
+ return;
+ }
+ if (ram_save_queue_pages(ms, rbname, start, len)) {
+ mark_source_rp_bad(ms);
+ }
+}
+
+/*
+ * Handles messages sent on the return path towards the source VM
+ *
+ */
+static void *source_return_path_thread(void *opaque)
+{
+ MigrationState *ms = opaque;
+ QEMUFile *rp = ms->rp_state.from_dst_file;
+ uint16_t header_len, header_type;
+ const int max_len = 512;
+ uint8_t buf[max_len];
+ uint32_t tmp32, sibling_error;
+ ram_addr_t start = 0; /* =0 to silence warning */
+ size_t len = 0, expected_len;
+ int res;
+
+ trace_source_return_path_thread_entry();
+ while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
+ migration_is_setup_or_active(ms->state)) {
+ trace_source_return_path_thread_loop_top();
+ header_type = qemu_get_be16(rp);
+ header_len = qemu_get_be16(rp);
+
+ if (header_type >= MIG_RP_MSG_MAX ||
+ header_type == MIG_RP_MSG_INVALID) {
+ error_report("RP: Received invalid message 0x%04x length 0x%04x",
+ header_type, header_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ if ((rp_cmd_args[header_type].len != -1 &&
+ header_len != rp_cmd_args[header_type].len) ||
+ header_len > max_len) {
+ error_report("RP: Received '%s' message (0x%04x) with"
+ "incorrect length %d expecting %zu",
+ rp_cmd_args[header_type].name, header_type, header_len,
+ (size_t)rp_cmd_args[header_type].len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ /* We know we've got a valid header by this point */
+ res = qemu_get_buffer(rp, buf, header_len);
+ if (res != header_len) {
+ error_report("RP: Failed reading data for message 0x%04x"
+ " read %d expected %d",
+ header_type, res, header_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ /* OK, we have the message and the data */
+ switch (header_type) {
+ case MIG_RP_MSG_SHUT:
+ sibling_error = be32_to_cpup((uint32_t *)buf);
+ trace_source_return_path_thread_shut(sibling_error);
+ if (sibling_error) {
+ error_report("RP: Sibling indicated error %d", sibling_error);
+ mark_source_rp_bad(ms);
+ }
+ /*
+ * We'll let the main thread deal with closing the RP
+ * we could do a shutdown(2) on it, but we're the only user
+ * anyway, so there's nothing gained.
+ */
+ goto out;
+
+ case MIG_RP_MSG_PONG:
+ tmp32 = be32_to_cpup((uint32_t *)buf);
+ trace_source_return_path_thread_pong(tmp32);
+ break;
+
+ case MIG_RP_MSG_REQ_PAGES:
+ start = be64_to_cpup((uint64_t *)buf);
+ len = be32_to_cpup((uint32_t *)(buf + 8));
+ migrate_handle_rp_req_pages(ms, NULL, start, len);
+ break;
+
+ case MIG_RP_MSG_REQ_PAGES_ID:
+ expected_len = 12 + 1; /* header + termination */
+
+ if (header_len >= expected_len) {
+ start = be64_to_cpup((uint64_t *)buf);
+ len = be32_to_cpup((uint32_t *)(buf + 8));
+ /* Now we expect an idstr */
+ tmp32 = buf[12]; /* Length of the following idstr */
+ buf[13 + tmp32] = '\0';
+ expected_len += tmp32;
+ }
+ if (header_len != expected_len) {
+ error_report("RP: Req_Page_id with length %d expecting %zd",
+ header_len, expected_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+ migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (qemu_file_get_error(rp)) {
+ trace_source_return_path_thread_bad_end();
+ mark_source_rp_bad(ms);
+ }
+
+ trace_source_return_path_thread_end();
+out:
+ ms->rp_state.from_dst_file = NULL;
+ qemu_fclose(rp);
+ return NULL;
+}
+
+static int open_return_path_on_source(MigrationState *ms)
+{
+
+ ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file);
+ if (!ms->rp_state.from_dst_file) {
+ return -1;
+ }
+
+ trace_open_return_path_on_source();
+ qemu_thread_create(&ms->rp_state.rp_thread, "return path",
+ source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
+
+ trace_open_return_path_on_source_continue();
+
+ return 0;
+}
+
+/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
+static int await_return_path_close_on_source(MigrationState *ms)
+{
+ /*
+ * If this is a normal exit then the destination will send a SHUT and the
+ * rp_thread will exit, however if there's an error we need to cause
+ * it to exit.
+ */
+ if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) {
+ /*
+ * shutdown(2), if we have it, will cause it to unblock if it's stuck
+ * waiting for the destination.
+ */
+ qemu_file_shutdown(ms->rp_state.from_dst_file);
+ mark_source_rp_bad(ms);
+ }
+ trace_await_return_path_close_on_source_joining();
+ qemu_thread_join(&ms->rp_state.rp_thread);
+ trace_await_return_path_close_on_source_close();
+ return ms->rp_state.error;
+}
+
+/*
+ * Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+{
+ int ret;
+ const QEMUSizedBuffer *qsb;
+ int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ migrate_set_state(ms, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ trace_postcopy_start();
+ qemu_mutex_lock_iothread();
+ trace_postcopy_start_set_run();
+
+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+ *old_vm_running = runstate_is_running();
+ global_state_store();
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /*
+ * Cause any non-postcopiable, but iterative devices to
+ * send out their final data.
+ */
+ qemu_savevm_state_complete_precopy(ms->file, true);
+
+ /*
+ * in Finish migrate and with the io-lock held everything should
+ * be quiet, but we've potentially still got dirty pages and we
+ * need to tell the destination to throw any pages it's already received
+ * that are dirty
+ */
+ if (ram_postcopy_send_discard_bitmap(ms)) {
+ error_report("postcopy send discard bitmap failed");
+ goto fail;
+ }
+
+ /*
+ * send rest of state - note things that are doing postcopy
+ * will notice we're in POSTCOPY_ACTIVE and not actually
+ * wrap their state up here
+ */
+ qemu_file_set_rate_limit(ms->file, INT64_MAX);
+ /* Ping just for debugging, helps line traces up */
+ qemu_savevm_send_ping(ms->file, 2);
+
+ /*
+ * While loading the device state we may trigger page transfer
+ * requests and the fd must be free to process those, and thus
+ * the destination must read the whole device state off the fd before
+ * it starts processing it. Unfortunately the ad-hoc migration format
+ * doesn't allow the destination to know the size to read without fully
+ * parsing it through each devices load-state code (especially the open
+ * coded devices that use get/put).
+ * So we wrap the device state up in a package with a length at the start;
+ * to do this we use a qemu_buf to hold the whole of the device state.
+ */
+ QEMUFile *fb = qemu_bufopen("w", NULL);
+ if (!fb) {
+ error_report("Failed to create buffered file");
+ goto fail;
+ }
+
+ /*
+ * Make sure the receiver can get incoming pages before we send the rest
+ * of the state
+ */
+ qemu_savevm_send_postcopy_listen(fb);
+
+ qemu_savevm_state_complete_precopy(fb, false);
+ qemu_savevm_send_ping(fb, 3);
+
+ qemu_savevm_send_postcopy_run(fb);
+
+ /* <><> end of stuff going into the package */
+ qsb = qemu_buf_get(fb);
+
+ /* Now send that blob */
+ if (qemu_savevm_send_packaged(ms->file, qsb)) {
+ goto fail_closefb;
+ }
+ qemu_fclose(fb);
+ ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
+
+ qemu_mutex_unlock_iothread();
+
+ /*
+ * Although this ping is just for debug, it could potentially be
+ * used for getting a better measurement of downtime at the source.
+ */
+ qemu_savevm_send_ping(ms->file, 4);
+
+ ret = qemu_file_get_error(ms->file);
+ if (ret) {
+ error_report("postcopy_start: Migration stream errored");
+ migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+ }
+
+ return ret;
+
+fail_closefb:
+ qemu_fclose(fb);
+fail:
+ migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+ qemu_mutex_unlock_iothread();
+ return -1;
+}
+
+/**
+ * migration_completion: Used by migration_thread when there's not much left.
+ * The caller 'breaks' the loop when this returns.
+ *
+ * @s: Current migration state
+ * @current_active_state: The migration state we expect to be in
+ * @*old_vm_running: Pointer to old_vm_running flag
+ * @*start_time: Pointer to time to update
+ */
+static void migration_completion(MigrationState *s, int current_active_state,
+ bool *old_vm_running,
+ int64_t *start_time)
+{
+ int ret;
+
+ if (s->state == MIGRATION_STATUS_ACTIVE) {
+ qemu_mutex_lock_iothread();
+ *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+ *old_vm_running = runstate_is_running();
+ ret = global_state_store();
+
+ if (!ret) {
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret >= 0) {
+ qemu_file_set_rate_limit(s->file, INT64_MAX);
+ qemu_savevm_state_complete_precopy(s->file, false);
+ }
+ }
+ qemu_mutex_unlock_iothread();
+
+ if (ret < 0) {
+ goto fail;
+ }
+ } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ trace_migration_completion_postcopy_end();
+
+ qemu_savevm_state_complete_postcopy(s->file);
+ trace_migration_completion_postcopy_end_after_complete();
+ }
+
+ /*
+ * If rp was opened we must clean up the thread before
+ * cleaning everything else up (since if there are no failures
+ * it will wait for the destination to send it's status in
+ * a SHUT command).
+ * Postcopy opens rp if enabled (even if it's not avtivated)
+ */
+ if (migrate_postcopy_ram()) {
+ int rp_error;
+ trace_migration_completion_postcopy_end_before_rp();
+ rp_error = await_return_path_close_on_source(s);
+ trace_migration_completion_postcopy_end_after_rp(rp_error);
+ if (rp_error) {
+ goto fail;
+ }
+ }
+
+ if (qemu_file_get_error(s->file)) {
+ trace_migration_completion_file_err();
+ goto fail;
+ }
+
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED);
+ return;
+
+fail:
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
+}
+
+/*
+ * Master migration thread on the source VM.
+ * It drives the migration and pumps the data down the outgoing channel.
+ */
static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
+ /* Used by the bandwidth calcs, updated later */
int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
int64_t initial_bytes = 0;
int64_t max_size = 0;
int64_t start_time = initial_time;
+ int64_t end_time;
bool old_vm_running = false;
+ bool entered_postcopy = false;
+ /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
rcu_register_thread();
qemu_savevm_state_header(s->file);
+
+ if (migrate_postcopy_ram()) {
+ /* Now tell the dest that it should open its end so it can reply */
+ qemu_savevm_send_open_return_path(s->file);
+
+ /* And do a ping that will make stuff easier to debug */
+ qemu_savevm_send_ping(s->file, 1);
+
+ /*
+ * Tell the destination that we *might* want to do postcopy later;
+ * if the other end can't do postcopy it should fail now, nice and
+ * early.
+ */
+ qemu_savevm_send_postcopy_advise(s->file);
+ }
+
qemu_savevm_state_begin(s->file, &s->params);
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+ current_active_state = MIGRATION_STATUS_ACTIVE;
migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE);
- while (s->state == MIGRATION_STATUS_ACTIVE) {
+ trace_migration_thread_setup_complete();
+
+ while (s->state == MIGRATION_STATUS_ACTIVE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
int64_t current_time;
uint64_t pending_size;
if (!qemu_file_rate_limit(s->file)) {
- pending_size = qemu_savevm_state_pending(s->file, max_size);
- trace_migrate_pending(pending_size, max_size);
+ uint64_t pend_post, pend_nonpost;
+
+ qemu_savevm_state_pending(s->file, max_size, &pend_nonpost,
+ &pend_post);
+ pending_size = pend_nonpost + pend_post;
+ trace_migrate_pending(pending_size, max_size,
+ pend_post, pend_nonpost);
if (pending_size && pending_size >= max_size) {
- qemu_savevm_state_iterate(s->file);
- } else {
- int ret;
-
- qemu_mutex_lock_iothread();
- start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
- old_vm_running = runstate_is_running();
-
- ret = global_state_store();
- if (!ret) {
- ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
- if (ret >= 0) {
- qemu_file_set_rate_limit(s->file, INT64_MAX);
- qemu_savevm_state_complete(s->file);
- }
- }
- qemu_mutex_unlock_iothread();
+ /* Still a significant amount to transfer */
- if (ret < 0) {
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_FAILED);
- break;
- }
+ if (migrate_postcopy_ram() &&
+ s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+ pend_nonpost <= max_size &&
+ atomic_read(&s->start_postcopy)) {
+
+ if (!postcopy_start(s, &old_vm_running)) {
+ current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+ entered_postcopy = true;
+ }
- if (!qemu_file_get_error(s->file)) {
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_COMPLETED);
- break;
+ continue;
}
+ /* Just another iteration step */
+ qemu_savevm_state_iterate(s->file, entered_postcopy);
+ } else {
+ trace_migration_thread_low_pending(pending_size);
+ migration_completion(s, current_active_state,
+ &old_vm_running, &start_time);
+ break;
}
}
if (qemu_file_get_error(s->file)) {
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_FAILED);
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
+ trace_migration_thread_file_err();
break;
}
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
if (current_time >= initial_time + BUFFER_DELAY) {
uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes;
uint64_t time_spent = current_time - initial_time;
- double bandwidth = transferred_bytes / time_spent;
+ double bandwidth = (double)transferred_bytes / time_spent;
max_size = bandwidth * migrate_max_downtime() / 1000000;
s->mbps = time_spent ? (((double) transferred_bytes * 8.0) /
@@ -1013,19 +1698,26 @@ static void *migration_thread(void *opaque)
}
}
+ trace_migration_thread_after_loop();
+ /* If we enabled cpu throttling for auto-converge, turn it off. */
+ cpu_throttle_stop();
+ end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
qemu_mutex_lock_iothread();
+ qemu_savevm_state_cleanup();
if (s->state == MIGRATION_STATUS_COMPLETED) {
- int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
uint64_t transferred_bytes = qemu_ftell(s->file);
s->total_time = end_time - s->total_time;
- s->downtime = end_time - start_time;
+ if (!entered_postcopy) {
+ s->downtime = end_time - start_time;
+ }
if (s->total_time) {
s->mbps = (((double) transferred_bytes * 8.0) /
((double) s->total_time)) / 1000;
}
runstate_set(RUN_STATE_POSTMIGRATE);
} else {
- if (old_vm_running) {
+ if (old_vm_running && !entered_postcopy) {
vm_start();
}
}
@@ -1048,7 +1740,34 @@ void migrate_fd_connect(MigrationState *s)
/* Notify before starting migration thread */
notifier_list_notify(&migration_state_notifiers, s);
+ /*
+ * Open the return path; currently for postcopy but other things might
+ * also want it.
+ */
+ if (migrate_postcopy_ram()) {
+ if (open_return_path_on_source(s)) {
+ error_report("Unable to open return-path for postcopy");
+ migrate_set_state(s, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_FAILED);
+ migrate_fd_cleanup(s);
+ return;
+ }
+ }
+
migrate_compress_threads_create();
qemu_thread_create(&s->thread, "migration", migration_thread, s,
QEMU_THREAD_JOINABLE);
+ s->migration_thread_running = true;
+}
+
+PostcopyState postcopy_state_get(void)
+{
+ return atomic_mb_read(&incoming_postcopy_state);
}
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state)
+{
+ return atomic_xchg(&incoming_postcopy_state, new_state);
+}
+
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
new file mode 100644
index 0000000000..3946aa98aa
--- /dev/null
+++ b/migration/postcopy-ram.c
@@ -0,0 +1,761 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * Postcopy is a migration technique where the execution flips from the
+ * source to the destination before all the data has been copied.
+ */
+
+#include <glib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/balloon.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+/* Arbitrary limit on size of each discard command,
+ * keeps them around ~200 bytes
+ */
+#define MAX_DISCARDS_PER_COMMAND 12
+
+struct PostcopyDiscardState {
+ const char *ramblock_name;
+ uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
+ uint16_t cur_entry;
+ /*
+ * Start and length of a discard range (bytes)
+ */
+ uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
+ uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
+ unsigned int nsentwords;
+ unsigned int nsentcmds;
+};
+
+/* Postcopy needs to detect accesses to pages that haven't yet been copied
+ * across, and efficiently map new pages in, the techniques for doing this
+ * are target OS specific.
+ */
+#if defined(__linux__)
+
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <asm/types.h> /* for __u64 */
+#endif
+
+#if defined(__linux__) && defined(__NR_userfaultfd)
+#include <linux/userfaultfd.h>
+
+static bool ufd_version_check(int ufd)
+{
+ struct uffdio_api api_struct;
+ uint64_t ioctl_mask;
+
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+ if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+ error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
+ strerror(errno));
+ return false;
+ }
+
+ ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
+ (__u64)1 << _UFFDIO_UNREGISTER;
+ if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+ error_report("Missing userfault features: %" PRIx64,
+ (uint64_t)(~api_struct.ioctls & ioctl_mask));
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Note: This has the side effect of munlock'ing all of RAM, that's
+ * normally fine since if the postcopy succeeds it gets turned back on at the
+ * end.
+ */
+bool postcopy_ram_supported_by_host(void)
+{
+ long pagesize = getpagesize();
+ int ufd = -1;
+ bool ret = false; /* Error unless we change it */
+ void *testarea = NULL;
+ struct uffdio_register reg_struct;
+ struct uffdio_range range_struct;
+ uint64_t feature_mask;
+
+ if ((1ul << qemu_target_page_bits()) > pagesize) {
+ error_report("Target page size bigger than host page size");
+ goto out;
+ }
+
+ ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ if (ufd == -1) {
+ error_report("%s: userfaultfd not available: %s", __func__,
+ strerror(errno));
+ goto out;
+ }
+
+ /* Version and features check */
+ if (!ufd_version_check(ufd)) {
+ goto out;
+ }
+
+ /*
+ * userfault and mlock don't go together; we'll put it back later if
+ * it was enabled.
+ */
+ if (munlockall()) {
+ error_report("%s: munlockall: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ /*
+ * We need to check that the ops we need are supported on anon memory
+ * To do that we need to register a chunk and see the flags that
+ * are returned.
+ */
+ testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+ MAP_ANONYMOUS, -1, 0);
+ if (testarea == MAP_FAILED) {
+ error_report("%s: Failed to map test area: %s", __func__,
+ strerror(errno));
+ goto out;
+ }
+ g_assert(((size_t)testarea & (pagesize-1)) == 0);
+
+ reg_struct.range.start = (uintptr_t)testarea;
+ reg_struct.range.len = pagesize;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
+ error_report("%s userfault register: %s", __func__, strerror(errno));
+ goto out;
+ }
+
+ range_struct.start = (uintptr_t)testarea;
+ range_struct.len = pagesize;
+ if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
+ error_report("%s userfault unregister: %s", __func__, strerror(errno));
+ goto out;
+ }
+
+ feature_mask = (__u64)1 << _UFFDIO_WAKE |
+ (__u64)1 << _UFFDIO_COPY |
+ (__u64)1 << _UFFDIO_ZEROPAGE;
+ if ((reg_struct.ioctls & feature_mask) != feature_mask) {
+ error_report("Missing userfault map features: %" PRIx64,
+ (uint64_t)(~reg_struct.ioctls & feature_mask));
+ goto out;
+ }
+
+ /* Success! */
+ ret = true;
+out:
+ if (testarea) {
+ munmap(testarea, pagesize);
+ }
+ if (ufd != -1) {
+ close(ufd);
+ }
+ return ret;
+}
+
+/**
+ * postcopy_ram_discard_range: Discard a range of memory.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true.
+ *
+ * @mis: Current incoming migration state.
+ * @start, @length: range of memory to discard.
+ *
+ * returns: 0 on success.
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+ size_t length)
+{
+ trace_postcopy_ram_discard_range(start, length);
+ if (madvise(start, length, MADV_DONTNEED)) {
+ error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+
+ trace_postcopy_init_range(block_name, host_addr, offset, length);
+
+ /*
+ * We need the whole of RAM to be truly empty for postcopy, so things
+ * like ROMs and any data tables built during init must be zero'd
+ * - we're going to get the copy from the source anyway.
+ * (Precopy will just overwrite this data, so doesn't need the discard)
+ */
+ if (postcopy_ram_discard_range(mis, host_addr, length)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_range
+ * opaque should be the MIS.
+ */
+static int cleanup_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffdio_range range_struct;
+ trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
+
+ /*
+ * We turned off hugepage for the precopy stage with postcopy enabled
+ * we can turn it back on now.
+ */
+ qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
+
+ /*
+ * We can also turn off userfault now since we should have all the
+ * pages. It can be useful to leave it on to debug postcopy
+ * if you're not sure it's always getting every page.
+ */
+ range_struct.start = (uintptr_t)host_addr;
+ range_struct.len = length;
+
+ if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
+ error_report("%s: userfault unregister %s", __func__, strerror(errno));
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+ if (qemu_ram_foreach_block(init_range, mis)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+ trace_postcopy_ram_incoming_cleanup_entry();
+
+ if (mis->have_fault_thread) {
+ uint64_t tmp64;
+
+ if (qemu_ram_foreach_block(cleanup_range, mis)) {
+ return -1;
+ }
+ /*
+ * Tell the fault_thread to exit, it's an eventfd that should
+ * currently be at 0, we're going to increment it to 1
+ */
+ tmp64 = 1;
+ if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
+ trace_postcopy_ram_incoming_cleanup_join();
+ qemu_thread_join(&mis->fault_thread);
+ } else {
+ /* Not much we can do here, but may as well report it */
+ error_report("%s: incrementing userfault_quit_fd: %s", __func__,
+ strerror(errno));
+ }
+ trace_postcopy_ram_incoming_cleanup_closeuf();
+ close(mis->userfault_fd);
+ close(mis->userfault_quit_fd);
+ mis->have_fault_thread = false;
+ }
+
+ qemu_balloon_inhibit(false);
+
+ if (enable_mlock) {
+ if (os_mlock() < 0) {
+ error_report("mlock: %s", strerror(errno));
+ /*
+ * It doesn't feel right to fail at this point, we have a valid
+ * VM state.
+ */
+ }
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_END);
+ migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
+
+ if (mis->postcopy_tmp_page) {
+ munmap(mis->postcopy_tmp_page, getpagesize());
+ mis->postcopy_tmp_page = NULL;
+ }
+ trace_postcopy_ram_incoming_cleanup_exit();
+ return 0;
+}
+
+/*
+ * Disable huge pages on an area
+ */
+static int nhp_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ trace_postcopy_nhp_range(block_name, host_addr, offset, length);
+
+ /*
+ * Before we do discards we need to ensure those discards really
+ * do delete areas of the page, even if THP thinks a hugepage would
+ * be a good idea, so force hugepages off.
+ */
+ qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
+
+ return 0;
+}
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+ if (qemu_ram_foreach_block(nhp_range, mis)) {
+ return -1;
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
+
+ return 0;
+}
+
+/*
+ * Mark the given area of RAM as requiring notification to unwritten areas
+ * Used as a callback on qemu_ram_foreach_block.
+ * host_addr: Base of area to mark
+ * offset: Offset in the whole ram arena
+ * length: Length of the section
+ * opaque: MigrationIncomingState pointer
+ * Returns 0 on success
+ */
+static int ram_block_enable_notify(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length,
+ void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffdio_register reg_struct;
+
+ reg_struct.range.start = (uintptr_t)host_addr;
+ reg_struct.range.len = length;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ /* Now tell our userfault_fd that it's responsible for this area */
+ if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
+ error_report("%s userfault register: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle faults detected by the USERFAULT markings
+ */
+static void *postcopy_ram_fault_thread(void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffd_msg msg;
+ int ret;
+ size_t hostpagesize = getpagesize();
+ RAMBlock *rb = NULL;
+ RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
+
+ trace_postcopy_ram_fault_thread_entry();
+ qemu_sem_post(&mis->fault_thread_sem);
+
+ while (true) {
+ ram_addr_t rb_offset;
+ ram_addr_t in_raspace;
+ struct pollfd pfd[2];
+
+ /*
+ * We're mainly waiting for the kernel to give us a faulting HVA,
+ * however we can be told to quit via userfault_quit_fd which is
+ * an eventfd
+ */
+ pfd[0].fd = mis->userfault_fd;
+ pfd[0].events = POLLIN;
+ pfd[0].revents = 0;
+ pfd[1].fd = mis->userfault_quit_fd;
+ pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+ pfd[1].revents = 0;
+
+ if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+ error_report("%s: userfault poll: %s", __func__, strerror(errno));
+ break;
+ }
+
+ if (pfd[1].revents) {
+ trace_postcopy_ram_fault_thread_quit();
+ break;
+ }
+
+ ret = read(mis->userfault_fd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (errno == EAGAIN) {
+ /*
+ * if a wake up happens on the other thread just after
+ * the poll, there is nothing to read.
+ */
+ continue;
+ }
+ if (ret < 0) {
+ error_report("%s: Failed to read full userfault message: %s",
+ __func__, strerror(errno));
+ break;
+ } else {
+ error_report("%s: Read %d bytes from userfaultfd expected %zd",
+ __func__, ret, sizeof(msg));
+ break; /* Lost alignment, don't know what we'd read next */
+ }
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT) {
+ error_report("%s: Read unexpected event %ud from userfaultfd",
+ __func__, msg.event);
+ continue; /* It's not a page fault, shouldn't happen */
+ }
+
+ rb = qemu_ram_block_from_host(
+ (void *)(uintptr_t)msg.arg.pagefault.address,
+ true, &in_raspace, &rb_offset);
+ if (!rb) {
+ error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+ PRIx64, (uint64_t)msg.arg.pagefault.address);
+ break;
+ }
+
+ rb_offset &= ~(hostpagesize - 1);
+ trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+ qemu_ram_get_idstr(rb),
+ rb_offset);
+
+ /*
+ * Send the request to the source - we want to request one
+ * of our host page sizes (which is >= TPS)
+ */
+ if (rb != last_rb) {
+ last_rb = rb;
+ migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+ rb_offset, hostpagesize);
+ } else {
+ /* Save some space */
+ migrate_send_rp_req_pages(mis, NULL,
+ rb_offset, hostpagesize);
+ }
+ }
+ trace_postcopy_ram_fault_thread_exit();
+ return NULL;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+ /* Open the fd for the kernel to give us userfaults */
+ mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (mis->userfault_fd == -1) {
+ error_report("%s: Failed to open userfault fd: %s", __func__,
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * Although the host check already tested the API, we need to
+ * do the check again as an ABI handshake on the new fd.
+ */
+ if (!ufd_version_check(mis->userfault_fd)) {
+ return -1;
+ }
+
+ /* Now an eventfd we use to tell the fault-thread to quit */
+ mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
+ if (mis->userfault_quit_fd == -1) {
+ error_report("%s: Opening userfault_quit_fd: %s", __func__,
+ strerror(errno));
+ close(mis->userfault_fd);
+ return -1;
+ }
+
+ qemu_sem_init(&mis->fault_thread_sem, 0);
+ qemu_thread_create(&mis->fault_thread, "postcopy/fault",
+ postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
+ qemu_sem_wait(&mis->fault_thread_sem);
+ qemu_sem_destroy(&mis->fault_thread_sem);
+ mis->have_fault_thread = true;
+
+ /* Mark so that we get notified of accesses to unwritten areas */
+ if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+ return -1;
+ }
+
+ /*
+ * Ballooning can mark pages as absent while we're postcopying
+ * that would cause false userfaults.
+ */
+ qemu_balloon_inhibit(true);
+
+ trace_postcopy_ram_enable_notify();
+
+ return 0;
+}
+
+/*
+ * Place a host page (from) at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+ struct uffdio_copy copy_struct;
+
+ copy_struct.dst = (uint64_t)(uintptr_t)host;
+ copy_struct.src = (uint64_t)(uintptr_t)from;
+ copy_struct.len = getpagesize();
+ copy_struct.mode = 0;
+
+ /* copy also acks to the kernel waking the stalled thread up
+ * TODO: We can inhibit that ack and only do it if it was requested
+ * which would be slightly cheaper, but we'd have to be careful
+ * of the order of updating our page state.
+ */
+ if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
+ int e = errno;
+ error_report("%s: %s copy host: %p from: %p",
+ __func__, strerror(e), host, from);
+
+ return -e;
+ }
+
+ trace_postcopy_place_page(host);
+ return 0;
+}
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+ struct uffdio_zeropage zero_struct;
+
+ zero_struct.range.start = (uint64_t)(uintptr_t)host;
+ zero_struct.range.len = getpagesize();
+ zero_struct.mode = 0;
+
+ if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
+ int e = errno;
+ error_report("%s: %s zero host: %p",
+ __func__, strerror(e), host);
+
+ return -e;
+ }
+
+ trace_postcopy_place_page_zero(host);
+ return 0;
+}
+
+/*
+ * Returns a target page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * The same address is used repeatedly, postcopy_place_page just takes the
+ * backing page away.
+ * Returns: Pointer to allocated page
+ *
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+ if (!mis->postcopy_tmp_page) {
+ mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
+ PROT_READ | PROT_WRITE, MAP_PRIVATE |
+ MAP_ANONYMOUS, -1, 0);
+ if (!mis->postcopy_tmp_page) {
+ error_report("%s: %s", __func__, strerror(errno));
+ return NULL;
+ }
+ }
+
+ return mis->postcopy_tmp_page;
+}
+
+#else
+/* No target OS support, stubs just fail */
+bool postcopy_ram_supported_by_host(void)
+{
+ error_report("%s: No OS support", __func__);
+ return false;
+}
+
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+ error_report("postcopy_ram_incoming_init: No OS support");
+ return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+ size_t length)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+ assert(0);
+ return -1;
+}
+
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+ assert(0);
+ return NULL;
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/**
+ * postcopy_discard_send_init: Called at the start of each RAMBlock before
+ * asking to discard individual ranges.
+ *
+ * @ms: The current migration state.
+ * @offset: the bitmap offset of the named RAMBlock in the migration
+ * bitmap.
+ * @name: RAMBlock that discards will operate on.
+ *
+ * returns: a new PDS.
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+ unsigned long offset,
+ const char *name)
+{
+ PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
+
+ if (res) {
+ res->ramblock_name = name;
+ res->offset = offset;
+ }
+
+ return res;
+}
+
+/**
+ * postcopy_discard_send_range: Called by the bitmap code for each chunk to
+ * discard. May send a discard message, may just leave it queued to
+ * be sent later.
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ * @start,@length: a range of pages in the migration bitmap in the
+ * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+ unsigned long start, unsigned long length)
+{
+ size_t tp_bits = qemu_target_page_bits();
+ /* Convert to byte offsets within the RAM block */
+ pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
+ pds->length_list[pds->cur_entry] = length << tp_bits;
+ trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
+ pds->cur_entry++;
+ pds->nsentwords++;
+
+ if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+ /* Full set, ship it! */
+ qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+ pds->cur_entry,
+ pds->start_list,
+ pds->length_list);
+ pds->nsentcmds++;
+ pds->cur_entry = 0;
+ }
+}
+
+/**
+ * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
+ * bitmap code. Sends any outstanding discard messages, frees the PDS
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ */
+void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+{
+ /* Anything unsent? */
+ if (pds->cur_entry) {
+ qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+ pds->cur_entry,
+ pds->start_list,
+ pds->length_list);
+ pds->nsentcmds++;
+ }
+
+ trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
+ pds->nsentcmds);
+
+ g_free(pds);
+}
diff --git a/migration/qemu-file-buf.c b/migration/qemu-file-buf.c
index 2de9330ca5..49516b8643 100644
--- a/migration/qemu-file-buf.c
+++ b/migration/qemu-file-buf.c
@@ -29,7 +29,7 @@
#include "qemu/error-report.h"
#include "qemu/iov.h"
#include "qemu/sockets.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
#include "migration/migration.h"
#include "migration/qemu-file.h"
#include "migration/qemu-file-internal.h"
@@ -372,7 +372,8 @@ typedef struct QEMUBuffer {
bool qsb_allocated;
} QEMUBuffer;
-static int buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+static ssize_t buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size)
{
QEMUBuffer *s = opaque;
ssize_t len = qsb_get_length(s->qsb) - pos;
@@ -387,8 +388,8 @@ static int buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
return qsb_get_buffer(s->qsb, pos, len, buf);
}
-static int buf_put_buffer(void *opaque, const uint8_t *buf,
- int64_t pos, int size)
+static ssize_t buf_put_buffer(void *opaque, const uint8_t *buf,
+ int64_t pos, size_t size)
{
QEMUBuffer *s = opaque;
@@ -439,7 +440,7 @@ QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input)
return NULL;
}
- s = g_malloc0(sizeof(QEMUBuffer));
+ s = g_new0(QEMUBuffer, 1);
s->qsb = input;
if (s->qsb == NULL) {
diff --git a/migration/qemu-file-stdio.c b/migration/qemu-file-stdio.c
index 285068b303..9bde9db566 100644
--- a/migration/qemu-file-stdio.c
+++ b/migration/qemu-file-stdio.c
@@ -22,7 +22,7 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
#include "migration/qemu-file.h"
typedef struct QEMUFileStdio {
@@ -37,11 +37,11 @@ static int stdio_get_fd(void *opaque)
return fileno(s->stdio_file);
}
-static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
- int size)
+static ssize_t stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
+ size_t size)
{
QEMUFileStdio *s = opaque;
- int res;
+ size_t res;
res = fwrite(buf, 1, size, s->stdio_file);
@@ -51,11 +51,12 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
return res;
}
-static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+static ssize_t stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size)
{
QEMUFileStdio *s = opaque;
FILE *fp = s->stdio_file;
- int bytes;
+ ssize_t bytes;
for (;;) {
clearerr(fp);
@@ -143,7 +144,7 @@ QEMUFile *qemu_popen_cmd(const char *command, const char *mode)
return NULL;
}
- s = g_malloc0(sizeof(QEMUFileStdio));
+ s = g_new0(QEMUFileStdio, 1);
s->stdio_file = stdio_file;
@@ -175,7 +176,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
return NULL;
}
- s = g_malloc0(sizeof(QEMUFileStdio));
+ s = g_new0(QEMUFileStdio, 1);
s->stdio_file = fopen(filename, mode);
if (!s->stdio_file) {
diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c
index bfbc0861ab..6ca53e7d67 100644
--- a/migration/qemu-file-unix.c
+++ b/migration/qemu-file-unix.c
@@ -22,9 +22,10 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
+#include "qemu/error-report.h"
#include "qemu/iov.h"
#include "qemu/sockets.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
#include "migration/qemu-file.h"
#include "migration/qemu-file-internal.h"
@@ -39,12 +40,44 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
QEMUFileSocket *s = opaque;
ssize_t len;
ssize_t size = iov_size(iov, iovcnt);
+ ssize_t offset = 0;
+ int err;
- len = iov_send(s->fd, iov, iovcnt, 0, size);
- if (len < size) {
- len = -socket_error();
- }
- return len;
+ while (size > 0) {
+ len = iov_send(s->fd, iov, iovcnt, offset, size);
+
+ if (len > 0) {
+ size -= len;
+ offset += len;
+ }
+
+ if (size > 0) {
+ err = socket_error();
+
+ if (err != EAGAIN && err != EWOULDBLOCK) {
+ error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)",
+ err, (size_t)size, (size_t)len);
+ /*
+ * If I've already sent some but only just got the error, I
+ * could return the amount validly sent so far and wait for the
+ * next call to report the error, but I'd rather flag the error
+ * immediately.
+ */
+ return -err;
+ }
+
+ /* Emulate blocking */
+ GPollFD pfd;
+
+ pfd.fd = s->fd;
+ pfd.events = G_IO_OUT | G_IO_ERR;
+ pfd.revents = 0;
+ TFR(err = g_poll(&pfd, 1, -1 /* no timeout */));
+ /* Errors other than EINTR intentionally ignored */
+ }
+ }
+
+ return offset;
}
static int socket_get_fd(void *opaque)
@@ -54,7 +87,8 @@ static int socket_get_fd(void *opaque)
return s->fd;
}
-static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size)
{
QEMUFileSocket *s = opaque;
ssize_t len;
@@ -96,6 +130,56 @@ static int socket_shutdown(void *opaque, bool rd, bool wr)
}
}
+static int socket_return_close(void *opaque)
+{
+ QEMUFileSocket *s = opaque;
+ /*
+ * Note: We don't close the socket, that should be done by the forward
+ * path.
+ */
+ g_free(s);
+ return 0;
+}
+
+static const QEMUFileOps socket_return_read_ops = {
+ .get_fd = socket_get_fd,
+ .get_buffer = socket_get_buffer,
+ .close = socket_return_close,
+ .shut_down = socket_shutdown,
+};
+
+static const QEMUFileOps socket_return_write_ops = {
+ .get_fd = socket_get_fd,
+ .writev_buffer = socket_writev_buffer,
+ .close = socket_return_close,
+ .shut_down = socket_shutdown,
+};
+
+/*
+ * Give a QEMUFile* off the same socket but data in the opposite
+ * direction.
+ */
+static QEMUFile *socket_get_return_path(void *opaque)
+{
+ QEMUFileSocket *forward = opaque;
+ QEMUFileSocket *reverse;
+
+ if (qemu_file_get_error(forward->file)) {
+ /* If the forward file is in error, don't try and open a return */
+ return NULL;
+ }
+
+ reverse = g_malloc0(sizeof(QEMUFileSocket));
+ reverse->fd = forward->fd;
+ /* I don't think there's a better way to tell which direction 'this' is */
+ if (forward->file->ops->get_buffer != NULL) {
+ /* being called from the read side, so we need to be able to write */
+ return qemu_fopen_ops(reverse, &socket_return_write_ops);
+ } else {
+ return qemu_fopen_ops(reverse, &socket_return_read_ops);
+ }
+}
+
static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
int64_t pos)
{
@@ -138,7 +222,8 @@ static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
return total;
}
-static int unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+static ssize_t unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size)
{
QEMUFileSocket *s = opaque;
ssize_t len;
@@ -192,7 +277,7 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
return NULL;
}
- s = g_malloc0(sizeof(QEMUFileSocket));
+ s = g_new0(QEMUFileSocket, 1);
s->fd = fd;
if (mode[0] == 'r') {
@@ -204,18 +289,19 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
}
static const QEMUFileOps socket_read_ops = {
- .get_fd = socket_get_fd,
- .get_buffer = socket_get_buffer,
- .close = socket_close,
- .shut_down = socket_shutdown
-
+ .get_fd = socket_get_fd,
+ .get_buffer = socket_get_buffer,
+ .close = socket_close,
+ .shut_down = socket_shutdown,
+ .get_return_path = socket_get_return_path
};
static const QEMUFileOps socket_write_ops = {
- .get_fd = socket_get_fd,
- .writev_buffer = socket_writev_buffer,
- .close = socket_close,
- .shut_down = socket_shutdown
+ .get_fd = socket_get_fd,
+ .writev_buffer = socket_writev_buffer,
+ .close = socket_close,
+ .shut_down = socket_shutdown,
+ .get_return_path = socket_get_return_path
};
QEMUFile *qemu_fopen_socket(int fd, const char *mode)
@@ -226,7 +312,7 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode)
return NULL;
}
- s = g_malloc0(sizeof(QEMUFileSocket));
+ s = g_new0(QEMUFileSocket, 1);
s->fd = fd;
if (mode[0] == 'w') {
qemu_set_block(s->fd);
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 6bb3dc15cd..0bbd2574a8 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -26,7 +26,7 @@
#include "qemu/error-report.h"
#include "qemu/iov.h"
#include "qemu/sockets.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
#include "migration/migration.h"
#include "migration/qemu-file.h"
#include "migration/qemu-file-internal.h"
@@ -44,6 +44,18 @@ int qemu_file_shutdown(QEMUFile *f)
return f->ops->shut_down(f->opaque, true, true);
}
+/*
+ * Result: QEMUFile* for a 'return path' for comms in the opposite direction
+ * NULL if not available
+ */
+QEMUFile *qemu_file_get_return_path(QEMUFile *f)
+{
+ if (!f->ops->get_return_path) {
+ return NULL;
+ }
+ return f->ops->get_return_path(f->opaque);
+}
+
bool qemu_file_mode_is_not_valid(const char *mode)
{
if (mode == NULL ||
@@ -60,7 +72,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
{
QEMUFile *f;
- f = g_malloc0(sizeof(QEMUFile));
+ f = g_new0(QEMUFile, 1);
f->opaque = opaque;
f->ops = ops;
@@ -270,7 +282,7 @@ int qemu_fclose(QEMUFile *f)
return ret;
}
-static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size)
+static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size)
{
/* check for adjacent buffer and coalesce them */
if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base +
@@ -286,7 +298,7 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size)
}
}
-void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size)
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size)
{
if (!f->ops->writev_buffer) {
qemu_put_buffer(f, buf, size);
@@ -301,9 +313,9 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size)
add_to_iovec(f, buf, size);
}
-void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size)
+void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
{
- int l;
+ size_t l;
if (f->last_error) {
return;
@@ -363,10 +375,10 @@ void qemu_file_skip(QEMUFile *f, int size)
* return as many as it managed to read (assuming blocking fd's which
* all current QEMUFile are)
*/
-int qemu_peek_buffer(QEMUFile *f, uint8_t **buf, int size, size_t offset)
+size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset)
{
- int pending;
- int index;
+ ssize_t pending;
+ size_t index;
assert(!qemu_file_is_writable(f));
assert(offset < IO_BUF_SIZE);
@@ -411,13 +423,13 @@ int qemu_peek_buffer(QEMUFile *f, uint8_t **buf, int size, size_t offset)
* return as many as it managed to read (assuming blocking fd's which
* all current QEMUFile are)
*/
-int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size)
+size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size)
{
- int pending = size;
- int done = 0;
+ size_t pending = size;
+ size_t done = 0;
while (pending > 0) {
- int res;
+ size_t res;
uint8_t *src;
res = qemu_peek_buffer(f, &src, MIN(pending, IO_BUF_SIZE), 0);
@@ -434,6 +446,43 @@ int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size)
}
/*
+ * Read 'size' bytes of data from the file.
+ * 'size' can be larger than the internal buffer.
+ *
+ * The data:
+ * may be held on an internal buffer (in which case *buf is updated
+ * to point to it) that is valid until the next qemu_file operation.
+ * OR
+ * will be copied to the *buf that was passed in.
+ *
+ * The code tries to avoid the copy if possible.
+ *
+ * It will return size bytes unless there was an error, in which case it will
+ * return as many as it managed to read (assuming blocking fd's which
+ * all current QEMUFile are)
+ *
+ * Note: Since **buf may get changed, the caller should take care to
+ * keep a pointer to the original buffer if it needs to deallocate it.
+ */
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
+{
+ if (size < IO_BUF_SIZE) {
+ size_t res;
+ uint8_t *src;
+
+ res = qemu_peek_buffer(f, &src, size, 0);
+
+ if (res == size) {
+ qemu_file_skip(f, res);
+ *buf = src;
+ return res;
+ }
+ }
+
+ return qemu_get_buffer(f, *buf, size);
+}
+
+/*
* Peeks a single byte from the buffer; this isn't guaranteed to work if
* offset leaves a gap after the previous read/peeked data.
*/
@@ -611,3 +660,18 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256])
return res == len ? res : 0;
}
+
+/*
+ * Set the blocking state of the QEMUFile.
+ * Note: On some transports the OS only keeps a single blocking state for
+ * both directions, and thus changing the blocking on the main
+ * QEMUFile can also affect the return path.
+ */
+void qemu_file_set_blocking(QEMUFile *f, bool block)
+{
+ if (block) {
+ qemu_set_block(qemu_get_fd(f));
+ } else {
+ qemu_set_nonblock(qemu_get_fd(f));
+ }
+}
diff --git a/migration/ram.c b/migration/ram.c
index 7f007e6432..0490f005dd 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -32,6 +32,7 @@
#include "qemu/timer.h"
#include "qemu/main-loop.h"
#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
#include "exec/address-spaces.h"
#include "migration/page_cache.h"
#include "qemu/error-report.h"
@@ -47,9 +48,7 @@
do { } while (0)
#endif
-static bool mig_throttle_on;
static int dirty_rate_high_cnt;
-static void check_guest_throttling(void);
static uint64_t bitmap_sync_count;
@@ -221,12 +220,34 @@ static RAMBlock *last_seen_block;
/* This is the last block from where we have sent data */
static RAMBlock *last_sent_block;
static ram_addr_t last_offset;
-static unsigned long *migration_bitmap;
static QemuMutex migration_bitmap_mutex;
static uint64_t migration_dirty_pages;
static uint32_t last_version;
static bool ram_bulk_stage;
+/* used by the search for pages to send */
+struct PageSearchStatus {
+ /* Current block being searched */
+ RAMBlock *block;
+ /* Current offset to search from */
+ ram_addr_t offset;
+ /* Set once we wrap around */
+ bool complete_round;
+};
+typedef struct PageSearchStatus PageSearchStatus;
+
+static struct BitmapRcu {
+ struct rcu_head rcu;
+ /* Main migration bitmap */
+ unsigned long *bmap;
+ /* bitmap of pages that haven't been sent even once
+ * only maintained and used in postcopy at the moment
+ * where it's used to send the dirtymap at the start
+ * of the postcopy phase
+ */
+ unsigned long *unsentmap;
+} *migration_bitmap_rcu;
+
struct CompressParam {
bool start;
bool done;
@@ -396,6 +417,29 @@ static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
return size;
}
+/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
+ * If guest dirty memory rate is reduced below the rate at which we can
+ * transfer pages to the destination then we should be able to complete
+ * migration. Some workloads dirty memory way too fast and will not effectively
+ * converge, even with auto-converge.
+ */
+static void mig_throttle_guest_down(void)
+{
+ MigrationState *s = migrate_get_current();
+ uint64_t pct_initial =
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
+ uint64_t pct_icrement =
+ s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
+
+ /* We have not started throttling yet. Let's start it. */
+ if (!cpu_throttle_active()) {
+ cpu_throttle_set(pct_initial);
+ } else {
+ /* Throttling already on, just increase the rate */
+ cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
+ }
+}
+
/* Update the xbzrle cache to reflect a page that's been sent as all 0.
* The important thing is that a stale (not-yet-0'd) page be replaced
* by the new data.
@@ -495,43 +539,60 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
return 1;
}
-/* Called with rcu_read_lock() to protect migration_bitmap */
+/* Called with rcu_read_lock() to protect migration_bitmap
+ * rb: The RAMBlock to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * ram_addr_abs: Pointer into which to store the address of the dirty page
+ * within the global ram_addr space
+ *
+ * Returns: byte offset within memory region of the start of a dirty page
+ */
static inline
-ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
- ram_addr_t start)
+ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
+ ram_addr_t start,
+ ram_addr_t *ram_addr_abs)
{
- unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
+ unsigned long base = rb->offset >> TARGET_PAGE_BITS;
unsigned long nr = base + (start >> TARGET_PAGE_BITS);
- uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr));
- unsigned long size = base + (mr_size >> TARGET_PAGE_BITS);
+ uint64_t rb_size = rb->used_length;
+ unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
unsigned long *bitmap;
unsigned long next;
- bitmap = atomic_rcu_read(&migration_bitmap);
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
if (ram_bulk_stage && nr > base) {
next = nr + 1;
} else {
next = find_next_bit(bitmap, size, nr);
}
- if (next < size) {
- clear_bit(next, bitmap);
+ *ram_addr_abs = next << TARGET_PAGE_BITS;
+ return (next - base) << TARGET_PAGE_BITS;
+}
+
+static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
+{
+ bool ret;
+ int nr = addr >> TARGET_PAGE_BITS;
+ unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+ ret = test_and_clear_bit(nr, bitmap);
+
+ if (ret) {
migration_dirty_pages--;
}
- return (next - base) << TARGET_PAGE_BITS;
+ return ret;
}
-/* Called with rcu_read_lock() to protect migration_bitmap */
static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
{
unsigned long *bitmap;
- bitmap = atomic_rcu_read(&migration_bitmap);
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
migration_dirty_pages +=
cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
}
-
/* Fix me: there are too many global variables used in migration process. */
static int64_t start_time;
static int64_t bytes_xfer_prev;
@@ -573,7 +634,7 @@ static void migration_bitmap_sync(void)
qemu_mutex_lock(&migration_bitmap_mutex);
rcu_read_lock();
QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
- migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
+ migration_bitmap_sync_range(block->offset, block->used_length);
}
rcu_read_unlock();
qemu_mutex_unlock(&migration_bitmap_mutex);
@@ -589,21 +650,21 @@ static void migration_bitmap_sync(void)
/* The following detection logic can be refined later. For now:
Check to see if the dirtied bytes is 50% more than the approx.
amount of bytes that just got transferred since the last time we
- were in this routine. If that happens >N times (for now N==4)
- we turn on the throttle down logic */
+ were in this routine. If that happens twice, start or increase
+ throttling */
bytes_xfer_now = ram_bytes_transferred();
+
if (s->dirty_pages_rate &&
(num_dirty_pages_period * TARGET_PAGE_SIZE >
(bytes_xfer_now - bytes_xfer_prev)/2) &&
- (dirty_rate_high_cnt++ > 4)) {
+ (dirty_rate_high_cnt++ >= 2)) {
trace_migration_throttle();
- mig_throttle_on = true;
dirty_rate_high_cnt = 0;
+ mig_throttle_guest_down();
}
bytes_xfer_prev = bytes_xfer_now;
- } else {
- mig_throttle_on = false;
}
+
if (migrate_use_xbzrle()) {
if (iterations_prev != acct_info.iterations) {
acct_info.xbzrle_cache_miss_rate =
@@ -655,6 +716,9 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
* ram_save_page: Send the given page to the stream
*
* Returns: Number of pages written.
+ * < 0 - error
+ * >=0 - Number of pages written - this might legally be 0
+ * if xbzrle noticed the page was the same.
*
* @f: QEMUFile where to send the data
* @block: block that contains the page we want to send
@@ -668,12 +732,11 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
int pages = -1;
uint64_t bytes_xmit;
ram_addr_t current_addr;
- MemoryRegion *mr = block->mr;
uint8_t *p;
int ret;
bool send_async = true;
- p = memory_region_get_ram_ptr(mr) + offset;
+ p = block->host + offset;
/* In doubt sent page as normal */
bytes_xmit = 0;
@@ -744,7 +807,7 @@ static int do_compress_ram_page(CompressParam *param)
RAMBlock *block = param->block;
ram_addr_t offset = param->offset;
- p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK);
+ p = block->host + (offset & TARGET_PAGE_MASK);
bytes_sent = save_page_header(param->file, block, offset |
RAM_SAVE_FLAG_COMPRESS_PAGE);
@@ -852,11 +915,10 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
{
int pages = -1;
uint64_t bytes_xmit;
- MemoryRegion *mr = block->mr;
uint8_t *p;
int ret;
- p = memory_region_get_ram_ptr(mr) + offset;
+ p = block->host + offset;
bytes_xmit = 0;
ret = ram_control_save_page(f, block->offset,
@@ -909,6 +971,339 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
return pages;
}
+/*
+ * Find the next dirty page and update any state associated with
+ * the search process.
+ *
+ * Returns: True if a page is found
+ *
+ * @f: Current migration stream.
+ * @pss: Data about the state of the current dirty page scan.
+ * @*again: Set to false if the search has scanned the whole of RAM
+ * *ram_addr_abs: Pointer into which to store the address of the dirty page
+ * within the global ram_addr space
+ */
+static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
+ bool *again, ram_addr_t *ram_addr_abs)
+{
+ pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
+ ram_addr_abs);
+ if (pss->complete_round && pss->block == last_seen_block &&
+ pss->offset >= last_offset) {
+ /*
+ * We've been once around the RAM and haven't found anything.
+ * Give up.
+ */
+ *again = false;
+ return false;
+ }
+ if (pss->offset >= pss->block->used_length) {
+ /* Didn't find anything in this RAM Block */
+ pss->offset = 0;
+ pss->block = QLIST_NEXT_RCU(pss->block, next);
+ if (!pss->block) {
+ /* Hit the end of the list */
+ pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
+ /* Flag that we've looped */
+ pss->complete_round = true;
+ ram_bulk_stage = false;
+ if (migrate_use_xbzrle()) {
+ /* If xbzrle is on, stop using the data compression at this
+ * point. In theory, xbzrle can do better than compression.
+ */
+ flush_compressed_data(f);
+ compression_switch = false;
+ }
+ }
+ /* Didn't find anything this time, but try again on the new block */
+ *again = true;
+ return false;
+ } else {
+ /* Can go around again, but... */
+ *again = true;
+ /* We've found something so probably don't need to */
+ return true;
+ }
+}
+
+/*
+ * Helper for 'get_queued_page' - gets a page off the queue
+ * ms: MigrationState in
+ * *offset: Used to return the offset within the RAMBlock
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns: block (or NULL if none available)
+ */
+static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
+ ram_addr_t *ram_addr_abs)
+{
+ RAMBlock *block = NULL;
+
+ qemu_mutex_lock(&ms->src_page_req_mutex);
+ if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
+ struct MigrationSrcPageRequest *entry =
+ QSIMPLEQ_FIRST(&ms->src_page_requests);
+ block = entry->rb;
+ *offset = entry->offset;
+ *ram_addr_abs = (entry->offset + entry->rb->offset) &
+ TARGET_PAGE_MASK;
+
+ if (entry->len > TARGET_PAGE_SIZE) {
+ entry->len -= TARGET_PAGE_SIZE;
+ entry->offset += TARGET_PAGE_SIZE;
+ } else {
+ memory_region_unref(block->mr);
+ QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+ g_free(entry);
+ }
+ }
+ qemu_mutex_unlock(&ms->src_page_req_mutex);
+
+ return block;
+}
+
+/*
+ * Unqueue a page from the queue fed by postcopy page requests; skips pages
+ * that are already sent (!dirty)
+ *
+ * ms: MigrationState in
+ * pss: PageSearchStatus structure updated with found block/offset
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns: true if a queued page is found
+ */
+static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
+ ram_addr_t *ram_addr_abs)
+{
+ RAMBlock *block;
+ ram_addr_t offset;
+ bool dirty;
+
+ do {
+ block = unqueue_page(ms, &offset, ram_addr_abs);
+ /*
+ * We're sending this page, and since it's postcopy nothing else
+ * will dirty it, and we must make sure it doesn't get sent again
+ * even if this queue request was received after the background
+ * search already sent it.
+ */
+ if (block) {
+ unsigned long *bitmap;
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
+ if (!dirty) {
+ trace_get_queued_page_not_dirty(
+ block->idstr, (uint64_t)offset,
+ (uint64_t)*ram_addr_abs,
+ test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
+ atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
+ } else {
+ trace_get_queued_page(block->idstr,
+ (uint64_t)offset,
+ (uint64_t)*ram_addr_abs);
+ }
+ }
+
+ } while (block && !dirty);
+
+ if (block) {
+ /*
+ * As soon as we start servicing pages out of order, then we have
+ * to kill the bulk stage, since the bulk stage assumes
+ * in (migration_bitmap_find_and_reset_dirty) that every page is
+ * dirty, that's no longer true.
+ */
+ ram_bulk_stage = false;
+
+ /*
+ * We want the background search to continue from the queued page
+ * since the guest is likely to want other pages near to the page
+ * it just requested.
+ */
+ pss->block = block;
+ pss->offset = offset;
+ }
+
+ return !!block;
+}
+
+/**
+ * flush_page_queue: Flush any remaining pages in the ram request queue
+ * it should be empty at the end anyway, but in error cases there may be
+ * some left.
+ *
+ * ms: MigrationState
+ */
+void flush_page_queue(MigrationState *ms)
+{
+ struct MigrationSrcPageRequest *mspr, *next_mspr;
+ /* This queue generally should be empty - but in the case of a failed
+ * migration might have some droppings in.
+ */
+ rcu_read_lock();
+ QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
+ memory_region_unref(mspr->rb->mr);
+ QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+ g_free(mspr);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * Queue the pages for transmission, e.g. a request from postcopy destination
+ * ms: MigrationStatus in which the queue is held
+ * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
+ * start: Offset from the start of the RAMBlock
+ * len: Length (in bytes) to send
+ * Return: 0 on success
+ */
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+ ram_addr_t start, ram_addr_t len)
+{
+ RAMBlock *ramblock;
+
+ rcu_read_lock();
+ if (!rbname) {
+ /* Reuse last RAMBlock */
+ ramblock = ms->last_req_rb;
+
+ if (!ramblock) {
+ /*
+ * Shouldn't happen, we can't reuse the last RAMBlock if
+ * it's the 1st request.
+ */
+ error_report("ram_save_queue_pages no previous block");
+ goto err;
+ }
+ } else {
+ ramblock = qemu_ram_block_by_name(rbname);
+
+ if (!ramblock) {
+ /* We shouldn't be asked for a non-existent RAMBlock */
+ error_report("ram_save_queue_pages no block '%s'", rbname);
+ goto err;
+ }
+ ms->last_req_rb = ramblock;
+ }
+ trace_ram_save_queue_pages(ramblock->idstr, start, len);
+ if (start+len > ramblock->used_length) {
+ error_report("%s request overrun start=" RAM_ADDR_FMT " len="
+ RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
+ __func__, start, len, ramblock->used_length);
+ goto err;
+ }
+
+ struct MigrationSrcPageRequest *new_entry =
+ g_malloc0(sizeof(struct MigrationSrcPageRequest));
+ new_entry->rb = ramblock;
+ new_entry->offset = start;
+ new_entry->len = len;
+
+ memory_region_ref(ramblock->mr);
+ qemu_mutex_lock(&ms->src_page_req_mutex);
+ QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
+ qemu_mutex_unlock(&ms->src_page_req_mutex);
+ rcu_read_unlock();
+
+ return 0;
+
+err:
+ rcu_read_unlock();
+ return -1;
+}
+
+/**
+ * ram_save_target_page: Save one target page
+ *
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page;
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ *
+ * Returns: Number of pages written.
+ */
+static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
+ RAMBlock *block, ram_addr_t offset,
+ bool last_stage,
+ uint64_t *bytes_transferred,
+ ram_addr_t dirty_ram_abs)
+{
+ int res = 0;
+
+ /* Check the pages is dirty and if it is send it */
+ if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
+ unsigned long *unsentmap;
+ if (compression_switch && migrate_use_compression()) {
+ res = ram_save_compressed_page(f, block, offset,
+ last_stage,
+ bytes_transferred);
+ } else {
+ res = ram_save_page(f, block, offset, last_stage,
+ bytes_transferred);
+ }
+
+ if (res < 0) {
+ return res;
+ }
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ if (unsentmap) {
+ clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
+ }
+ /* Only update last_sent_block if a block was actually sent; xbzrle
+ * might have decided the page was identical so didn't bother writing
+ * to the stream.
+ */
+ if (res > 0) {
+ last_sent_block = block;
+ }
+ }
+
+ return res;
+}
+
+/**
+ * ram_save_host_page: Starting at *offset send pages upto the end
+ * of the current host page. It's valid for the initial
+ * offset to point into the middle of a host page
+ * in which case the remainder of the hostpage is sent.
+ * Only dirty target pages are sent.
+ *
+ * Returns: Number of pages written.
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page; updated to last target page
+ * sent
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ */
+static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
+ ram_addr_t *offset, bool last_stage,
+ uint64_t *bytes_transferred,
+ ram_addr_t dirty_ram_abs)
+{
+ int tmppages, pages = 0;
+ do {
+ tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
+ bytes_transferred, dirty_ram_abs);
+ if (tmppages < 0) {
+ return tmppages;
+ }
+
+ pages += tmppages;
+ *offset += TARGET_PAGE_SIZE;
+ dirty_ram_abs += TARGET_PAGE_SIZE;
+ } while (*offset & (qemu_host_page_size - 1));
+
+ /* The offset we leave with is the last one we looked at */
+ *offset -= TARGET_PAGE_SIZE;
+ return pages;
+}
+
/**
* ram_find_and_save_block: Finds a dirty page and sends it to f
*
@@ -920,61 +1315,47 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
* @f: QEMUFile where to send the data
* @last_stage: if we are at the completion stage
* @bytes_transferred: increase it with the number of transferred bytes
+ *
+ * On systems where host-page-size > target-page-size it will send all the
+ * pages in a host page that are dirty.
*/
static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
uint64_t *bytes_transferred)
{
- RAMBlock *block = last_seen_block;
- ram_addr_t offset = last_offset;
- bool complete_round = false;
+ PageSearchStatus pss;
+ MigrationState *ms = migrate_get_current();
int pages = 0;
- MemoryRegion *mr;
+ bool again, found;
+ ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
+ ram_addr_t space */
- if (!block)
- block = QLIST_FIRST_RCU(&ram_list.blocks);
+ pss.block = last_seen_block;
+ pss.offset = last_offset;
+ pss.complete_round = false;
- while (true) {
- mr = block->mr;
- offset = migration_bitmap_find_and_reset_dirty(mr, offset);
- if (complete_round && block == last_seen_block &&
- offset >= last_offset) {
- break;
+ if (!pss.block) {
+ pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
+ }
+
+ do {
+ again = true;
+ found = get_queued_page(ms, &pss, &dirty_ram_abs);
+
+ if (!found) {
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
}
- if (offset >= block->used_length) {
- offset = 0;
- block = QLIST_NEXT_RCU(block, next);
- if (!block) {
- block = QLIST_FIRST_RCU(&ram_list.blocks);
- complete_round = true;
- ram_bulk_stage = false;
- if (migrate_use_xbzrle()) {
- /* If xbzrle is on, stop using the data compression at this
- * point. In theory, xbzrle can do better than compression.
- */
- flush_compressed_data(f);
- compression_switch = false;
- }
- }
- } else {
- if (compression_switch && migrate_use_compression()) {
- pages = ram_save_compressed_page(f, block, offset, last_stage,
- bytes_transferred);
- } else {
- pages = ram_save_page(f, block, offset, last_stage,
- bytes_transferred);
- }
- /* if page is unmodified, continue to the next */
- if (pages > 0) {
- last_sent_block = block;
- break;
- }
+ if (found) {
+ pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
+ last_stage, bytes_transferred,
+ dirty_ram_abs);
}
- }
+ } while (!pages && again);
- last_seen_block = block;
- last_offset = offset;
+ last_seen_block = pss.block;
+ last_offset = pss.offset;
return pages;
}
@@ -1024,17 +1405,23 @@ void free_xbzrle_decoded_buf(void)
xbzrle_decoded_buf = NULL;
}
-static void migration_end(void)
+static void migration_bitmap_free(struct BitmapRcu *bmap)
+{
+ g_free(bmap->bmap);
+ g_free(bmap->unsentmap);
+ g_free(bmap);
+}
+
+static void ram_migration_cleanup(void *opaque)
{
/* caller have hold iothread lock or is in a bh, so there is
* no writing race against this migration_bitmap
*/
- unsigned long *bitmap = migration_bitmap;
- atomic_rcu_set(&migration_bitmap, NULL);
+ struct BitmapRcu *bitmap = migration_bitmap_rcu;
+ atomic_rcu_set(&migration_bitmap_rcu, NULL);
if (bitmap) {
memory_global_dirty_log_stop();
- synchronize_rcu();
- g_free(bitmap);
+ call_rcu(bitmap, migration_bitmap_free, rcu);
}
XBZRLE_cache_lock();
@@ -1049,11 +1436,6 @@ static void migration_end(void)
XBZRLE_cache_unlock();
}
-static void ram_migration_cancel(void *opaque)
-{
- migration_end();
-}
-
static void reset_ram_globals(void)
{
last_seen_block = NULL;
@@ -1070,9 +1452,10 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
/* called in qemu main thread, so there is
* no writing race against this migration_bitmap
*/
- if (migration_bitmap) {
- unsigned long *old_bitmap = migration_bitmap, *bitmap;
- bitmap = bitmap_new(new);
+ if (migration_bitmap_rcu) {
+ struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
+ bitmap = g_new(struct BitmapRcu, 1);
+ bitmap->bmap = bitmap_new(new);
/* prevent migration_bitmap content from being set bit
* by migration_bitmap_sync_range() at the same time.
@@ -1080,16 +1463,410 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
* at the same time.
*/
qemu_mutex_lock(&migration_bitmap_mutex);
- bitmap_copy(bitmap, old_bitmap, old);
- bitmap_set(bitmap, old, new - old);
- atomic_rcu_set(&migration_bitmap, bitmap);
+ bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
+ bitmap_set(bitmap->bmap, old, new - old);
+
+ /* We don't have a way to safely extend the sentmap
+ * with RCU; so mark it as missing, entry to postcopy
+ * will fail.
+ */
+ bitmap->unsentmap = NULL;
+
+ atomic_rcu_set(&migration_bitmap_rcu, bitmap);
qemu_mutex_unlock(&migration_bitmap_mutex);
migration_dirty_pages += new - old;
- synchronize_rcu();
- g_free(old_bitmap);
+ call_rcu(old_bitmap, migration_bitmap_free, rcu);
}
}
+/*
+ * 'expected' is the value you expect the bitmap mostly to be full
+ * of; it won't bother printing lines that are all this value.
+ * If 'todump' is null the migration bitmap is dumped.
+ */
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
+{
+ int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+ int64_t cur;
+ int64_t linelen = 128;
+ char linebuf[129];
+
+ if (!todump) {
+ todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ }
+
+ for (cur = 0; cur < ram_pages; cur += linelen) {
+ int64_t curb;
+ bool found = false;
+ /*
+ * Last line; catch the case where the line length
+ * is longer than remaining ram
+ */
+ if (cur + linelen > ram_pages) {
+ linelen = ram_pages - cur;
+ }
+ for (curb = 0; curb < linelen; curb++) {
+ bool thisbit = test_bit(cur + curb, todump);
+ linebuf[curb] = thisbit ? '1' : '.';
+ found = found || (thisbit != expected);
+ }
+ if (found) {
+ linebuf[curb] = '\0';
+ fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
+ }
+ }
+}
+
+/* **** functions for postcopy ***** */
+
+/*
+ * Callback from postcopy_each_ram_send_discard for each RAMBlock
+ * Note: At this point the 'unsentmap' is the processed bitmap combined
+ * with the dirtymap; so a '1' means it's either dirty or unsent.
+ * start,length: Indexes into the bitmap for the first bit
+ * representing the named block and length in target-pages
+ */
+static int postcopy_send_discard_bm_ram(MigrationState *ms,
+ PostcopyDiscardState *pds,
+ unsigned long start,
+ unsigned long length)
+{
+ unsigned long end = start + length; /* one after the end */
+ unsigned long current;
+ unsigned long *unsentmap;
+
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ for (current = start; current < end; ) {
+ unsigned long one = find_next_bit(unsentmap, end, current);
+
+ if (one <= end) {
+ unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
+ unsigned long discard_length;
+
+ if (zero >= end) {
+ discard_length = end - one;
+ } else {
+ discard_length = zero - one;
+ }
+ postcopy_discard_send_range(ms, pds, one, discard_length);
+ current = one + discard_length;
+ } else {
+ current = one;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ * Calls postcopy_send_discard_bm_ram for each RAMBlock
+ * passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ * which would mean postcopy code would have to deal with target page)
+ */
+static int postcopy_each_ram_send_discard(MigrationState *ms)
+{
+ struct RAMBlock *block;
+ int ret;
+
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+ PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
+ first,
+ block->idstr);
+
+ /*
+ * Postcopy sends chunks of bitmap over the wire, but it
+ * just needs indexes at this point, avoids it having
+ * target page specific code.
+ */
+ ret = postcopy_send_discard_bm_ram(ms, pds, first,
+ block->used_length >> TARGET_PAGE_BITS);
+ postcopy_discard_send_finish(ms, pds);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
+ * the two bitmaps, that are similar, but one is inverted.
+ *
+ * We search for runs of target-pages that don't start or end on a
+ * host page boundary;
+ * unsent_pass=true: Cleans up partially unsent host pages by searching
+ * the unsentmap
+ * unsent_pass=false: Cleans up partially dirty host pages by searching
+ * the main migration bitmap
+ *
+ */
+static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
+ RAMBlock *block,
+ PostcopyDiscardState *pds)
+{
+ unsigned long *bitmap;
+ unsigned long *unsentmap;
+ unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+ unsigned long len = block->used_length >> TARGET_PAGE_BITS;
+ unsigned long last = first + (len - 1);
+ unsigned long run_start;
+
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+
+ if (unsent_pass) {
+ /* Find a sent page */
+ run_start = find_next_zero_bit(unsentmap, last + 1, first);
+ } else {
+ /* Find a dirty page */
+ run_start = find_next_bit(bitmap, last + 1, first);
+ }
+
+ while (run_start <= last) {
+ bool do_fixup = false;
+ unsigned long fixup_start_addr;
+ unsigned long host_offset;
+
+ /*
+ * If the start of this run of pages is in the middle of a host
+ * page, then we need to fixup this host page.
+ */
+ host_offset = run_start % host_ratio;
+ if (host_offset) {
+ do_fixup = true;
+ run_start -= host_offset;
+ fixup_start_addr = run_start;
+ /* For the next pass */
+ run_start = run_start + host_ratio;
+ } else {
+ /* Find the end of this run */
+ unsigned long run_end;
+ if (unsent_pass) {
+ run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
+ } else {
+ run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
+ }
+ /*
+ * If the end isn't at the start of a host page, then the
+ * run doesn't finish at the end of a host page
+ * and we need to discard.
+ */
+ host_offset = run_end % host_ratio;
+ if (host_offset) {
+ do_fixup = true;
+ fixup_start_addr = run_end - host_offset;
+ /*
+ * This host page has gone, the next loop iteration starts
+ * from after the fixup
+ */
+ run_start = fixup_start_addr + host_ratio;
+ } else {
+ /*
+ * No discards on this iteration, next loop starts from
+ * next sent/dirty page
+ */
+ run_start = run_end + 1;
+ }
+ }
+
+ if (do_fixup) {
+ unsigned long page;
+
+ /* Tell the destination to discard this page */
+ if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
+ /* For the unsent_pass we:
+ * discard partially sent pages
+ * For the !unsent_pass (dirty) we:
+ * discard partially dirty pages that were sent
+ * (any partially sent pages were already discarded
+ * by the previous unsent_pass)
+ */
+ postcopy_discard_send_range(ms, pds, fixup_start_addr,
+ host_ratio);
+ }
+
+ /* Clean up the bitmap */
+ for (page = fixup_start_addr;
+ page < fixup_start_addr + host_ratio; page++) {
+ /* All pages in this host page are now not sent */
+ set_bit(page, unsentmap);
+
+ /*
+ * Remark them as dirty, updating the count for any pages
+ * that weren't previously dirty.
+ */
+ migration_dirty_pages += !test_and_set_bit(page, bitmap);
+ }
+ }
+
+ if (unsent_pass) {
+ /* Find the next sent page for the next iteration */
+ run_start = find_next_zero_bit(unsentmap, last + 1,
+ run_start);
+ } else {
+ /* Find the next dirty page for the next iteration */
+ run_start = find_next_bit(bitmap, last + 1, run_start);
+ }
+ }
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *
+ * Discard any partially sent host-page size chunks, mark any partially
+ * dirty host-page size chunks as all dirty.
+ *
+ * Returns: 0 on success
+ */
+static int postcopy_chunk_hostpages(MigrationState *ms)
+{
+ struct RAMBlock *block;
+
+ if (qemu_host_page_size == TARGET_PAGE_SIZE) {
+ /* Easy case - TPS==HPS - nothing to be done */
+ return 0;
+ }
+
+ /* Easiest way to make sure we don't resume in the middle of a host-page */
+ last_seen_block = NULL;
+ last_sent_block = NULL;
+ last_offset = 0;
+
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+
+ PostcopyDiscardState *pds =
+ postcopy_discard_send_init(ms, first, block->idstr);
+
+ /* First pass: Discard all partially sent host pages */
+ postcopy_chunk_hostpages_pass(ms, true, block, pds);
+ /*
+ * Second pass: Ensure that all partially dirty host pages are made
+ * fully dirty.
+ */
+ postcopy_chunk_hostpages_pass(ms, false, block, pds);
+
+ postcopy_discard_send_finish(ms, pds);
+ } /* ram_list loop */
+
+ return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that:
+ * a) Have been previously transmitted but are now dirty again
+ * b) Pages that have never been transmitted, this ensures that
+ * any pages on the destination that have been mapped by background
+ * tasks get discarded (transparent huge pages is the specific concern)
+ * Hopefully this is pretty sparse
+ */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+{
+ int ret;
+ unsigned long *bitmap, *unsentmap;
+
+ rcu_read_lock();
+
+ /* This should be our last sync, the src is now paused */
+ migration_bitmap_sync();
+
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ if (!unsentmap) {
+ /* We don't have a safe way to resize the sentmap, so
+ * if the bitmap was resized it will be NULL at this
+ * point.
+ */
+ error_report("migration ram resized during precopy phase");
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ /* Deal with TPS != HPS */
+ ret = postcopy_chunk_hostpages(ms);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+
+ /*
+ * Update the unsentmap to be unsentmap = unsentmap | dirty
+ */
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ bitmap_or(unsentmap, unsentmap, bitmap,
+ last_ram_offset() >> TARGET_PAGE_BITS);
+
+
+ trace_ram_postcopy_send_discard_bitmap();
+#ifdef DEBUG_POSTCOPY
+ ram_debug_dump_bitmap(unsentmap, true);
+#endif
+
+ ret = postcopy_each_ram_send_discard(ms);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start, length describe a byte address range within the RAMBlock
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+ const char *block_name,
+ uint64_t start, size_t length)
+{
+ int ret = -1;
+
+ rcu_read_lock();
+ RAMBlock *rb = qemu_ram_block_by_name(block_name);
+
+ if (!rb) {
+ error_report("ram_discard_range: Failed to find block '%s'",
+ block_name);
+ goto err;
+ }
+
+ uint8_t *host_startaddr = rb->host + start;
+
+ if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
+ error_report("ram_discard_range: Unaligned start address: %p",
+ host_startaddr);
+ goto err;
+ }
+
+ if ((start + length) <= rb->used_length) {
+ uint8_t *host_endaddr = host_startaddr + length;
+ if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
+ error_report("ram_discard_range: Unaligned end address: %p",
+ host_endaddr);
+ goto err;
+ }
+ ret = postcopy_ram_discard_range(mis, host_startaddr, length);
+ } else {
+ error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
+ "/%zx/" RAM_ADDR_FMT")",
+ block_name, start, length, rb->used_length);
+ }
+
+err:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+
/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
* long-running RCU critical section. When rcu-reclaims in the code
* start to become numerous it will be necessary to reduce the
@@ -1101,7 +1878,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
RAMBlock *block;
int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
- mig_throttle_on = false;
dirty_rate_high_cnt = 0;
bitmap_sync_count = 0;
migration_bitmap_sync_init();
@@ -1145,8 +1921,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
reset_ram_globals();
ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
- migration_bitmap = bitmap_new(ram_bitmap_pages);
- bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
+ migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
+ migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
+ bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
+
+ if (migrate_postcopy_ram()) {
+ migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
+ bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+ }
/*
* Count the total number of pages used by ram blocks not including any
@@ -1206,7 +1988,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
}
pages_sent += pages;
acct_info.iterations++;
- check_guest_throttling();
+
/* we want to check in the 1st loop, just in case it was the 1st time
and we had to sync the dirty bitmap.
qemu_get_clock_ns() is a bit expensive, so we only check each some
@@ -1247,7 +2029,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
{
rcu_read_lock();
- migration_bitmap_sync();
+ if (!migration_in_postcopy(migrate_get_current())) {
+ migration_bitmap_sync();
+ }
ram_control_before_iterate(f, RAM_CONTROL_FINISH);
@@ -1269,19 +2053,21 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
rcu_read_unlock();
- migration_end();
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
return 0;
}
-static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+ uint64_t *non_postcopiable_pending,
+ uint64_t *postcopiable_pending)
{
uint64_t remaining_size;
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
- if (remaining_size < max_size) {
+ if (!migration_in_postcopy(migrate_get_current()) &&
+ remaining_size < max_size) {
qemu_mutex_lock_iothread();
rcu_read_lock();
migration_bitmap_sync();
@@ -1289,7 +2075,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
qemu_mutex_unlock_iothread();
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
}
- return remaining_size;
+
+ /* We can do postcopy, and all the data is postcopiable */
+ *postcopiable_pending += remaining_size;
}
static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
@@ -1330,6 +2118,14 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
/* Must be called from within a rcu critical section.
* Returns a pointer from within the RCU-protected ram_list.
*/
+/*
+ * Read a RAMBlock ID from the stream f, find the host address of the
+ * start of that block and add on 'offset'
+ *
+ * f: Stream to read from
+ * offset: Offset within the block
+ * flags: Page flags (mostly to see if it's a continuation of previous block)
+ */
static inline void *host_from_stream_offset(QEMUFile *f,
ram_addr_t offset,
int flags)
@@ -1344,21 +2140,19 @@ static inline void *host_from_stream_offset(QEMUFile *f,
return NULL;
}
- return memory_region_get_ram_ptr(block->mr) + offset;
+ return block->host + offset;
}
len = qemu_get_byte(f);
qemu_get_buffer(f, (uint8_t *)id, len);
id[len] = 0;
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
- if (!strncmp(id, block->idstr, sizeof(id)) &&
- block->max_length > offset) {
- return memory_region_get_ram_ptr(block->mr) + offset;
- }
+ block = qemu_ram_block_by_name(id);
+ if (block && block->max_length > offset) {
+ return block->host + offset;
}
- error_report("Can't find block %s!", id);
+ error_report("Can't find block %s", id);
return NULL;
}
@@ -1466,11 +2260,148 @@ static void decompress_data_with_multi_threads(uint8_t *compbuf,
}
}
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+ size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+ return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
+/*
+ * Called in postcopy mode by ram_load().
+ * rcu_read_lock is taken prior to this being called.
+ */
+static int ram_load_postcopy(QEMUFile *f)
+{
+ int flags = 0, ret = 0;
+ bool place_needed = false;
+ bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ /* Temporary page that is later 'placed' */
+ void *postcopy_host_page = postcopy_get_tmp_page(mis);
+ void *last_host = NULL;
+ bool all_zero = false;
+
+ while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+ ram_addr_t addr;
+ void *host = NULL;
+ void *page_buffer = NULL;
+ void *place_source = NULL;
+ uint8_t ch;
+
+ addr = qemu_get_be64(f);
+ flags = addr & ~TARGET_PAGE_MASK;
+ addr &= TARGET_PAGE_MASK;
+
+ trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+ place_needed = false;
+ if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
+ host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+ ret = -EINVAL;
+ break;
+ }
+ page_buffer = host;
+ /*
+ * Postcopy requires that we place whole host pages atomically.
+ * To make it atomic, the data is read into a temporary page
+ * that's moved into place later.
+ * The migration protocol uses, possibly smaller, target-pages
+ * however the source ensures it always sends all the components
+ * of a host page in order.
+ */
+ page_buffer = postcopy_host_page +
+ ((uintptr_t)host & ~qemu_host_page_mask);
+ /* If all TP are zero then we can optimise the place */
+ if (!((uintptr_t)host & ~qemu_host_page_mask)) {
+ all_zero = true;
+ } else {
+ /* not the 1st TP within the HP */
+ if (host != (last_host + TARGET_PAGE_SIZE)) {
+ error_report("Non-sequential target page %p/%p\n",
+ host, last_host);
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+
+ /*
+ * If it's the last part of a host page then we place the host
+ * page
+ */
+ place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
+ ~qemu_host_page_mask) == 0;
+ place_source = postcopy_host_page;
+ }
+ last_host = host;
+
+ switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+ case RAM_SAVE_FLAG_COMPRESS:
+ ch = qemu_get_byte(f);
+ memset(page_buffer, ch, TARGET_PAGE_SIZE);
+ if (ch) {
+ all_zero = false;
+ }
+ break;
+
+ case RAM_SAVE_FLAG_PAGE:
+ all_zero = false;
+ if (!place_needed || !matching_page_sizes) {
+ qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
+ } else {
+ /* Avoids the qemu_file copy during postcopy, which is
+ * going to do a copy later; can only do it when we
+ * do this read in one go (matching page sizes)
+ */
+ qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
+ TARGET_PAGE_SIZE);
+ }
+ break;
+ case RAM_SAVE_FLAG_EOS:
+ /* normal exit */
+ break;
+ default:
+ error_report("Unknown combination of migration flags: %#x"
+ " (postcopy mode)", flags);
+ ret = -EINVAL;
+ }
+
+ if (place_needed) {
+ /* This gets called at the last target page in the host page */
+ if (all_zero) {
+ ret = postcopy_place_page_zero(mis,
+ host + TARGET_PAGE_SIZE -
+ qemu_host_page_size);
+ } else {
+ ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
+ qemu_host_page_size,
+ place_source);
+ }
+ }
+ if (!ret) {
+ ret = qemu_file_get_error(f);
+ }
+ }
+
+ return ret;
+}
+
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
int flags = 0, ret = 0;
static uint64_t seq_iter;
int len = 0;
+ /*
+ * If system is running in postcopy mode, page inserts to host memory must
+ * be atomic
+ */
+ bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
seq_iter++;
@@ -1484,15 +2415,30 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
* critical section.
*/
rcu_read_lock();
- while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+
+ if (postcopy_running) {
+ ret = ram_load_postcopy(f);
+ }
+
+ while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr, total_ram_bytes;
- void *host;
+ void *host = NULL;
uint8_t ch;
addr = qemu_get_be64(f);
flags = addr & ~TARGET_PAGE_MASK;
addr &= TARGET_PAGE_MASK;
+ if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
+ RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+ host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+ ret = -EINVAL;
+ break;
+ }
+ }
+
switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
case RAM_SAVE_FLAG_MEM_SIZE:
/* Synchronize RAM block list */
@@ -1507,23 +2453,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
id[len] = 0;
length = qemu_get_be64(f);
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
- if (!strncmp(id, block->idstr, sizeof(id))) {
- if (length != block->used_length) {
- Error *local_err = NULL;
+ block = qemu_ram_block_by_name(id);
+ if (block) {
+ if (length != block->used_length) {
+ Error *local_err = NULL;
- ret = qemu_ram_resize(block->offset, length, &local_err);
- if (local_err) {
- error_report_err(local_err);
- }
+ ret = qemu_ram_resize(block->offset, length,
+ &local_err);
+ if (local_err) {
+ error_report_err(local_err);
}
- ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
- block->idstr);
- break;
}
- }
-
- if (!block) {
+ ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
+ block->idstr);
+ } else {
error_report("Unknown ramblock \"%s\", cannot "
"accept migration", id);
ret = -EINVAL;
@@ -1532,33 +2475,17 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
total_ram_bytes -= length;
}
break;
+
case RAM_SAVE_FLAG_COMPRESS:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
ch = qemu_get_byte(f);
ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
break;
+
case RAM_SAVE_FLAG_PAGE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
break;
- case RAM_SAVE_FLAG_COMPRESS_PAGE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
+ case RAM_SAVE_FLAG_COMPRESS_PAGE:
len = qemu_get_be32(f);
if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
error_report("Invalid compressed data length: %d", len);
@@ -1568,13 +2495,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
qemu_get_buffer(f, compressed_data_buf, len);
decompress_data_with_multi_threads(compressed_data_buf, host, len);
break;
+
case RAM_SAVE_FLAG_XBZRLE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
if (load_xbzrle(f, addr, host) < 0) {
error_report("Failed to decompress XBZRLE page at "
RAM_ADDR_FMT, addr);
@@ -1608,10 +2530,11 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
static SaveVMHandlers savevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
- .save_live_complete = ram_save_complete,
+ .save_live_complete_postcopy = ram_save_complete,
+ .save_live_complete_precopy = ram_save_complete,
.save_live_pending = ram_save_pending,
.load_state = ram_load,
- .cancel = ram_migration_cancel,
+ .cleanup = ram_migration_cleanup,
};
void ram_mig_init(void)
@@ -1619,52 +2542,3 @@ void ram_mig_init(void)
qemu_mutex_init(&XBZRLE.lock);
register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
}
-/* Stub function that's gets run on the vcpu when its brought out of the
- VM to run inside qemu via async_run_on_cpu()*/
-
-static void mig_sleep_cpu(void *opq)
-{
- qemu_mutex_unlock_iothread();
- g_usleep(30*1000);
- qemu_mutex_lock_iothread();
-}
-
-/* To reduce the dirty rate explicitly disallow the VCPUs from spending
- much time in the VM. The migration thread will try to catchup.
- Workload will experience a performance drop.
-*/
-static void mig_throttle_guest_down(void)
-{
- CPUState *cpu;
-
- qemu_mutex_lock_iothread();
- CPU_FOREACH(cpu) {
- async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
- }
- qemu_mutex_unlock_iothread();
-}
-
-static void check_guest_throttling(void)
-{
- static int64_t t0;
- int64_t t1;
-
- if (!mig_throttle_on) {
- return;
- }
-
- if (!t0) {
- t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
- return;
- }
-
- t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-
- /* If it has been more than 40 ms since the last time the guest
- * was throttled then do it again.
- */
- if (40 < (t1-t0)/1000000) {
- mig_throttle_guest_down();
- t0 = t1;
- }
-}
diff --git a/migration/rdma.c b/migration/rdma.c
index 74876fd7ab..dcabb91005 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -19,7 +19,7 @@
#include "qemu/main-loop.h"
#include "qemu/sockets.h"
#include "qemu/bitmap.h"
-#include "block/coroutine.h"
+#include "qemu/coroutine.h"
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
@@ -541,7 +541,7 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name,
RDMALocalBlock *block;
RDMALocalBlock *old = local->block;
- local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
+ local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
if (local->nb_blocks) {
int x;
@@ -572,12 +572,12 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name,
bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
block->unregister_bitmap = bitmap_new(block->nb_chunks);
bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
- block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
+ block->remote_keys = g_new0(uint32_t, block->nb_chunks);
block->is_ram_block = local->init ? false : true;
if (rdma->blockmap) {
- g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
+ g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
}
trace_rdma_add_block(block_name, local->nb_blocks,
@@ -617,8 +617,8 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
memset(local, 0, sizeof *local);
qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
- rdma->dest_blocks = (RDMADestBlock *) g_malloc0(sizeof(RDMADestBlock) *
- rdma->local_ram_blocks.nb_blocks);
+ rdma->dest_blocks = g_new0(RDMADestBlock,
+ rdma->local_ram_blocks.nb_blocks);
local->init = true;
return 0;
}
@@ -677,8 +677,7 @@ static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
if (local->nb_blocks > 1) {
- local->block = g_malloc0(sizeof(RDMALocalBlock) *
- (local->nb_blocks - 1));
+ local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
if (block->index) {
memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
@@ -778,7 +777,7 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
*
* If the source VM connects with an IPv4 address without knowing that the
* destination has bound to '[::]' the migration will unconditionally fail
- * unless the management software is explicitly listening on the the IPv4
+ * unless the management software is explicitly listening on the IPv4
* address while using a RoCE-based device.
*
* If the source VM connects with an IPv6 address, then we're OK because we can
@@ -1164,7 +1163,7 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
/* allocate memory to store chunk MRs */
if (!block->pmr) {
- block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
+ block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
}
/*
@@ -2494,7 +2493,7 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp)
InetSocketAddress *addr;
if (host_port) {
- rdma = g_malloc0(sizeof(RDMAContext));
+ rdma = g_new0(RDMAContext, 1);
rdma->current_index = -1;
rdma->current_chunk = -1;
@@ -2519,8 +2518,8 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp)
* SEND messages for control only.
* VM's ram is handled with regular RDMA messages.
*/
-static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
- int64_t pos, int size)
+static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
+ int64_t pos, size_t size)
{
QEMUFileRDMA *r = opaque;
QEMUFile *f = r->file;
@@ -2547,7 +2546,8 @@ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
r->len = MIN(remaining, RDMA_SEND_INCREMENT);
remaining -= r->len;
- head.len = r->len;
+ /* Guaranteed to fit due to RDMA_SEND_INCREMENT MIN above */
+ head.len = (uint32_t)r->len;
head.type = RDMA_CONTROL_QEMU_FILE;
ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
@@ -2564,7 +2564,7 @@ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
}
static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
- int size, int idx)
+ size_t size, int idx)
{
size_t len = 0;
@@ -2585,8 +2585,8 @@ static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
* RDMA links don't use bytestreams, so we have to
* return bytes to QEMUFile opportunistically.
*/
-static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
- int64_t pos, int size)
+static ssize_t qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
+ int64_t pos, size_t size)
{
QEMUFileRDMA *r = opaque;
RDMAContext *rdma = r->rdma;
@@ -3399,7 +3399,7 @@ static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
return NULL;
}
- r = g_malloc0(sizeof(QEMUFileRDMA));
+ r = g_new0(QEMUFileRDMA, 1);
r->rdma = rdma;
if (mode[0] == 'w') {
diff --git a/migration/savevm.c b/migration/savevm.c
index 60712153fa..0ad1b93a8b 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -37,6 +37,7 @@
#include "qemu/timer.h"
#include "audio/audio.h"
#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
#include "qapi/qmp/qerror.h"
#include "qemu/error-report.h"
#include "qemu/sockets.h"
@@ -45,6 +46,7 @@
#include "exec/memory.h"
#include "qmp-commands.h"
#include "trace.h"
+#include "qemu/bitops.h"
#include "qemu/iov.h"
#include "block/snapshot.h"
#include "block/qapi.h"
@@ -57,8 +59,26 @@
#define ARP_PTYPE_IP 0x0800
#define ARP_OP_REQUEST_REV 0x3
+const unsigned int postcopy_ram_discard_version = 0;
+
static bool skip_section_footers;
+static struct mig_cmd_args {
+ ssize_t len; /* -1 = variable */
+ const char *name;
+} mig_cmd_args[] = {
+ [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" },
+ [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" },
+ [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" },
+ [MIG_CMD_POSTCOPY_ADVISE] = { .len = 16, .name = "POSTCOPY_ADVISE" },
+ [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" },
+ [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" },
+ [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
+ .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
+ [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" },
+ [MIG_CMD_MAX] = { .len = -1, .name = "MAX" },
+};
+
static int announce_self_create(uint8_t *buf,
uint8_t *mac_addr)
{
@@ -138,14 +158,15 @@ static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
return qiov.size;
}
-static int block_put_buffer(void *opaque, const uint8_t *buf,
- int64_t pos, int size)
+static ssize_t block_put_buffer(void *opaque, const uint8_t *buf,
+ int64_t pos, size_t size)
{
bdrv_save_vmstate(opaque, buf, pos, size);
return size;
}
-static int block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size)
{
return bdrv_load_vmstate(opaque, buf, pos, size);
}
@@ -480,7 +501,7 @@ int register_savevm_live(DeviceState *dev,
{
SaveStateEntry *se;
- se = g_malloc0(sizeof(SaveStateEntry));
+ se = g_new0(SaveStateEntry, 1);
se->version_id = version_id;
se->section_id = savevm_state.global_section_id++;
se->ops = ops;
@@ -498,7 +519,7 @@ int register_savevm_live(DeviceState *dev,
pstrcat(se->idstr, sizeof(se->idstr), "/");
g_free(id);
- se->compat = g_malloc0(sizeof(CompatEntry));
+ se->compat = g_new0(CompatEntry, 1);
pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
se->compat->instance_id = instance_id == -1 ?
calculate_compat_instance_id(idstr) : instance_id;
@@ -526,7 +547,7 @@ int register_savevm(DeviceState *dev,
LoadStateHandler *load_state,
void *opaque)
{
- SaveVMHandlers *ops = g_malloc0(sizeof(SaveVMHandlers));
+ SaveVMHandlers *ops = g_new0(SaveVMHandlers, 1);
ops->save_state = save_state;
ops->load_state = load_state;
return register_savevm_live(dev, idstr, instance_id, version_id,
@@ -551,9 +572,7 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
- if (se->compat) {
- g_free(se->compat);
- }
+ g_free(se->compat);
g_free(se->ops);
g_free(se);
}
@@ -570,7 +589,7 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
/* If this triggers, alias support can be dropped for the vmsd. */
assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
- se = g_malloc0(sizeof(SaveStateEntry));
+ se = g_new0(SaveStateEntry, 1);
se->version_id = vmsd->version_id;
se->section_id = savevm_state.global_section_id++;
se->opaque = opaque;
@@ -584,7 +603,7 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
pstrcat(se->idstr, sizeof(se->idstr), "/");
g_free(id);
- se->compat = g_malloc0(sizeof(CompatEntry));
+ se->compat = g_new0(CompatEntry, 1);
pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
se->compat->instance_id = instance_id == -1 ?
calculate_compat_instance_id(vmsd->name) : instance_id;
@@ -612,9 +631,7 @@ void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
if (se->vmsd == vmsd && se->opaque == opaque) {
QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
- if (se->compat) {
- g_free(se->compat);
- }
+ g_free(se->compat);
g_free(se);
}
}
@@ -697,6 +714,156 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
}
}
+/**
+ * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
+ * command and associated data.
+ *
+ * @f: File to send command on
+ * @command: Command type to send
+ * @len: Length of associated data
+ * @data: Data associated with command.
+ */
+void qemu_savevm_command_send(QEMUFile *f,
+ enum qemu_vm_cmd command,
+ uint16_t len,
+ uint8_t *data)
+{
+ trace_savevm_command_send(command, len);
+ qemu_put_byte(f, QEMU_VM_COMMAND);
+ qemu_put_be16(f, (uint16_t)command);
+ qemu_put_be16(f, len);
+ qemu_put_buffer(f, data, len);
+ qemu_fflush(f);
+}
+
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
+{
+ uint32_t buf;
+
+ trace_savevm_send_ping(value);
+ buf = cpu_to_be32(value);
+ qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
+}
+
+void qemu_savevm_send_open_return_path(QEMUFile *f)
+{
+ trace_savevm_send_open_return_path();
+ qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
+}
+
+/* We have a buffer of data to send; we don't want that all to be loaded
+ * by the command itself, so the command contains just the length of the
+ * extra buffer that we then send straight after it.
+ * TODO: Must be a better way to organise that
+ *
+ * Returns:
+ * 0 on success
+ * -ve on error
+ */
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
+{
+ size_t cur_iov;
+ size_t len = qsb_get_length(qsb);
+ uint32_t tmp;
+
+ if (len > MAX_VM_CMD_PACKAGED_SIZE) {
+ error_report("%s: Unreasonably large packaged state: %zu",
+ __func__, len);
+ return -1;
+ }
+
+ tmp = cpu_to_be32(len);
+
+ trace_qemu_savevm_send_packaged();
+ qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
+
+ /* all the data follows (concatinating the iov's) */
+ for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) {
+ /* The iov entries are partially filled */
+ size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len);
+ len -= towrite;
+
+ if (!towrite) {
+ break;
+ }
+
+ qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite);
+ }
+
+ return 0;
+}
+
+/* Send prior to any postcopy transfer */
+void qemu_savevm_send_postcopy_advise(QEMUFile *f)
+{
+ uint64_t tmp[2];
+ tmp[0] = cpu_to_be64(getpagesize());
+ tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
+
+ trace_qemu_savevm_send_postcopy_advise();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
+}
+
+/* Sent prior to starting the destination running in postcopy, discard pages
+ * that have already been sent but redirtied on the source.
+ * CMD_POSTCOPY_RAM_DISCARD consist of:
+ * byte version (0)
+ * byte Length of name field (not including 0)
+ * n x byte RAM block name
+ * byte 0 terminator (just for safety)
+ * n x Byte ranges within the named RAMBlock
+ * be64 Start of the range
+ * be64 Length
+ *
+ * name: RAMBlock name that these entries are part of
+ * len: Number of page entries
+ * start_list: 'len' addresses
+ * length_list: 'len' addresses
+ *
+ */
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+ uint16_t len,
+ uint64_t *start_list,
+ uint64_t *length_list)
+{
+ uint8_t *buf;
+ uint16_t tmplen;
+ uint16_t t;
+ size_t name_len = strlen(name);
+
+ trace_qemu_savevm_send_postcopy_ram_discard(name, len);
+ assert(name_len < 256);
+ buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
+ buf[0] = postcopy_ram_discard_version;
+ buf[1] = name_len;
+ memcpy(buf + 2, name, name_len);
+ tmplen = 2 + name_len;
+ buf[tmplen++] = '\0';
+
+ for (t = 0; t < len; t++) {
+ cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]);
+ tmplen += 8;
+ cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]);
+ tmplen += 8;
+ }
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
+ g_free(buf);
+}
+
+/* Get the destination into a state where it can receive postcopy data. */
+void qemu_savevm_send_postcopy_listen(QEMUFile *f)
+{
+ trace_savevm_send_postcopy_listen();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
+}
+
+/* Kick the destination into running */
+void qemu_savevm_send_postcopy_run(QEMUFile *f)
+{
+ trace_savevm_send_postcopy_run();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
+}
+
bool qemu_savevm_state_blocked(Error **errp)
{
SaveStateEntry *se;
@@ -716,6 +883,12 @@ void qemu_savevm_state_header(QEMUFile *f)
trace_savevm_state_header();
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+
+ if (!savevm_state.skip_configuration) {
+ qemu_put_byte(f, QEMU_VM_CONFIGURATION);
+ vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
+ }
+
}
void qemu_savevm_state_begin(QEMUFile *f,
@@ -732,11 +905,6 @@ void qemu_savevm_state_begin(QEMUFile *f,
se->ops->set_params(params, se->opaque);
}
- if (!savevm_state.skip_configuration) {
- qemu_put_byte(f, QEMU_VM_CONFIGURATION);
- vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
- }
-
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_setup) {
continue;
@@ -763,7 +931,7 @@ void qemu_savevm_state_begin(QEMUFile *f,
* 0 : We haven't finished, caller have to go again
* 1 : We have finished, we can go to complete phase
*/
-int qemu_savevm_state_iterate(QEMUFile *f)
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
{
SaveStateEntry *se;
int ret = 1;
@@ -778,6 +946,15 @@ int qemu_savevm_state_iterate(QEMUFile *f)
continue;
}
}
+ /*
+ * In the postcopy phase, any device that doesn't know how to
+ * do postcopy should have saved it's state in the _complete
+ * call that's already run, it might get confused if we call
+ * iterate afterwards.
+ */
+ if (postcopy && !se->ops->save_live_complete_postcopy) {
+ continue;
+ }
if (qemu_file_rate_limit(f)) {
return 0;
}
@@ -806,24 +983,69 @@ int qemu_savevm_state_iterate(QEMUFile *f)
static bool should_send_vmdesc(void)
{
MachineState *machine = MACHINE(qdev_get_machine());
- return !machine->suppress_vmdesc;
+ bool in_postcopy = migration_in_postcopy(migrate_get_current());
+ return !machine->suppress_vmdesc && !in_postcopy;
}
-void qemu_savevm_state_complete(QEMUFile *f)
+/*
+ * Calls the save_live_complete_postcopy methods
+ * causing the last few pages to be sent immediately and doing any associated
+ * cleanup.
+ * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
+ * all the other devices, but that happens at the point we switch to postcopy.
+ */
+void qemu_savevm_state_complete_postcopy(QEMUFile *f)
+{
+ SaveStateEntry *se;
+ int ret;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->save_live_complete_postcopy) {
+ continue;
+ }
+ if (se->ops && se->ops->is_active) {
+ if (!se->ops->is_active(se->opaque)) {
+ continue;
+ }
+ }
+ trace_savevm_section_start(se->idstr, se->section_id);
+ /* Section type */
+ qemu_put_byte(f, QEMU_VM_SECTION_END);
+ qemu_put_be32(f, se->section_id);
+
+ ret = se->ops->save_live_complete_postcopy(f, se->opaque);
+ trace_savevm_section_end(se->idstr, se->section_id, ret);
+ save_section_footer(f, se);
+ if (ret < 0) {
+ qemu_file_set_error(f, ret);
+ return;
+ }
+ }
+
+ qemu_put_byte(f, QEMU_VM_EOF);
+ qemu_fflush(f);
+}
+
+void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
{
QJSON *vmdesc;
int vmdesc_len;
SaveStateEntry *se;
int ret;
+ bool in_postcopy = migration_in_postcopy(migrate_get_current());
- trace_savevm_state_complete();
+ trace_savevm_state_complete_precopy();
cpu_synchronize_all_states();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops || !se->ops->save_live_complete) {
+ if (!se->ops ||
+ (in_postcopy && se->ops->save_live_complete_postcopy) ||
+ (in_postcopy && !iterable_only) ||
+ !se->ops->save_live_complete_precopy) {
continue;
}
+
if (se->ops && se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
@@ -833,7 +1055,7 @@ void qemu_savevm_state_complete(QEMUFile *f)
save_section_header(f, se, QEMU_VM_SECTION_END);
- ret = se->ops->save_live_complete(f, se->opaque);
+ ret = se->ops->save_live_complete_precopy(f, se->opaque);
trace_savevm_section_end(se->idstr, se->section_id, ret);
save_section_footer(f, se);
if (ret < 0) {
@@ -842,6 +1064,10 @@ void qemu_savevm_state_complete(QEMUFile *f)
}
}
+ if (iterable_only) {
+ return;
+ }
+
vmdesc = qjson_new();
json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE);
json_start_array(vmdesc, "devices");
@@ -870,7 +1096,10 @@ void qemu_savevm_state_complete(QEMUFile *f)
save_section_footer(f, se);
}
- qemu_put_byte(f, QEMU_VM_EOF);
+ if (!in_postcopy) {
+ /* Postcopy stream will still be going */
+ qemu_put_byte(f, QEMU_VM_EOF);
+ }
json_end_array(vmdesc);
qjson_finish(vmdesc);
@@ -886,10 +1115,19 @@ void qemu_savevm_state_complete(QEMUFile *f)
qemu_fflush(f);
}
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+/* Give an estimate of the amount left to be transferred,
+ * the result is split into the amount for units that can and
+ * for units that can't do postcopy.
+ */
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+ uint64_t *res_non_postcopiable,
+ uint64_t *res_postcopiable)
{
SaveStateEntry *se;
- uint64_t ret = 0;
+
+ *res_non_postcopiable = 0;
+ *res_postcopiable = 0;
+
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_pending) {
@@ -900,19 +1138,19 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
continue;
}
}
- ret += se->ops->save_live_pending(f, se->opaque, max_size);
+ se->ops->save_live_pending(f, se->opaque, max_size,
+ res_non_postcopiable, res_postcopiable);
}
- return ret;
}
-void qemu_savevm_state_cancel(void)
+void qemu_savevm_state_cleanup(void)
{
SaveStateEntry *se;
- trace_savevm_state_cancel();
+ trace_savevm_state_cleanup();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (se->ops && se->ops->cancel) {
- se->ops->cancel(se->opaque);
+ if (se->ops && se->ops->cleanup) {
+ se->ops->cleanup(se->opaque);
}
}
}
@@ -924,6 +1162,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
.blk = 0,
.shared = 0
};
+ MigrationState *ms = migrate_init(&params);
+ ms->file = f;
if (qemu_savevm_state_blocked(errp)) {
return -EINVAL;
@@ -935,18 +1175,18 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
qemu_mutex_lock_iothread();
while (qemu_file_get_error(f) == 0) {
- if (qemu_savevm_state_iterate(f) > 0) {
+ if (qemu_savevm_state_iterate(f, false) > 0) {
break;
}
}
ret = qemu_file_get_error(f);
if (ret == 0) {
- qemu_savevm_state_complete(f);
+ qemu_savevm_state_complete_precopy(f, false);
ret = qemu_file_get_error(f);
}
+ qemu_savevm_state_cleanup();
if (ret != 0) {
- qemu_savevm_state_cancel();
error_setg_errno(errp, -ret, "Error while writing VM state");
}
return ret;
@@ -1004,6 +1244,420 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id)
return NULL;
}
+enum LoadVMExitCodes {
+ /* Allow a command to quit all layers of nested loadvm loops */
+ LOADVM_QUIT = 1,
+};
+
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
+
+/* ------ incoming postcopy messages ------ */
+/* 'advise' arrives before any transfers just to tell us that a postcopy
+ * *might* happen - it might be skipped if precopy transferred everything
+ * quickly.
+ */
+static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+ uint64_t remote_hps, remote_tps;
+
+ trace_loadvm_postcopy_handle_advise();
+ if (ps != POSTCOPY_INCOMING_NONE) {
+ error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+
+ if (!postcopy_ram_supported_by_host()) {
+ return -1;
+ }
+
+ remote_hps = qemu_get_be64(mis->from_src_file);
+ if (remote_hps != getpagesize()) {
+ /*
+ * Some combinations of mismatch are probably possible but it gets
+ * a bit more complicated. In particular we need to place whole
+ * host pages on the dest at once, and we need to ensure that we
+ * handle dirtying to make sure we never end up sending part of
+ * a hostpage on it's own.
+ */
+ error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
+ (int)remote_hps, getpagesize());
+ return -1;
+ }
+
+ remote_tps = qemu_get_be64(mis->from_src_file);
+ if (remote_tps != (1ul << qemu_target_page_bits())) {
+ /*
+ * Again, some differences could be dealt with, but for now keep it
+ * simple.
+ */
+ error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
+ (int)remote_tps, 1 << qemu_target_page_bits());
+ return -1;
+ }
+
+ if (ram_postcopy_incoming_init(mis)) {
+ return -1;
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+
+ return 0;
+}
+
+/* After postcopy we will be told to throw some pages away since they're
+ * dirty and will have to be demand fetched. Must happen before CPU is
+ * started.
+ * There can be 0..many of these messages, each encoding multiple pages.
+ */
+static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
+ uint16_t len)
+{
+ int tmp;
+ char ramid[256];
+ PostcopyState ps = postcopy_state_get();
+
+ trace_loadvm_postcopy_ram_handle_discard();
+
+ switch (ps) {
+ case POSTCOPY_INCOMING_ADVISE:
+ /* 1st discard */
+ tmp = postcopy_ram_prepare_discard(mis);
+ if (tmp) {
+ return tmp;
+ }
+ break;
+
+ case POSTCOPY_INCOMING_DISCARD:
+ /* Expected state */
+ break;
+
+ default:
+ error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
+ ps);
+ return -1;
+ }
+ /* We're expecting a
+ * Version (0)
+ * a RAM ID string (length byte, name, 0 term)
+ * then at least 1 16 byte chunk
+ */
+ if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+ return -1;
+ }
+
+ tmp = qemu_get_byte(mis->from_src_file);
+ if (tmp != postcopy_ram_discard_version) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
+ return -1;
+ }
+
+ if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
+ return -1;
+ }
+ tmp = qemu_get_byte(mis->from_src_file);
+ if (tmp != 0) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
+ return -1;
+ }
+
+ len -= 3 + strlen(ramid);
+ if (len % 16) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+ return -1;
+ }
+ trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
+ while (len) {
+ uint64_t start_addr, block_length;
+ start_addr = qemu_get_be64(mis->from_src_file);
+ block_length = qemu_get_be64(mis->from_src_file);
+
+ len -= 16;
+ int ret = ram_discard_range(mis, ramid, start_addr,
+ block_length);
+ if (ret) {
+ return ret;
+ }
+ }
+ trace_loadvm_postcopy_ram_handle_discard_end();
+
+ return 0;
+}
+
+/*
+ * Triggered by a postcopy_listen command; this thread takes over reading
+ * the input stream, leaving the main thread free to carry on loading the rest
+ * of the device state (from RAM).
+ * (TODO:This could do with being in a postcopy file - but there again it's
+ * just another input loop, not that postcopy specific)
+ */
+static void *postcopy_ram_listen_thread(void *opaque)
+{
+ QEMUFile *f = opaque;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ int load_res;
+
+ qemu_sem_post(&mis->listen_thread_sem);
+ trace_postcopy_ram_listen_thread_start();
+
+ /*
+ * Because we're a thread and not a coroutine we can't yield
+ * in qemu_file, and thus we must be blocking now.
+ */
+ qemu_file_set_blocking(f, true);
+ load_res = qemu_loadvm_state_main(f, mis);
+ /* And non-blocking again so we don't block in any cleanup */
+ qemu_file_set_blocking(f, false);
+
+ trace_postcopy_ram_listen_thread_exit();
+ if (load_res < 0) {
+ error_report("%s: loadvm failed: %d", __func__, load_res);
+ qemu_file_set_error(f, load_res);
+ } else {
+ /*
+ * This looks good, but it's possible that the device loading in the
+ * main thread hasn't finished yet, and so we might not be in 'RUN'
+ * state yet; wait for the end of the main thread.
+ */
+ qemu_event_wait(&mis->main_thread_load_event);
+ }
+ postcopy_ram_incoming_cleanup(mis);
+ /*
+ * If everything has worked fine, then the main thread has waited
+ * for us to start, and we're the last use of the mis.
+ * (If something broke then qemu will have to exit anyway since it's
+ * got a bad migration state).
+ */
+ migration_incoming_state_destroy();
+
+ if (load_res < 0) {
+ /*
+ * If something went wrong then we have a bad state so exit;
+ * depending how far we got it might be possible at this point
+ * to leave the guest running and fire MCEs for pages that never
+ * arrived as a desperate recovery step.
+ */
+ exit(EXIT_FAILURE);
+ }
+
+ return NULL;
+}
+
+/* After this message we must be able to immediately receive postcopy data */
+static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
+ trace_loadvm_postcopy_handle_listen();
+ if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
+ error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+ if (ps == POSTCOPY_INCOMING_ADVISE) {
+ /*
+ * A rare case, we entered listen without having to do any discards,
+ * so do the setup that's normally done at the time of the 1st discard.
+ */
+ postcopy_ram_prepare_discard(mis);
+ }
+
+ /*
+ * Sensitise RAM - can now generate requests for blocks that don't exist
+ * However, at this point the CPU shouldn't be running, and the IO
+ * shouldn't be doing anything yet so don't actually expect requests
+ */
+ if (postcopy_ram_enable_notify(mis)) {
+ return -1;
+ }
+
+ if (mis->have_listen_thread) {
+ error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
+ return -1;
+ }
+
+ mis->have_listen_thread = true;
+ /* Start up the listening thread and wait for it to signal ready */
+ qemu_sem_init(&mis->listen_thread_sem, 0);
+ qemu_thread_create(&mis->listen_thread, "postcopy/listen",
+ postcopy_ram_listen_thread, mis->from_src_file,
+ QEMU_THREAD_JOINABLE);
+ qemu_sem_wait(&mis->listen_thread_sem);
+ qemu_sem_destroy(&mis->listen_thread_sem);
+
+ return 0;
+}
+
+/* After all discards we can start running and asking for pages */
+static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
+ Error *local_err = NULL;
+
+ trace_loadvm_postcopy_handle_run();
+ if (ps != POSTCOPY_INCOMING_LISTENING) {
+ error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+
+ /* TODO we should move all of this lot into postcopy_ram.c or a shared code
+ * in migration.c
+ */
+ cpu_synchronize_all_post_init();
+
+ qemu_announce_self();
+
+ /* Make sure all file formats flush their mutable metadata */
+ bdrv_invalidate_cache_all(&local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return -1;
+ }
+
+ trace_loadvm_postcopy_handle_run_cpu_sync();
+ cpu_synchronize_all_post_init();
+
+ trace_loadvm_postcopy_handle_run_vmstart();
+
+ if (autostart) {
+ /* Hold onto your hats, starting the CPU */
+ vm_start();
+ } else {
+ /* leave it paused and let management decide when to start the CPU */
+ runstate_set(RUN_STATE_PAUSED);
+ }
+
+ /* We need to finish reading the stream from the package
+ * and also stop reading anything more from the stream that loaded the
+ * package (since it's now being read by the listener thread).
+ * LOADVM_QUIT will quit all the layers of nested loadvm loops.
+ */
+ return LOADVM_QUIT;
+}
+
+/**
+ * Immediately following this command is a blob of data containing an embedded
+ * chunk of migration stream; read it and load it.
+ *
+ * @mis: Incoming state
+ * @length: Length of packaged data to read
+ *
+ * Returns: Negative values on error
+ *
+ */
+static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
+{
+ int ret;
+ uint8_t *buffer;
+ uint32_t length;
+ QEMUSizedBuffer *qsb;
+
+ length = qemu_get_be32(mis->from_src_file);
+ trace_loadvm_handle_cmd_packaged(length);
+
+ if (length > MAX_VM_CMD_PACKAGED_SIZE) {
+ error_report("Unreasonably large packaged state: %u", length);
+ return -1;
+ }
+ buffer = g_malloc0(length);
+ ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length);
+ if (ret != length) {
+ g_free(buffer);
+ error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n",
+ ret, length);
+ return (ret < 0) ? ret : -EAGAIN;
+ }
+ trace_loadvm_handle_cmd_packaged_received(ret);
+
+ /* Setup a dummy QEMUFile that actually reads from the buffer */
+ qsb = qsb_create(buffer, length);
+ g_free(buffer); /* Because qsb_create copies */
+ if (!qsb) {
+ error_report("Unable to create qsb");
+ }
+ QEMUFile *packf = qemu_bufopen("r", qsb);
+
+ ret = qemu_loadvm_state_main(packf, mis);
+ trace_loadvm_handle_cmd_packaged_main(ret);
+ qemu_fclose(packf);
+ qsb_free(qsb);
+
+ return ret;
+}
+
+/*
+ * Process an incoming 'QEMU_VM_COMMAND'
+ * 0 just a normal return
+ * LOADVM_QUIT All good, but exit the loop
+ * <0 Error
+ */
+static int loadvm_process_command(QEMUFile *f)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ uint16_t cmd;
+ uint16_t len;
+ uint32_t tmp32;
+
+ cmd = qemu_get_be16(f);
+ len = qemu_get_be16(f);
+
+ trace_loadvm_process_command(cmd, len);
+ if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
+ error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
+ return -EINVAL;
+ }
+
+ if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
+ error_report("%s received with bad length - expecting %zu, got %d",
+ mig_cmd_args[cmd].name,
+ (size_t)mig_cmd_args[cmd].len, len);
+ return -ERANGE;
+ }
+
+ switch (cmd) {
+ case MIG_CMD_OPEN_RETURN_PATH:
+ if (mis->to_src_file) {
+ error_report("CMD_OPEN_RETURN_PATH called when RP already open");
+ /* Not really a problem, so don't give up */
+ return 0;
+ }
+ mis->to_src_file = qemu_file_get_return_path(f);
+ if (!mis->to_src_file) {
+ error_report("CMD_OPEN_RETURN_PATH failed");
+ return -1;
+ }
+ break;
+
+ case MIG_CMD_PING:
+ tmp32 = qemu_get_be32(f);
+ trace_loadvm_process_command_ping(tmp32);
+ if (!mis->to_src_file) {
+ error_report("CMD_PING (0x%x) received with no return path",
+ tmp32);
+ return -1;
+ }
+ migrate_send_rp_pong(mis, tmp32);
+ break;
+
+ case MIG_CMD_PACKAGED:
+ return loadvm_handle_cmd_packaged(mis);
+
+ case MIG_CMD_POSTCOPY_ADVISE:
+ return loadvm_postcopy_handle_advise(mis);
+
+ case MIG_CMD_POSTCOPY_LISTEN:
+ return loadvm_postcopy_handle_listen(mis);
+
+ case MIG_CMD_POSTCOPY_RUN:
+ return loadvm_postcopy_handle_run(mis);
+
+ case MIG_CMD_POSTCOPY_RAM_DISCARD:
+ return loadvm_postcopy_ram_handle_discard(mis, len);
+ }
+
+ return 0;
+}
+
struct LoadStateEntry {
QLIST_ENTRY(LoadStateEntry) entry;
SaveStateEntry *se;
@@ -1056,47 +1710,10 @@ void loadvm_free_handlers(MigrationIncomingState *mis)
}
}
-int qemu_loadvm_state(QEMUFile *f)
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{
- MigrationIncomingState *mis = migration_incoming_get_current();
- Error *local_err = NULL;
uint8_t section_type;
- unsigned int v;
int ret;
- int file_error_after_eof = -1;
-
- if (qemu_savevm_state_blocked(&local_err)) {
- error_report_err(local_err);
- return -EINVAL;
- }
-
- v = qemu_get_be32(f);
- if (v != QEMU_VM_FILE_MAGIC) {
- error_report("Not a migration stream");
- return -EINVAL;
- }
-
- v = qemu_get_be32(f);
- if (v == QEMU_VM_FILE_VERSION_COMPAT) {
- error_report("SaveVM v2 format is obsolete and don't work anymore");
- return -ENOTSUP;
- }
- if (v != QEMU_VM_FILE_VERSION) {
- error_report("Unsupported migration stream version");
- return -ENOTSUP;
- }
-
- if (!savevm_state.skip_configuration) {
- if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
- error_report("Configuration section missing");
- return -EINVAL;
- }
- ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
-
- if (ret) {
- return ret;
- }
- }
while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
uint32_t instance_id, version_id, section_id;
@@ -1125,16 +1742,14 @@ int qemu_loadvm_state(QEMUFile *f)
if (se == NULL) {
error_report("Unknown savevm section or instance '%s' %d",
idstr, instance_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Validate version */
if (version_id > se->version_id) {
error_report("savevm: unsupported version %d for '%s' v%d",
version_id, idstr, se->version_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Add entry */
@@ -1149,11 +1764,10 @@ int qemu_loadvm_state(QEMUFile *f)
if (ret < 0) {
error_report("error while loading state for instance 0x%x of"
" device '%s'", instance_id, idstr);
- goto out;
+ return ret;
}
if (!check_section_footer(f, le)) {
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
break;
case QEMU_VM_SECTION_PART:
@@ -1168,29 +1782,88 @@ int qemu_loadvm_state(QEMUFile *f)
}
if (le == NULL) {
error_report("Unknown savevm section %d", section_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
ret = vmstate_load(f, le->se, le->version_id);
if (ret < 0) {
error_report("error while loading state section id %d(%s)",
section_id, le->se->idstr);
- goto out;
+ return ret;
}
if (!check_section_footer(f, le)) {
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
+ }
+ break;
+ case QEMU_VM_COMMAND:
+ ret = loadvm_process_command(f);
+ trace_qemu_loadvm_state_section_command(ret);
+ if ((ret < 0) || (ret & LOADVM_QUIT)) {
+ return ret;
}
break;
default:
error_report("Unknown savevm section type %d", section_type);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
- file_error_after_eof = qemu_file_get_error(f);
+ return 0;
+}
+
+int qemu_loadvm_state(QEMUFile *f)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ Error *local_err = NULL;
+ unsigned int v;
+ int ret;
+
+ if (qemu_savevm_state_blocked(&local_err)) {
+ error_report_err(local_err);
+ return -EINVAL;
+ }
+
+ v = qemu_get_be32(f);
+ if (v != QEMU_VM_FILE_MAGIC) {
+ error_report("Not a migration stream");
+ return -EINVAL;
+ }
+
+ v = qemu_get_be32(f);
+ if (v == QEMU_VM_FILE_VERSION_COMPAT) {
+ error_report("SaveVM v2 format is obsolete and don't work anymore");
+ return -ENOTSUP;
+ }
+ if (v != QEMU_VM_FILE_VERSION) {
+ error_report("Unsupported migration stream version");
+ return -ENOTSUP;
+ }
+
+ if (!savevm_state.skip_configuration) {
+ if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
+ error_report("Configuration section missing");
+ return -EINVAL;
+ }
+ ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
+
+ if (ret) {
+ return ret;
+ }
+ }
+
+ ret = qemu_loadvm_state_main(f, mis);
+ qemu_event_set(&mis->main_thread_load_event);
+
+ trace_qemu_loadvm_state_post_main(ret);
+
+ if (mis->have_listen_thread) {
+ /* Listen thread still going, can't clean up yet */
+ return ret;
+ }
+
+ if (ret == 0) {
+ ret = qemu_file_get_error(f);
+ }
/*
* Try to read in the VMDESC section as well, so that dumping tools that
@@ -1202,10 +1875,10 @@ int qemu_loadvm_state(QEMUFile *f)
* We also mustn't read data that isn't there; some transports (RDMA)
* will stall waiting for that data when the source has already closed.
*/
- if (should_send_vmdesc()) {
+ if (ret == 0 && should_send_vmdesc()) {
uint8_t *buf;
uint32_t size;
- section_type = qemu_get_byte(f);
+ uint8_t section_type = qemu_get_byte(f);
if (section_type != QEMU_VM_VMDESCRIPTION) {
error_report("Expected vmdescription section, but got %d",
@@ -1229,57 +1902,9 @@ int qemu_loadvm_state(QEMUFile *f)
cpu_synchronize_all_post_init();
- ret = 0;
-
-out:
- if (ret == 0) {
- /* We may not have a VMDESC section, so ignore relative errors */
- ret = file_error_after_eof;
- }
-
return ret;
}
-static BlockDriverState *find_vmstate_bs(void)
-{
- BlockDriverState *bs = NULL;
- while ((bs = bdrv_next(bs))) {
- if (bdrv_can_snapshot(bs)) {
- return bs;
- }
- }
- return NULL;
-}
-
-/*
- * Deletes snapshots of a given name in all opened images.
- */
-static int del_existing_snapshots(Monitor *mon, const char *name)
-{
- BlockDriverState *bs;
- QEMUSnapshotInfo sn1, *snapshot = &sn1;
- Error *err = NULL;
-
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
- if (bdrv_can_snapshot(bs) &&
- bdrv_snapshot_find(bs, snapshot, name) >= 0) {
- bdrv_snapshot_delete_by_id_or_name(bs, name, &err);
- if (err) {
- monitor_printf(mon,
- "Error while deleting snapshot on device '%s':"
- " %s\n",
- bdrv_get_device_name(bs),
- error_get_pretty(err));
- error_free(err);
- return -1;
- }
- }
- }
-
- return 0;
-}
-
void hmp_savevm(Monitor *mon, const QDict *qdict)
{
BlockDriverState *bs, *bs1;
@@ -1292,27 +1917,29 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
struct tm tm;
const char *name = qdict_get_try_str(qdict, "name");
Error *local_err = NULL;
+ AioContext *aio_context;
- /* Verify if there is a device that doesn't support snapshots and is writable */
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
-
- if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
- continue;
- }
+ if (!bdrv_all_can_snapshot(&bs)) {
+ monitor_printf(mon, "Device '%s' is writable but does not "
+ "support snapshots.\n", bdrv_get_device_name(bs));
+ return;
+ }
- if (!bdrv_can_snapshot(bs)) {
- monitor_printf(mon, "Device '%s' is writable but does not support snapshots.\n",
- bdrv_get_device_name(bs));
- return;
- }
+ /* Delete old snapshots of the same name */
+ if (name && bdrv_all_delete_snapshot(name, &bs1, &local_err) < 0) {
+ monitor_printf(mon,
+ "Error while deleting snapshot on device '%s': %s\n",
+ bdrv_get_device_name(bs1), error_get_pretty(local_err));
+ error_free(local_err);
+ return;
}
- bs = find_vmstate_bs();
- if (!bs) {
+ bs = bdrv_all_find_vmstate_bs();
+ if (bs == NULL) {
monitor_printf(mon, "No block device can accept snapshots\n");
return;
}
+ aio_context = bdrv_get_aio_context(bs);
saved_vm_running = runstate_is_running();
@@ -1323,6 +1950,8 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
}
vm_stop(RUN_STATE_SAVE_VM);
+ aio_context_acquire(aio_context);
+
memset(sn, 0, sizeof(*sn));
/* fill auxiliary fields */
@@ -1345,11 +1974,6 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
}
- /* Delete old snapshots of the same name */
- if (name && del_existing_snapshots(mon, name) < 0) {
- goto the_end;
- }
-
/* save the VM state */
f = qemu_fopen_bdrv(bs, 1);
if (!f) {
@@ -1365,22 +1989,14 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
goto the_end;
}
- /* create the snapshots */
-
- bs1 = NULL;
- while ((bs1 = bdrv_next(bs1))) {
- if (bdrv_can_snapshot(bs1)) {
- /* Write VM state size only to the image that contains the state */
- sn->vm_state_size = (bs == bs1 ? vm_state_size : 0);
- ret = bdrv_snapshot_create(bs1, sn);
- if (ret < 0) {
- monitor_printf(mon, "Error while creating snapshot on '%s'\n",
- bdrv_get_device_name(bs1));
- }
- }
+ ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
+ if (ret < 0) {
+ monitor_printf(mon, "Error while creating snapshot on '%s'\n",
+ bdrv_get_device_name(bs));
}
the_end:
+ aio_context_release(aio_context);
if (saved_vm_running) {
vm_start();
}
@@ -1419,15 +2035,31 @@ int load_vmstate(const char *name)
QEMUSnapshotInfo sn;
QEMUFile *f;
int ret;
+ AioContext *aio_context;
- bs_vm_state = find_vmstate_bs();
+ if (!bdrv_all_can_snapshot(&bs)) {
+ error_report("Device '%s' is writable but does not support snapshots.",
+ bdrv_get_device_name(bs));
+ return -ENOTSUP;
+ }
+ ret = bdrv_all_find_snapshot(name, &bs);
+ if (ret < 0) {
+ error_report("Device '%s' does not have the requested snapshot '%s'",
+ bdrv_get_device_name(bs), name);
+ return ret;
+ }
+
+ bs_vm_state = bdrv_all_find_vmstate_bs();
if (!bs_vm_state) {
error_report("No block device supports snapshots");
return -ENOTSUP;
}
+ aio_context = bdrv_get_aio_context(bs_vm_state);
/* Don't even try to load empty VM states */
+ aio_context_acquire(aio_context);
ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
+ aio_context_release(aio_context);
if (ret < 0) {
return ret;
} else if (sn.vm_state_size == 0) {
@@ -1436,42 +2068,14 @@ int load_vmstate(const char *name)
return -EINVAL;
}
- /* Verify if there is any device that doesn't support snapshots and is
- writable and check if the requested snapshot is available too. */
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
-
- if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
- continue;
- }
-
- if (!bdrv_can_snapshot(bs)) {
- error_report("Device '%s' is writable but does not support snapshots.",
- bdrv_get_device_name(bs));
- return -ENOTSUP;
- }
-
- ret = bdrv_snapshot_find(bs, &sn, name);
- if (ret < 0) {
- error_report("Device '%s' does not have the requested snapshot '%s'",
- bdrv_get_device_name(bs), name);
- return ret;
- }
- }
-
/* Flush all IO requests so they don't interfere with the new state. */
bdrv_drain_all();
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
- if (bdrv_can_snapshot(bs)) {
- ret = bdrv_snapshot_goto(bs, name);
- if (ret < 0) {
- error_report("Error %d while activating snapshot '%s' on '%s'",
- ret, name, bdrv_get_device_name(bs));
- return ret;
- }
- }
+ ret = bdrv_all_goto_snapshot(name, &bs);
+ if (ret < 0) {
+ error_report("Error %d while activating snapshot '%s' on '%s'",
+ ret, name, bdrv_get_device_name(bs));
+ return ret;
}
/* restore the VM state */
@@ -1483,9 +2087,12 @@ int load_vmstate(const char *name)
qemu_system_reset(VMRESET_SILENT);
migration_incoming_state_new(f);
- ret = qemu_loadvm_state(f);
+ aio_context_acquire(aio_context);
+ ret = qemu_loadvm_state(f);
qemu_fclose(f);
+ aio_context_release(aio_context);
+
migration_incoming_state_destroy();
if (ret < 0) {
error_report("Error %d while loading VM state", ret);
@@ -1501,43 +2108,34 @@ void hmp_delvm(Monitor *mon, const QDict *qdict)
Error *err;
const char *name = qdict_get_str(qdict, "name");
- if (!find_vmstate_bs()) {
- monitor_printf(mon, "No block device supports snapshots\n");
- return;
- }
-
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
- if (bdrv_can_snapshot(bs)) {
- err = NULL;
- bdrv_snapshot_delete_by_id_or_name(bs, name, &err);
- if (err) {
- monitor_printf(mon,
- "Error while deleting snapshot on device '%s':"
- " %s\n",
- bdrv_get_device_name(bs),
- error_get_pretty(err));
- error_free(err);
- }
- }
+ if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) {
+ monitor_printf(mon,
+ "Error while deleting snapshot on device '%s': %s\n",
+ bdrv_get_device_name(bs), error_get_pretty(err));
+ error_free(err);
}
}
void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
{
BlockDriverState *bs, *bs1;
- QEMUSnapshotInfo *sn_tab, *sn, s, *sn_info = &s;
- int nb_sns, i, ret, available;
+ QEMUSnapshotInfo *sn_tab, *sn;
+ int nb_sns, i;
int total;
int *available_snapshots;
+ AioContext *aio_context;
- bs = find_vmstate_bs();
+ bs = bdrv_all_find_vmstate_bs();
if (!bs) {
monitor_printf(mon, "No available block device supports snapshots\n");
return;
}
+ aio_context = bdrv_get_aio_context(bs);
+ aio_context_acquire(aio_context);
nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+ aio_context_release(aio_context);
+
if (nb_sns < 0) {
monitor_printf(mon, "bdrv_snapshot_list: error %d\n", nb_sns);
return;
@@ -1548,24 +2146,10 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
return;
}
- available_snapshots = g_malloc0(sizeof(int) * nb_sns);
+ available_snapshots = g_new0(int, nb_sns);
total = 0;
for (i = 0; i < nb_sns; i++) {
- sn = &sn_tab[i];
- available = 1;
- bs1 = NULL;
-
- while ((bs1 = bdrv_next(bs1))) {
- if (bdrv_can_snapshot(bs1) && bs1 != bs) {
- ret = bdrv_snapshot_find(bs1, sn_info, sn->id_str);
- if (ret < 0) {
- available = 0;
- break;
- }
- }
- }
-
- if (available) {
+ if (bdrv_all_find_snapshot(sn_tab[i].id_str, &bs1) == 0) {
available_snapshots[total] = i;
total++;
}