@@ -494,6 +494,7 @@ int migrate_init(MigrationState *s, Error **errp);
bool migration_is_blocked(Error **errp);
/* True if outgoing migration has entered postcopy phase */
bool migration_in_postcopy(void);
+bool migration_postcopy_is_alive(int state);
MigrationState *migrate_get_current(void);
uint64_t ram_get_total_transferred_pages(void);
@@ -534,8 +535,11 @@ void migration_populate_vfio_info(MigrationInfo *info);
void migration_reset_vfio_bytes_transferred(void);
void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
-/* Migration thread waiting for return path thread. */
-void migration_rp_wait(MigrationState *s);
+/*
+ * Migration thread waiting for return path thread. Return non-zero if an
+ * error is detected.
+ */
+int migration_rp_wait(MigrationState *s);
/*
* Kick the migration thread waiting for return path messages. NOTE: the
* name can be slightly confusing (when read as "kick the rp thread"), just
@@ -1393,6 +1393,17 @@ bool migration_in_postcopy(void)
}
}
+bool migration_postcopy_is_alive(int state)
+{
+ switch (state) {
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_RECOVER:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool migration_in_postcopy_after_devices(MigrationState *s)
{
return migration_in_postcopy() && s->postcopy_after_devices;
@@ -1673,8 +1684,15 @@ void qmp_migrate_pause(Error **errp)
MigrationIncomingState *mis = migration_incoming_get_current();
int ret = 0;
- if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ if (migration_postcopy_is_alive(ms->state)) {
/* Source side, during postcopy */
+ Error *error = NULL;
+
+ /* Tell the core migration that we're pausing */
+ error_setg(&error, "Postcopy migration is paused by the user");
+ migrate_set_error(ms, error);
+ error_free(error);
+
qemu_mutex_lock(&ms->qemu_file_lock);
if (ms->to_dst_file) {
ret = qemu_file_shutdown(ms->to_dst_file);
@@ -1683,10 +1701,17 @@ void qmp_migrate_pause(Error **errp)
if (ret) {
error_setg(errp, "Failed to pause source migration");
}
+
+ /*
+ * Kick the migration thread out of any waiting windows (on behalf
+ * of the rp thread).
+ */
+ migration_rp_kick(ms);
+
return;
}
- if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ if (migration_postcopy_is_alive(mis->state)) {
ret = qemu_file_shutdown(mis->from_src_file);
if (ret) {
error_setg(errp, "Failed to pause destination migration");
@@ -1695,7 +1720,7 @@ void qmp_migrate_pause(Error **errp)
}
error_setg(errp, "migrate-pause is currently only supported "
- "during postcopy-active state");
+ "during postcopy-active or postcopy-recover state");
}
bool migration_is_blocked(Error **errp)
@@ -1882,9 +1907,21 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp)
qemu_sem_post(&s->pause_sem);
}
-void migration_rp_wait(MigrationState *s)
+int migration_rp_wait(MigrationState *s)
{
+ /* If migration has failure already, ignore the wait */
+ if (migrate_has_error(s)) {
+ return -1;
+ }
+
qemu_sem_wait(&s->rp_state.rp_sem);
+
+ /* After wait, double check that there's no failure */
+ if (migrate_has_error(s)) {
+ return -1;
+ }
+
+ return 0;
}
void migration_rp_kick(MigrationState *s)
@@ -2146,6 +2183,20 @@ out:
trace_source_return_path_thread_bad_end();
}
+ if (ms->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
+ /*
+ * this will be extremely unlikely: that we got yet another network
+ * issue during recovering of the 1st network failure.. during this
+ * period the main migration thread can be waiting on rp_sem for
+ * this thread to sync with the other side.
+ *
+ * When this happens, explicitly kick the migration thread out of
+ * RECOVER stage and back to PAUSED, so the admin can try
+ * everything again.
+ */
+ migration_rp_kick(ms);
+ }
+
trace_source_return_path_thread_end();
rcu_unregister_thread();
@@ -2611,7 +2662,9 @@ static int postcopy_resume_handshake(MigrationState *s)
qemu_savevm_send_postcopy_resume(s->to_dst_file);
while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
- migration_rp_wait(s);
+ if (migration_rp_wait(s)) {
+ return -1;
+ }
}
if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
@@ -4099,7 +4099,9 @@ static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
/* Wait until all the ramblocks' dirty bitmap synced */
while (qatomic_read(&rs->postcopy_bmap_sync_requested)) {
- migration_rp_wait(s);
+ if (migration_rp_wait(s)) {
+ return -1;
+ }
}
trace_ram_dirty_bitmap_sync_complete();