qemu: Remember failed post-copy migration in job

When post-copy migration fails, the domain stays running on the
destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the
state and the reason can later be rewritten in case the domain gets
paused for other reasons (such as an I/O error). Thus we need a separate
place to remember the post-copy migration failed to be able to resume
the migration.

https://bugzilla.redhat.com/show_bug.cgi?id=2111948

Signed-off-by: Jiri Denemark <jdenemar@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
This commit is contained in:
Jiri Denemark 2022-12-15 14:12:43 +01:00
parent 49a5754063
commit 7050dad5f9
6 changed files with 55 additions and 12 deletions

View File

@ -27922,8 +27922,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason)
bool
virDomainObjIsFailedPostcopy(virDomainObj *dom,
virDomainJobObj *job G_GNUC_UNUSED)
virDomainJobObj *job)
{
if (job && job->asyncPaused &&
(job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN ||
job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT))
return true;
return ((dom->state.state == VIR_DOMAIN_PAUSED &&
dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
(dom->state.state == VIR_DOMAIN_RUNNING &&

View File

@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job)
job->asyncOwner = 0;
g_clear_pointer(&job->asyncOwnerAPI, g_free);
job->asyncStarted = 0;
job->asyncPaused = false;
job->phase = 0;
job->mask = VIR_JOB_DEFAULT_MASK;
job->abortJob = false;

View File

@ -176,6 +176,7 @@ struct _virDomainJobObj {
unsigned long long asyncOwner; /* Thread which set current async job */
char *asyncOwnerAPI; /* The API which owns the async job */
unsigned long long asyncStarted; /* When the current async job started */
bool asyncPaused; /* The async job is paused */
int phase; /* Job phase (mainly for migrations) */
unsigned long long mask; /* Jobs allowed during async job */
virDomainJobData *current; /* async job progress data */

View File

@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf,
if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) {
virBufferAsprintf(&attrBuf, " flags='0x%x'", vm->job->apiFlags);
virBufferAsprintf(&attrBuf, " asyncStarted='%llu'", vm->job->asyncStarted);
if (vm->job->asyncPaused)
virBufferAddLit(&attrBuf, " asyncPaused='yes'");
}
if (vm->job->cb &&
@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
if ((tmp = virXPathString("string(@async)", ctxt))) {
int async;
virTristateBool paused;
if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
_("Invalid async job start"));
return -1;
}
if (virXMLPropTristateBool(ctxt->node, "asyncPaused", VIR_XML_PROP_NONE,
&paused) < 0)
return -1;
vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES;
}
if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE,

View File

@ -1664,17 +1664,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm)
state = virDomainObjGetState(vm, &reason);
VIR_DEBUG("%s/%s",
VIR_DEBUG("%s/%s, asyncPaused=%u",
virDomainStateTypeToString(state),
virDomainStateReasonToString(state, reason));
virDomainStateReasonToString(state, reason),
vm->job->asyncPaused);
if (state != VIR_DOMAIN_PAUSED ||
reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
virDomainObjIsFailedPostcopy(vm, vm->job))
return;
VIR_WARN("Migration of domain %s failed during post-copy; "
"leaving the domain paused", vm->def->name);
vm->job->asyncPaused = true;
virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
@ -1694,21 +1696,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm)
state = virDomainObjGetState(vm, &reason);
VIR_DEBUG("%s/%s",
VIR_DEBUG("%s/%s, asyncPaused=%u",
virDomainStateTypeToString(state),
virDomainStateReasonToString(state, reason));
virDomainStateReasonToString(state, reason),
vm->job->asyncPaused);
if (state != VIR_DOMAIN_RUNNING ||
reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED)
if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) ||
virDomainObjIsFailedPostcopy(vm, vm->job))
return;
VIR_WARN("Incoming migration of domain '%s' failed during post-copy; "
"leaving the domain running", vm->def->name);
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
vm->job->asyncPaused = true;
if (state == VIR_DOMAIN_RUNNING) {
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
} else {
/* The domain was paused for other reasons (I/O error, ...) so we don't
* want to rewrite the original reason and just emit a postcopy-failed
* event. */
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
}
virObjectEventStateQueue(driver->domainEventState, event);
}

View File

@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED,
vm->def->name, virDomainRunningReasonTypeToString(reason),
eventDetail);
/* When a domain is running in (failed) post-copy migration on the
* destination host, we need to make sure to set the appropriate reason
* here. */
if (virDomainObjIsPostcopy(vm, vm->job)) {
if (virDomainObjIsFailedPostcopy(vm, vm->job))
reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED;
else
reason = VIR_DOMAIN_RUNNING_POSTCOPY;
}
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_RESUMED,
@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
vm->def->name,
virDomainStateTypeToString(state),
NULLSTR(virDomainStateReasonToString(state, reason)));
vm->job->asyncPaused = false;
virDomainObjSetState(vm, state, reason);
event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail);
qemuDomainSaveStatus(vm);
@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm,
job->privateData = g_steal_pointer(&vm->job->privateData);
vm->job->privateData = jobPriv;
vm->job->apiFlags = job->apiFlags;
vm->job->asyncPaused = job->asyncPaused;
qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob);
}
@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) {
VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended",
vm->def->name);
vm->job->asyncPaused = false;
return 0;
}
@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
qemuMigrationSrcPostcopyFailed(vm);
else
qemuMigrationDstPostcopyFailed(vm);
/* Set the asyncPaused flag in case we're reconnecting to a domain
* started by an older libvirt. */
vm->job->asyncPaused = true;
return 0;
}