mirror of
https://gitlab.com/libvirt/libvirt.git
synced 2025-01-22 12:35:17 +00:00
qemu: Remember failed post-copy migration in job
When post-copy migration fails, the domain stays running on the destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the state and the reason can later be rewritten in case the domain gets paused for other reasons (such as an I/O error). Thus we need a separate place to remember the post-copy migration failed to be able to resume the migration. https://bugzilla.redhat.com/show_bug.cgi?id=2111948 Signed-off-by: Jiri Denemark <jdenemar@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
This commit is contained in:
parent
49a5754063
commit
7050dad5f9
@ -27922,8 +27922,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason)
|
||||
|
||||
bool
|
||||
virDomainObjIsFailedPostcopy(virDomainObj *dom,
|
||||
virDomainJobObj *job G_GNUC_UNUSED)
|
||||
virDomainJobObj *job)
|
||||
{
|
||||
if (job && job->asyncPaused &&
|
||||
(job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN ||
|
||||
job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT))
|
||||
return true;
|
||||
|
||||
return ((dom->state.state == VIR_DOMAIN_PAUSED &&
|
||||
dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
|
||||
(dom->state.state == VIR_DOMAIN_RUNNING &&
|
||||
|
@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job)
|
||||
job->asyncOwner = 0;
|
||||
g_clear_pointer(&job->asyncOwnerAPI, g_free);
|
||||
job->asyncStarted = 0;
|
||||
job->asyncPaused = false;
|
||||
job->phase = 0;
|
||||
job->mask = VIR_JOB_DEFAULT_MASK;
|
||||
job->abortJob = false;
|
||||
|
@ -176,6 +176,7 @@ struct _virDomainJobObj {
|
||||
unsigned long long asyncOwner; /* Thread which set current async job */
|
||||
char *asyncOwnerAPI; /* The API which owns the async job */
|
||||
unsigned long long asyncStarted; /* When the current async job started */
|
||||
bool asyncPaused; /* The async job is paused */
|
||||
int phase; /* Job phase (mainly for migrations) */
|
||||
unsigned long long mask; /* Jobs allowed during async job */
|
||||
virDomainJobData *current; /* async job progress data */
|
||||
|
@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf,
|
||||
if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) {
|
||||
virBufferAsprintf(&attrBuf, " flags='0x%x'", vm->job->apiFlags);
|
||||
virBufferAsprintf(&attrBuf, " asyncStarted='%llu'", vm->job->asyncStarted);
|
||||
if (vm->job->asyncPaused)
|
||||
virBufferAddLit(&attrBuf, " asyncPaused='yes'");
|
||||
}
|
||||
|
||||
if (vm->job->cb &&
|
||||
@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
|
||||
|
||||
if ((tmp = virXPathString("string(@async)", ctxt))) {
|
||||
int async;
|
||||
virTristateBool paused;
|
||||
|
||||
if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) {
|
||||
virReportError(VIR_ERR_INTERNAL_ERROR,
|
||||
@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
|
||||
_("Invalid async job start"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (virXMLPropTristateBool(ctxt->node, "asyncPaused", VIR_XML_PROP_NONE,
|
||||
&paused) < 0)
|
||||
return -1;
|
||||
|
||||
vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES;
|
||||
}
|
||||
|
||||
if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE,
|
||||
|
@ -1664,17 +1664,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm)
|
||||
|
||||
state = virDomainObjGetState(vm, &reason);
|
||||
|
||||
VIR_DEBUG("%s/%s",
|
||||
VIR_DEBUG("%s/%s, asyncPaused=%u",
|
||||
virDomainStateTypeToString(state),
|
||||
virDomainStateReasonToString(state, reason));
|
||||
virDomainStateReasonToString(state, reason),
|
||||
vm->job->asyncPaused);
|
||||
|
||||
if (state != VIR_DOMAIN_PAUSED ||
|
||||
reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
|
||||
virDomainObjIsFailedPostcopy(vm, vm->job))
|
||||
return;
|
||||
|
||||
VIR_WARN("Migration of domain %s failed during post-copy; "
|
||||
"leaving the domain paused", vm->def->name);
|
||||
|
||||
vm->job->asyncPaused = true;
|
||||
virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
|
||||
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
|
||||
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
|
||||
@ -1694,21 +1696,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm)
|
||||
|
||||
state = virDomainObjGetState(vm, &reason);
|
||||
|
||||
VIR_DEBUG("%s/%s",
|
||||
VIR_DEBUG("%s/%s, asyncPaused=%u",
|
||||
virDomainStateTypeToString(state),
|
||||
virDomainStateReasonToString(state, reason));
|
||||
virDomainStateReasonToString(state, reason),
|
||||
vm->job->asyncPaused);
|
||||
|
||||
if (state != VIR_DOMAIN_RUNNING ||
|
||||
reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED)
|
||||
if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) ||
|
||||
virDomainObjIsFailedPostcopy(vm, vm->job))
|
||||
return;
|
||||
|
||||
VIR_WARN("Incoming migration of domain '%s' failed during post-copy; "
|
||||
"leaving the domain running", vm->def->name);
|
||||
|
||||
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
|
||||
VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
|
||||
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
|
||||
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
|
||||
vm->job->asyncPaused = true;
|
||||
if (state == VIR_DOMAIN_RUNNING) {
|
||||
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
|
||||
VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
|
||||
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
|
||||
VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
|
||||
} else {
|
||||
/* The domain was paused for other reasons (I/O error, ...) so we don't
|
||||
* want to rewrite the original reason and just emit a postcopy-failed
|
||||
* event. */
|
||||
event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
|
||||
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
|
||||
}
|
||||
virObjectEventStateQueue(driver->domainEventState, event);
|
||||
}
|
||||
|
||||
|
@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED,
|
||||
vm->def->name, virDomainRunningReasonTypeToString(reason),
|
||||
eventDetail);
|
||||
|
||||
/* When a domain is running in (failed) post-copy migration on the
|
||||
* destination host, we need to make sure to set the appropriate reason
|
||||
* here. */
|
||||
if (virDomainObjIsPostcopy(vm, vm->job)) {
|
||||
if (virDomainObjIsFailedPostcopy(vm, vm->job))
|
||||
reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED;
|
||||
else
|
||||
reason = VIR_DOMAIN_RUNNING_POSTCOPY;
|
||||
}
|
||||
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
|
||||
event = virDomainEventLifecycleNewFromObj(vm,
|
||||
VIR_DOMAIN_EVENT_RESUMED,
|
||||
@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
|
||||
vm->def->name,
|
||||
virDomainStateTypeToString(state),
|
||||
NULLSTR(virDomainStateReasonToString(state, reason)));
|
||||
vm->job->asyncPaused = false;
|
||||
virDomainObjSetState(vm, state, reason);
|
||||
event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail);
|
||||
qemuDomainSaveStatus(vm);
|
||||
@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm,
|
||||
job->privateData = g_steal_pointer(&vm->job->privateData);
|
||||
vm->job->privateData = jobPriv;
|
||||
vm->job->apiFlags = job->apiFlags;
|
||||
vm->job->asyncPaused = job->asyncPaused;
|
||||
|
||||
qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob);
|
||||
}
|
||||
@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
|
||||
if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) {
|
||||
VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended",
|
||||
vm->def->name);
|
||||
vm->job->asyncPaused = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
|
||||
qemuMigrationSrcPostcopyFailed(vm);
|
||||
else
|
||||
qemuMigrationDstPostcopyFailed(vm);
|
||||
/* Set the asyncPaused flag in case we're reconnecting to a domain
|
||||
* started by an older libvirt. */
|
||||
vm->job->asyncPaused = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user