mirror of
https://gitlab.com/libvirt/libvirt.git
synced 2025-01-23 04:55:18 +00:00
qemu: drop driver lock while trying to terminate qemu process
This patch is based on an earlier patch by Eric Blake which was never committed: https://www.redhat.com/archives/libvir-list/2011-November/msg00243.html Aside from rebasing, this patch only drops the driver lock once (prior to the first time the function sleeps), then leaves it dropped until it returns (Eric's patch would drop and re-acquire the lock around each call to sleep). At the time Eric sent his patch, the response (from Dan Berrange) was that, while it wasn't a good thing to be holding the driver lock while sleeping, we really need to rethink locking wrt the driver object, switching to a finer-grained approach that locks individual items within the driver object separately to allow for greater concurrency. This is a good plan, and at the time it made sense to not apply the patch because there was no known bug related to the driver lock being held in this function. However, we now know that the length of the wait in qemuProcessKill is sometimes too short to allow the qemu process to fully flush its disk cache before SIGKILL is sent, so we need to lengthen the timeout (in order to improve the situation with management applications until they can be updated to use the new VIR_DOMAIN_DESTROY_GRACEFUL flag added in commit 72f8a7f19753506ed957b78ad800c0f3892c9304). But, if we lengthen the timeout, we also lengthen the amount of time that all other threads in libvirtd are essentially blocked from doing anything (since just about everything needs to acquire the driver lock, if only for long enough to get a pointer to a domain). The solution is to modify qemuProcessKill to drop the driver lock while sleeping, as proposed in Eric's patch. Then we can increase the timeout with a clear conscience, and thus at least lower the chances that someone running with existing management software will suffer the consequence's of qemu's disk cache not being flushed. In the meantime, we still should work on Dan's proposal to make locking within the driver object more fine grained. (NB: although I couldn't find any instance where qemuProcessKill() was called with no jobs active for the domain (or some other guarantee that the current thread had at least one refcount on the domain object), this patch still follows Eric's method of temporarily adding a ref prior to unlocking the domain object, because I couldn't convince myself 100% that this was the case.)
This commit is contained in:
parent
5452e88c32
commit
595e26c086
@ -1791,13 +1791,13 @@ qemuDomainDestroyFlags(virDomainPtr dom,
|
||||
* it now means the job will be released
|
||||
*/
|
||||
if (flags & VIR_DOMAIN_DESTROY_GRACEFUL) {
|
||||
if (qemuProcessKill(vm, 0) < 0) {
|
||||
if (qemuProcessKill(driver, vm, 0) < 0) {
|
||||
qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
||||
_("failed to kill qemu process with SIGTERM"));
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
}
|
||||
|
||||
/* We need to prevent monitor EOF callback from doing our work (and sending
|
||||
|
@ -586,8 +586,10 @@ endjob:
|
||||
|
||||
cleanup:
|
||||
if (vm) {
|
||||
if (ret == -1)
|
||||
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
if (ret == -1) {
|
||||
ignore_value(qemuProcessKill(driver, vm,
|
||||
VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
}
|
||||
if (virDomainObjUnref(vm) > 0)
|
||||
virDomainObjUnlock(vm);
|
||||
}
|
||||
@ -612,12 +614,13 @@ qemuProcessShutdownOrReboot(struct qemud_driver *driver,
|
||||
qemuProcessFakeReboot,
|
||||
vm) < 0) {
|
||||
VIR_ERROR(_("Failed to create reboot thread, killing domain"));
|
||||
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_NOWAIT));
|
||||
ignore_value(qemuProcessKill(driver, vm,
|
||||
VIR_QEMU_PROCESS_KILL_NOWAIT));
|
||||
/* Safe to ignore value since ref count was incremented above */
|
||||
ignore_value(virDomainObjUnref(vm));
|
||||
}
|
||||
} else {
|
||||
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_NOWAIT));
|
||||
ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_NOWAIT));
|
||||
}
|
||||
}
|
||||
|
||||
@ -3532,10 +3535,13 @@ cleanup:
|
||||
}
|
||||
|
||||
|
||||
int qemuProcessKill(virDomainObjPtr vm, unsigned int flags)
|
||||
int
|
||||
qemuProcessKill(struct qemud_driver *driver,
|
||||
virDomainObjPtr vm, unsigned int flags)
|
||||
{
|
||||
int i;
|
||||
int i, ret = -1;
|
||||
const char *signame = "TERM";
|
||||
bool driver_unlocked = false;
|
||||
|
||||
VIR_DEBUG("vm=%s pid=%d flags=%x",
|
||||
vm->def->name, vm->pid, flags);
|
||||
@ -3579,18 +3585,44 @@ int qemuProcessKill(virDomainObjPtr vm, unsigned int flags)
|
||||
VIR_WARN("Failed to terminate process %d with SIG%s: %s",
|
||||
vm->pid, signame,
|
||||
virStrerror(errno, ebuf, sizeof ebuf));
|
||||
return -1;
|
||||
goto cleanup;
|
||||
}
|
||||
return 0; /* process is dead */
|
||||
ret = 0;
|
||||
goto cleanup; /* process is dead */
|
||||
}
|
||||
|
||||
if (i == 0 && (flags & VIR_QEMU_PROCESS_KILL_NOWAIT))
|
||||
return 0;
|
||||
if (i == 0 && (flags & VIR_QEMU_PROCESS_KILL_NOWAIT)) {
|
||||
ret = 0;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (driver && !driver_unlocked) {
|
||||
/* THREADS.txt says we can't hold the driver lock while sleeping */
|
||||
qemuDriverUnlock(driver);
|
||||
driver_unlocked = true;
|
||||
}
|
||||
|
||||
usleep(200 * 1000);
|
||||
}
|
||||
VIR_WARN("Timed out waiting after SIG%s to process %d", signame, vm->pid);
|
||||
return -1;
|
||||
cleanup:
|
||||
if (driver_unlocked) {
|
||||
/* We had unlocked the driver, so re-lock it. THREADS.txt says
|
||||
* we can't have the domain locked when locking the driver, so
|
||||
* we must first unlock the domain. BUT, before we can unlock
|
||||
* the domain, we need to add a ref to it in case there aren't
|
||||
* any active jobs (analysis of all callers didn't reveal such
|
||||
* a case, but there are too many to maintain certainty, so we
|
||||
* will do this as a precaution).
|
||||
*/
|
||||
virDomainObjRef(vm);
|
||||
virDomainObjUnlock(vm);
|
||||
qemuDriverLock(driver);
|
||||
virDomainObjLock(vm);
|
||||
/* Safe to ignore value since ref count was incremented above */
|
||||
ignore_value(virDomainObjUnref(vm));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -3679,7 +3711,7 @@ void qemuProcessStop(struct qemud_driver *driver,
|
||||
}
|
||||
|
||||
/* shut it off for sure */
|
||||
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_FORCE));
|
||||
|
||||
/* Stop autodestroy in case guest is restarted */
|
||||
qemuProcessAutoDestroyRemove(driver, vm);
|
||||
|
@ -73,7 +73,8 @@ typedef enum {
|
||||
VIR_QEMU_PROCESS_KILL_NOWAIT = 1 << 1,
|
||||
} virQemuProcessKillMode;
|
||||
|
||||
int qemuProcessKill(virDomainObjPtr vm, unsigned int flags);
|
||||
int qemuProcessKill(struct qemud_driver *driver,
|
||||
virDomainObjPtr vm, unsigned int flags);
|
||||
|
||||
int qemuProcessAutoDestroyInit(struct qemud_driver *driver);
|
||||
void qemuProcessAutoDestroyRun(struct qemud_driver *driver,
|
||||
|
Loading…
x
Reference in New Issue
Block a user