qemu: drop driver lock while trying to terminate qemu process

This patch is based on an earlier patch by Eric Blake which was never
committed:

https://www.redhat.com/archives/libvir-list/2011-November/msg00243.html

Aside from rebasing, this patch only drops the driver lock once (prior
to the first time the function sleeps), then leaves it dropped until
it returns (Eric's patch would drop and re-acquire the lock around
each call to sleep).

At the time Eric sent his patch, the response (from Dan Berrange) was
that, while it wasn't a good thing to be holding the driver lock while
sleeping, we really need to rethink locking wrt the driver object,
switching to a finer-grained approach that locks individual items
within the driver object separately to allow for greater concurrency.

This is a good plan, and at the time it made sense to not apply the
patch because there was no known bug related to the driver lock being
held in this function.

However, we now know that the length of the wait in qemuProcessKill is
sometimes too short to allow the qemu process to fully flush its disk
cache before SIGKILL is sent, so we need to lengthen the timeout (in
order to improve the situation with management applications until they
can be updated to use the new VIR_DOMAIN_DESTROY_GRACEFUL flag added
in commit 72f8a7f19753506ed957b78ad800c0f3892c9304). But, if we
lengthen the timeout, we also lengthen the amount of time that all
other threads in libvirtd are essentially blocked from doing anything
(since just about everything needs to acquire the driver lock, if only
for long enough to get a pointer to a domain).

The solution is to modify qemuProcessKill to drop the driver lock
while sleeping, as proposed in Eric's patch. Then we can increase the
timeout with a clear conscience, and thus at least lower the chances
that someone running with existing management software will suffer the
consequence's of qemu's disk cache not being flushed.

In the meantime, we still should work on Dan's proposal to make
locking within the driver object more fine grained.

(NB: although I couldn't find any instance where qemuProcessKill() was
called with no jobs active for the domain (or some other guarantee
that the current thread had at least one refcount on the domain
object), this patch still follows Eric's method of temporarily adding
a ref prior to unlocking the domain object, because I couldn't
convince myself 100% that this was the case.)
This commit is contained in:
Laine Stump 2012-02-07 11:13:57 -05:00
parent 5452e88c32
commit 595e26c086
3 changed files with 48 additions and 15 deletions

View File

@ -1791,13 +1791,13 @@ qemuDomainDestroyFlags(virDomainPtr dom,
* it now means the job will be released * it now means the job will be released
*/ */
if (flags & VIR_DOMAIN_DESTROY_GRACEFUL) { if (flags & VIR_DOMAIN_DESTROY_GRACEFUL) {
if (qemuProcessKill(vm, 0) < 0) { if (qemuProcessKill(driver, vm, 0) < 0) {
qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("failed to kill qemu process with SIGTERM")); _("failed to kill qemu process with SIGTERM"));
goto cleanup; goto cleanup;
} }
} else { } else {
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE)); ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_FORCE));
} }
/* We need to prevent monitor EOF callback from doing our work (and sending /* We need to prevent monitor EOF callback from doing our work (and sending

View File

@ -586,8 +586,10 @@ endjob:
cleanup: cleanup:
if (vm) { if (vm) {
if (ret == -1) if (ret == -1) {
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE)); ignore_value(qemuProcessKill(driver, vm,
VIR_QEMU_PROCESS_KILL_FORCE));
}
if (virDomainObjUnref(vm) > 0) if (virDomainObjUnref(vm) > 0)
virDomainObjUnlock(vm); virDomainObjUnlock(vm);
} }
@ -612,12 +614,13 @@ qemuProcessShutdownOrReboot(struct qemud_driver *driver,
qemuProcessFakeReboot, qemuProcessFakeReboot,
vm) < 0) { vm) < 0) {
VIR_ERROR(_("Failed to create reboot thread, killing domain")); VIR_ERROR(_("Failed to create reboot thread, killing domain"));
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_NOWAIT)); ignore_value(qemuProcessKill(driver, vm,
VIR_QEMU_PROCESS_KILL_NOWAIT));
/* Safe to ignore value since ref count was incremented above */ /* Safe to ignore value since ref count was incremented above */
ignore_value(virDomainObjUnref(vm)); ignore_value(virDomainObjUnref(vm));
} }
} else { } else {
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_NOWAIT)); ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_NOWAIT));
} }
} }
@ -3532,10 +3535,13 @@ cleanup:
} }
int qemuProcessKill(virDomainObjPtr vm, unsigned int flags) int
qemuProcessKill(struct qemud_driver *driver,
virDomainObjPtr vm, unsigned int flags)
{ {
int i; int i, ret = -1;
const char *signame = "TERM"; const char *signame = "TERM";
bool driver_unlocked = false;
VIR_DEBUG("vm=%s pid=%d flags=%x", VIR_DEBUG("vm=%s pid=%d flags=%x",
vm->def->name, vm->pid, flags); vm->def->name, vm->pid, flags);
@ -3579,18 +3585,44 @@ int qemuProcessKill(virDomainObjPtr vm, unsigned int flags)
VIR_WARN("Failed to terminate process %d with SIG%s: %s", VIR_WARN("Failed to terminate process %d with SIG%s: %s",
vm->pid, signame, vm->pid, signame,
virStrerror(errno, ebuf, sizeof ebuf)); virStrerror(errno, ebuf, sizeof ebuf));
return -1; goto cleanup;
} }
return 0; /* process is dead */ ret = 0;
goto cleanup; /* process is dead */
} }
if (i == 0 && (flags & VIR_QEMU_PROCESS_KILL_NOWAIT)) if (i == 0 && (flags & VIR_QEMU_PROCESS_KILL_NOWAIT)) {
return 0; ret = 0;
goto cleanup;
}
if (driver && !driver_unlocked) {
/* THREADS.txt says we can't hold the driver lock while sleeping */
qemuDriverUnlock(driver);
driver_unlocked = true;
}
usleep(200 * 1000); usleep(200 * 1000);
} }
VIR_WARN("Timed out waiting after SIG%s to process %d", signame, vm->pid); VIR_WARN("Timed out waiting after SIG%s to process %d", signame, vm->pid);
return -1; cleanup:
if (driver_unlocked) {
/* We had unlocked the driver, so re-lock it. THREADS.txt says
* we can't have the domain locked when locking the driver, so
* we must first unlock the domain. BUT, before we can unlock
* the domain, we need to add a ref to it in case there aren't
* any active jobs (analysis of all callers didn't reveal such
* a case, but there are too many to maintain certainty, so we
* will do this as a precaution).
*/
virDomainObjRef(vm);
virDomainObjUnlock(vm);
qemuDriverLock(driver);
virDomainObjLock(vm);
/* Safe to ignore value since ref count was incremented above */
ignore_value(virDomainObjUnref(vm));
}
return ret;
} }
@ -3679,7 +3711,7 @@ void qemuProcessStop(struct qemud_driver *driver,
} }
/* shut it off for sure */ /* shut it off for sure */
ignore_value(qemuProcessKill(vm, VIR_QEMU_PROCESS_KILL_FORCE)); ignore_value(qemuProcessKill(driver, vm, VIR_QEMU_PROCESS_KILL_FORCE));
/* Stop autodestroy in case guest is restarted */ /* Stop autodestroy in case guest is restarted */
qemuProcessAutoDestroyRemove(driver, vm); qemuProcessAutoDestroyRemove(driver, vm);

View File

@ -73,7 +73,8 @@ typedef enum {
VIR_QEMU_PROCESS_KILL_NOWAIT = 1 << 1, VIR_QEMU_PROCESS_KILL_NOWAIT = 1 << 1,
} virQemuProcessKillMode; } virQemuProcessKillMode;
int qemuProcessKill(virDomainObjPtr vm, unsigned int flags); int qemuProcessKill(struct qemud_driver *driver,
virDomainObjPtr vm, unsigned int flags);
int qemuProcessAutoDestroyInit(struct qemud_driver *driver); int qemuProcessAutoDestroyInit(struct qemud_driver *driver);
void qemuProcessAutoDestroyRun(struct qemud_driver *driver, void qemuProcessAutoDestroyRun(struct qemud_driver *driver,