From 39dcf00e72d51e3022f1c7a2bc3e40e413d74f89 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 20 May 2013 20:30:30 -0600 Subject: [PATCH] cgroup: be robust against cgroup movement races https://bugzilla.redhat.com/show_bug.cgi?id=965169 documents a problem starting domains when cgroups are enabled; I was able to reliably reproduce the race about 5% of the time when I added hooks to domain startup by 3 seconds (as that seemed to be about the length of time that qemu created and then closed a temporary thread, probably related to aio handling of initially opening a disk image). The problem has existed since we introduced virCgroupMoveTask in commit 9102829 (v0.10.0). There are some inherent TOCTTOU races when moving tasks between kernel cgroups, precisely because threads can be created or completed in the window between when we read a thread id from the source and when we write to the destination. As the goal of virCgroupMoveTask is merely to move ALL tasks into the new cgroup, it is sufficient to iterate until no more threads are being created in the old group, and ignoring any threads that die before we can move them. It would be nicer to start the threads in the right cgroup to begin with, but by default, all child threads are created in the same cgroup as their parent, and we don't want vcpu child threads in the emulator cgroup, so I don't see any good way of avoiding the move. It would also be nice if the kernel were to implement something like rename() as a way to atomically move a group of threads from one cgroup to another, instead of forcing a window where we have to read and parse the source, then format and write back into the destination. * src/util/vircgroup.c (virCgroupAddTaskStrController): Ignore ESRCH, because a thread ended between read and write attempts. (virCgroupMoveTask): Loop until all threads have moved. Signed-off-by: Eric Blake (cherry picked from commit 83e4c77547f5b721afad19a452f41c31daeee8c5) Conflicts: src/util/cgroup.c - refactoring in commit 56f27b3bb is too big to take in entirety; but I did inline its changes to the cleanup label --- src/util/cgroup.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/util/cgroup.c b/src/util/cgroup.c index baf40483cb..b1280c0035 100644 --- a/src/util/cgroup.c +++ b/src/util/cgroup.c @@ -844,7 +844,11 @@ static int virCgroupAddTaskStrController(virCgroupPtr group, goto cleanup; rc = virCgroupAddTaskController(group, p, controller); - if (rc != 0) + /* A thread that exits between when we first read the source + * tasks and now is not fatal. */ + if (rc == -ESRCH) + rc = 0; + else if (rc != 0) goto cleanup; next = strchr(cur, '\n'); @@ -873,7 +877,7 @@ cleanup: int virCgroupMoveTask(virCgroupPtr src_group, virCgroupPtr dest_group, int controller) { - int rc = 0, err = 0; + int rc = 0; char *content = NULL; if (controller < VIR_CGROUP_CONTROLLER_CPU || @@ -885,28 +889,25 @@ int virCgroupMoveTask(virCgroupPtr src_group, virCgroupPtr dest_group, return -EINVAL; } - rc = virCgroupGetValueStr(src_group, controller, "tasks", &content); - if (rc != 0) - return rc; + /* New threads are created in the same group as their parent; + * but if a thread is created after we first read we aren't + * aware that it needs to move. Therefore, we must iterate + * until content is empty. */ + while (1) { + rc = virCgroupGetValueStr(src_group, controller, "tasks", &content); + if (rc != 0) + return rc; - rc = virCgroupAddTaskStrController(dest_group, content, controller); - if (rc != 0) - goto cleanup; + rc = virCgroupAddTaskStrController(dest_group, content, controller); + if (rc != 0) + goto cleanup; - VIR_FREE(content); + VIR_FREE(content); + } return 0; cleanup: - /* - * We don't need to recover dest_cgroup because cgroup will make sure - * that one task only resides in one cgroup of the same controller. - */ - err = virCgroupAddTaskStrController(src_group, content, controller); - if (err != 0) - VIR_ERROR(_("Cannot recover cgroup %s from %s"), - src_group->controllers[controller].mountPoint, - dest_group->controllers[controller].mountPoint); VIR_FREE(content); return rc;