/* * Copyright IBM Corp. 2008 * Copyright Red Hat 2008 * * lxc_container.c: file description * * Authors: * David L. Leskovec * Daniel P. Berrange * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include /* Yes, we want linux private one, for _syscall2() macro */ #include /* For MS_MOVE */ #include #include "virterror_internal.h" #include "logging.h" #include "lxc_container.h" #include "util.h" #include "memory.h" #include "veth.h" #define VIR_FROM_THIS VIR_FROM_LXC /* * GLibc headers are behind the kernel, so we define these * constants if they're not present already. */ #ifndef CLONE_NEWPID #define CLONE_NEWPID 0x20000000 #endif #ifndef CLONE_NEWUTS #define CLONE_NEWUTS 0x04000000 #endif #ifndef CLONE_NEWUSER #define CLONE_NEWUSER 0x10000000 #endif #ifndef CLONE_NEWIPC #define CLONE_NEWIPC 0x08000000 #endif #ifndef CLONE_NEWNET #define CLONE_NEWNET 0x40000000 /* New network namespace */ #endif /* messages between parent and container */ typedef char lxc_message_t; #define LXC_CONTINUE_MSG 'c' typedef struct __lxc_child_argv lxc_child_argv_t; struct __lxc_child_argv { virDomainDefPtr config; unsigned int nveths; char **veths; int monitor; char *ttyPath; }; /** * lxcContainerExecInit: * @vmDef: Ptr to vm definition structure * * Exec the container init string. The container init will replace then * be running in the current process * * Does not return */ static int lxcContainerExecInit(virDomainDefPtr vmDef) { const char *const argv[] = { vmDef->os.init, NULL, }; return execve(argv[0], (char **)argv, NULL); } /** * lxcContainerSetStdio: * @control: the conrol FD * @ttyPath: Name of tty to set as the container console * * Sets the given tty as the primary conosole for the container as well as * stdout, stdin and stderr. * * Returns 0 on success or -1 in case of error */ static int lxcContainerSetStdio(int control, int ttyfd) { int rc = -1; int open_max, i; if (setsid() < 0) { virReportSystemError(NULL, errno, "%s", _("setsid failed")); goto cleanup; } if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("ioctl(TIOCSTTY) failed")); goto cleanup; } /* Just in case someone forget to set FD_CLOEXEC, explicitly * close all FDs before executing the container */ open_max = sysconf (_SC_OPEN_MAX); for (i = 0; i < open_max; i++) if (i != ttyfd && i != control) close(i); if (dup2(ttyfd, 0) < 0) { virReportSystemError(NULL, errno, "%s", _("dup2(stdin) failed")); goto cleanup; } if (dup2(ttyfd, 1) < 0) { virReportSystemError(NULL, errno, "%s", _("dup2(stdout) failed")); goto cleanup; } if (dup2(ttyfd, 2) < 0) { virReportSystemError(NULL, errno, "%s", _("dup2(stderr) failed")); goto cleanup; } rc = 0; cleanup: return rc; } /** * lxcContainerSendContinue: * @monitor: control FD to child * * Sends the continue message via the socket pair stored in the vm * structure. * * Returns 0 on success or -1 in case of error */ int lxcContainerSendContinue(int control) { int rc = -1; lxc_message_t msg = LXC_CONTINUE_MSG; int writeCount = 0; writeCount = safewrite(control, &msg, sizeof(msg)); if (writeCount != sizeof(msg)) { virReportSystemError(NULL, errno, "%s", _("unable to send container continue message")); goto error_out; } rc = 0; error_out: return rc; } /** * lxcContainerWaitForContinue: * @control: control FD from parent * * This function will wait for the container continue message from the * parent process. It will send this message on the socket pair stored in * the vm structure once it has completed the post clone container setup. * * Returns 0 on success or -1 in case of error */ static int lxcContainerWaitForContinue(int control) { lxc_message_t msg; int readLen; readLen = saferead(control, &msg, sizeof(msg)); if (readLen != sizeof(msg) || msg != LXC_CONTINUE_MSG) { virReportSystemError(NULL, errno, "%s", _("Failed to read the container continue message")); return -1; } close(control); DEBUG0("Received container continue message"); return 0; } /** * lxcEnableInterfaces: * @vm: Pointer to vm structure * * This function will enable the interfaces for this container. * * Returns 0 on success or nonzero in case of error */ static int lxcContainerEnableInterfaces(unsigned int nveths, char **veths) { int rc = 0; unsigned int i; for (i = 0 ; i < nveths ; i++) { DEBUG("Enabling %s", veths[i]); rc = vethInterfaceUpOrDown(veths[i], 1); if (0 != rc) { goto error_out; } } /* enable lo device only if there were other net devices */ if (veths) rc = vethInterfaceUpOrDown("lo", 1); error_out: return rc; } //_syscall2(int, pivot_root, char *, newroot, const char *, oldroot) extern int pivot_root(const char * new_root,const char * put_old); static int lxcContainerChildMountSort(const void *a, const void *b) { const char **sa = (const char**)a; const char **sb = (const char**)b; /* Delibrately reversed args - we need to unmount deepest children first */ return strcmp(*sb, *sa); } #ifndef MS_REC #define MS_REC 16384 #endif #ifndef MNT_DETACH #define MNT_DETACH 0x00000002 #endif #ifndef MS_PRIVATE #define MS_PRIVATE (1<<18) #endif #ifndef MS_SLAVE #define MS_SLAVE (1<<19) #endif static int lxcContainerPivotRoot(virDomainFSDefPtr root) { int rc, ret; char *oldroot = NULL, *newroot = NULL; ret = -1; /* root->parent must be private, so make / private. */ if (mount("", "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to make root private")); goto err; } if (virAsprintf(&oldroot, "%s/.oldroot", root->src) < 0) { virReportOOMError(NULL); goto err; } if ((rc = virFileMakePath(oldroot)) < 0) { virReportSystemError(NULL, rc, _("failed to create %s"), oldroot); goto err; } /* Create a tmpfs root since old and new roots must be * on separate filesystems */ if (mount("", oldroot, "tmpfs", 0, NULL) < 0) { virReportSystemError(NULL, errno, _("failed to mount empty tmpfs at %s"), oldroot); goto err; } /* Create a directory called 'new' in tmpfs */ if (virAsprintf(&newroot, "%s/new", oldroot) < 0) { virReportOOMError(NULL); goto err; } if ((rc = virFileMakePath(newroot)) < 0) { virReportSystemError(NULL, rc, _("failed to create %s"), newroot); goto err; } /* ... and mount our root onto it */ if (mount(root->src, newroot, NULL, MS_BIND|MS_REC, NULL) < 0) { virReportSystemError(NULL, errno, _("failed to bind new root %s into tmpfs"), root->src); goto err; } /* Now we chroot into the tmpfs, then pivot into the * root->src bind-mounted onto '/new' */ if (chroot(oldroot) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to chroot into tmpfs")); goto err; } if (chdir("/new") < 0) { virReportSystemError(NULL, errno, "%s", _("failed to chdir into /new on tmpfs")); goto err; } /* The old root directory will live at /.oldroot after * this and will soon be unmounted completely */ if (pivot_root(".", ".oldroot") < 0) { virReportSystemError(NULL, errno, "%s", _("failed to pivot root")); goto err; } /* CWD is undefined after pivot_root, so go to / */ if (chdir("/") < 0) goto err; if (umount2(".oldroot", MNT_DETACH) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to lazily unmount old root")); goto err; } ret = 0; err: VIR_FREE(oldroot); VIR_FREE(newroot); return ret; } static int lxcContainerPopulateDevices(void) { int i; int rc; const struct { int maj; int min; mode_t mode; const char *path; } devs[] = { { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/dev/null" }, { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/dev/zero" }, { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/dev/full" }, { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_CONSOLE, 0600, "/dev/console" }, { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/dev/random" }, { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/dev/urandom" }, }; if ((rc = virFileMakePath("/dev")) < 0) { virReportSystemError(NULL, rc, "%s", _("cannot create /dev/")); return -1; } if (mount("none", "/dev", "tmpfs", 0, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to mount /dev tmpfs")); return -1; } /* Move old devpts into container, since we have to connect to the master ptmx which was opened in the parent. XXX This sucks, we need to figure out how to get our own private devpts for isolation */ if ((rc = virFileMakePath("/dev/pts") < 0)) { virReportSystemError(NULL, rc, "%s", _("cannot create /dev/pts")); return -1; } if (mount("devpts", "/dev/pts", "devpts", 0, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to mount /dev/pts in container")); return -1; } /* Populate /dev/ with a few important bits */ for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { dev_t dev = makedev(devs[i].maj, devs[i].min); if (mknod(devs[i].path, 0, dev) < 0 || chmod(devs[i].path, devs[i].mode)) { virReportSystemError(NULL, errno, _("failed to make device %s"), devs[i].path); return -1; } } return 0; } static int lxcContainerMountNewFS(virDomainDefPtr vmDef) { int i; /* Pull in rest of container's mounts */ for (i = 0 ; i < vmDef->nfss ; i++) { char *src; if (STREQ(vmDef->fss[i]->dst, "/")) continue; // XXX fix if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT) continue; if (virAsprintf(&src, "/.oldroot/%s", vmDef->fss[i]->src) < 0) { virReportOOMError(NULL); return -1; } if (virFileMakePath(vmDef->fss[i]->dst) < 0) { virReportSystemError(NULL, errno, _("failed to create %s"), vmDef->fss[i]->dst); VIR_FREE(src); return -1; } if (mount(src, vmDef->fss[i]->dst, NULL, MS_BIND, NULL) < 0) { VIR_FREE(src); virReportSystemError(NULL, errno, _("failed to mount %s at %s"), vmDef->fss[i]->src, vmDef->fss[i]->dst); return -1; } VIR_FREE(src); } return 0; } static int lxcContainerUnmountOldFS(void) { struct mntent mntent; char **mounts = NULL; int nmounts = 0; FILE *procmnt; int i; char mntbuf[1024]; if (!(procmnt = setmntent("/proc/mounts", "r"))) { virReportSystemError(NULL, errno, "%s", _("failed to read /proc/mounts")); return -1; } while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { if (!STRPREFIX(mntent.mnt_dir, "/.oldroot")) continue; if (VIR_REALLOC_N(mounts, nmounts+1) < 0) { endmntent(procmnt); virReportOOMError(NULL); return -1; } if (!(mounts[nmounts++] = strdup(mntent.mnt_dir))) { endmntent(procmnt); virReportOOMError(NULL); return -1; } } endmntent(procmnt); qsort(mounts, nmounts, sizeof(mounts[0]), lxcContainerChildMountSort); for (i = 0 ; i < nmounts ; i++) { if (umount(mounts[i]) < 0) { virReportSystemError(NULL, errno, _("failed to unmount '%s'"), mounts[i]); return -1; } VIR_FREE(mounts[i]); } VIR_FREE(mounts); return 0; } /* Got a FS mapped to /, we're going the pivot_root * approach to do a better-chroot-than-chroot * this is based on this thread http://lkml.org/lkml/2008/3/5/29 */ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, virDomainFSDefPtr root) { if (lxcContainerPivotRoot(root) < 0) return -1; if (virFileMakePath("/proc") < 0 || mount("none", "/proc", "proc", 0, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to mount /proc")); return -1; } if (lxcContainerPopulateDevices() < 0) return -1; if (lxcContainerMountNewFS(vmDef) < 0) return -1; if (lxcContainerUnmountOldFS() < 0) return -1; return 0; } /* Nothing mapped to /, we're using the main root, but with extra stuff mapped in */ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef) { int i; if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to make / slave")); return -1; } for (i = 0 ; i < vmDef->nfss ; i++) { // XXX fix to support other mount types if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT) continue; if (mount(vmDef->fss[i]->src, vmDef->fss[i]->dst, NULL, MS_BIND, NULL) < 0) { virReportSystemError(NULL, errno, _("failed to mount %s at %s"), vmDef->fss[i]->src, vmDef->fss[i]->dst); return -1; } } /* mount /proc */ if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) { virReportSystemError(NULL, errno, "%s", _("failed to mount /proc")); return -1; } return 0; } static int lxcContainerSetupMounts(virDomainDefPtr vmDef) { int i; virDomainFSDefPtr root = NULL; for (i = 0 ; i < vmDef->nfss ; i++) { if (vmDef->fss[i]->type != VIR_DOMAIN_FS_TYPE_MOUNT) continue; if (STREQ(vmDef->fss[i]->dst, "/")) root = vmDef->fss[i]; } if (root) return lxcContainerSetupPivotRoot(vmDef, root); else return lxcContainerSetupExtraMounts(vmDef); } /** * lxcChild: * @argv: Pointer to container arguments * * This function is run in the process clone()'d in lxcStartContainer. * Perform a number of container setup tasks: * Setup container file system * mount container /proca * Then exec's the container init * * Returns 0 on success or -1 in case of error */ static int lxcContainerChild( void *data ) { lxc_child_argv_t *argv = data; virDomainDefPtr vmDef = argv->config; int ttyfd; if (NULL == vmDef) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, "%s", _("lxcChild() passed invalid vm definition")); return -1; } if (lxcContainerSetupMounts(vmDef) < 0) return -1; ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY); if (ttyfd < 0) { virReportSystemError(NULL, errno, _("failed to open %s"), argv->ttyPath); return -1; } if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) { close(ttyfd); return -1; } close(ttyfd); /* Wait for interface devices to show up */ if (lxcContainerWaitForContinue(argv->monitor) < 0) return -1; /* enable interfaces */ if (lxcContainerEnableInterfaces(argv->nveths, argv->veths) < 0) return -1; /* this function will only return if an error occured */ return lxcContainerExecInit(vmDef); } static int userns_supported(void) { return lxcContainerAvailable(LXC_CONTAINER_FEATURE_USER) == 0; } /** * lxcContainerStart: * @driver: pointer to driver structure * @vm: pointer to virtual machine structure * * Starts a container process by calling clone() with the namespace flags * * Returns PID of container on success or -1 in case of error */ int lxcContainerStart(virDomainDefPtr def, unsigned int nveths, char **veths, int control, char *ttyPath) { pid_t pid; int flags; int stacksize = getpagesize() * 4; char *stack, *stacktop; lxc_child_argv_t args = { def, nveths, veths, control, ttyPath }; /* allocate a stack for the container */ if (VIR_ALLOC_N(stack, stacksize) < 0) { virReportOOMError(NULL); return -1; } stacktop = stack + stacksize; flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|SIGCHLD; if (userns_supported()) flags |= CLONE_NEWUSER; if (def->nets != NULL) flags |= CLONE_NEWNET; pid = clone(lxcContainerChild, stacktop, flags, &args); VIR_FREE(stack); DEBUG("clone() returned, %d", pid); if (pid < 0) { virReportSystemError(NULL, errno, "%s", _("failed to run clone container")); return -1; } return pid; } static int lxcContainerDummyChild(void *argv ATTRIBUTE_UNUSED) { _exit(0); } int lxcContainerAvailable(int features) { int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS| CLONE_NEWIPC|SIGCHLD; int cpid; char *childStack; char *stack; int childStatus; if (features & LXC_CONTAINER_FEATURE_USER) flags |= CLONE_NEWUSER; if (features & LXC_CONTAINER_FEATURE_NET) flags |= CLONE_NEWNET; if (VIR_ALLOC_N(stack, getpagesize() * 4) < 0) { DEBUG0("Unable to allocate stack"); return -1; } childStack = stack + (getpagesize() * 4); cpid = clone(lxcContainerDummyChild, childStack, flags, NULL); VIR_FREE(stack); if (cpid < 0) { char ebuf[1024]; DEBUG("clone call returned %s, container support is not enabled", virStrerror(errno, ebuf, sizeof ebuf)); return -1; } else { waitpid(cpid, &childStatus, 0); } return 0; }