libvirt/src/lxc/lxc_container.c
Michal Privoznik 11b767d110 lxc_container: Increase stack size for lxcContainerChild()
When spawning a new container (via clone()) we allocate stack for
lxcContainerChild(). So far, we allocate 4 pages for the stack
and this used to be enough until we started rewriting everything
to glib. With glib we switched to g_strerror() which localizes
errno strings and thus increases stack usage, while the
previously used strerror_r() was more compact.

Fortunately, the solution is easy - just increase how much stack
the child can use (16 pages ought to be enough for anybody).

And while at it, lets use mmap() for allocation which offer some
nice features:

MAP_STACK - align allocation to be suitable for stack (even
            though, currently ignored on Linux),
MAP_GROWSDOWN - kernel guards out of bounds access from child

Resolves: https://gitlab.com/libvirt/libvirt/-/issues/511
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2023-08-08 07:37:34 +02:00

2242 lines
68 KiB
C

/*
* Copyright (C) 2008-2016 Red Hat, Inc.
* Copyright (C) 2008 IBM Corp.
* Copyright (c) 2015 SUSE LINUX Products GmbH, Nuernberg, Germany.
*
* lxc_container.c: Performs container setup tasks
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <mntent.h>
#include <sys/reboot.h>
#include <linux/reboot.h>
/* Yes, we want linux private one, for _syscall2() macro */
#include <linux/unistd.h>
#if WITH_CAPNG
# include <cap-ng.h>
#endif
#if WITH_BLKID
# include <blkid.h>
#endif
#if WITH_SELINUX
# include <selinux/selinux.h>
#endif
#include "virerror.h"
#include "virlog.h"
#include "lxc_container.h"
#include "viralloc.h"
#include "viruuid.h"
#include "virfile.h"
#include "vircommand.h"
#include "virnetdevip.h"
#include "virprocess.h"
#include "virstring.h"
#include "virutil.h"
#define VIR_FROM_THIS VIR_FROM_LXC
VIR_LOG_INIT("lxc.lxc_container");
/* messages between parent and container */
typedef char lxc_message_t;
#define LXC_CONTINUE_MSG 'c'
typedef struct __lxc_child_argv lxc_child_argv_t;
struct __lxc_child_argv {
virDomainDef *config;
virSecurityManager *securityDriver;
size_t nveths;
char **veths;
int monitor;
size_t npassFDs;
int *passFDs;
size_t nttyPaths;
char **ttyPaths;
int handshakefd;
int *nsInheritFDs;
};
static int lxcContainerMountFSBlock(virDomainFSDef *fs,
const char *srcprefix,
const char *sec_mount_options);
/*
* reboot(LINUX_REBOOT_CMD_CAD_ON) will return -EINVAL
* in a child pid namespace if container reboot support exists.
* Otherwise, it will either succeed or return -EPERM.
*/
G_GNUC_NORETURN static int
lxcContainerRebootChild(void *argv)
{
int *cmd = argv;
int ret;
ret = reboot(*cmd);
if (ret == -1 && errno == EINVAL)
_exit(1);
_exit(0);
}
static
int lxcContainerHasReboot(void)
{
int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|
CLONE_NEWIPC|SIGCHLD;
int cpid;
char *childStack;
g_autofree char *stack = NULL;
g_autofree char *buf = NULL;
int cmd, v;
int status;
char *tmp;
int stacksize = getpagesize() * 4;
if (virFileReadAll("/proc/sys/kernel/ctrl-alt-del", 10, &buf) < 0)
return -1;
if ((tmp = strchr(buf, '\n')))
*tmp = '\0';
if (virStrToLong_i(buf, NULL, 10, &v) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Malformed ctrl-alt-del setting '%1$s'"), buf);
return -1;
}
cmd = v ? LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF;
stack = g_new0(char, stacksize);
childStack = stack + stacksize;
cpid = clone(lxcContainerRebootChild, childStack, flags, &cmd);
if (cpid < 0) {
virReportSystemError(errno, "%s",
_("Unable to clone to check reboot support"));
return -1;
} else if (virProcessWait(cpid, &status, false) < 0) {
return -1;
}
if (status != 1) {
VIR_DEBUG("Containerized reboot support is missing "
"(kernel probably too old < 3.4)");
return 0;
}
VIR_DEBUG("Containerized reboot support is available");
return 1;
}
/**
* lxcContainerBuildInitCmd:
* @vmDef: pointer to vm definition structure
*
* Build a virCommand *for launching the container 'init' process
*
* Returns a virCommand *
*/
static virCommand *lxcContainerBuildInitCmd(virDomainDef *vmDef,
char **ttyPaths,
size_t nttyPaths)
{
char uuidstr[VIR_UUID_STRING_BUFLEN];
virCommand *cmd;
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
size_t i;
/* 'container_ptys' must exclude the PTY associated with
* the /dev/console device, hence start at 1 not 0
*/
for (i = 1; i < nttyPaths; i++) {
if (!STRPREFIX(ttyPaths[i], "/dev/")) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Expected a /dev path for '%1$s'"),
ttyPaths[i]);
return NULL;
}
virBufferAdd(&buf, ttyPaths[i] + 5, -1);
virBufferAddChar(&buf, ' ');
}
virBufferTrimLen(&buf, 1);
virUUIDFormat(vmDef->uuid, uuidstr);
cmd = virCommandNew(vmDef->os.init);
if (vmDef->os.initargv && vmDef->os.initargv[0])
virCommandAddArgSet(cmd, (const char **)vmDef->os.initargv);
virCommandAddEnvString(cmd, "PATH=/bin:/sbin");
virCommandAddEnvString(cmd, "TERM=linux");
virCommandAddEnvString(cmd, "container=lxc-libvirt");
virCommandAddEnvString(cmd, "HOME=/");
virCommandAddEnvPair(cmd, "container_uuid", uuidstr);
if (nttyPaths > 1)
virCommandAddEnvPair(cmd, "container_ttys", virBufferCurrentContent(&buf));
virCommandAddEnvPair(cmd, "LIBVIRT_LXC_UUID", uuidstr);
virCommandAddEnvPair(cmd, "LIBVIRT_LXC_NAME", vmDef->name);
if (vmDef->os.cmdline)
virCommandAddEnvPair(cmd, "LIBVIRT_LXC_CMDLINE", vmDef->os.cmdline);
if (vmDef->os.initdir)
virCommandSetWorkingDirectory(cmd, vmDef->os.initdir);
for (i = 0; vmDef->os.initenv[i]; i++) {
virCommandAddEnvPair(cmd, vmDef->os.initenv[i]->name,
vmDef->os.initenv[i]->value);
}
return cmd;
}
/**
* lxcContainerSetupFDs:
* @control: control FD from parent
* @ttyfd: FD of tty to set as the container console
* @npassFDs: number of extra FDs
* @passFDs: list of extra FDs
*
* Setup file descriptors in the container. @ttyfd is set to be
* the container's stdin, stdout & stderr. Any FDs included in
* @passFDs, will be dup()'d such that they start from stderr+1
* with no gaps.
*
* Returns 0 on success or -1 in case of error
*/
static int lxcContainerSetupFDs(int *ttyfd,
size_t npassFDs, int *passFDs)
{
int rc = -1;
int open_max;
int fd;
int last_fd;
size_t i;
size_t j;
VIR_DEBUG("Logging from the container init will now cease "
"as the FDs are about to be closed for exec of "
"the container init process");
if (dup2(*ttyfd, STDIN_FILENO) < 0) {
virReportSystemError(errno, "%s",
_("dup2(stdin) failed"));
goto cleanup;
}
if (dup2(*ttyfd, STDOUT_FILENO) < 0) {
virReportSystemError(errno, "%s",
_("dup2(stdout) failed"));
goto cleanup;
}
if (dup2(*ttyfd, STDERR_FILENO) < 0) {
virReportSystemError(errno, "%s",
_("dup2(stderr) failed"));
goto cleanup;
}
VIR_FORCE_CLOSE(*ttyfd);
/* Any FDs in @passFDs need to be moved around so that
* they are numbered, without gaps, starting from
* STDERR_FILENO + 1
*/
for (i = 0; i < npassFDs; i++) {
int wantfd;
wantfd = STDERR_FILENO + i + 1;
VIR_DEBUG("Pass %d onto %d", passFDs[i], wantfd);
/* If we already have desired FD number, life
* is easy. Nothing needs renumbering */
if (passFDs[i] == wantfd)
continue;
/*
* Lets check to see if any later FDs are occupying
* our desired FD number. If so, we must move them
* out of the way
*/
for (j = i + 1; j < npassFDs; j++) {
if (passFDs[j] == wantfd) {
int newfd = dup(passFDs[j]);
VIR_DEBUG("Clash %zu", j);
if (newfd < 0) {
virReportSystemError(errno,
_("Cannot move fd %1$d out of the way"),
passFDs[j]);
goto cleanup;
}
/* We're intentionally not closing the
* old value of passFDs[j], because we
* don't want later iterations of the
* loop to take it back. dup2() will
* cause it to be closed shortly anyway
*/
VIR_DEBUG("Moved clash onto %d", newfd);
passFDs[j] = newfd;
}
}
/* Finally we can move into our desired FD number */
if (dup2(passFDs[i], wantfd) < 0) {
virReportSystemError(errno,
_("Cannot duplicate fd %1$d onto fd %2$d"),
passFDs[i], wantfd);
goto cleanup;
}
VIR_FORCE_CLOSE(passFDs[i]);
}
last_fd = STDERR_FILENO + npassFDs;
/* Just in case someone forget to set FD_CLOEXEC, explicitly
* close all remaining FDs before executing the container */
open_max = sysconf(_SC_OPEN_MAX);
if (open_max < 0) {
virReportSystemError(errno, "%s",
_("sysconf(_SC_OPEN_MAX) failed"));
goto cleanup;
}
for (fd = last_fd + 1; fd < open_max; fd++) {
int tmpfd = fd;
VIR_MASS_CLOSE(tmpfd);
}
rc = 0;
cleanup:
VIR_DEBUG("rc=%d", rc);
return rc;
}
/**
* lxcContainerSendContinue:
* @control: control FD to child
*
* Sends the continue message via the socket pair stored in the vm
* structure.
*
* Returns 0 on success or -1 in case of error
*/
int lxcContainerSendContinue(int control)
{
lxc_message_t msg = LXC_CONTINUE_MSG;
int writeCount = 0;
VIR_DEBUG("Send continue on fd %d", control);
writeCount = safewrite(control, &msg, sizeof(msg));
if (writeCount != sizeof(msg))
return -1;
return 0;
}
/**
* lxcContainerWaitForContinue:
* @control: Control FD from parent
*
* This function will wait for the container continue message from the
* parent process. It will send this message on the socket pair stored in
* the vm structure once it has completed the post clone container setup.
*
* Returns 0 on success or -1 in case of error
*/
int lxcContainerWaitForContinue(int control)
{
lxc_message_t msg;
int readLen;
VIR_DEBUG("Wait continue on fd %d", control);
readLen = saferead(control, &msg, sizeof(msg));
VIR_DEBUG("Got continue on fd %d %d", control, readLen);
if (readLen != sizeof(msg)) {
if (readLen >= 0)
errno = EIO;
return -1;
}
if (msg != LXC_CONTINUE_MSG) {
errno = EINVAL;
return -1;
}
return 0;
}
/**
* lxcContainerSetID:
*
* This function calls setuid and setgid to create proper
* cred for tasks running in container.
*
* Returns 0 on success or -1 in case of error
*/
static int lxcContainerSetID(virDomainDef *def)
{
/* Only call virSetUIDGID when user namespace is enabled
* for this container. And user namespace is only enabled
* when nuidmap&ngidmap is not zero */
if (!def->idmap.nuidmap)
return 0;
VIR_DEBUG("Setting UID/GID to 0/0");
if (virSetUIDGID(0, 0, NULL, 0) < 0) {
virReportSystemError(errno, "%s",
_("setuid or setgid failed"));
return -1;
}
return 0;
}
static virDomainNetDef *
lxcContainerGetNetDef(virDomainDef *vmDef, const char *devName)
{
size_t i;
virDomainNetDef *netDef;
for (i = 0; i < vmDef->nnets; i++) {
netDef = vmDef->nets[i];
if (STREQ_NULLABLE(netDef->ifname_guest_actual, devName))
return netDef;
}
return NULL;
}
/**
* lxcContainerRenameAndEnableInterfaces:
* @nveths: number of interfaces
* @veths: interface names
*
* This function will rename the interfaces to ethN
* with id ascending order from zero and enable the
* renamed interfaces for this container.
*
* Returns 0 on success or nonzero in case of error
*/
static int
lxcContainerRenameAndEnableInterfaces(virDomainDef *vmDef,
size_t nveths,
char **veths)
{
size_t i;
const char *newname;
virDomainNetDef *netDef;
bool privNet = vmDef->features[VIR_DOMAIN_FEATURE_PRIVNET] ==
VIR_TRISTATE_SWITCH_ON;
for (i = 0; i < nveths; i++) {
if (!(netDef = lxcContainerGetNetDef(vmDef, veths[i])))
return -1;
newname = netDef->ifname_guest;
if (!newname) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Missing device name for container-side veth"));
return -1;
}
VIR_DEBUG("Renaming %s to %s", veths[i], newname);
if (virNetDevSetName(veths[i], newname) < 0)
return -1;
/* Only enable this device if there is a reason to do so (either
* at least one IP was specified, or link state was set to up in
* the config)
*/
if (netDef->guestIP.nips ||
netDef->linkstate == VIR_DOMAIN_NET_INTERFACE_LINK_STATE_UP) {
VIR_DEBUG("Enabling %s", newname);
if (virNetDevSetOnline(newname, true) < 0)
return -1;
}
/* set IP addresses and routes */
if (virNetDevIPInfoAddToDev(newname, &netDef->guestIP) < 0)
return -1;
}
/* enable lo device only if there were other net devices */
if ((veths || privNet) &&
virNetDevSetOnline("lo", true) < 0)
return -1;
return 0;
}
/*_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)*/
extern int pivot_root(const char * new_root, const char * put_old);
static int lxcContainerUnmountSubtree(const char *prefix,
bool isOldRootFS)
{
g_auto(GStrv) mounts = NULL;
size_t nmounts = 0;
size_t i;
int saveErrno;
const char *failedUmount = NULL;
VIR_DEBUG("Unmount subtree from %s", prefix);
if (virFileGetMountReverseSubtree("/proc/mounts", prefix,
&mounts, &nmounts) < 0)
return -1;
for (i = 0; i < nmounts; i++) {
VIR_DEBUG("Umount %s", mounts[i]);
if (umount(mounts[i]) < 0) {
failedUmount = mounts[i];
saveErrno = errno;
VIR_WARN("Failed to unmount '%s', trying to detach subtree '%s': %s",
failedUmount, mounts[nmounts-1],
g_strerror(errno));
break;
}
}
if (failedUmount) {
/* This detaches the subtree */
if (umount2(mounts[nmounts-1], MNT_DETACH) < 0) {
virReportSystemError(saveErrno,
_("Failed to unmount '%1$s' and could not detach subtree '%2$s'"),
failedUmount, mounts[nmounts-1]);
return -1;
}
/* This unmounts the tmpfs on which the old root filesystem was hosted */
if (isOldRootFS &&
umount(mounts[nmounts-1]) < 0) {
virReportSystemError(saveErrno,
_("Failed to unmount '%1$s' and could not unmount old root '%2$s'"),
failedUmount, mounts[nmounts-1]);
return -1;
}
}
return 0;
}
static int lxcContainerResolveSymlinks(virDomainFSDef *fs, bool gentle)
{
char *newroot;
if (!fs->src || !fs->src->path || fs->symlinksResolved)
return 0;
if (access(fs->src->path, F_OK)) {
if (gentle) {
/* Just ignore the error for the while, we'll try again later */
VIR_DEBUG("Skipped unaccessible '%s'", fs->src->path);
return 0;
} else {
virReportSystemError(errno,
_("Failed to access '%1$s'"), fs->src->path);
return -1;
}
}
VIR_DEBUG("Resolving '%s'", fs->src->path);
if (virFileResolveAllLinks(fs->src->path, &newroot) < 0) {
if (gentle) {
VIR_DEBUG("Skipped non-resolvable '%s'", fs->src->path);
return 0;
} else {
virReportSystemError(errno,
_("Failed to resolve symlink at %1$s"),
fs->src->path);
}
return -1;
}
/* Mark it resolved to skip it the next time */
fs->symlinksResolved = true;
VIR_DEBUG("Resolved '%s' to %s", fs->src->path, newroot);
g_free(fs->src->path);
fs->src->path = newroot;
return 0;
}
static int lxcContainerPrepareRoot(virDomainDef *def,
virDomainFSDef *root,
const char *sec_mount_options)
{
g_autofree char *dst = NULL;
char *tmp;
VIR_DEBUG("Prepare root %d", root->type);
if (root->type == VIR_DOMAIN_FS_TYPE_MOUNT)
return 0;
if (root->type == VIR_DOMAIN_FS_TYPE_FILE) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Unexpected root filesystem without loop device"));
return -1;
}
if (root->type != VIR_DOMAIN_FS_TYPE_BLOCK) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Unsupported root filesystem type %1$s"),
virDomainFSTypeToString(root->type));
return -1;
}
if (lxcContainerResolveSymlinks(root, false) < 0)
return -1;
dst = g_strdup_printf("%s/%s.root", LXC_STATE_DIR, def->name);
tmp = root->dst;
root->dst = dst;
if (lxcContainerMountFSBlock(root, "", sec_mount_options) < 0) {
root->dst = tmp;
return -1;
}
root->dst = tmp;
root->type = VIR_DOMAIN_FS_TYPE_MOUNT;
g_free(root->src->path);
root->src->path = g_steal_pointer(&dst);
return 0;
}
static int lxcContainerPivotRoot(virDomainFSDef *root)
{
g_autofree char *oldroot = NULL;
g_autofree char *newroot = NULL;
VIR_DEBUG("Pivot via %s", root->src->path);
/* root->parent must be private, so make / private. */
if (mount("", "/", "none", MS_PRIVATE|MS_REC, NULL) < 0) {
virReportSystemError(errno, "%s",
_("Failed to make root private"));
return -1;
}
oldroot = g_strdup_printf("%s/.oldroot", root->src->path);
if (g_mkdir_with_parents(oldroot, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create %1$s"),
oldroot);
return -1;
}
/* Create a tmpfs root since old and new roots must be
* on separate filesystems */
if (mount("tmprootfs", oldroot, "tmpfs", 0, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount empty tmpfs at %1$s"),
oldroot);
return -1;
}
/* Create a directory called 'new' in tmpfs */
newroot = g_strdup_printf("%s/new", oldroot);
if (g_mkdir_with_parents(newroot, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create %1$s"),
newroot);
return -1;
}
/* ... and mount our root onto it */
if (mount(root->src->path, newroot, "none", MS_BIND|MS_REC, NULL) < 0) {
virReportSystemError(errno,
_("Failed to bind %1$s to new root %2$s"),
root->src->path, newroot);
return -1;
}
if (root->readonly) {
if (mount(root->src->path, newroot, "none", MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) {
virReportSystemError(errno,
_("Failed to make new root %1$s readonly"),
root->src->path);
return -1;
}
}
/* Now we chdir into the tmpfs, then pivot into the
* root->src bind-mounted onto '/new' */
if (chdir(newroot) < 0) {
virReportSystemError(errno,
_("Failed to chdir into %1$s"), newroot);
return -1;
}
/* The old root directory will live at /.oldroot after
* this and will soon be unmounted completely */
if (pivot_root(".", ".oldroot") < 0) {
virReportSystemError(errno, "%s",
_("Failed to pivot root"));
return -1;
}
/* CWD is undefined after pivot_root, so go to / */
if (chdir("/") < 0)
return -1;
return 0;
}
typedef struct {
const char *src;
const char *dst;
const char *type;
int mflags;
bool skipUserNS;
bool skipUnmounted;
bool skipNoNetns;
} virLXCBasicMountInfo;
static const virLXCBasicMountInfo lxcBasicMounts[] = {
{ "proc", "/proc", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, false, false, false },
{ "/proc/sys", "/proc/sys", "none", MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, false, false, false },
{ "/.oldroot/proc/sys/net/ipv4", "/proc/sys/net/ipv4", "none", MS_BIND, false, false, true },
{ "/.oldroot/proc/sys/net/ipv6", "/proc/sys/net/ipv6", "none", MS_BIND, false, false, true },
{ "sysfs", "/sys", "sysfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, false, false, false },
{ "securityfs", "/sys/kernel/security", "securityfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, true, true, false },
#if WITH_SELINUX
{ SELINUX_MOUNT, SELINUX_MOUNT, "selinuxfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, true, true, false },
#endif
};
bool lxcIsBasicMountLocation(const char *path)
{
size_t i;
for (i = 0; i < G_N_ELEMENTS(lxcBasicMounts); i++) {
if (STREQ(path, lxcBasicMounts[i].dst))
return true;
}
return false;
}
static int lxcContainerSetReadOnly(void)
{
FILE *procmnt;
struct mntent mntent;
char mntbuf[1024];
g_auto(GStrv) mounts = NULL;
size_t nmounts = 0;
size_t i;
if (!(procmnt = setmntent("/proc/mounts", "r"))) {
virReportSystemError(errno, "%s",
_("Failed to read /proc/mounts"));
return -1;
}
while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) {
char *tmp;
if (STREQ(mntent.mnt_dir, "/") ||
STREQ(mntent.mnt_dir, "/.oldroot") ||
STRPREFIX(mntent.mnt_dir, "/.oldroot/") ||
lxcIsBasicMountLocation(mntent.mnt_dir))
continue;
tmp = g_strdup(mntent.mnt_dir);
VIR_APPEND_ELEMENT(mounts, nmounts, tmp);
}
endmntent(procmnt);
if (!mounts)
return 0;
qsort(mounts, nmounts, sizeof(mounts[0]), virStringSortRevCompare);
/* turn 'mounts' into a proper GStrv */
VIR_EXPAND_N(mounts, nmounts, 1);
nmounts--;
for (i = 0; i < nmounts; i++) {
VIR_DEBUG("Bind readonly %s", mounts[i]);
if (mount(mounts[i], mounts[i], "none", MS_BIND|MS_REC|MS_RDONLY|MS_REMOUNT, NULL) < 0) {
virReportSystemError(errno,
_("Failed to make mount %1$s readonly"),
mounts[i]);
return -1;
}
}
return 0;
}
static int lxcContainerMountBasicFS(bool userns_enabled,
bool netns_disabled)
{
size_t i;
int mnt_mflags;
VIR_DEBUG("Mounting basic filesystems");
for (i = 0; i < G_N_ELEMENTS(lxcBasicMounts); i++) {
g_autofree char *mnt_src = NULL;
bool bindOverReadonly;
virLXCBasicMountInfo const *mnt = &lxcBasicMounts[i];
/* When enable userns but disable netns, kernel will
* forbid us doing a new fresh mount for sysfs.
* So we had to do a bind mount for sysfs instead.
*/
if (userns_enabled && netns_disabled &&
STREQ(mnt->src, "sysfs")) {
mnt_src = g_strdup("/sys");
mnt_mflags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY|MS_BIND;
} else {
mnt_src = g_strdup(mnt->src);
mnt_mflags = mnt->mflags;
}
VIR_DEBUG("Processing %s -> %s",
mnt_src, mnt->dst);
if (mnt->skipUnmounted) {
int ret;
g_autofree char *hostdir = g_strdup_printf("/.oldroot%s",
mnt->dst);
ret = virFileIsMountPoint(hostdir);
if (ret < 0)
return -1;
if (ret == 0) {
VIR_DEBUG("Skipping '%s' which isn't mounted in host",
mnt->dst);
continue;
}
}
if (mnt->skipUserNS && userns_enabled) {
VIR_DEBUG("Skipping due to user ns enablement");
continue;
}
/* Skip mounts with missing source without shouting: it may be a
* missing folder in /proc due to the absence of a kernel feature */
if (STRPREFIX(mnt_src, "/") && !virFileExists(mnt_src)) {
VIR_DEBUG("Skipping due to missing source: %s", mnt_src);
continue;
}
if (mnt->skipNoNetns && netns_disabled) {
VIR_DEBUG("Skipping due to absence of network namespace");
continue;
}
if (g_mkdir_with_parents(mnt->dst, 0777) < 0) {
virReportSystemError(errno,
_("Failed to mkdir %1$s"),
mnt->dst);
return -1;
}
/*
* We can't immediately set the MS_RDONLY flag when mounting filesystems
* because (in at least some kernel versions) this will propagate back
* to the original mount in the host OS, turning it readonly too. Thus
* we mount the filesystem in read-write mode initially, and then do a
* separate read-only bind mount on top of that.
*/
bindOverReadonly = !!(mnt_mflags & MS_RDONLY);
VIR_DEBUG("Mount %s on %s type=%s flags=0x%x",
mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY);
if (mount(mnt_src, mnt->dst, mnt->type, mnt_mflags & ~MS_RDONLY, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %1$s on %2$s type %3$s flags=0x%4$x"),
mnt_src, mnt->dst, NULLSTR(mnt->type),
mnt_mflags & ~MS_RDONLY);
return -1;
}
if (bindOverReadonly &&
mount(mnt_src, mnt->dst, "none",
MS_BIND|MS_REMOUNT|mnt_mflags|MS_RDONLY, NULL) < 0) {
virReportSystemError(errno,
_("Failed to re-mount %1$s on %2$s flags=0x%3$x"),
mnt_src, mnt->dst,
MS_BIND|MS_REMOUNT|MS_RDONLY);
return -1;
}
}
return 0;
}
#if WITH_FUSE
static int lxcContainerMountProcFuse(virDomainDef *def,
const char *stateDir)
{
g_autofree char *meminfo_path = NULL;
VIR_DEBUG("Mount /proc/meminfo stateDir=%s", stateDir);
meminfo_path = g_strdup_printf("/.oldroot/%s/%s.fuse/meminfo",
stateDir,
def->name);
if (mount(meminfo_path, "/proc/meminfo",
NULL, MS_BIND, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %1$s on /proc/meminfo"),
meminfo_path);
return -1;
}
return 0;
}
#else
static int lxcContainerMountProcFuse(virDomainDef *def G_GNUC_UNUSED,
const char *stateDir G_GNUC_UNUSED)
{
return 0;
}
#endif
static int lxcContainerMountFSDev(virDomainDef *def,
const char *stateDir)
{
g_autofree char *path = NULL;
int flags = def->idmap.nuidmap ? MS_BIND : MS_MOVE;
VIR_DEBUG("Mount /dev/ stateDir=%s", stateDir);
path = g_strdup_printf("/.oldroot/%s/%s.dev", stateDir, def->name);
if (g_mkdir_with_parents("/dev", 0777) < 0) {
virReportSystemError(errno, "%s",
_("Cannot create /dev"));
return -1;
}
VIR_DEBUG("Trying to %s %s to /dev", def->idmap.nuidmap ?
"bind" : "move", path);
if (mount(path, "/dev", "none", flags, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %1$s on /dev"),
path);
return -1;
}
return 0;
}
static int lxcContainerMountFSDevPTS(virDomainDef *def,
const char *stateDir)
{
g_autofree char *path = NULL;
int flags = def->idmap.nuidmap ? MS_BIND : MS_MOVE;
VIR_DEBUG("Mount /dev/pts stateDir=%s", stateDir);
path = g_strdup_printf("/.oldroot/%s/%s.devpts", stateDir, def->name);
if (g_mkdir_with_parents("/dev/pts", 0777) < 0) {
virReportSystemError(errno, "%s",
_("Cannot create /dev/pts"));
return -1;
}
VIR_DEBUG("Trying to %s %s to /dev/pts", def->idmap.nuidmap ?
"bind" : "move", path);
if (mount(path, "/dev/pts", "none", flags, NULL) < 0) {
virReportSystemError(errno,
_("Failed to mount %1$s on /dev/pts"),
path);
return -1;
}
return 0;
}
static int lxcContainerSetupDevices(char **ttyPaths, size_t nttyPaths)
{
size_t i;
const struct {
const char *src;
const char *dst;
} links[] = {
{ "/proc/self/fd/0", "/dev/stdin" },
{ "/proc/self/fd/1", "/dev/stdout" },
{ "/proc/self/fd/2", "/dev/stderr" },
{ "/proc/self/fd", "/dev/fd" },
};
for (i = 0; i < G_N_ELEMENTS(links); i++) {
if (symlink(links[i].src, links[i].dst) < 0) {
virReportSystemError(errno,
_("Failed to symlink device %1$s to %2$s"),
links[i].dst, links[i].src);
return -1;
}
}
/* We have private devpts capability, so bind that */
if (virFileBindMountDevice("/dev/pts/ptmx", "/dev/ptmx") < 0)
return -1;
for (i = 0; i < nttyPaths; i++) {
g_autofree char *tty = g_strdup_printf("/dev/tty%zu", i + 1);
if (virFileBindMountDevice(ttyPaths[i], tty) < 0)
return -1;
if (i == 0 &&
virFileBindMountDevice(ttyPaths[i], "/dev/console") < 0)
return -1;
}
return 0;
}
static int lxcContainerMountFSBind(virDomainFSDef *fs,
const char *srcprefix)
{
g_autofree char *src = NULL;
struct stat st;
VIR_DEBUG("src=%s dst=%s", fs->src->path, fs->dst);
src = g_strdup_printf("%s%s", srcprefix, fs->src->path);
if (stat(fs->dst, &st) < 0) {
if (errno != ENOENT) {
virReportSystemError(errno, _("Unable to stat bind target %1$s"),
fs->dst);
return -1;
}
/* ENOENT => create the target dir or file */
if (stat(src, &st) < 0) {
virReportSystemError(errno, _("Unable to stat bind source %1$s"),
src);
return -1;
}
if (S_ISDIR(st.st_mode)) {
if (g_mkdir_with_parents(fs->dst, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create %1$s"),
fs->dst);
return -1;
}
} else {
/* Create Empty file for target mount point */
int fd = open(fs->dst, O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, 0666);
if (fd < 0) {
if (errno != EEXIST) {
virReportSystemError(errno,
_("Failed to create bind target %1$s"),
fs->dst);
return -1;
}
}
if (VIR_CLOSE(fd) < 0) {
virReportSystemError(errno,
_("Failed to close bind target %1$s"),
fs->dst);
return -1;
}
}
}
if (mount(src, fs->dst, "none", MS_BIND, NULL) < 0) {
virReportSystemError(errno,
_("Failed to bind mount directory %1$s to %2$s"),
src, fs->dst);
return -1;
}
if (fs->readonly) {
VIR_DEBUG("Binding %s readonly", fs->dst);
if (mount(src, fs->dst, "none", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
virReportSystemError(errno,
_("Failed to make directory %1$s readonly"),
fs->dst);
}
}
return 0;
}
#ifdef WITH_BLKID
static int
lxcContainerMountDetectFilesystem(const char *src, char **type)
{
int fd;
int ret = -1;
int rc;
const char *data = NULL;
blkid_probe blkid = NULL;
*type = NULL;
if ((fd = open(src, O_RDONLY)) < 0) {
virReportSystemError(errno,
_("Unable to open filesystem %1$s"), src);
return -1;
}
if (!(blkid = blkid_new_probe())) {
virReportSystemError(errno, "%s",
_("Unable to create blkid library handle"));
goto cleanup;
}
if (blkid_probe_set_device(blkid, fd, 0, 0) < 0) {
virReportSystemError(EINVAL,
_("Unable to associate device %1$s with blkid library"),
src);
goto cleanup;
}
blkid_probe_enable_superblocks(blkid, 1);
blkid_probe_set_superblocks_flags(blkid, BLKID_SUBLKS_TYPE);
rc = blkid_do_safeprobe(blkid);
if (rc != 0) {
if (rc == 1) /* Nothing found, return success with *type == NULL */
goto done;
if (rc == -2) {
virReportSystemError(EINVAL,
_("Too many filesystems detected for %1$s"),
src);
} else {
virReportSystemError(errno,
_("Unable to detect filesystem for %1$s"),
src);
}
goto cleanup;
}
if (blkid_probe_lookup_value(blkid, "TYPE", &data, NULL) < 0) {
virReportSystemError(ENOENT,
_("Unable to find filesystem type for %1$s"),
src);
goto cleanup;
}
*type = g_strdup(data);
done:
ret = 0;
cleanup:
VIR_FORCE_CLOSE(fd);
if (blkid)
blkid_free_probe(blkid);
return ret;
}
#else /* ! WITH_BLKID */
static int
lxcContainerMountDetectFilesystem(const char *src G_GNUC_UNUSED,
char **type)
{
/* No libblkid, so just return success with no detected type */
*type = NULL;
return 0;
}
#endif /* ! WITH_BLKID */
/*
* This function attempts to do automatic detection of filesystem
* type following the same rules as the util-linux 'mount' binary.
*
* The main difference is that we don't (currently) try to use
* libblkid to detect the format first. We go straight to using
* /etc/filesystems, and then /proc/filesystems
*/
static int lxcContainerMountFSBlockAuto(virDomainFSDef *fs,
int fsflags,
const char *src,
const char *srcprefix,
const char *sec_mount_options)
{
FILE *fp = NULL;
int ret = -1;
bool tryProc = false;
bool gotStar = false;
g_autofree char *fslist = NULL;
const char *type;
VIR_DEBUG("src=%s dst=%s srcprefix=%s", src, fs->dst, srcprefix);
/* First time around we use /etc/filesystems */
retry:
g_free(fslist);
fslist = g_strdup_printf("%s%s", srcprefix,
tryProc ? "/proc/filesystems" : "/etc/filesystems");
VIR_DEBUG("Open fslist %s", fslist);
if (!(fp = fopen(fslist, "r"))) {
/* If /etc/filesystems does not exist, then we need to retry
* with /proc/filesystems next
*/
if (errno == ENOENT &&
!tryProc) {
tryProc = true;
goto retry;
}
virReportSystemError(errno,
_("Unable to read %1$s"),
fslist);
goto cleanup;
}
while (!feof(fp)) {
g_autofree char *line = NULL;
size_t n;
if (getline(&line, &n, fp) <= 0) {
if (feof(fp))
break;
goto cleanup;
}
if (strstr(line, "nodev"))
continue;
type = strchr(line, '\n');
if (type)
line[type-line] = '\0';
type = line;
virSkipSpaces(&type);
/*
* /etc/filesystems is only allowed to contain '*' on the last line
*/
if (gotStar && !tryProc) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("%1$s has unexpected '*' before last line"),
fslist);
goto cleanup;
}
/* An '*' on the last line in /etc/filesystems
* means try /proc/filesystems next. We don't
* jump immediately though, since we need to see
* if any more lines follow
*/
if (!tryProc &&
STREQ(type, "*"))
gotStar = true;
VIR_DEBUG("Trying mount '%s' on '%s' with '%s' opts '%s'",
src, fs->dst, type, sec_mount_options);
if (mount(src, fs->dst, type, fsflags, sec_mount_options) < 0) {
/* These errnos indicate a bogus filesystem type for
* the image we have, so skip to the next type
*/
if (errno == EINVAL || errno == ENODEV)
continue;
virReportSystemError(errno,
_("Failed to mount device %1$s to %2$s"),
src, fs->dst);
goto cleanup;
}
ret = 0;
break;
}
/* We've got to the end of /etc/filesystems and saw
* a '*', so we must try /proc/filesystems next
*/
if (ret != 0 &&
!tryProc &&
gotStar) {
tryProc = true;
VIR_FORCE_FCLOSE(fp);
goto retry;
}
if (ret != 0) {
virReportSystemError(ENODEV,
_("Failed to mount device %1$s to %2$s, unable to detect filesystem"),
src, fs->dst);
}
VIR_DEBUG("Done mounting filesystem ret=%d tryProc=%d", ret, tryProc);
cleanup:
VIR_FORCE_FCLOSE(fp);
return ret;
}
/*
* Mount a block device 'src' on fs->dst, automatically
* probing for filesystem type
*/
static int lxcContainerMountFSBlockHelper(virDomainFSDef *fs,
const char *src,
const char *srcprefix,
const char *sec_mount_options)
{
int fsflags = 0;
g_autofree char *format = NULL;
if (fs->readonly)
fsflags |= MS_RDONLY;
if (g_mkdir_with_parents(fs->dst, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create %1$s"),
fs->dst);
return -1;
}
if (lxcContainerMountDetectFilesystem(src, &format) < 0)
return -1;
if (format) {
VIR_DEBUG("Mount '%s' on '%s' with detected format '%s' opts '%s'",
src, fs->dst, format, sec_mount_options);
if (mount(src, fs->dst, format, fsflags, sec_mount_options) < 0) {
virReportSystemError(errno,
_("Failed to mount device %1$s to %2$s as %3$s"),
src, fs->dst, format);
return -1;
}
return 0;
} else {
return lxcContainerMountFSBlockAuto(fs, fsflags, src, srcprefix, sec_mount_options);
}
}
static int lxcContainerMountFSBlock(virDomainFSDef *fs,
const char *srcprefix,
const char *sec_mount_options)
{
g_autofree char *src = NULL;
int ret = -1;
VIR_DEBUG("src=%s dst=%s", fs->src->path, fs->dst);
src = g_strdup_printf("%s%s", srcprefix, fs->src->path);
ret = lxcContainerMountFSBlockHelper(fs, src, srcprefix, sec_mount_options);
VIR_DEBUG("Done mounting filesystem ret=%d", ret);
return ret;
}
static int lxcContainerMountFSTmpfs(virDomainFSDef *fs,
char *sec_mount_options)
{
g_autofree char *data = NULL;
VIR_DEBUG("usage=%lld sec=%s", fs->usage, sec_mount_options);
data = g_strdup_printf("size=%lld%s", fs->usage, sec_mount_options);
if (g_mkdir_with_parents(fs->dst, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create %1$s"),
fs->dst);
return -1;
}
if (mount("tmpfs", fs->dst, "tmpfs", MS_NOSUID|MS_NODEV, data) < 0) {
virReportSystemError(errno,
_("Failed to mount directory %1$s as tmpfs"),
fs->dst);
return -1;
}
if (fs->readonly) {
VIR_DEBUG("Binding %s readonly", fs->dst);
if (mount(fs->dst, fs->dst, "none", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
virReportSystemError(errno,
_("Failed to make directory %1$s readonly"),
fs->dst);
return -1;
}
}
return 0;
}
static int lxcContainerMountFS(virDomainFSDef *fs,
char *sec_mount_options)
{
switch (fs->type) {
case VIR_DOMAIN_FS_TYPE_MOUNT:
if (lxcContainerMountFSBind(fs, "/.oldroot") < 0)
return -1;
break;
case VIR_DOMAIN_FS_TYPE_BLOCK:
if (lxcContainerMountFSBlock(fs, "/.oldroot", sec_mount_options) < 0)
return -1;
break;
case VIR_DOMAIN_FS_TYPE_RAM:
if (lxcContainerMountFSTmpfs(fs, sec_mount_options) < 0)
return -1;
break;
case VIR_DOMAIN_FS_TYPE_BIND:
if (lxcContainerMountFSBind(fs, "") < 0)
return -1;
break;
case VIR_DOMAIN_FS_TYPE_FILE:
/* We do actually support this, but the lxc controller
* should have associated the file with a loopback
* device and changed this to TYPE_BLOCK for us */
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Unexpected filesystem type %1$s"),
virDomainFSTypeToString(fs->type));
return -1;
case VIR_DOMAIN_FS_TYPE_TEMPLATE:
case VIR_DOMAIN_FS_TYPE_VOLUME:
case VIR_DOMAIN_FS_TYPE_LAST:
default:
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cannot mount filesystem type %1$s"),
virDomainFSTypeToString(fs->type));
return -1;
}
return 0;
}
static int lxcContainerMountAllFS(virDomainDef *vmDef,
char *sec_mount_options)
{
size_t i;
VIR_DEBUG("Mounting all non-root filesystems");
/* Pull in rest of container's mounts */
for (i = 0; i < vmDef->nfss; i++) {
if (STREQ(vmDef->fss[i]->dst, "/"))
continue;
VIR_DEBUG("Mounting '%s' -> '%s'", vmDef->fss[i]->src->path, vmDef->fss[i]->dst);
if (lxcContainerResolveSymlinks(vmDef->fss[i], false) < 0)
return -1;
if (!(vmDef->fss[i]->src && vmDef->fss[i]->src->path &&
STRPREFIX(vmDef->fss[i]->src->path, vmDef->fss[i]->dst)) &&
lxcContainerUnmountSubtree(vmDef->fss[i]->dst, false) < 0)
return -1;
if (lxcContainerMountFS(vmDef->fss[i], sec_mount_options) < 0)
return -1;
}
VIR_DEBUG("Mounted all non-root filesystems");
return 0;
}
int lxcContainerSetupHostdevCapsMakePath(const char *dev)
{
g_autofree char *dir = NULL;
char *tmp;
dir = g_strdup(dev);
if ((tmp = strrchr(dir, '/'))) {
*tmp = '\0';
if (g_mkdir_with_parents(dir, 0777) < 0) {
virReportSystemError(errno,
_("Failed to create directory for '%1$s' dev '%2$s'"),
dir, dev);
return -1;
}
}
return 0;
}
static int lxcContainerUnmountForSharedRoot(const char *stateDir,
const char *domain)
{
g_autofree char *tmp = NULL;
#if WITH_SELINUX
/* Some versions of Linux kernel don't let you overmount
* the selinux filesystem, so make sure we kill it first
*/
if (lxcContainerUnmountSubtree(SELINUX_MOUNT, false) < 0)
return -1;
#endif
/* These filesystems are created by libvirt temporarily, they
* shouldn't appear in container. */
tmp = g_strdup_printf("%s/%s.dev", stateDir, domain);
if (lxcContainerUnmountSubtree(tmp, false) < 0)
return -1;
g_free(tmp);
tmp = g_strdup_printf("%s/%s.devpts", stateDir, domain);
if (lxcContainerUnmountSubtree(tmp, false) < 0)
return -1;
#if WITH_FUSE
g_free(tmp);
tmp = g_strdup_printf("%s/%s.fuse", stateDir, domain);
if (lxcContainerUnmountSubtree(tmp, false) < 0)
return -1;
#endif
/* If we have the root source being '/', then we need to
* get rid of any existing stuff under /proc, /sys & /tmp.
* We need new namespace aware versions of those. We must
* do /proc last otherwise we won't find /proc/mounts :-) */
if (lxcContainerUnmountSubtree("/sys", false) < 0 ||
lxcContainerUnmountSubtree("/dev", false) < 0 ||
lxcContainerUnmountSubtree("/proc", false) < 0)
return -1;
return 0;
}
static bool
lxcNeedNetworkNamespace(virDomainDef *def)
{
size_t i;
if (def->nets != NULL)
return true;
if (def->features[VIR_DOMAIN_FEATURE_PRIVNET] == VIR_TRISTATE_SWITCH_ON)
return true;
for (i = 0; i < def->nhostdevs; i++) {
if (def->hostdevs[i]->mode == VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES &&
def->hostdevs[i]->source.caps.type == VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET)
return true;
}
return false;
}
/* Got a FS mapped to /, we're going the pivot_root
* approach to do a better-chroot-than-chroot
* this is based on this thread https://lkml.org/lkml/2008/3/5/29
*/
static int lxcContainerSetupPivotRoot(virDomainDef *vmDef,
virDomainFSDef *root,
char **ttyPaths,
size_t nttyPaths,
virSecurityManager *securityDriver)
{
g_autoptr(virCgroup) cgroup = NULL;
g_autofree char *sec_mount_options = NULL;
g_autofree char *stateDir = NULL;
VIR_DEBUG("Setup pivot root");
if (!(sec_mount_options = virSecurityManagerGetMountOptions(securityDriver, vmDef)))
return -1;
/* Before pivoting we need to identify any
* cgroups controllers that are mounted */
if (virCgroupNewSelf(&cgroup) < 0)
return -1;
if (virFileResolveAllLinks(LXC_STATE_DIR, &stateDir) < 0)
return -1;
/* Ensure the root filesystem is mounted */
if (lxcContainerPrepareRoot(vmDef, root, sec_mount_options) < 0)
return -1;
/* Gives us a private root, leaving all parent OS mounts on /.oldroot */
if (lxcContainerPivotRoot(root) < 0)
return -1;
/* FIXME: we should find a way to unmount these mounts for container
* even user namespace is enabled. */
if (STREQ(root->src->path, "/") && (!vmDef->idmap.nuidmap) &&
lxcContainerUnmountForSharedRoot(stateDir, vmDef->name) < 0)
return -1;
/* Mounts the core /proc, /sys, etc filesystems */
if (lxcContainerMountBasicFS(vmDef->idmap.nuidmap,
!lxcNeedNetworkNamespace(vmDef)) < 0)
return -1;
/* Ensure entire root filesystem (except /.oldroot) is readonly */
if (root->readonly &&
lxcContainerSetReadOnly() < 0)
return -1;
/* Mounts /proc/meminfo etc sysinfo */
if (lxcContainerMountProcFuse(vmDef, stateDir) < 0)
return -1;
/* Now we can re-mount the cgroups controllers in the
* same configuration as before */
if (virCgroupBindMount(cgroup, "/.oldroot/", sec_mount_options) < 0)
return -1;
/* Mounts /dev */
if (lxcContainerMountFSDev(vmDef, stateDir) < 0)
return -1;
/* Mounts /dev/pts */
if (lxcContainerMountFSDevPTS(vmDef, stateDir) < 0)
return -1;
/* Setup device nodes in /dev/ */
if (lxcContainerSetupDevices(ttyPaths, nttyPaths) < 0)
return -1;
/* Sets up any non-root mounts from guest config */
if (lxcContainerMountAllFS(vmDef, sec_mount_options) < 0)
return -1;
/* Gets rid of all remaining mounts from host OS, including /.oldroot itself */
if (lxcContainerUnmountSubtree("/.oldroot", true) < 0)
return -1;
return 0;
}
static int lxcContainerResolveAllSymlinks(virDomainDef *vmDef)
{
size_t i;
VIR_DEBUG("Resolving symlinks");
for (i = 0; i < vmDef->nfss; i++) {
virDomainFSDef *fs = vmDef->fss[i];
/* In the first pass, be gentle as some files may
depend on other filesystems to be mounted */
if (lxcContainerResolveSymlinks(fs, true) < 0)
return -1;
}
VIR_DEBUG("Resolved all filesystem symlinks");
return 0;
}
/*
* This is running as the 'init' process inside the container.
* It removes some capabilities that could be dangerous to
* host system, since they are not currently "containerized"
*/
#if WITH_CAPNG
static int lxcContainerDropCapabilities(virDomainDef *def,
bool keepReboot)
{
int ret;
size_t i;
int policy = def->features[VIR_DOMAIN_FEATURE_CAPABILITIES];
/* Maps virDomainProcessCapsFeature to CAPS_* */
static int capsMapping[] = {CAP_AUDIT_CONTROL,
CAP_AUDIT_WRITE,
CAP_BLOCK_SUSPEND,
CAP_CHOWN,
CAP_DAC_OVERRIDE,
CAP_DAC_READ_SEARCH,
CAP_FOWNER,
CAP_FSETID,
CAP_IPC_LOCK,
CAP_IPC_OWNER,
CAP_KILL,
CAP_LEASE,
CAP_LINUX_IMMUTABLE,
CAP_MAC_ADMIN,
CAP_MAC_OVERRIDE,
CAP_MKNOD,
CAP_NET_ADMIN,
CAP_NET_BIND_SERVICE,
CAP_NET_BROADCAST,
CAP_NET_RAW,
CAP_SETGID,
CAP_SETFCAP,
CAP_SETPCAP,
CAP_SETUID,
CAP_SYS_ADMIN,
CAP_SYS_BOOT,
CAP_SYS_CHROOT,
CAP_SYS_MODULE,
CAP_SYS_NICE,
CAP_SYS_PACCT,
CAP_SYS_PTRACE,
CAP_SYS_RAWIO,
CAP_SYS_RESOURCE,
CAP_SYS_TIME,
CAP_SYS_TTY_CONFIG,
CAP_SYSLOG,
CAP_WAKE_ALARM};
capng_get_caps_process();
/* Make sure we drop everything if required by the user */
if (policy == VIR_DOMAIN_CAPABILITIES_POLICY_DENY)
capng_clear(CAPNG_SELECT_BOTH);
/* Apply all single capabilities changes */
for (i = 0; i < VIR_DOMAIN_PROCES_CAPS_FEATURE_LAST; i++) {
bool toDrop = false;
int state = def->caps_features[i];
if (!cap_valid(capsMapping[i]))
continue;
switch ((virDomainCapabilitiesPolicy) policy) {
case VIR_DOMAIN_CAPABILITIES_POLICY_DENY:
if (state == VIR_TRISTATE_SWITCH_ON &&
(ret = capng_update(CAPNG_ADD,
CAPNG_EFFECTIVE | CAPNG_PERMITTED |
CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
capsMapping[i])) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Failed to add capability %1$s: %2$d"),
virDomainProcessCapsFeatureTypeToString(i), ret);
return -1;
}
break;
case VIR_DOMAIN_CAPABILITIES_POLICY_DEFAULT:
switch (i) {
case VIR_DOMAIN_PROCES_CAPS_FEATURE_SYS_BOOT: /* No use of reboot */
toDrop = !keepReboot && (state != VIR_TRISTATE_SWITCH_ON);
break;
case VIR_DOMAIN_PROCES_CAPS_FEATURE_SYS_MODULE: /* No kernel module loading */
case VIR_DOMAIN_PROCES_CAPS_FEATURE_SYS_TIME: /* No changing the clock */
case VIR_DOMAIN_PROCES_CAPS_FEATURE_MKNOD: /* No creating device nodes */
case VIR_DOMAIN_PROCES_CAPS_FEATURE_AUDIT_CONTROL: /* No messing with auditing status */
case VIR_DOMAIN_PROCES_CAPS_FEATURE_MAC_ADMIN: /* No messing with LSM config */
toDrop = (state != VIR_TRISTATE_SWITCH_ON);
break;
default: /* User specified capabilities to drop */
toDrop = (state == VIR_TRISTATE_SWITCH_OFF);
}
G_GNUC_FALLTHROUGH;
case VIR_DOMAIN_CAPABILITIES_POLICY_ALLOW:
if (policy == VIR_DOMAIN_CAPABILITIES_POLICY_ALLOW)
toDrop = state == VIR_TRISTATE_SWITCH_OFF;
if (toDrop && (ret = capng_update(CAPNG_DROP,
CAPNG_EFFECTIVE | CAPNG_PERMITTED |
CAPNG_INHERITABLE | CAPNG_BOUNDING_SET,
capsMapping[i])) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Failed to remove capability %1$s: %2$d"),
virDomainProcessCapsFeatureTypeToString(i), ret);
return -1;
}
break;
case VIR_DOMAIN_CAPABILITIES_POLICY_LAST:
default:
virReportEnumRangeError(virDomainCapabilitiesPolicy, policy);
return -1;
}
}
if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Failed to apply capabilities: %1$d"), ret);
return -1;
}
/* We do not need to call capng_lock() in this case. The bounding
* set restriction will prevent them reacquiring sys_boot/module/time,
* etc which is all that matters for the container. Once inside the
* container it is fine for SECURE_NOROOT / SECURE_NO_SETUID_FIXUP to
* be unmasked - they can never escape the bounding set. */
return 0;
}
#else
static int lxcContainerDropCapabilities(virDomainDef *def G_GNUC_UNUSED,
bool keepReboot G_GNUC_UNUSED)
{
VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
return 0;
}
#endif
/**
* lxcAttach_ns:
* @ns_fd: array of namespaces to attach
*/
static int lxcAttachNS(int *ns_fd)
{
if (ns_fd &&
virProcessSetNamespaces((size_t)VIR_LXC_DOMAIN_NAMESPACE_LAST,
ns_fd) < 0)
return -1;
return 0;
}
/**
* lxcContainerSetUserGroup:
* @cmd: command to update
* @vmDef: domain definition for the container
* @ttyPath: guest path to the tty
*
* Set the command UID and GID. As this function attempts at
* converting the user/group name into uid/gid, it needs to
* be called after the pivot root is done.
*
* The owner of the tty is also changed to the given user.
*/
static int lxcContainerSetUserGroup(virCommand *cmd,
virDomainDef *vmDef,
const char *ttyPath)
{
uid_t uid;
gid_t gid;
if (vmDef->os.inituser) {
if (virGetUserID(vmDef->os.inituser, &uid) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, _("User %1$s doesn't exist"),
vmDef->os.inituser);
return -1;
}
virCommandSetUID(cmd, uid);
/* Change the newly created tty owner to the inituid for
* shells to have job control. */
if (chown(ttyPath, uid, -1) < 0) {
virReportSystemError(errno,
_("Failed to change ownership of tty %1$s"),
ttyPath);
return -1;
}
}
if (vmDef->os.initgroup) {
if (virGetGroupID(vmDef->os.initgroup, &gid) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, _("Group %1$s doesn't exist"),
vmDef->os.initgroup);
return -1;
}
virCommandSetGID(cmd, gid);
}
return 0;
}
static const char hostname_validchars[] =
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789-";
static int lxcContainerSetHostname(virDomainDef *def)
{
g_autofree char *name = NULL;
const char *hostname = NULL;
/* Filter the VM name to get a valid hostname */
name = g_strdup(def->name);
/* RFC 1123 allows 0-9 digits as a first character in hostname */
virStringFilterChars(name, hostname_validchars);
hostname = name;
if (strlen(name) > 0 && name[0] == '-')
hostname = name + 1;
if (sethostname(hostname, strlen(hostname)) < 0) {
virReportSystemError(errno, "%s", _("Failed to set hostname"));
return -1;
}
return 0;
}
/**
* lxcContainerChild:
* @data: pointer to container arguments
*
* This function is run in the process clone()'d in lxcStartContainer.
* Perform a number of container setup tasks:
* Setup container file system
* mount container /proc
* Then exec's the container init
*
* Returns 0 on success or -1 in case of error
*/
static int lxcContainerChild(void *data)
{
lxc_child_argv_t *argv = data;
virDomainDef *vmDef = argv->config;
int ttyfd = -1;
int ret = -1;
g_autofree char *ttyPath = NULL;
virDomainFSDef *root;
g_autoptr(virCommand) cmd = NULL;
int hasReboot;
g_autofree gid_t *groups = NULL;
int ngroups;
if (NULL == vmDef) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("lxcChild() passed invalid vm definition"));
goto cleanup;
}
if (lxcAttachNS(argv->nsInheritFDs) < 0) {
virReportError(VIR_ERR_SYSTEM_ERROR, "%s",
_("failed to attach the namespace"));
return -1;
}
/* Wait for controller to finish setup tasks, including
* things like move of network interfaces, uid/gid mapping
*/
if (lxcContainerWaitForContinue(argv->monitor) < 0) {
virReportSystemError(errno, "%s",
_("Failed to read the container continue message"));
goto cleanup;
}
VIR_DEBUG("Received container continue message");
if ((hasReboot = lxcContainerHasReboot()) < 0)
goto cleanup;
cmd = lxcContainerBuildInitCmd(vmDef,
argv->ttyPaths,
argv->nttyPaths);
virCommandWriteArgLog(cmd, 1);
if (lxcContainerSetID(vmDef) < 0)
goto cleanup;
root = virDomainGetFilesystemForTarget(vmDef, "/");
if (argv->nttyPaths) {
const char *tty = argv->ttyPaths[0];
if (STRPREFIX(tty, "/dev/pts/"))
tty += strlen("/dev/pts/");
ttyPath = g_strdup_printf("%s/%s.devpts/%s", LXC_STATE_DIR, vmDef->name,
tty);
} else {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("At least one tty is required"));
goto cleanup;
}
VIR_DEBUG("Container TTY path: %s", ttyPath);
ttyfd = open(ttyPath, O_RDWR);
if (ttyfd < 0) {
virReportSystemError(errno,
_("Failed to open tty %1$s"),
ttyPath);
goto cleanup;
}
if (lxcContainerResolveAllSymlinks(vmDef) < 0)
goto cleanup;
VIR_DEBUG("Setting up pivot");
if (lxcContainerSetupPivotRoot(vmDef, root,
argv->ttyPaths, argv->nttyPaths,
argv->securityDriver) < 0)
goto cleanup;
if (!virFileExists(vmDef->os.init)) {
virReportSystemError(errno,
_("cannot find init path '%1$s' relative to container root"),
vmDef->os.init);
goto cleanup;
}
if (lxcContainerSetUserGroup(cmd, vmDef, argv->ttyPaths[0]) < 0)
goto cleanup;
/* rename and enable interfaces */
if (lxcContainerRenameAndEnableInterfaces(vmDef,
argv->nveths,
argv->veths) < 0) {
goto cleanup;
}
if (lxcContainerSetHostname(vmDef) < 0)
goto cleanup;
/* drop a set of root capabilities */
if (lxcContainerDropCapabilities(vmDef, !!hasReboot) < 0)
goto cleanup;
if (lxcContainerSendContinue(argv->handshakefd) < 0) {
virReportSystemError(errno, "%s",
_("Failed to send continue signal to controller"));
goto cleanup;
}
VIR_DEBUG("Setting up security labeling");
if (virSecurityManagerSetProcessLabel(argv->securityDriver, vmDef) < 0)
goto cleanup;
VIR_DEBUG("Setting up inherited FDs");
VIR_FORCE_CLOSE(argv->handshakefd);
VIR_FORCE_CLOSE(argv->monitor);
if (lxcContainerSetupFDs(&ttyfd,
argv->npassFDs, argv->passFDs) < 0)
goto cleanup;
/* Make init process of the container the leader of the new session.
* That is needed when checkpointing container.
*/
if (setsid() < 0) {
virReportSystemError(errno, "%s",
_("Unable to become session leader"));
goto cleanup;
}
/* TODO is it safe to call it here or should this call be moved in
* front of the clone() as otherwise there might be a risk for a
* deadlock */
if ((ngroups = virGetGroupList(virCommandGetUID(cmd), virCommandGetGID(cmd),
&groups)) < 0)
goto cleanup;
ret = 0;
cleanup:
VIR_FORCE_CLOSE(ttyfd);
VIR_FORCE_CLOSE(argv->monitor);
VIR_FORCE_CLOSE(argv->handshakefd);
if (ret == 0) {
VIR_DEBUG("Executing init binary");
/* this function will only return if an error occurred */
ret = virCommandExec(cmd, groups, ngroups);
}
if (ret != 0) {
VIR_DEBUG("Tearing down container");
fprintf(stderr,
_("Failure in libvirt_lxc startup: %1$s\n"),
virGetLastErrorMessage());
}
return ret;
}
static int userns_required(virDomainDef *def)
{
return def->idmap.uidmap && def->idmap.gidmap;
}
virArch lxcContainerGetAlt32bitArch(virArch arch)
{
/* Any Linux 64bit arch which has a 32bit
* personality available should be listed here */
if (arch == VIR_ARCH_X86_64)
return VIR_ARCH_I686;
if (arch == VIR_ARCH_S390X)
return VIR_ARCH_S390;
if (arch == VIR_ARCH_PPC64)
return VIR_ARCH_PPC;
if (arch == VIR_ARCH_PARISC64)
return VIR_ARCH_PARISC;
if (arch == VIR_ARCH_SPARC64)
return VIR_ARCH_SPARC;
if (arch == VIR_ARCH_MIPS64)
return VIR_ARCH_MIPS;
if (arch == VIR_ARCH_MIPS64EL)
return VIR_ARCH_MIPSEL;
if (arch == VIR_ARCH_AARCH64)
return VIR_ARCH_ARMV7L;
return VIR_ARCH_NONE;
}
/**
* lxcContainerStart:
* @def: pointer to virtual machine structure
* @nveths: number of interfaces
* @veths: interface names
* @control: control FD to the container
* @ttyPath: path of tty to set as the container console
*
* Starts a container process by calling clone() with the namespace flags
*
* Returns PID of container on success or -1 in case of error
*/
int lxcContainerStart(virDomainDef *def,
virSecurityManager *securityDriver,
size_t nveths,
char **veths,
size_t npassFDs,
int *passFDs,
int control,
int handshakefd,
int *nsInheritFDs,
size_t nttyPaths,
char **ttyPaths)
{
pid_t pid;
int cflags;
int stacksize = getpagesize() * 16;
char *stack = NULL;
char *stacktop;
int ret = -1;
lxc_child_argv_t args = {
.config = def,
.securityDriver = securityDriver,
.nveths = nveths,
.veths = veths,
.npassFDs = npassFDs,
.passFDs = passFDs,
.monitor = control,
.nttyPaths = nttyPaths,
.ttyPaths = ttyPaths,
.handshakefd = handshakefd,
.nsInheritFDs = nsInheritFDs,
};
/* allocate a stack for the container */
stack = mmap(NULL, stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN | MAP_STACK,
-1, 0);
if (stack == MAP_FAILED) {
virReportSystemError(errno, "%s",
_("Unable to allocate stack"));
return -1;
}
stacktop = stack + stacksize;
cflags = CLONE_NEWPID|CLONE_NEWNS|SIGCHLD;
if (userns_required(def)) {
if (virProcessNamespaceAvailable(VIR_PROCESS_NAMESPACE_USER) < 0) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Kernel doesn't support user namespace"));
goto cleanup;
}
VIR_DEBUG("Enable user namespace");
cflags |= CLONE_NEWUSER;
}
if (!nsInheritFDs || nsInheritFDs[VIR_LXC_DOMAIN_NAMESPACE_SHARENET] == -1) {
if (lxcNeedNetworkNamespace(def)) {
VIR_DEBUG("Enable network namespaces");
cflags |= CLONE_NEWNET;
}
} else {
if (lxcNeedNetworkNamespace(def)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Config asks for inherit net namespace "
"as well as private network interfaces"));
goto cleanup;
}
VIR_DEBUG("Inheriting a net namespace");
}
if (!nsInheritFDs || nsInheritFDs[VIR_LXC_DOMAIN_NAMESPACE_SHAREIPC] == -1) {
cflags |= CLONE_NEWIPC;
} else {
VIR_DEBUG("Inheriting an IPC namespace");
}
if (!nsInheritFDs || nsInheritFDs[VIR_LXC_DOMAIN_NAMESPACE_SHAREUTS] == -1) {
cflags |= CLONE_NEWUTS;
} else {
VIR_DEBUG("Inheriting a UTS namespace");
}
VIR_DEBUG("Cloning container init process");
pid = clone(lxcContainerChild, stacktop, cflags, &args);
VIR_DEBUG("clone() completed, new container PID is %d", pid);
if (pid < 0) {
virReportSystemError(errno, "%s",
_("Failed to run clone container"));
goto cleanup;
}
ret = pid;
cleanup:
if (munmap(stack, stacksize) < 0)
VIR_WARN("Unable to munmap() stack: %s", g_strerror(errno));
return ret;
}
int lxcContainerChown(virDomainDef *def, const char *path)
{
uid_t uid;
gid_t gid;
if (!def->idmap.uidmap)
return 0;
uid = def->idmap.uidmap[0].target;
gid = def->idmap.gidmap[0].target;
if (chown(path, uid, gid) < 0) {
virReportSystemError(errno,
_("Failed to change owner of %1$s to %2$u:%3$u"),
path, uid, gid);
return -1;
}
return 0;
}