libvirt/src/lxc/lxc_controller.c
Daniel P. Berrange 5f5c6fde00 Allow use of file images for LXC container filesystems
A previous commit gave the LXC driver the ability to mount
block devices for the container filesystem. Through use of
the loopback device functionality, we can build on this to
support use of plain file images for LXC filesytems.

By setting the LO_FLAGS_AUTOCLEAR flag we can ensure that
the loop device automatically disappears when the container
dies / shuts down

* src/lxc/lxc_container.c: Raise error if we see a file
  based filesystem, since it should have been turned into
  a loopback device already
* src/lxc/lxc_controller.c: Rewrite any filesystems of
  type=file, into type=block, by binding the file image
  to a free loop device
2011-08-08 11:38:09 +01:00

1190 lines
34 KiB
C

/*
* Copyright (C) 2010-2011 Red Hat, Inc.
* Copyright IBM Corp. 2008
*
* lxc_controller.c: linux container process controller
*
* Authors:
* David L. Leskovec <dlesko at linux.vnet.ibm.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <config.h>
#include <sys/epoll.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/utsname.h>
#include <sys/personality.h>
#include <unistd.h>
#include <paths.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <getopt.h>
#include <sys/mount.h>
#include <locale.h>
#include <linux/loop.h>
#include <dirent.h>
#if HAVE_CAPNG
# include <cap-ng.h>
#endif
#include "virterror_internal.h"
#include "logging.h"
#include "util.h"
#include "lxc_conf.h"
#include "lxc_container.h"
#include "veth.h"
#include "memory.h"
#include "util.h"
#include "virfile.h"
#define VIR_FROM_THIS VIR_FROM_LXC
struct cgroup_device_policy {
char type;
int major;
int minor;
};
static int lxcGetLoopFD(char **devname)
{
int fd = -1;
DIR *dh = NULL;
struct dirent *de;
char *looppath;
struct loop_info64 lo;
VIR_DEBUG("Looking for loop devices in /dev");
if (!(dh = opendir("/dev"))) {
virReportSystemError(errno, "%s",
_("Unable to read /dev"));
goto cleanup;
}
while ((de = readdir(dh)) != NULL) {
if (!STRPREFIX(de->d_name, "loop"))
continue;
if (virAsprintf(&looppath, "/dev/%s", de->d_name) < 0) {
virReportOOMError();
goto cleanup;
}
VIR_DEBUG("Checking up on device %s", looppath);
if ((fd = open(looppath, O_RDWR)) < 0) {
virReportSystemError(errno,
_("Unable to open %s"), looppath);
goto cleanup;
}
if (ioctl(fd, LOOP_GET_STATUS64, &lo) < 0) {
/* Got a free device, return the fd */
if (errno == ENXIO)
goto cleanup;
VIR_FORCE_CLOSE(fd);
virReportSystemError(errno,
_("Unable to get loop status on %s"),
looppath);
goto cleanup;
}
/* Oh well, try the next device */
VIR_FORCE_CLOSE(fd);
VIR_FREE(looppath);
}
lxcError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Unable to find a free loop device in /dev"));
cleanup:
if (fd != -1) {
VIR_DEBUG("Got free loop device %s %d", looppath, fd);
*devname = looppath;
} else {
VIR_DEBUG("No free loop devices available");
VIR_FREE(looppath);
}
if (dh)
closedir(dh);
return fd;
}
static int lxcSetupLoopDevice(virDomainFSDefPtr fs)
{
int lofd = -1;
int fsfd = -1;
struct loop_info64 lo;
char *loname = NULL;
int ret = -1;
if ((lofd = lxcGetLoopFD(&loname)) < 0)
return -1;
memset(&lo, 0, sizeof(lo));
lo.lo_flags = LO_FLAGS_AUTOCLEAR;
if ((fsfd = open(fs->src, O_RDWR)) < 0) {
virReportSystemError(errno,
_("Unable to open %s"), fs->src);
goto cleanup;
}
if (ioctl(lofd, LOOP_SET_FD, fsfd) < 0) {
virReportSystemError(errno,
_("Unable to attach %s to loop device"),
fs->src);
goto cleanup;
}
if (ioctl(lofd, LOOP_SET_STATUS64, &lo) < 0) {
virReportSystemError(errno, "%s",
_("Unable to mark loop device as autoclear"));
if (ioctl(lofd, LOOP_CLR_FD, 0) < 0)
VIR_WARN("Unable to detach %s from loop device", fs->src);
goto cleanup;
}
VIR_DEBUG("Attached loop device %s %d to %s", fs->src, lofd, loname);
/*
* We now change it into a block device type, so that
* the rest of container setup 'just works'
*/
fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
VIR_FREE(fs->src);
fs->src = loname;
loname = NULL;
ret = 0;
cleanup:
VIR_FREE(loname);
VIR_FORCE_CLOSE(fsfd);
if (ret == -1)
VIR_FORCE_CLOSE(lofd);
return lofd;
}
static int lxcSetupLoopDevices(virDomainDefPtr def, size_t *nloopDevs, int **loopDevs)
{
size_t i;
int ret = -1;
for (i = 0 ; i < def->nfss ; i++) {
int fd;
if (def->fss[i]->type != VIR_DOMAIN_FS_TYPE_FILE)
continue;
fd = lxcSetupLoopDevice(def->fss[i]);
if (fd < 0)
goto cleanup;
VIR_DEBUG("Saving loop fd %d", fd);
if (VIR_REALLOC_N(*loopDevs, *nloopDevs+1) < 0) {
VIR_FORCE_CLOSE(fd);
virReportOOMError();
goto cleanup;
}
(*loopDevs)[*nloopDevs++] = fd;
}
VIR_DEBUG("Setup all loop devices");
ret = 0;
cleanup:
return ret;
}
/**
* lxcSetContainerResources
* @def: pointer to virtual machine structure
*
* Creates a cgroup for the container, moves the task inside,
* and sets resource limits
*
* Returns 0 on success or -1 in case of error
*/
static int lxcSetContainerResources(virDomainDefPtr def)
{
virCgroupPtr driver;
virCgroupPtr cgroup;
int rc = -1;
int i;
struct cgroup_device_policy devices[] = {
{'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL},
{'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO},
{'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL},
{'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM},
{'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM},
{'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY},
{'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX},
{0, 0, 0}};
rc = virCgroupForDriver("lxc", &driver, 1, 0);
if (rc != 0) {
/* Skip all if no driver cgroup is configured */
if (rc == -ENXIO || rc == -ENOENT)
return 0;
virReportSystemError(-rc, "%s",
_("Unable to get cgroup for driver"));
return rc;
}
rc = virCgroupForDomain(driver, def->name, &cgroup, 1);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to create cgroup for domain %s"),
def->name);
goto cleanup;
}
if (def->blkio.weight) {
rc = virCgroupSetBlkioWeight(cgroup, def->blkio.weight);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set Blkio weight for domain %s"),
def->name);
goto cleanup;
}
}
if (def->cputune.shares) {
rc = virCgroupSetCpuShares(cgroup, def->cputune.shares);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set cpu shares for domain %s"),
def->name);
goto cleanup;
}
}
rc = virCgroupSetMemory(cgroup, def->mem.max_balloon);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set memory limit for domain %s"),
def->name);
goto cleanup;
}
if (def->mem.hard_limit) {
rc = virCgroupSetMemoryHardLimit(cgroup, def->mem.hard_limit);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set memory hard limit for domain %s"),
def->name);
goto cleanup;
}
}
if (def->mem.soft_limit) {
rc = virCgroupSetMemorySoftLimit(cgroup, def->mem.soft_limit);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set memory soft limit for domain %s"),
def->name);
goto cleanup;
}
}
if (def->mem.swap_hard_limit) {
rc = virCgroupSetMemSwapHardLimit(cgroup, def->mem.swap_hard_limit);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to set swap hard limit for domain %s"),
def->name);
goto cleanup;
}
}
rc = virCgroupDenyAllDevices(cgroup);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to deny devices for domain %s"),
def->name);
goto cleanup;
}
for (i = 0; devices[i].type != 0; i++) {
struct cgroup_device_policy *dev = &devices[i];
rc = virCgroupAllowDevice(cgroup,
dev->type,
dev->major,
dev->minor,
VIR_CGROUP_DEVICE_RWM);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to allow device %c:%d:%d for domain %s"),
dev->type, dev->major, dev->minor, def->name);
goto cleanup;
}
}
for (i = 0 ; i < def->nfss ; i++) {
if (def->fss[i]->type != VIR_DOMAIN_FS_TYPE_BLOCK)
continue;
rc = virCgroupAllowDevicePath(cgroup,
def->fss[i]->src,
def->fss[i]->readonly ?
VIR_CGROUP_DEVICE_READ :
VIR_CGROUP_DEVICE_RW);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to allow device %s for domain %s"),
def->fss[i]->src, def->name);
goto cleanup;
}
}
rc = virCgroupAllowDeviceMajor(cgroup, 'c', LXC_DEV_MAJ_PTY,
VIR_CGROUP_DEVICE_RWM);
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to allow PTY devices for domain %s"),
def->name);
goto cleanup;
}
rc = virCgroupAddTask(cgroup, getpid());
if (rc != 0) {
virReportSystemError(-rc,
_("Unable to add task %d to cgroup for domain %s"),
getpid(), def->name);
}
cleanup:
virCgroupFree(&driver);
virCgroupFree(&cgroup);
return rc;
}
static char*lxcMonitorPath(virDomainDefPtr def)
{
char *sockpath;
if (virAsprintf(&sockpath, "%s/%s.sock",
LXC_STATE_DIR, def->name) < 0)
virReportOOMError();
return sockpath;
}
static int lxcMonitorServer(const char *sockpath)
{
int fd;
struct sockaddr_un addr;
if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
virReportSystemError(errno,
_("failed to create server socket '%s'"),
sockpath);
goto error;
}
unlink(sockpath);
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
if (virStrcpyStatic(addr.sun_path, sockpath) == NULL) {
lxcError(VIR_ERR_INTERNAL_ERROR,
_("Socket path %s too long for destination"), sockpath);
goto error;
}
if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
virReportSystemError(errno,
_("failed to bind server socket '%s'"),
sockpath);
goto error;
}
if (listen(fd, 30 /* backlog */ ) < 0) {
virReportSystemError(errno,
_("failed to listen server socket %s"),
sockpath);
goto error;
}
return fd;
error:
VIR_FORCE_CLOSE(fd);
return -1;
}
/**
* lxcFdForward:
* @readFd: file descriptor to read
* @writeFd: file desriptor to write
*
* Reads 1 byte of data from readFd and writes to writeFd.
*
* Returns 0 on success, EAGAIN if returned on read, or -1 in case of error
*/
static int lxcFdForward(int readFd, int writeFd)
{
int rc = -1;
char buf[2];
if (1 != (saferead(readFd, buf, 1))) {
if (EAGAIN == errno) {
rc = EAGAIN;
goto cleanup;
}
virReportSystemError(errno,
_("read of fd %d failed"),
readFd);
goto cleanup;
}
if (1 != (safewrite(writeFd, buf, 1))) {
virReportSystemError(errno,
_("write to fd %d failed"),
writeFd);
goto cleanup;
}
rc = 0;
cleanup:
return rc;
}
static int lxcControllerClearCapabilities(void)
{
#if HAVE_CAPNG
int ret;
capng_clear(CAPNG_SELECT_BOTH);
if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
lxcError(VIR_ERR_INTERNAL_ERROR,
_("failed to apply capabilities: %d"), ret);
return -1;
}
#else
VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
#endif
return 0;
}
typedef struct _lxcTtyForwardFd_t {
int fd;
int active;
} lxcTtyForwardFd_t;
/* Return true if it is ok to ignore an accept-after-epoll syscall
that fails with the specified errno value. Else false. */
static bool
ignorable_epoll_accept_errno(int errnum)
{
return (errnum == EINVAL
|| errnum == ECONNABORTED
|| errnum == EAGAIN
|| errnum == EWOULDBLOCK);
}
static bool
lxcPidGone(pid_t container)
{
waitpid(container, NULL, WNOHANG);
if (kill(container, 0) < 0 &&
errno == ESRCH)
return true;
return false;
}
/**
* lxcControllerMain
* @monitor: server socket fd to accept client requests
* @client: initial client which is the libvirtd daemon
* @appPty: open fd for application facing Pty
* @contPty: open fd for container facing Pty
*
* Forwards traffic between fds. Data read from appPty will be written to contPty
* This process loops forever.
* This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP
* events when the user disconnects the virsh console via ctrl-]
*
* Returns 0 on success or -1 in case of error
*/
static int lxcControllerMain(int monitor,
int client,
int appPty,
int contPty,
pid_t container)
{
int rc = -1;
int epollFd;
struct epoll_event epollEvent;
int numEvents;
int numActive = 0;
lxcTtyForwardFd_t fdArray[2];
int timeout = -1;
int curFdOff = 0;
int writeFdOff = 0;
fdArray[0].fd = appPty;
fdArray[0].active = 0;
fdArray[1].fd = contPty;
fdArray[1].active = 0;
VIR_DEBUG("monitor=%d client=%d appPty=%d contPty=%d",
monitor, client, appPty, contPty);
/* create the epoll fild descriptor */
epollFd = epoll_create(2);
if (0 > epollFd) {
virReportSystemError(errno, "%s",
_("epoll_create(2) failed"));
goto cleanup;
}
/* add the file descriptors the epoll fd */
memset(&epollEvent, 0x00, sizeof(epollEvent));
epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */
epollEvent.data.fd = appPty;
if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(appPty) failed"));
goto cleanup;
}
epollEvent.data.fd = contPty;
if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(contPty) failed"));
goto cleanup;
}
epollEvent.events = EPOLLIN;
epollEvent.data.fd = monitor;
if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(monitor) failed"));
goto cleanup;
}
epollEvent.events = EPOLLHUP;
epollEvent.data.fd = client;
if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(client) failed"));
goto cleanup;
}
while (1) {
/* if active fd's, return if no events, else wait forever */
timeout = (numActive > 0) ? 0 : -1;
numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout);
if (numEvents > 0) {
if (epollEvent.data.fd == monitor) {
int fd = accept(monitor, NULL, 0);
if (fd < 0) {
/* First reflex may be simply to declare accept failure
to be a fatal error. However, accept may fail when
a client quits between the above epoll_wait and here.
That case is not fatal, but rather to be expected,
if not common, so ignore it. */
if (ignorable_epoll_accept_errno(errno))
continue;
virReportSystemError(errno, "%s",
_("accept(monitor,...) failed"));
goto cleanup;
}
if (client != -1) { /* Already connected, so kick new one out */
VIR_FORCE_CLOSE(fd);
continue;
}
client = fd;
epollEvent.events = EPOLLHUP;
epollEvent.data.fd = client;
if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(client) failed"));
goto cleanup;
}
} else if (client != -1 && epollEvent.data.fd == client) {
if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) {
virReportSystemError(errno, "%s",
_("epoll_ctl(client) failed"));
goto cleanup;
}
VIR_FORCE_CLOSE(client);
} else {
if (epollEvent.events & EPOLLIN) {
curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
if (!fdArray[curFdOff].active) {
fdArray[curFdOff].active = 1;
++numActive;
}
} else if (epollEvent.events & EPOLLHUP) {
if (lxcPidGone(container))
goto cleanup;
curFdOff = epollEvent.data.fd == appPty ? 0 : 1;
if (fdArray[curFdOff].active) {
fdArray[curFdOff].active = 0;
--numActive;
}
continue;
} else {
lxcError(VIR_ERR_INTERNAL_ERROR,
_("error event %d"), epollEvent.events);
goto cleanup;
}
}
} else if (0 == numEvents) {
if (2 == numActive) {
/* both fds active, toggle between the two */
curFdOff ^= 1;
} else {
/* only one active, if current is active, use it, else it */
/* must be the other one (ie. curFd just went inactive) */
curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1;
}
} else {
if (EINTR == errno) {
continue;
}
/* error */
virReportSystemError(errno, "%s",
_("epoll_wait() failed"));
goto cleanup;
}
if (0 < numActive) {
writeFdOff = curFdOff ^ 1;
rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd);
if (EAGAIN == rc) {
/* this fd no longer has data, set it as inactive */
--numActive;
fdArray[curFdOff].active = 0;
} else if (-1 == rc) {
if (lxcPidGone(container))
goto cleanup;
continue;
}
}
}
rc = 0;
cleanup:
VIR_FORCE_CLOSE(appPty);
VIR_FORCE_CLOSE(contPty);
VIR_FORCE_CLOSE(epollFd);
return rc;
}
/**
* lxcControllerMoveInterfaces
* @nveths: number of interfaces
* @veths: interface names
* @container: pid of container
*
* Moves network interfaces into a container's namespace
*
* Returns 0 on success or -1 in case of error
*/
static int lxcControllerMoveInterfaces(unsigned int nveths,
char **veths,
pid_t container)
{
unsigned int i;
for (i = 0 ; i < nveths ; i++)
if (moveInterfaceToNetNs(veths[i], container) < 0)
return -1;
return 0;
}
/**
* lxcCleanupInterfaces:
* @nveths: number of interfaces
* @veths: interface names
*
* Cleans up the container interfaces by deleting the veth device pairs.
*
* Returns 0 on success or -1 in case of error
*/
static int lxcControllerCleanupInterfaces(unsigned int nveths,
char **veths)
{
unsigned int i;
for (i = 0 ; i < nveths ; i++)
vethDelete(veths[i]);
return 0;
}
static int lxcSetPersonality(virDomainDefPtr def)
{
struct utsname utsname;
const char *altArch;
uname(&utsname);
altArch = lxcContainerGetAlt32bitArch(utsname.machine);
if (altArch &&
STREQ(def->os.arch, altArch)) {
if (personality(PER_LINUX32) < 0) {
virReportSystemError(errno, _("Unable to request personality for %s on %s"),
altArch, utsname.machine);
return -1;
}
}
return 0;
}
#ifndef MS_REC
# define MS_REC 16384
#endif
#ifndef MS_SLAVE
# define MS_SLAVE (1<<19)
#endif
static int
lxcControllerRun(virDomainDefPtr def,
unsigned int nveths,
char **veths,
int monitor,
int client,
int appPty,
int handshakefd)
{
int rc = -1;
int control[2] = { -1, -1};
int containerhandshake[2] = { -1, -1 };
int containerPty = -1;
char *containerPtyPath = NULL;
pid_t container = -1;
virDomainFSDefPtr root;
char *devpts = NULL;
char *devptmx = NULL;
size_t nloopDevs = 0;
int *loopDevs = NULL;
size_t i;
if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
virReportSystemError(errno, "%s",
_("sockpair failed"));
goto cleanup;
}
if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) {
virReportSystemError(errno, "%s",
_("socketpair failed"));
goto cleanup;
}
if (lxcSetupLoopDevices(def, &nloopDevs, &loopDevs) < 0)
goto cleanup;
root = virDomainGetRootFilesystem(def);
if (lxcSetContainerResources(def) < 0)
goto cleanup;
/*
* If doing a chroot style setup, we need to prepare
* a private /dev/pts for the child now, which they
* will later move into position.
*
* This is complex because 'virsh console' needs to
* use /dev/pts from the host OS, and the guest OS
* needs to use /dev/pts from the guest.
*
* This means that we (libvirt_lxc) need to see and
* use both /dev/pts instances. We're running in the
* host OS context though and don't want to expose
* the guest OS /dev/pts there.
*
* Thus we call unshare(CLONE_NS) so that we can see
* the guest's new /dev/pts, without it becoming
* visible to the host OS. We also put the root FS
* into slave mode, just in case it was currently
* marked as shared
*/
if (root) {
VIR_DEBUG("Setting up private /dev/pts");
if (!virFileExists(root->src)) {
virReportSystemError(errno,
_("root source %s does not exist"),
root->src);
goto cleanup;
}
if (unshare(CLONE_NEWNS) < 0) {
virReportSystemError(errno, "%s",
_("Cannot unshare mount namespace"));
goto cleanup;
}
if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
virReportSystemError(errno, "%s",
_("Failed to switch root mount into slave mode"));
goto cleanup;
}
if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 ||
virAsprintf(&devptmx, "%s/dev/pts/ptmx", root->src) < 0) {
virReportOOMError();
goto cleanup;
}
if (virFileMakePath(devpts) < 0) {
virReportSystemError(errno,
_("Failed to make path %s"),
devpts);
goto cleanup;
}
VIR_DEBUG("Mouting 'devpts' on %s", devpts);
if (mount("devpts", devpts, "devpts", 0,
"newinstance,ptmxmode=0666,mode=0620,gid=5") < 0) {
virReportSystemError(errno,
_("Failed to mount devpts on %s"),
devpts);
goto cleanup;
}
if (access(devptmx, R_OK) < 0) {
VIR_WARN("Kernel does not support private devpts, using shared devpts");
VIR_FREE(devptmx);
}
}
if (devptmx) {
VIR_DEBUG("Opening tty on private %s", devptmx);
if (virFileOpenTtyAt(devptmx,
&containerPty,
&containerPtyPath,
0) < 0) {
virReportSystemError(errno, "%s",
_("Failed to allocate tty"));
goto cleanup;
}
} else {
VIR_DEBUG("Opening tty on shared /dev/ptmx");
if (virFileOpenTty(&containerPty,
&containerPtyPath,
0) < 0) {
virReportSystemError(errno, "%s",
_("Failed to allocate tty"));
goto cleanup;
}
}
if (lxcSetPersonality(def) < 0)
goto cleanup;
if ((container = lxcContainerStart(def,
nveths,
veths,
control[1],
containerhandshake[1],
containerPtyPath)) < 0)
goto cleanup;
VIR_FORCE_CLOSE(control[1]);
VIR_FORCE_CLOSE(containerhandshake[1]);
if (lxcControllerMoveInterfaces(nveths, veths, container) < 0)
goto cleanup;
if (lxcContainerSendContinue(control[0]) < 0) {
virReportSystemError(errno, "%s",
_("Unable to send container continue message"));
goto cleanup;
}
if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) {
virReportSystemError(errno, "%s",
_("error receiving signal from container"));
goto cleanup;
}
/* Now the container is fully setup... */
/* ...we can close the loop devices... */
for (i = 0 ; i < nloopDevs ; i++)
VIR_FORCE_CLOSE(loopDevs[i]);
/* ...and reduce our privileges */
if (lxcControllerClearCapabilities() < 0)
goto cleanup;
if (lxcContainerSendContinue(handshakefd) < 0) {
virReportSystemError(errno, "%s",
_("error sending continue signal to parent"));
goto cleanup;
}
VIR_FORCE_CLOSE(handshakefd);
rc = lxcControllerMain(monitor, client, appPty, containerPty, container);
cleanup:
VIR_FREE(devptmx);
VIR_FREE(devpts);
VIR_FORCE_CLOSE(control[0]);
VIR_FORCE_CLOSE(control[1]);
VIR_FREE(containerPtyPath);
VIR_FORCE_CLOSE(containerPty);
VIR_FORCE_CLOSE(handshakefd);
VIR_FORCE_CLOSE(containerhandshake[0]);
VIR_FORCE_CLOSE(containerhandshake[1]);
for (i = 0 ; i < nloopDevs ; i++)
VIR_FORCE_CLOSE(loopDevs[i]);
VIR_FREE(loopDevs);
if (container > 1) {
int status;
kill(container, SIGTERM);
if (!(waitpid(container, &status, WNOHANG) == 0 &&
WIFEXITED(status)))
kill(container, SIGKILL);
waitpid(container, NULL, 0);
}
return rc;
}
int main(int argc, char *argv[])
{
pid_t pid;
int rc = 1;
int client;
char *name = NULL;
int nveths = 0;
char **veths = NULL;
int monitor = -1;
int appPty = -1;
int handshakefd = -1;
int bg = 0;
virCapsPtr caps = NULL;
virDomainDefPtr def = NULL;
char *configFile = NULL;
char *sockpath = NULL;
const struct option options[] = {
{ "background", 0, NULL, 'b' },
{ "name", 1, NULL, 'n' },
{ "veth", 1, NULL, 'v' },
{ "console", 1, NULL, 'c' },
{ "handshakefd", 1, NULL, 's' },
{ "help", 0, NULL, 'h' },
{ 0, 0, 0, 0 },
};
if (setlocale(LC_ALL, "") == NULL ||
bindtextdomain(PACKAGE, LOCALEDIR) == NULL ||
textdomain(PACKAGE) == NULL) {
fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
exit(EXIT_FAILURE);
}
while (1) {
int c;
c = getopt_long(argc, argv, "dn:v:m:c:s:h",
options, NULL);
if (c == -1)
break;
switch (c) {
case 'b':
bg = 1;
break;
case 'n':
if ((name = strdup(optarg)) == NULL) {
virReportOOMError();
goto cleanup;
}
break;
case 'v':
if (VIR_REALLOC_N(veths, nveths+1) < 0) {
virReportOOMError();
goto cleanup;
}
if ((veths[nveths++] = strdup(optarg)) == NULL) {
virReportOOMError();
goto cleanup;
}
break;
case 'c':
if (virStrToLong_i(optarg, NULL, 10, &appPty) < 0) {
fprintf(stderr, "malformed --console argument '%s'", optarg);
goto cleanup;
}
break;
case 's':
if (virStrToLong_i(optarg, NULL, 10, &handshakefd) < 0) {
fprintf(stderr, "malformed --handshakefd argument '%s'",
optarg);
goto cleanup;
}
break;
case 'h':
case '?':
fprintf(stderr, "\n");
fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "Options\n");
fprintf(stderr, "\n");
fprintf(stderr, " -b, --background\n");
fprintf(stderr, " -n NAME, --name NAME\n");
fprintf(stderr, " -c FD, --console FD\n");
fprintf(stderr, " -v VETH, --veth VETH\n");
fprintf(stderr, " -s FD, --handshakefd FD\n");
fprintf(stderr, " -h, --help\n");
fprintf(stderr, "\n");
goto cleanup;
}
}
if (name == NULL) {
fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
goto cleanup;
}
if (appPty < 0) {
fprintf(stderr, "%s: missing --console argument for container PTY\n", argv[0]);
goto cleanup;
}
if (handshakefd < 0) {
fprintf(stderr, "%s: missing --handshake argument for container PTY\n",
argv[0]);
goto cleanup;
}
if (getuid() != 0) {
fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
goto cleanup;
}
if ((caps = lxcCapsInit()) == NULL)
goto cleanup;
if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
name)) == NULL)
goto cleanup;
if ((def = virDomainDefParseFile(caps, configFile,
1 << VIR_DOMAIN_VIRT_LXC,
VIR_DOMAIN_XML_INACTIVE)) == NULL)
goto cleanup;
if (def->nnets != nveths) {
fprintf(stderr, "%s: expecting %d veths, but got %d\n",
argv[0], def->nnets, nveths);
goto cleanup;
}
if ((sockpath = lxcMonitorPath(def)) == NULL)
goto cleanup;
if ((monitor = lxcMonitorServer(sockpath)) < 0)
goto cleanup;
if (bg) {
if ((pid = fork()) < 0)
goto cleanup;
if (pid > 0) {
if ((rc = virFileWritePid(LXC_STATE_DIR, name, pid)) < 0) {
virReportSystemError(-rc,
_("Unable to write pid file '%s/%s.pid'"),
LXC_STATE_DIR, name);
_exit(1);
}
/* First child now exits, allowing original caller
* (ie libvirtd's LXC driver to complete their
* waitpid & continue */
_exit(0);
}
/* Don't hold onto any cwd we inherit from libvirtd either */
if (chdir("/") < 0) {
virReportSystemError(errno, "%s",
_("Unable to change to root dir"));
goto cleanup;
}
if (setsid() < 0) {
virReportSystemError(errno, "%s",
_("Unable to become session leader"));
goto cleanup;
}
}
/* Initialize logging */
virLogSetFromEnv();
/* Accept initial client which is the libvirtd daemon */
if ((client = accept(monitor, NULL, 0)) < 0) {
virReportSystemError(errno, "%s",
_("Failed to accept a connection from driver"));
goto cleanup;
}
rc = lxcControllerRun(def, nveths, veths, monitor, client, appPty,
handshakefd);
cleanup:
if (def)
virFileDeletePid(LXC_STATE_DIR, def->name);
lxcControllerCleanupInterfaces(nveths, veths);
if (sockpath)
unlink(sockpath);
VIR_FREE(sockpath);
return rc ? EXIT_FAILURE : EXIT_SUCCESS;
}