libvirt/src/qemu/qemu_namespace.c

1632 lines
41 KiB
C
Raw Normal View History

/*
* qemu_namespace.c: QEMU domain namespace helpers
*
* Copyright (C) 2006-2020 Red Hat, Inc.
* Copyright (C) 2006 Daniel P. Berrange
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#ifdef __linux__
# include <sys/sysmacros.h>
#endif
#if defined(WITH_SYS_MOUNT_H)
# include <sys/mount.h>
#endif
#ifdef WITH_SELINUX
# include <selinux/selinux.h>
#endif
#include "qemu_namespace.h"
#include "qemu_domain.h"
#include "qemu_cgroup.h"
#include "qemu_security.h"
#include "qemu_hostdev.h"
#include "viralloc.h"
#include "virlog.h"
#include "virstring.h"
#include "virdevmapper.h"
#include "virglibutil.h"
#define VIR_FROM_THIS VIR_FROM_QEMU
VIR_LOG_INIT("qemu.qemu_domain");
VIR_ENUM_IMPL(qemuDomainNamespace,
QEMU_DOMAIN_NS_LAST,
"mount",
);
/**
* qemuDomainGetPreservedMountPath:
* @cfg: driver configuration data
* @vm: domain object
* @mountpoint: mount point path to convert
*
* For given @mountpoint return new path where the mount point
* should be moved temporarily whilst building the namespace.
*
* Returns: allocated string on success which the caller must free,
* NULL on failure.
*/
static char *
qemuDomainGetPreservedMountPath(virQEMUDriverConfig *cfg,
virDomainObj *vm,
const char *mountpoint)
{
char *path = NULL;
char *tmp;
const char *suffix = mountpoint + strlen(QEMU_DEVPREFIX);
g_autofree char *domname = virDomainDefGetShortName(vm->def);
size_t off;
if (!domname)
return NULL;
if (STREQ(mountpoint, "/dev"))
suffix = "dev";
path = g_strdup_printf("%s/%s.%s", cfg->stateDir, domname, suffix);
/* Now consider that @mountpoint is "/dev/blah/blah2".
* @suffix then points to "blah/blah2". However, caller
* expects all the @paths to be the same depth. The
* caller doesn't always do `mkdir -p` but sometimes bare
* `touch`. Therefore fix all the suffixes. */
off = strlen(path) - strlen(suffix);
tmp = path + off;
while (*tmp) {
if (*tmp == '/')
*tmp = '.';
tmp++;
}
return path;
}
/**
* qemuDomainGetPreservedMounts:
*
* Process list of mounted filesystems and:
* a) save all FSs mounted under /dev to @devPath
* b) generate backup path for all the entries in a)
*
* Any of the return pointers can be NULL. Both arrays are NULL-terminated.
*
* Returns 0 on success, -1 otherwise (with error reported)
*/
static int
qemuDomainGetPreservedMounts(virQEMUDriverConfig *cfg,
virDomainObj *vm,
char ***devPath,
char ***devSavePath,
size_t *ndevPath)
{
g_auto(GStrv) mounts = NULL;
size_t nmounts = 0;
g_auto(GStrv) paths = NULL;
g_auto(GStrv) savePaths = NULL;
size_t i;
if (ndevPath)
*ndevPath = 0;
if (virFileGetMountSubtree(QEMU_PROC_MOUNTS, "/dev", &mounts, &nmounts) < 0)
return -1;
if (nmounts == 0)
return 0;
/* There can be nested mount points. For instance
* /dev/shm/blah can be a mount point and /dev/shm too. It
* doesn't make much sense to return the former path because
* caller preserves the latter (and with that the former
* too). Therefore prune nested mount points.
* NB mounts[0] is "/dev". Should we start the outer loop
* from the beginning of the array all we'd be left with is
* just the first element. Think about it.
*/
for (i = 1; i < nmounts; i++) {
size_t j = i + 1;
while (j < nmounts) {
char *c = STRSKIP(mounts[j], mounts[i]);
if (c && (*c == '/' || *c == '\0')) {
VIR_DEBUG("Dropping path %s because of %s", mounts[j], mounts[i]);
VIR_DELETE_ELEMENT(mounts, j, nmounts);
} else {
j++;
}
}
}
/* mounts may not be NULL-terminated at this point, but we convert it into
* 'paths' which is NULL-terminated */
paths = g_new0(char *, nmounts + 1);
for (i = 0; i < nmounts; i++)
paths[i] = g_steal_pointer(&mounts[i]);
if (devSavePath) {
savePaths = g_new0(char *, nmounts + 1);
for (i = 0; i < nmounts; i++) {
if (!(savePaths[i] = qemuDomainGetPreservedMountPath(cfg, vm, paths[i])))
return -1;
}
}
if (devPath)
*devPath = g_steal_pointer(&paths);
if (devSavePath)
*devSavePath = g_steal_pointer(&savePaths);
if (ndevPath)
*ndevPath = nmounts;
return 0;
}
static int
qemuDomainPopulateDevices(virQEMUDriverConfig *cfg,
GSList **paths)
{
const char *const *devices = (const char *const *) cfg->cgroupDeviceACL;
size_t i;
if (!devices)
devices = defaultDeviceACL;
for (i = 0; devices[i]; i++) {
*paths = g_slist_prepend(*paths, g_strdup(devices[i]));
}
return 0;
}
static int
qemuDomainSetupDev(virSecurityManager *mgr,
virDomainObj *vm,
const char *path)
{
g_autofree char *mount_options = NULL;
g_autofree char *opts = NULL;
VIR_DEBUG("Setting up /dev/ for domain %s", vm->def->name);
mount_options = qemuSecurityGetMountOptions(mgr, vm->def);
if (!mount_options)
mount_options = g_strdup("");
/*
* tmpfs is limited to 64kb, since we only have device nodes in there
* and don't want to DOS the entire OS RAM usage
*/
opts = g_strdup_printf("mode=755,size=65536%s", mount_options);
if (virFileSetupDev(path, opts) < 0)
return -1;
return 0;
}
static int
qemuDomainSetupDisk(virStorageSource *src,
GSList **paths)
{
virStorageSource *next;
bool hasNVMe = false;
for (next = src; virStorageSourceIsBacking(next); next = next->backingStore) {
g_autofree char *tmpPath = NULL;
if (next->type == VIR_STORAGE_TYPE_NVME) {
hasNVMe = true;
if (!(tmpPath = virPCIDeviceAddressGetIOMMUGroupDev(&next->nvme->pciAddr)))
return -1;
} else {
g_auto(GStrv) targetPaths = NULL;
GStrv tmp;
if (virStorageSourceIsEmpty(next) ||
!virStorageSourceIsLocalStorage(next)) {
/* Not creating device. Just continue. */
continue;
}
tmpPath = g_strdup(next->path);
if (virDevMapperGetTargets(next->path, &targetPaths) < 0 &&
errno != ENOSYS) {
virReportSystemError(errno,
_("Unable to get devmapper targets for %s"),
next->path);
return -1;
}
if (targetPaths) {
for (tmp = targetPaths; *tmp; tmp++)
*paths = g_slist_prepend(*paths, g_steal_pointer(tmp));
}
}
*paths = g_slist_prepend(*paths, g_steal_pointer(&tmpPath));
}
/* qemu-pr-helper might require access to /dev/mapper/control. */
if (src->pr)
*paths = g_slist_prepend(*paths, g_strdup(QEMU_DEVICE_MAPPER_CONTROL_PATH));
if (hasNVMe)
*paths = g_slist_prepend(*paths, g_strdup(QEMU_DEV_VFIO));
return 0;
}
static int
qemuDomainSetupAllDisks(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up disks");
for (i = 0; i < vm->def->ndisks; i++) {
if (qemuDomainSetupDisk(vm->def->disks[i]->src,
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all disks");
return 0;
}
static int
qemuDomainSetupHostdev(virDomainObj *vm,
virDomainHostdevDef *hostdev,
bool hotplug,
GSList **paths)
{
g_autofree char *path = NULL;
if (qemuDomainGetHostdevPath(hostdev, &path, NULL) < 0)
return -1;
if (path)
*paths = g_slist_prepend(*paths, g_steal_pointer(&path));
if (qemuHostdevNeedsVFIO(hostdev) &&
(!hotplug || !qemuDomainNeedsVFIO(vm->def)))
*paths = g_slist_prepend(*paths, g_strdup(QEMU_DEV_VFIO));
return 0;
}
static int
qemuDomainSetupAllHostdevs(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up hostdevs");
for (i = 0; i < vm->def->nhostdevs; i++) {
if (qemuDomainSetupHostdev(vm,
vm->def->hostdevs[i],
false,
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all hostdevs");
return 0;
}
static int
qemuDomainSetupMemory(virDomainMemoryDef *mem,
GSList **paths)
{
if (mem->model != VIR_DOMAIN_MEMORY_MODEL_NVDIMM &&
mem->model != VIR_DOMAIN_MEMORY_MODEL_VIRTIO_PMEM)
return 0;
*paths = g_slist_prepend(*paths, g_strdup(mem->nvdimmPath));
return 0;
}
static int
qemuDomainSetupAllMemories(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up memories");
for (i = 0; i < vm->def->nmems; i++) {
if (qemuDomainSetupMemory(vm->def->mems[i],
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all memories");
return 0;
}
static int
qemuDomainSetupChardev(virDomainDef *def G_GNUC_UNUSED,
virDomainChrDef *dev,
void *opaque)
{
GSList **paths = opaque;
const char *path = NULL;
if (!(path = virDomainChrSourceDefGetPath(dev->source)))
return 0;
/* Socket created by qemu. It doesn't exist upfront. */
if (dev->source->type == VIR_DOMAIN_CHR_TYPE_UNIX &&
dev->source->data.nix.listen)
return 0;
*paths = g_slist_prepend(*paths, g_strdup(path));
return 0;
}
static int
qemuDomainSetupAllChardevs(virDomainObj *vm,
GSList **paths)
{
VIR_DEBUG("Setting up chardevs");
if (virDomainChrDefForeach(vm->def,
true,
qemuDomainSetupChardev,
paths) < 0)
return -1;
VIR_DEBUG("Setup all chardevs");
return 0;
}
static int
qemuDomainSetupTPM(virDomainTPMDef *dev,
GSList **paths)
{
switch (dev->type) {
case VIR_DOMAIN_TPM_TYPE_PASSTHROUGH:
*paths = g_slist_prepend(*paths, g_strdup(dev->data.passthrough.source.data.file.path));
break;
case VIR_DOMAIN_TPM_TYPE_EMULATOR:
case VIR_DOMAIN_TPM_TYPE_LAST:
/* nada */
break;
}
return 0;
}
static int
qemuDomainSetupAllTPMs(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up TPMs");
for (i = 0; i < vm->def->ntpms; i++) {
if (qemuDomainSetupTPM(vm->def->tpms[i], paths) < 0)
return -1;
}
VIR_DEBUG("Setup all TPMs");
return 0;
}
static int
qemuDomainSetupGraphics(virDomainGraphicsDef *gfx,
GSList **paths)
{
const char *rendernode = virDomainGraphicsGetRenderNode(gfx);
if (!rendernode)
return 0;
*paths = g_slist_prepend(*paths, g_strdup(rendernode));
return 0;
}
static int
qemuDomainSetupAllGraphics(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up graphics");
for (i = 0; i < vm->def->ngraphics; i++) {
if (qemuDomainSetupGraphics(vm->def->graphics[i],
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all graphics");
return 0;
}
static int
qemuDomainSetupInput(virDomainInputDef *input,
GSList **paths)
{
const char *path = virDomainInputDefGetPath(input);
if (!path)
return 0;
*paths = g_slist_prepend(*paths, g_strdup(path));
return 0;
}
static int
qemuDomainSetupAllInputs(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up inputs");
for (i = 0; i < vm->def->ninputs; i++) {
if (qemuDomainSetupInput(vm->def->inputs[i],
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all inputs");
return 0;
}
static int
qemuDomainSetupRNG(virDomainRNGDef *rng,
GSList **paths)
{
switch ((virDomainRNGBackend) rng->backend) {
case VIR_DOMAIN_RNG_BACKEND_RANDOM:
*paths = g_slist_prepend(*paths, g_strdup(rng->source.file));
break;
case VIR_DOMAIN_RNG_BACKEND_EGD:
case VIR_DOMAIN_RNG_BACKEND_BUILTIN:
case VIR_DOMAIN_RNG_BACKEND_LAST:
/* nada */
break;
}
return 0;
}
static int
qemuDomainSetupAllRNGs(virDomainObj *vm,
GSList **paths)
{
size_t i;
VIR_DEBUG("Setting up RNGs");
for (i = 0; i < vm->def->nrngs; i++) {
if (qemuDomainSetupRNG(vm->def->rngs[i],
paths) < 0)
return -1;
}
VIR_DEBUG("Setup all RNGs");
return 0;
}
static int
qemuDomainSetupLoader(virDomainObj *vm,
GSList **paths)
{
virDomainLoaderDef *loader = vm->def->os.loader;
VIR_DEBUG("Setting up loader");
if (loader) {
switch ((virDomainLoader) loader->type) {
case VIR_DOMAIN_LOADER_TYPE_ROM:
*paths = g_slist_prepend(*paths, g_strdup(loader->path));
break;
case VIR_DOMAIN_LOADER_TYPE_PFLASH:
*paths = g_slist_prepend(*paths, g_strdup(loader->path));
if (loader->nvram)
*paths = g_slist_prepend(*paths, g_strdup(loader->nvram));
break;
case VIR_DOMAIN_LOADER_TYPE_NONE:
case VIR_DOMAIN_LOADER_TYPE_LAST:
break;
}
}
VIR_DEBUG("Setup loader");
return 0;
}
static int
qemuDomainSetupLaunchSecurity(virDomainObj *vm,
GSList **paths)
{
virDomainSEVDef *sev = vm->def->sev;
if (!sev || sev->sectype != VIR_DOMAIN_LAUNCH_SECURITY_SEV)
return 0;
VIR_DEBUG("Setting up launch security");
*paths = g_slist_prepend(*paths, g_strdup(QEMU_DEV_SEV));
VIR_DEBUG("Set up launch security");
return 0;
}
static int
qemuNamespaceMknodPaths(virDomainObj *vm,
GSList *paths);
int
qemuDomainBuildNamespace(virQEMUDriverConfig *cfg,
virDomainObj *vm)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT)) {
VIR_DEBUG("namespaces disabled for domain %s", vm->def->name);
return 0;
}
if (qemuDomainPopulateDevices(cfg, &paths) < 0)
return -1;
if (qemuDomainSetupAllDisks(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllHostdevs(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllMemories(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllChardevs(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllTPMs(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllGraphics(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllInputs(vm, &paths) < 0)
return -1;
if (qemuDomainSetupAllRNGs(vm, &paths) < 0)
return -1;
if (qemuDomainSetupLoader(vm, &paths) < 0)
return -1;
if (qemuDomainSetupLaunchSecurity(vm, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainUnshareNamespace(virQEMUDriverConfig *cfg,
virSecurityManager *mgr,
virDomainObj *vm)
{
const char *devPath = NULL;
char **devMountsPath = NULL, **devMountsSavePath = NULL;
size_t ndevMountsPath = 0, i;
int ret = -1;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT)) {
ret = 0;
goto cleanup;
}
if (qemuDomainGetPreservedMounts(cfg, vm,
&devMountsPath, &devMountsSavePath,
&ndevMountsPath) < 0)
goto cleanup;
for (i = 0; i < ndevMountsPath; i++) {
if (STREQ(devMountsPath[i], "/dev")) {
devPath = devMountsSavePath[i];
break;
}
}
if (!devPath) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Unable to find any /dev mount"));
goto cleanup;
}
if (virProcessSetupPrivateMountNS() < 0)
goto cleanup;
if (qemuDomainSetupDev(mgr, vm, devPath) < 0)
goto cleanup;
/* Save some mount points because we want to share them with the host */
for (i = 0; i < ndevMountsPath; i++) {
struct stat sb;
if (devMountsSavePath[i] == devPath)
continue;
if (stat(devMountsPath[i], &sb) < 0) {
virReportSystemError(errno,
_("Unable to stat: %s"),
devMountsPath[i]);
goto cleanup;
}
/* At this point, devMountsPath is either:
* a file (regular or special), or
* a directory. */
if ((S_ISDIR(sb.st_mode) && g_mkdir_with_parents(devMountsSavePath[i], 0777) < 0) ||
(!S_ISDIR(sb.st_mode) && virFileTouch(devMountsSavePath[i], sb.st_mode) < 0)) {
virReportSystemError(errno,
_("Failed to create %s"),
devMountsSavePath[i]);
goto cleanup;
}
if (virFileMoveMount(devMountsPath[i], devMountsSavePath[i]) < 0)
goto cleanup;
}
if (virFileMoveMount(devPath, "/dev") < 0)
goto cleanup;
for (i = 0; i < ndevMountsPath; i++) {
struct stat sb;
if (devMountsSavePath[i] == devPath)
continue;
if (stat(devMountsSavePath[i], &sb) < 0) {
virReportSystemError(errno,
_("Unable to stat: %s"),
devMountsSavePath[i]);
goto cleanup;
}
if (S_ISDIR(sb.st_mode)) {
if (g_mkdir_with_parents(devMountsPath[i], 0777) < 0) {
virReportSystemError(errno, _("Cannot create %s"),
devMountsPath[i]);
goto cleanup;
}
} else {
if (virFileMakeParentPath(devMountsPath[i]) < 0 ||
virFileTouch(devMountsPath[i], sb.st_mode) < 0) {
virReportSystemError(errno, _("Cannot create %s"),
devMountsPath[i]);
goto cleanup;
}
}
if (virFileMoveMount(devMountsSavePath[i], devMountsPath[i]) < 0)
goto cleanup;
}
ret = 0;
cleanup:
for (i = 0; i < ndevMountsPath; i++) {
#if defined(__linux__)
umount(devMountsSavePath[i]);
#endif /* defined(__linux__) */
/* The path can be either a regular file or a dir. */
if (virFileIsDir(devMountsSavePath[i]))
virFileDeleteTree(devMountsSavePath[i]);
else
unlink(devMountsSavePath[i]);
}
virStringListFreeCount(devMountsPath, ndevMountsPath);
virStringListFreeCount(devMountsSavePath, ndevMountsPath);
return ret;
}
bool
qemuDomainNamespaceEnabled(virDomainObj *vm,
qemuDomainNamespace ns)
{
qemuDomainObjPrivate *priv = vm->privateData;
return priv->namespaces &&
virBitmapIsBitSet(priv->namespaces, ns);
}
int
qemuDomainEnableNamespace(virDomainObj *vm,
qemuDomainNamespace ns)
{
qemuDomainObjPrivate *priv = vm->privateData;
if (!priv->namespaces)
priv->namespaces = virBitmapNew(QEMU_DOMAIN_NS_LAST);
if (virBitmapSetBit(priv->namespaces, ns) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Unable to enable namespace: %s"),
qemuDomainNamespaceTypeToString(ns));
return -1;
}
return 0;
}
static void
qemuDomainDisableNamespace(virDomainObj *vm,
qemuDomainNamespace ns)
{
qemuDomainObjPrivate *priv = vm->privateData;
if (priv->namespaces) {
ignore_value(virBitmapClearBit(priv->namespaces, ns));
if (virBitmapIsAllClear(priv->namespaces)) {
virBitmapFree(priv->namespaces);
priv->namespaces = NULL;
}
}
}
void
qemuDomainDestroyNamespace(virQEMUDriver *driver G_GNUC_UNUSED,
virDomainObj *vm)
{
if (qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
qemuDomainDisableNamespace(vm, QEMU_DOMAIN_NS_MOUNT);
}
bool
qemuDomainNamespaceAvailable(qemuDomainNamespace ns G_GNUC_UNUSED)
{
#if !defined(__linux__)
/* Namespaces are Linux specific. */
return false;
#else /* defined(__linux__) */
switch (ns) {
case QEMU_DOMAIN_NS_MOUNT:
# if !defined(WITH_LIBACL) || !defined(WITH_SELINUX)
/* We can't create the exact copy of paths if either of
* these is not available. */
return false;
# else
if (virProcessNamespaceAvailable(VIR_PROCESS_NAMESPACE_MNT) < 0)
return false;
# endif
break;
case QEMU_DOMAIN_NS_LAST:
break;
}
return true;
#endif /* defined(__linux__) */
}
typedef struct _qemuNamespaceMknodItem qemuNamespaceMknodItem;
struct _qemuNamespaceMknodItem {
char *file;
char *target;
bool bindmounted;
GStatBuf sb;
void *acl;
char *tcon;
};
typedef struct _qemuNamespaceMknodData qemuNamespaceMknodData;
struct _qemuNamespaceMknodData {
virQEMUDriver *driver;
virDomainObj *vm;
qemuNamespaceMknodItem *items;
size_t nitems;
};
static void
qemuNamespaceMknodItemClear(qemuNamespaceMknodItem *item)
{
VIR_FREE(item->file);
VIR_FREE(item->target);
virFileFreeACLs(&item->acl);
#ifdef WITH_SELINUX
freecon(item->tcon);
#endif
}
G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(qemuNamespaceMknodItem, qemuNamespaceMknodItemClear);
static void
qemuNamespaceMknodDataClear(qemuNamespaceMknodData *data)
{
size_t i;
for (i = 0; i < data->nitems; i++) {
qemuNamespaceMknodItem *item = &data->items[i];
qemuNamespaceMknodItemClear(item);
}
VIR_FREE(data->items);
}
/* Our way of creating devices is highly linux specific */
#if defined(__linux__)
static int
qemuNamespaceMknodOne(qemuNamespaceMknodItem *data)
{
int ret = -1;
bool delDevice = false;
bool isLink = S_ISLNK(data->sb.st_mode);
bool isDev = S_ISCHR(data->sb.st_mode) || S_ISBLK(data->sb.st_mode);
bool isReg = S_ISREG(data->sb.st_mode) || S_ISFIFO(data->sb.st_mode) || S_ISSOCK(data->sb.st_mode);
bool isDir = S_ISDIR(data->sb.st_mode);
if (virFileMakeParentPath(data->file) < 0) {
virReportSystemError(errno,
_("Unable to create %s"), data->file);
goto cleanup;
}
if (isLink) {
VIR_DEBUG("Creating symlink %s -> %s", data->file, data->target);
/* First, unlink the symlink target. Symlinks change and
* therefore we have no guarantees that pre-existing
* symlink is still valid. */
if (unlink(data->file) < 0 &&
errno != ENOENT) {
virReportSystemError(errno,
_("Unable to remove symlink %s"),
data->file);
goto cleanup;
}
if (symlink(data->target, data->file) < 0) {
virReportSystemError(errno,
_("Unable to create symlink %s (pointing to %s)"),
data->file, data->target);
goto cleanup;
} else {
delDevice = true;
}
} else if (isDev) {
VIR_DEBUG("Creating dev %s (%d,%d)",
data->file, major(data->sb.st_rdev), minor(data->sb.st_rdev));
unlink(data->file);
if (mknod(data->file, data->sb.st_mode, data->sb.st_rdev) < 0) {
virReportSystemError(errno,
_("Unable to create device %s"),
data->file);
goto cleanup;
} else {
delDevice = true;
}
} else if (isReg || isDir) {
/* We are not cleaning up disks on virDomainDetachDevice
* because disk might be still in use by different disk
* as its backing chain. This might however clash here.
* Therefore do the cleanup here. */
if (umount(data->file) < 0 &&
errno != ENOENT && errno != EINVAL) {
virReportSystemError(errno,
_("Unable to umount %s"),
data->file);
goto cleanup;
}
if ((isReg && virFileTouch(data->file, data->sb.st_mode) < 0) ||
(isDir && g_mkdir_with_parents(data->file, data->sb.st_mode) < 0))
goto cleanup;
delDevice = true;
/* Just create the file here so that code below sets
* proper owner and mode. Move the mount only after that. */
} else {
virReportError(VIR_ERR_OPERATION_UNSUPPORTED,
_("unsupported device type %s 0%o"),
data->file, data->sb.st_mode);
goto cleanup;
}
if (lchown(data->file, data->sb.st_uid, data->sb.st_gid) < 0) {
virReportSystemError(errno,
_("Failed to chown device %s"),
data->file);
goto cleanup;
}
/* Symlinks don't have mode */
if (!isLink &&
chmod(data->file, data->sb.st_mode) < 0) {
virReportSystemError(errno,
_("Failed to set permissions for device %s"),
data->file);
goto cleanup;
}
/* Symlinks don't have ACLs. */
if (!isLink &&
virFileSetACLs(data->file, data->acl) < 0 &&
errno != ENOTSUP) {
virReportSystemError(errno,
_("Unable to set ACLs on %s"), data->file);
goto cleanup;
}
# ifdef WITH_SELINUX
if (data->tcon &&
lsetfilecon_raw(data->file, (const char *)data->tcon) < 0) {
VIR_WARNINGS_NO_WLOGICALOP_EQUAL_EXPR
if (errno != EOPNOTSUPP && errno != ENOTSUP) {
VIR_WARNINGS_RESET
virReportSystemError(errno,
_("Unable to set SELinux label on %s"),
data->file);
goto cleanup;
}
}
# endif
/* Finish mount process started earlier. */
if ((isReg || isDir) &&
virFileMoveMount(data->target, data->file) < 0)
goto cleanup;
ret = 0;
cleanup:
if (ret < 0 && delDevice) {
if (isDir)
virFileDeleteTree(data->file);
else
unlink(data->file);
}
return ret;
}
static bool
qemuNamespaceMknodItemNeedsBindMount(mode_t st_mode)
{
/* A block device S_ISBLK() or a chardev S_ISCHR() is intentionally not
* handled. We want to mknod() it instead of passing in through bind
* mounting. */
return S_ISREG(st_mode) || S_ISFIFO(st_mode) ||
S_ISSOCK(st_mode) || S_ISDIR(st_mode);
}
static int
qemuNamespaceMknodHelper(pid_t pid G_GNUC_UNUSED,
void *opaque)
{
qemuNamespaceMknodData *data = opaque;
size_t i;
int ret = -1;
qemuSecurityPostFork(data->driver->securityManager);
for (i = 0; i < data->nitems; i++) {
if (qemuNamespaceMknodOne(&data->items[i]) < 0)
goto cleanup;
}
ret = 0;
cleanup:
qemuNamespaceMknodDataClear(data);
return ret;
}
static int
qemuNamespaceMknodItemInit(qemuNamespaceMknodItem *item,
virQEMUDriverConfig *cfg,
virDomainObj *vm,
const char *file)
{
g_autofree char *target = NULL;
bool isLink;
bool needsBindMount;
item->file = g_strdup(file);
if (g_lstat(file, &item->sb) < 0) {
qemu_namespace: Be tolerant to non-existent files when populating /dev In 6.7.0 release I've changed how domain namespace is built and populated. Previously it used to be done from a pre-exec hook (ran in the forked off child, just before dropping all privileges and exec()-ing QEMU), which not only meant we had to have two different code paths for creating a node in domain's namespace (one for this pre-exec hook, the other for hotplug ran from the daemon), it also proved problematic because it was leaking FDs into QEMU process. To mitigate this problem, we've not only ditched libdevmapper from the NS population process, I've also dropped the pre-exec code and let the NS be populated from the daemon (using the hotplug code). But, I was not careful when doing so, because the pre-exec code was tolerant to files that doesn't exist, while this new code isn't. For instance, the very first thing that is done when the new NS is created is it's populated with @defaultDeviceACL which contain files like /dev/null, /dev/zero, /dev/random and /dev/kvm (and others). While the rest will probably exist every time, /dev/kvm might not and thus the new code I wrote has to be tolerant to that. Of course, users can override the @defaultDeviceACL (by setting cgroup_device_acl in qemu.conf) and remove /dev/kvm (which is acceptable workaround), but we definitely want libvirt to work out of the box even on hosts without KVM. Fixes: 9048dc4e627ddf33996084167bece7b5fb83b0bc Reported-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Ján Tomko <jtomko@redhat.com>
2020-09-03 16:07:43 +00:00
if (errno == ENOENT)
return -2;
virReportSystemError(errno,
_("Unable to access %s"), file);
return -1;
}
isLink = S_ISLNK(item->sb.st_mode);
needsBindMount = qemuNamespaceMknodItemNeedsBindMount(item->sb.st_mode);
if (needsBindMount && STRPREFIX(file, QEMU_DEVPREFIX)) {
if (!(target = qemuDomainGetPreservedMountPath(cfg, vm, file)))
return -1;
item->target = g_steal_pointer(&target);
} else if (isLink) {
g_autoptr(GError) gerr = NULL;
if (!(target = g_file_read_link(file, &gerr))) {
virReportError(VIR_ERR_SYSTEM_ERROR,
_("failed to resolve symlink %s: %s"), file, gerr->message);
return -1;
}
if (!g_path_is_absolute(target)) {
g_autofree char *fileTmp = g_strdup(file);
char *c = NULL;
char *tmp = NULL;
if ((c = strrchr(fileTmp, '/')))
*(c + 1) = '\0';
tmp = g_strdup_printf("%s%s", fileTmp, target);
VIR_FREE(target);
target = g_steal_pointer(&tmp);
}
item->target = g_steal_pointer(&target);
}
/* Symlinks don't have ACLs. */
if (!isLink &&
virFileGetACLs(file, &item->acl) < 0 &&
errno != ENOTSUP) {
virReportSystemError(errno,
_("Unable to get ACLs on %s"), file);
return -1;
}
# ifdef WITH_SELINUX
if (lgetfilecon_raw(file, &item->tcon) < 0 &&
(errno != ENOTSUP && errno != ENODATA)) {
virReportSystemError(errno,
_("Unable to get SELinux label from %s"), file);
return -1;
}
# endif
return 0;
}
static int
qemuNamespacePrepareOneItem(qemuNamespaceMknodData *data,
virQEMUDriverConfig *cfg,
virDomainObj *vm,
const char *file,
char * const *devMountsPath,
size_t ndevMountsPath)
{
long ttl = sysconf(_SC_SYMLOOP_MAX);
g_autofree char *next = g_strdup(file);
size_t i;
while (1) {
g_auto(qemuNamespaceMknodItem) item = { 0 };
bool isLink;
bool addToData = false;
qemu_namespace: Be tolerant to non-existent files when populating /dev In 6.7.0 release I've changed how domain namespace is built and populated. Previously it used to be done from a pre-exec hook (ran in the forked off child, just before dropping all privileges and exec()-ing QEMU), which not only meant we had to have two different code paths for creating a node in domain's namespace (one for this pre-exec hook, the other for hotplug ran from the daemon), it also proved problematic because it was leaking FDs into QEMU process. To mitigate this problem, we've not only ditched libdevmapper from the NS population process, I've also dropped the pre-exec code and let the NS be populated from the daemon (using the hotplug code). But, I was not careful when doing so, because the pre-exec code was tolerant to files that doesn't exist, while this new code isn't. For instance, the very first thing that is done when the new NS is created is it's populated with @defaultDeviceACL which contain files like /dev/null, /dev/zero, /dev/random and /dev/kvm (and others). While the rest will probably exist every time, /dev/kvm might not and thus the new code I wrote has to be tolerant to that. Of course, users can override the @defaultDeviceACL (by setting cgroup_device_acl in qemu.conf) and remove /dev/kvm (which is acceptable workaround), but we definitely want libvirt to work out of the box even on hosts without KVM. Fixes: 9048dc4e627ddf33996084167bece7b5fb83b0bc Reported-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Ján Tomko <jtomko@redhat.com>
2020-09-03 16:07:43 +00:00
int rc;
qemu_namespace: Be tolerant to non-existent files when populating /dev In 6.7.0 release I've changed how domain namespace is built and populated. Previously it used to be done from a pre-exec hook (ran in the forked off child, just before dropping all privileges and exec()-ing QEMU), which not only meant we had to have two different code paths for creating a node in domain's namespace (one for this pre-exec hook, the other for hotplug ran from the daemon), it also proved problematic because it was leaking FDs into QEMU process. To mitigate this problem, we've not only ditched libdevmapper from the NS population process, I've also dropped the pre-exec code and let the NS be populated from the daemon (using the hotplug code). But, I was not careful when doing so, because the pre-exec code was tolerant to files that doesn't exist, while this new code isn't. For instance, the very first thing that is done when the new NS is created is it's populated with @defaultDeviceACL which contain files like /dev/null, /dev/zero, /dev/random and /dev/kvm (and others). While the rest will probably exist every time, /dev/kvm might not and thus the new code I wrote has to be tolerant to that. Of course, users can override the @defaultDeviceACL (by setting cgroup_device_acl in qemu.conf) and remove /dev/kvm (which is acceptable workaround), but we definitely want libvirt to work out of the box even on hosts without KVM. Fixes: 9048dc4e627ddf33996084167bece7b5fb83b0bc Reported-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Ján Tomko <jtomko@redhat.com>
2020-09-03 16:07:43 +00:00
rc = qemuNamespaceMknodItemInit(&item, cfg, vm, next);
if (rc == -2) {
/* @file doesn't exist. We can break here. */
break;
} else if (rc < 0) {
/* Some other (critical) error. */
return -1;
qemu_namespace: Be tolerant to non-existent files when populating /dev In 6.7.0 release I've changed how domain namespace is built and populated. Previously it used to be done from a pre-exec hook (ran in the forked off child, just before dropping all privileges and exec()-ing QEMU), which not only meant we had to have two different code paths for creating a node in domain's namespace (one for this pre-exec hook, the other for hotplug ran from the daemon), it also proved problematic because it was leaking FDs into QEMU process. To mitigate this problem, we've not only ditched libdevmapper from the NS population process, I've also dropped the pre-exec code and let the NS be populated from the daemon (using the hotplug code). But, I was not careful when doing so, because the pre-exec code was tolerant to files that doesn't exist, while this new code isn't. For instance, the very first thing that is done when the new NS is created is it's populated with @defaultDeviceACL which contain files like /dev/null, /dev/zero, /dev/random and /dev/kvm (and others). While the rest will probably exist every time, /dev/kvm might not and thus the new code I wrote has to be tolerant to that. Of course, users can override the @defaultDeviceACL (by setting cgroup_device_acl in qemu.conf) and remove /dev/kvm (which is acceptable workaround), but we definitely want libvirt to work out of the box even on hosts without KVM. Fixes: 9048dc4e627ddf33996084167bece7b5fb83b0bc Reported-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Reviewed-by: Ján Tomko <jtomko@redhat.com>
2020-09-03 16:07:43 +00:00
}
isLink = S_ISLNK(item.sb.st_mode);
if (STRPREFIX(next, QEMU_DEVPREFIX)) {
for (i = 0; i < ndevMountsPath; i++) {
if (STREQ(devMountsPath[i], "/dev"))
continue;
if (STRPREFIX(next, devMountsPath[i]))
break;
}
if (i == ndevMountsPath)
addToData = true;
}
g_free(next);
next = g_strdup(item.target);
if (addToData &&
VIR_APPEND_ELEMENT(data->items, data->nitems, item) < 0)
return -1;
if (!isLink)
break;
if (ttl-- == 0) {
virReportSystemError(ELOOP,
_("Too many levels of symbolic links: %s"),
next);
return -1;
}
}
return 0;
}
static int
qemuNamespaceMknodPaths(virDomainObj *vm,
GSList *paths)
{
qemuDomainObjPrivate *priv = vm->privateData;
virQEMUDriver *driver = priv->driver;
g_autoptr(virQEMUDriverConfig) cfg = NULL;
char **devMountsPath = NULL;
size_t ndevMountsPath = 0;
qemuNamespaceMknodData data = { 0 };
size_t i;
int ret = -1;
GSList *next;
if (!paths)
return 0;
cfg = virQEMUDriverGetConfig(driver);
if (qemuDomainGetPreservedMounts(cfg, vm,
&devMountsPath, NULL,
&ndevMountsPath) < 0)
return -1;
data.driver = driver;
data.vm = vm;
for (next = paths; next; next = next->next) {
const char *path = next->data;
if (qemuNamespacePrepareOneItem(&data, cfg, vm, path,
devMountsPath, ndevMountsPath) < 0)
goto cleanup;
}
for (i = 0; i < data.nitems; i++) {
qemuNamespaceMknodItem *item = &data.items[i];
if (item->target &&
qemuNamespaceMknodItemNeedsBindMount(item->sb.st_mode)) {
if (virFileBindMountDevice(item->file, item->target) < 0)
goto cleanup;
item->bindmounted = true;
}
}
if (qemuSecurityPreFork(driver->securityManager) < 0)
goto cleanup;
if (virProcessRunInMountNamespace(vm->pid,
qemuNamespaceMknodHelper,
&data) < 0) {
qemuSecurityPostFork(driver->securityManager);
goto cleanup;
}
qemuSecurityPostFork(driver->securityManager);
ret = 0;
cleanup:
for (i = 0; i < data.nitems; i++) {
if (data.items[i].bindmounted &&
umount(data.items[i].target) < 0) {
VIR_WARN("Unable to unmount %s", data.items[i].target);
}
}
qemuNamespaceMknodDataClear(&data);
virStringListFreeCount(devMountsPath, ndevMountsPath);
return ret;
}
#else /* !defined(__linux__) */
static int
qemuNamespaceMknodPaths(virDomainObj *vm G_GNUC_UNUSED,
GSList *paths G_GNUC_UNUSED)
{
virReportSystemError(ENOSYS, "%s",
_("Namespaces are not supported on this platform."));
return -1;
}
#endif /* !defined(__linux__) */
static int
qemuNamespaceUnlinkHelper(pid_t pid G_GNUC_UNUSED,
void *opaque)
{
g_autoptr(virGSListString) paths = opaque;
GSList *next;
for (next = paths; next; next = next->next) {
const char *path = next->data;
VIR_DEBUG("Unlinking %s", path);
if (unlink(path) < 0 && errno != ENOENT) {
virReportSystemError(errno,
_("Unable to remove device %s"), path);
return -1;
}
}
return 0;
}
static int
qemuNamespaceUnlinkPaths(virDomainObj *vm,
GSList *paths)
{
qemuDomainObjPrivate *priv = vm->privateData;
virQEMUDriver *driver = priv->driver;
g_autoptr(virQEMUDriverConfig) cfg = NULL;
g_auto(GStrv) devMountsPath = NULL;
g_autoptr(virGSListString) unlinkPaths = NULL;
GSList *next;
if (!paths)
return 0;
cfg = virQEMUDriverGetConfig(driver);
if (qemuDomainGetPreservedMounts(cfg, vm, &devMountsPath, NULL, NULL) < 0)
return -1;
for (next = paths; next; next = next->next) {
const char *path = next->data;
if (STRPREFIX(path, QEMU_DEVPREFIX)) {
GStrv mount;
bool inSubmount = false;
for (mount = devMountsPath; *mount; mount++) {
if (STREQ(*mount, "/dev"))
continue;
if (STRPREFIX(path, *mount)) {
inSubmount = true;
break;
}
}
if (!inSubmount)
unlinkPaths = g_slist_prepend(unlinkPaths, g_strdup(path));
}
}
if (unlinkPaths &&
virProcessRunInMountNamespace(vm->pid,
qemuNamespaceUnlinkHelper,
unlinkPaths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceSetupDisk(virDomainObj *vm,
virStorageSource *src)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupDisk(src, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceTeardownDisk(virDomainObj *vm G_GNUC_UNUSED,
virStorageSource *src G_GNUC_UNUSED)
{
/* While in hotplug case we create the whole backing chain,
* here we must limit ourselves. The disk we want to remove
* might be a part of backing chain of another disk.
* If you are reading these lines and have some spare time
* you can come up with and algorithm that checks for that.
* I don't, therefore: */
return 0;
}
/**
* qemuDomainNamespaceSetupHostdev:
* @vm: domain object
* @hostdev: hostdev to create in @vm's namespace
*
* For given @hostdev, create its devfs representation (if it has one) in
* domain namespace. Note, @hostdev must not be in @vm's definition.
*
* Returns: 0 on success,
* -1 otherwise.
*/
int
qemuDomainNamespaceSetupHostdev(virDomainObj *vm,
virDomainHostdevDef *hostdev)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupHostdev(vm,
hostdev,
true,
&paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
/**
* qemuDomainNamespaceTeardownHostdev:
* @vm: domain object
* @hostdev: hostdev to remove in @vm's namespace
*
* For given @hostdev, remove its devfs representation (if it has one) in
* domain namespace. Note, @hostdev must not be in @vm's definition.
*
* Returns: 0 on success,
* -1 otherwise.
*/
int
qemuDomainNamespaceTeardownHostdev(virDomainObj *vm,
virDomainHostdevDef *hostdev)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupHostdev(vm,
hostdev,
true,
&paths) < 0)
return -1;
if (qemuNamespaceUnlinkPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceSetupMemory(virDomainObj *vm,
virDomainMemoryDef *mem)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupMemory(mem, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceTeardownMemory(virDomainObj *vm,
virDomainMemoryDef *mem)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupMemory(mem, &paths) < 0)
return -1;
if (qemuNamespaceUnlinkPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceSetupChardev(virDomainObj *vm,
virDomainChrDef *chr)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupChardev(vm->def, chr, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceTeardownChardev(virDomainObj *vm,
virDomainChrDef *chr)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupChardev(vm->def, chr, &paths) < 0)
return -1;
if (qemuNamespaceUnlinkPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceSetupRNG(virDomainObj *vm,
virDomainRNGDef *rng)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupRNG(rng, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceTeardownRNG(virDomainObj *vm,
virDomainRNGDef *rng)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupRNG(rng, &paths) < 0)
return -1;
if (qemuNamespaceUnlinkPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceSetupInput(virDomainObj *vm,
virDomainInputDef *input)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupInput(input, &paths) < 0)
return -1;
if (qemuNamespaceMknodPaths(vm, paths) < 0)
return -1;
return 0;
}
int
qemuDomainNamespaceTeardownInput(virDomainObj *vm,
virDomainInputDef *input)
{
g_autoptr(virGSListString) paths = NULL;
if (!qemuDomainNamespaceEnabled(vm, QEMU_DOMAIN_NS_MOUNT))
return 0;
if (qemuDomainSetupInput(input, &paths) < 0)
return -1;
if (qemuNamespaceUnlinkPaths(vm, paths) < 0)
return -1;
return 0;
}