/* * Copyright (C) 2010-2014 Red Hat, Inc. * Copyright IBM Corp. 2008 * * lxc_cgroup.c: LXC cgroup helpers * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see * . */ #include #include "lxc_cgroup.h" #include "lxc_container.h" #include "domain_cgroup.h" #include "virfile.h" #include "virerror.h" #include "virlog.h" #include "virstring.h" #include "virsystemd.h" #include "virutil.h" #define VIR_FROM_THIS VIR_FROM_LXC VIR_LOG_INIT("lxc.lxc_cgroup"); static int virLXCCgroupSetupCpuTune(virDomainDef *def, virCgroup *cgroup) { if (def->cputune.sharesSpecified) { if (virCgroupSetCpuShares(cgroup, def->cputune.shares) < 0) return -1; } return virCgroupSetupCpuPeriodQuota(cgroup, def->cputune.period, def->cputune.quota); } static int virLXCCgroupSetupCpusetTune(virDomainDef *def, virCgroup *cgroup, virBitmap *nodemask) { g_autofree char *mask = NULL; virDomainNumatuneMemMode mode; if (def->placement_mode != VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO && def->cpumask && virCgroupSetupCpusetCpus(cgroup, def->cpumask) < 0) { return -1; } if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 || mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { return 0; } if (virDomainNumatuneMaybeFormatNodeset(def->numa, nodemask, &mask, -1) < 0) return -1; if (mask && virCgroupSetCpusetMems(cgroup, mask) < 0) return -1; return 0; } static int virLXCCgroupSetupBlkioTune(virDomainDef *def, virCgroup *cgroup) { return virDomainCgroupSetupBlkio(cgroup, def->blkio); } static int virLXCCgroupSetupMemTune(virDomainDef *def, virCgroup *cgroup) { if (virCgroupSetMemory(cgroup, virDomainDefGetMemoryInitial(def)) < 0) return -1; return virDomainCgroupSetupMemtune(cgroup, def->mem); } static int virLXCCgroupGetMemSwapUsage(virCgroup *cgroup, struct virLXCMeminfo *meminfo) { return virCgroupGetMemSwapUsage(cgroup, &meminfo->swapusage); } static int virLXCCgroupGetMemSwapTotal(virCgroup *cgroup, struct virLXCMeminfo *meminfo) { return virCgroupGetMemSwapHardLimit(cgroup, &meminfo->swaptotal); } static int virLXCCgroupGetMemUsage(virCgroup *cgroup, struct virLXCMeminfo *meminfo) { int ret; unsigned long memUsage; ret = virCgroupGetMemoryUsage(cgroup, &memUsage); meminfo->memusage = (unsigned long long)memUsage; return ret; } static int virLXCCgroupGetMemTotal(virCgroup *cgroup, struct virLXCMeminfo *meminfo) { return virCgroupGetMemoryHardLimit(cgroup, &meminfo->memtotal); } static int virLXCCgroupGetMemStat(virCgroup *cgroup, struct virLXCMeminfo *meminfo) { return virCgroupGetMemoryStat(cgroup, &meminfo->cached, &meminfo->inactive_anon, &meminfo->active_anon, &meminfo->inactive_file, &meminfo->active_file, &meminfo->unevictable); } int virLXCCgroupGetMeminfo(struct virLXCMeminfo *meminfo) { g_autoptr(virCgroup) cgroup = NULL; if (virCgroupNewSelf(&cgroup) < 0) return -1; if (virLXCCgroupGetMemStat(cgroup, meminfo) < 0) return -1; if (virLXCCgroupGetMemTotal(cgroup, meminfo) < 0) return -1; if (virLXCCgroupGetMemUsage(cgroup, meminfo) < 0) return -1; if (virLXCCgroupGetMemSwapTotal(cgroup, meminfo) < 0) return -1; if (virLXCCgroupGetMemSwapUsage(cgroup, meminfo) < 0) return -1; return 0; } typedef struct _virLXCCgroupDevicePolicy virLXCCgroupDevicePolicy; struct _virLXCCgroupDevicePolicy { char type; int major; int minor; }; int virLXCSetupHostUSBDeviceCgroup(virUSBDevice *dev G_GNUC_UNUSED, const char *path, void *opaque) { virCgroup *cgroup = opaque; VIR_DEBUG("Process path '%s' for USB device", path); if (virCgroupAllowDevicePath(cgroup, path, VIR_CGROUP_DEVICE_RWM, false) < 0) return -1; return 0; } int virLXCTeardownHostUSBDeviceCgroup(virUSBDevice *dev G_GNUC_UNUSED, const char *path, void *opaque) { virCgroup *cgroup = opaque; VIR_DEBUG("Process path '%s' for USB device", path); if (virCgroupDenyDevicePath(cgroup, path, VIR_CGROUP_DEVICE_RWM, false) < 0) return -1; return 0; } static int virLXCCgroupSetupDeviceACL(virDomainDef *def, virCgroup *cgroup) { int capMknod = def->caps_features[VIR_DOMAIN_PROCES_CAPS_FEATURE_MKNOD]; size_t i; static virLXCCgroupDevicePolicy devices[] = { {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL}, {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO}, {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL}, {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM}, {'c', LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM}, {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY}, {'c', LXC_DEV_MAJ_TTY, LXC_DEV_MIN_PTMX}, {'c', LXC_DEV_MAJ_FUSE, LXC_DEV_MIN_FUSE}, {0, 0, 0}}; if (virCgroupDenyAllDevices(cgroup) < 0) return -1; /* white list mknod if CAP_MKNOD has to be kept */ if (capMknod == VIR_TRISTATE_SWITCH_ON) { if (virCgroupAllowAllDevices(cgroup, VIR_CGROUP_DEVICE_MKNOD) < 0) return -1; } for (i = 0; devices[i].type != 0; i++) { virLXCCgroupDevicePolicy *dev = &devices[i]; if (virCgroupAllowDevice(cgroup, dev->type, dev->major, dev->minor, VIR_CGROUP_DEVICE_RWM) < 0) return -1; } VIR_DEBUG("Allowing any disk block devs"); for (i = 0; i < def->ndisks; i++) { if (virStorageSourceIsEmpty(def->disks[i]->src) || !virStorageSourceIsBlockLocal(def->disks[i]->src)) continue; if (virCgroupAllowDevicePath(cgroup, virDomainDiskGetSource(def->disks[i]), (def->disks[i]->src->readonly ? VIR_CGROUP_DEVICE_READ : VIR_CGROUP_DEVICE_RW) | VIR_CGROUP_DEVICE_MKNOD, false) < 0) return -1; } VIR_DEBUG("Allowing any filesystem block devs"); for (i = 0; i < def->nfss; i++) { if (def->fss[i]->type != VIR_DOMAIN_FS_TYPE_BLOCK) continue; if (virCgroupAllowDevicePath(cgroup, def->fss[i]->src->path, def->fss[i]->readonly ? VIR_CGROUP_DEVICE_READ : VIR_CGROUP_DEVICE_RW, false) < 0) return -1; } VIR_DEBUG("Allowing any hostdev block devs"); for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDef *hostdev = def->hostdevs[i]; virDomainHostdevSubsysUSB *usbsrc = &hostdev->source.subsys.u.usb; virUSBDevice *usb; switch (hostdev->mode) { case VIR_DOMAIN_HOSTDEV_MODE_SUBSYS: if (hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB) continue; if (hostdev->missing) continue; if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device, NULL)) == NULL) return -1; if (virUSBDeviceFileIterate(usb, virLXCSetupHostUSBDeviceCgroup, cgroup) < 0) { virUSBDeviceFree(usb); return -1; } virUSBDeviceFree(usb); break; case VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES: switch (hostdev->source.caps.type) { case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_STORAGE: if (virCgroupAllowDevicePath(cgroup, hostdev->source.caps.u.storage.block, VIR_CGROUP_DEVICE_RW | VIR_CGROUP_DEVICE_MKNOD, false) < 0) return -1; break; case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_MISC: if (virCgroupAllowDevicePath(cgroup, hostdev->source.caps.u.misc.chardev, VIR_CGROUP_DEVICE_RW | VIR_CGROUP_DEVICE_MKNOD, false) < 0) return -1; break; default: break; } default: break; } } if (virCgroupAllowDevice(cgroup, 'c', LXC_DEV_MAJ_PTY, -1, VIR_CGROUP_DEVICE_RWM) < 0) return -1; VIR_DEBUG("Allowing timers char devices"); /* Sync'ed with Host clock */ for (i = 0; i < def->clock.ntimers; i++) { virDomainTimerDef *timer = def->clock.timers[i]; const char *dev = NULL; /* Check if "present" is set to "no" otherwise enable it. */ if (!timer->present) continue; switch ((virDomainTimerNameType)timer->name) { case VIR_DOMAIN_TIMER_NAME_PLATFORM: case VIR_DOMAIN_TIMER_NAME_TSC: case VIR_DOMAIN_TIMER_NAME_KVMCLOCK: case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK: case VIR_DOMAIN_TIMER_NAME_PIT: case VIR_DOMAIN_TIMER_NAME_ARMVTIMER: case VIR_DOMAIN_TIMER_NAME_LAST: break; case VIR_DOMAIN_TIMER_NAME_RTC: dev = "/dev/rtc0"; break; case VIR_DOMAIN_TIMER_NAME_HPET: dev = "/dev/hpet"; break; } if (!dev) continue; if (!virFileExists(dev)) { VIR_DEBUG("Ignoring non-existent device %s", dev); continue; } if (virCgroupAllowDevicePath(cgroup, dev, VIR_CGROUP_DEVICE_READ, false) < 0) return -1; } VIR_DEBUG("Device ACL setup complete"); return 0; } virCgroup *virLXCCgroupCreate(virDomainDef *def, pid_t initpid, size_t nnicindexes, int *nicindexes) { virCgroup *cgroup = NULL; g_autofree char *machineName = virLXCDomainGetMachineName(def, 0); if (!machineName) return NULL; if (!g_path_is_absolute(def->resource->partition)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, _("Resource partition '%s' must start with '/'"), def->resource->partition); return NULL; } if (virCgroupNewMachine(machineName, "lxc", def->uuid, NULL, initpid, true, nnicindexes, nicindexes, def->resource->partition, -1, 0, &cgroup) < 0) return NULL; /* setup control group permissions for user namespace */ if (def->idmap.uidmap) { if (virCgroupSetOwner(cgroup, def->idmap.uidmap[0].target, def->idmap.gidmap[0].target, (1 << VIR_CGROUP_CONTROLLER_SYSTEMD)) < 0) { virCgroupFree(cgroup); return NULL; } } return cgroup; } int virLXCCgroupSetup(virDomainDef *def, virCgroup *cgroup, virBitmap *nodemask) { if (virLXCCgroupSetupCpuTune(def, cgroup) < 0) return -1; if (virLXCCgroupSetupCpusetTune(def, cgroup, nodemask) < 0) return -1; if (virLXCCgroupSetupBlkioTune(def, cgroup) < 0) return -1; if (virLXCCgroupSetupMemTune(def, cgroup) < 0) return -1; if (virLXCCgroupSetupDeviceACL(def, cgroup) < 0) return -1; return 0; }