ch: methods for cgroup mgmt in ch driver

Signed-off-by: Vineeth Pillai <viremana@linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
This commit is contained in:
Vineeth Pillai 2022-01-25 16:19:54 +00:00 committed by Michal Privoznik
parent 788e2b58cb
commit da6d4a2afc
8 changed files with 490 additions and 18 deletions

View File

@ -125,6 +125,8 @@ virCHDriverConfigNew(bool privileged)
if (!(cfg = virObjectNew(virCHDriverConfigClass)))
return NULL;
cfg->cgroupControllers = -1; /* Auto detect */
if (privileged) {
if (virGetUserID(CH_USER, &cfg->user) < 0)
return NULL;

View File

@ -36,10 +36,14 @@ struct _virCHDriverConfig {
char *stateDir;
char *logDir;
int cgroupControllers;
uid_t user;
gid_t group;
};
G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
struct _virCHDriver
{
virMutex lock;

View File

@ -319,6 +319,41 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
_("Serial can only be enabled for a PTY"));
return -1;
}
return 0;
}
int
virCHDomainRefreshThreadInfo(virDomainObj *vm)
{
size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
virCHMonitorThreadInfo *info = NULL;
size_t nthreads;
size_t ncpus = 0;
size_t i;
nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
true, &info);
for (i = 0; i < nthreads; i++) {
virCHDomainVcpuPrivate *vcpupriv;
virDomainVcpuDef *vcpu;
virCHMonitorCPUInfo *vcpuInfo;
if (info[i].type != virCHThreadTypeVcpu)
continue;
/* TODO: hotplug support */
vcpuInfo = &info[i].vcpuInfo;
vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
vcpupriv->tid = vcpuInfo->tid;
ncpus++;
}
/* TODO: Remove the warning when hotplug is implemented.*/
if (ncpus != maxvcpus)
VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
maxvcpus, ncpus);
return 0;
}

View File

@ -58,6 +58,8 @@ struct _virCHDomainObjPrivate {
virCHMonitor *monitor;
char *machineName;
virBitmap *autoCpuset;
virBitmap *autoNodeset;
virCgroup *cgroup;
};
#define CH_DOMAIN_PRIVATE(vm) \
@ -87,7 +89,8 @@ void
virCHDomainObjEndJob(virDomainObj *obj);
int
virCHDomainRefreshVcpuInfo(virDomainObj *vm);
virCHDomainRefreshThreadInfo(virDomainObj *vm);
pid_t
virCHDomainGetVcpuPid(virDomainObj *vm,
unsigned int vcpuid);

View File

@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
static virClass *virCHMonitorClass;
static void virCHMonitorDispose(void *obj);
static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
static int virCHMonitorOnceInit(void)
{
@ -578,6 +579,7 @@ static void virCHMonitorDispose(void *opaque)
virCHMonitor *mon = opaque;
VIR_DEBUG("mon=%p", mon);
virCHMonitorThreadInfoFree(mon);
virObjectUnref(mon->vm);
}
@ -743,6 +745,98 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response
return ret;
}
static void
virCHMonitorThreadInfoFree(virCHMonitor *mon)
{
mon->nthreads = 0;
VIR_FREE(mon->threads);
}
static size_t
virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
{
virCHMonitorThreadInfo *info = NULL;
g_autofree pid_t *tids = NULL;
virDomainObj *vm = mon->vm;
size_t ntids = 0;
size_t i;
virCHMonitorThreadInfoFree(mon);
if (virProcessGetPids(vm->pid, &ntids, &tids) < 0)
return 0;
info = g_new0(virCHMonitorThreadInfo, ntids);
for (i = 0; i < ntids; i++) {
g_autofree char *proc = NULL;
g_autofree char *data = NULL;
proc = g_strdup_printf("/proc/%d/task/%d/comm",
(int)vm->pid, (int)tids[i]);
if (virFileReadAll(proc, (1 << 16), &data) < 0) {
continue;
}
VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
(int)vm->pid, (int)tids[i], data);
if (STRPREFIX(data, "vcpu")) {
int cpuid;
char *tmp;
if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
VIR_WARN("Index is not specified correctly");
continue;
}
info[i].type = virCHThreadTypeVcpu;
info[i].vcpuInfo.tid = tids[i];
info[i].vcpuInfo.online = true;
info[i].vcpuInfo.cpuid = cpuid;
VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
} else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
STRPREFIX(data, "_rng")) {
/* Prefixes used by cloud-hypervisor for IO Threads are captured at
* https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */
info[i].type = virCHThreadTypeIO;
info[i].ioInfo.tid = tids[i];
virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
} else {
info[i].type = virCHThreadTypeEmulator;
info[i].emuInfo.tid = tids[i];
virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
}
mon->nthreads++;
}
mon->threads = info;
return mon->nthreads;
}
/**
* virCHMonitorGetThreadInfo:
* @mon: Pointer to the monitor
* @refresh: Refresh thread info or not
*
* Retrieve thread info and store to @threads
*
* Returns count of threads on success.
*/
size_t
virCHMonitorGetThreadInfo(virCHMonitor *mon,
bool refresh,
virCHMonitorThreadInfo **threads)
{
int nthreads = 0;
if (refresh)
nthreads = virCHMonitorRefreshThreadInfo(mon);
*threads = mon->threads;
return nthreads;
}
int
virCHMonitorShutdownVMM(virCHMonitor *mon)
{

View File

@ -37,6 +37,50 @@
#define URL_VM_RESUME "vm.resume"
#define URL_VM_INFO "vm.info"
#define VIRCH_THREAD_NAME_LEN 16
typedef enum {
virCHThreadTypeEmulator,
virCHThreadTypeVcpu,
virCHThreadTypeIO,
virCHThreadTypeMax
} virCHThreadType;
typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
struct _virCHMonitorCPUInfo {
int cpuid;
pid_t tid;
bool online;
};
typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
struct _virCHMonitorEmuThreadInfo {
char thrName[VIRCH_THREAD_NAME_LEN];
pid_t tid;
};
typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
struct _virCHMonitorIOThreadInfo {
char thrName[VIRCH_THREAD_NAME_LEN];
pid_t tid;
};
typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
struct _virCHMonitorThreadInfo {
virCHThreadType type;
union {
virCHMonitorCPUInfo vcpuInfo;
virCHMonitorEmuThreadInfo emuInfo;
virCHMonitorIOThreadInfo ioInfo;
};
};
typedef struct _virCHMonitor virCHMonitor;
struct _virCHMonitor {
@ -49,6 +93,9 @@ struct _virCHMonitor {
pid_t pid;
virDomainObj *vm;
size_t nthreads;
virCHMonitorThreadInfo *threads;
};
virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
@ -66,12 +113,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
int virCHMonitorResumeVM(virCHMonitor *mon);
int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
struct _virCHMonitorCPUInfo {
pid_t tid;
bool online;
};
void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
int virCHMonitorGetCPUInfo(virCHMonitor *mon,
virCHMonitorCPUInfo **vcpus,
size_t maxvcpus);
size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
virCHMonitorThreadInfo **threads);

View File

@ -26,6 +26,8 @@
#include "ch_domain.h"
#include "ch_monitor.h"
#include "ch_process.h"
#include "domain_cgroup.h"
#include "virnuma.h"
#include "viralloc.h"
#include "virerror.h"
#include "virjson.h"
@ -131,6 +133,251 @@ virCHProcessUpdateInfo(virDomainObj *vm)
return 0;
}
static int
virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
{
*cpumapRet = NULL;
if (!virHostCPUHasBitmap())
return 0;
if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
return -1;
return 0;
}
#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
static int
virCHProcessInitCpuAffinity(virDomainObj *vm)
{
g_autoptr(virBitmap) cpumapToSet = NULL;
virDomainNumatuneMemMode mem_mode;
virCHDomainObjPrivate *priv = vm->privateData;
if (!vm->pid) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot setup CPU affinity until process is started"));
return -1;
}
if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
virBitmap *nodeset = NULL;
if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
priv->autoNodeset,
&nodeset, -1) < 0)
return -1;
if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
return -1;
} else if (vm->def->cputune.emulatorpin) {
if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
return -1;
} else {
if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
return -1;
}
if (cpumapToSet && virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
return -1;
}
return 0;
}
#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
static int
virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
{
return 0;
}
#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
/**
* virCHProcessSetupPid:
*
* This function sets resource properties (affinity, cgroups,
* scheduler) for any PID associated with a domain. It should be used
* to set up emulator PIDs as well as vCPU and I/O thread pids to
* ensure they are all handled the same way.
*
* Returns 0 on success, -1 on error.
*/
static int
virCHProcessSetupPid(virDomainObj *vm,
pid_t pid,
virCgroupThreadName nameval,
int id,
virBitmap *cpumask,
unsigned long long period,
long long quota,
virDomainThreadSchedParam *sched)
{
virCHDomainObjPrivate *priv = vm->privateData;
virDomainNumatuneMemMode mem_mode;
g_autoptr(virCgroup) cgroup = NULL;
virBitmap *use_cpumask = NULL;
virBitmap *affinity_cpumask = NULL;
g_autoptr(virBitmap) hostcpumap = NULL;
g_autofree char *mem_mask = NULL;
int ret = -1;
if ((period || quota) &&
!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("cgroup cpu is required for scheduler tuning"));
goto cleanup;
}
/* Infer which cpumask shall be used. */
if (cpumask) {
use_cpumask = cpumask;
} else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
use_cpumask = priv->autoCpuset;
} else if (vm->def->cpumask) {
use_cpumask = vm->def->cpumask;
} else {
/* we can't assume cloud-hypervisor itself is running on all pCPUs,
* so we need to explicitly set the spawned instance to all pCPUs. */
if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
goto cleanup;
affinity_cpumask = hostcpumap;
}
/*
* If CPU cgroup controller is not initialized here, then we need
* neither period nor quota settings. And if CPUSET controller is
* not initialized either, then there's nothing to do anyway.
*/
if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
priv->autoNodeset,
&mem_mask, -1) < 0)
goto cleanup;
if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
goto cleanup;
if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
if (use_cpumask &&
virDomainCgroupSetupCpusetCpus(cgroup, use_cpumask) < 0)
goto cleanup;
if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
goto cleanup;
}
if (virDomainCgroupSetupVcpuBW(cgroup, period, quota) < 0)
goto cleanup;
/* Move the thread to the sub dir */
VIR_INFO("Adding pid %d to cgroup", pid);
if (virCgroupAddThread(cgroup, pid) < 0)
goto cleanup;
}
if (!affinity_cpumask)
affinity_cpumask = use_cpumask;
/* Setup legacy affinity. */
if (affinity_cpumask
&& virProcessSetAffinity(pid, affinity_cpumask, false) < 0)
goto cleanup;
/* Set scheduler type and priority, but not for the main thread. */
if (sched &&
nameval != VIR_CGROUP_THREAD_EMULATOR &&
virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
goto cleanup;
ret = 0;
cleanup:
if (ret < 0 && cgroup)
virCgroupRemove(cgroup);
return ret;
}
/**
* virCHProcessSetupVcpu:
* @vm: domain object
* @vcpuid: id of VCPU to set defaults
*
* This function sets resource properties (cgroups, affinity, scheduler) for a
* vCPU. This function expects that the vCPU is online and the vCPU pids were
* correctly detected at the point when it's called.
*
* Returns 0 on success, -1 on error.
*/
int
virCHProcessSetupVcpu(virDomainObj *vm,
unsigned int vcpuid)
{
pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
vcpuid, vcpu->cpumask,
vm->def->cputune.period,
vm->def->cputune.quota, &vcpu->sched);
}
static int
virCHProcessSetupVcpus(virDomainObj *vm)
{
virDomainVcpuDef *vcpu;
unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
size_t i;
if ((vm->def->cputune.period || vm->def->cputune.quota) &&
!virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->
cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("cgroup cpu is required for scheduler tuning"));
return -1;
}
if (!virCHDomainHasVcpuPids(vm)) {
/* If any CPU has custom affinity that differs from the
* VM default affinity, we must reject it */
for (i = 0; i < maxvcpus; i++) {
vcpu = virDomainDefGetVcpu(vm->def, i);
if (!vcpu->online)
continue;
if (vcpu->cpumask &&
!virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
virReportError(VIR_ERR_OPERATION_INVALID, "%s",
_("cpu affinity is not supported"));
return -1;
}
}
return 0;
}
for (i = 0; i < maxvcpus; i++) {
vcpu = virDomainDefGetVcpu(vm->def, i);
if (!vcpu->online)
continue;
if (virCHProcessSetupVcpu(vm, i) < 0)
return -1;
}
return 0;
}
/**
* virCHProcessStart:
* @driver: pointer to driver structure
@ -141,12 +388,14 @@ virCHProcessUpdateInfo(virDomainObj *vm)
*
* Returns 0 on success or -1 in case of error
*/
int virCHProcessStart(virCHDriver *driver,
virDomainObj *vm,
virDomainRunningReason reason)
int
virCHProcessStart(virCHDriver *driver,
virDomainObj *vm,
virDomainRunningReason reason)
{
int ret = -1;
virCHDomainObjPrivate *priv = vm->privateData;
g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
g_autofree int *nicindexes = NULL;
size_t nnicindexes = 0;
@ -154,30 +403,53 @@ int virCHProcessStart(virCHDriver *driver,
/* And we can get the first monitor connection now too */
if (!(priv->monitor = virCHProcessConnectMonitor(driver, vm))) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("failed to create connection to CH socket"));
_("failed to create connection to CH socket"));
goto cleanup;
}
if (virCHMonitorCreateVM(priv->monitor,
&nnicindexes, &nicindexes) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("failed to create guest VM"));
_("failed to create guest VM"));
goto cleanup;
}
}
vm->pid = priv->monitor->pid;
vm->def->id = vm->pid;
priv->machineName = virCHDomainGetMachineName(vm);
if (virDomainCgroupSetupCgroup("ch", vm,
nnicindexes, nicindexes,
&priv->cgroup,
cfg->cgroupControllers,
0, /*maxThreadsPerProc*/
priv->driver->privileged,
priv->machineName) < 0)
goto cleanup;
if (virCHProcessInitCpuAffinity(vm) < 0)
goto cleanup;
if (virCHMonitorBootVM(priv->monitor) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("failed to boot guest VM"));
goto cleanup;
}
priv->machineName = virCHDomainGetMachineName(vm);
vm->pid = priv->monitor->pid;
vm->def->id = vm->pid;
virCHDomainRefreshThreadInfo(vm);
VIR_DEBUG("Setting global CPU cgroup (if required)");
if (virDomainCgroupSetupGlobalCpuCgroup(vm,
priv->cgroup,
priv->autoNodeset) < 0)
goto cleanup;
VIR_DEBUG("Setting vCPU tuning/settings");
if (virCHProcessSetupVcpus(vm) < 0)
goto cleanup;
virCHProcessUpdateInfo(vm);
virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
return 0;
@ -189,10 +461,13 @@ int virCHProcessStart(virCHDriver *driver,
return ret;
}
int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
virDomainObj *vm,
virDomainShutoffReason reason)
int
virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
virDomainObj *vm,
virDomainShutoffReason reason)
{
int ret;
int retries = 0;
virCHDomainObjPrivate *priv = vm->privateData;
VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
@ -203,6 +478,18 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
priv->monitor = NULL;
}
retry:
if ((ret = virDomainCgroupRemoveCgroup(vm,
priv->cgroup,
priv->machineName)) < 0) {
if (ret == -EBUSY && (retries++ < 5)) {
g_usleep(200*1000);
goto retry;
}
VIR_WARN("Failed to remove cgroup for %s",
vm->def->name);
}
vm->pid = -1;
vm->def->id = -1;
g_clear_pointer(&priv->machineName, g_free);

View File

@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
int virCHProcessStop(virCHDriver *driver,
virDomainObj *vm,
virDomainShutoffReason reason);
int virCHProcessSetupVcpu(virDomainObj *vm,
unsigned int vcpuid);