libvirt/src/util/virresctrl.c
Peter Krempa 98f6f2081d util: alloc: Reimplement VIR_APPEND_ELEMENT using virAppendElement
Use virAppendElement instead of virInsertElementsN to implement
VIR_APPEND_ELEMENT which allows us to remove error handling as the
only relevant errors were removed when switching to aborting memory
allocation functions.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2021-08-06 08:53:25 +02:00

2666 lines
74 KiB
C

/*
* virresctrl.c:
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <sys/file.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#define LIBVIRT_VIRRESCTRLPRIV_H_ALLOW
#include "virresctrlpriv.h"
#include "viralloc.h"
#include "virbuffer.h"
#include "virfile.h"
#include "virlog.h"
#include "virobject.h"
#include "virstring.h"
#define VIR_FROM_THIS VIR_FROM_RESCTRL
VIR_LOG_INIT("util.virresctrl");
/* Resctrl is short for Resource Control. It might be implemented for various
* resources. Currently this supports cache allocation technology (aka CAT),
* memory bandwidth allocation (aka MBA) and cache monitoring technology (aka
* CMT). More resources technologies may be added in the future.
*/
/* Common definitions */
#define SYSFS_RESCTRL_PATH "/sys/fs/resctrl"
/* Following are three different enum implementations for the same enum. Each
* one of them helps translating to/from strings for different interfaces. The
* delimiter must be VIR_CACHE_TYPE_LAST for all of them in order to stay
* consistent in between all of them. */
/* Cache name mapping for Linux kernel naming. */
VIR_ENUM_IMPL(virCacheKernel,
VIR_CACHE_TYPE_LAST,
"Unified",
"Instruction",
"Data",
);
/* Cache name mapping for our XML naming. */
VIR_ENUM_IMPL(virCache,
VIR_CACHE_TYPE_LAST,
"both",
"code",
"data",
);
/* Cache name mapping for resctrl interface naming. */
VIR_ENUM_DECL(virResctrl);
VIR_ENUM_IMPL(virResctrl,
VIR_CACHE_TYPE_LAST,
"",
"CODE",
"DATA",
);
/* Monitor feature name prefix mapping for monitor naming */
VIR_ENUM_IMPL(virResctrlMonitorPrefix,
VIR_RESCTRL_MONITOR_TYPE_LAST,
"__unsupported__",
"llc_",
"mbm_",
);
/* All private typedefs so that they exist for all later definitions. This way
* structs can be included in one or another without reorganizing the code every
* time. */
typedef struct _virResctrlInfoPerType virResctrlInfoPerType;
typedef struct _virResctrlInfoPerLevel virResctrlInfoPerLevel;
typedef struct _virResctrlInfoMemBW virResctrlInfoMemBW;
typedef struct _virResctrlInfoMongrp virResctrlInfoMongrp;
typedef struct _virResctrlAllocPerType virResctrlAllocPerType;
typedef struct _virResctrlAllocPerLevel virResctrlAllocPerLevel;
typedef struct _virResctrlAllocMemBW virResctrlAllocMemBW;
/* Class definitions and initializations */
static virClass *virResctrlInfoClass;
static virClass *virResctrlAllocClass;
static virClass *virResctrlMonitorClass;
/* virResctrlInfo */
struct _virResctrlInfoPerType {
/* Kernel-provided information */
unsigned int min_cbm_bits;
/* Our computed information from the above */
unsigned int bits;
unsigned int max_cache_id;
/* In order to be self-sufficient we need size information per cache.
* Funnily enough, one of the outcomes of the resctrl design is that it
* does not account for different sizes per cache on the same level. So
* for the sake of easiness, let's copy that, for now. */
unsigned long long size;
/* Information that we will return upon request (this is public struct) as
* until now all the above is internal to this module */
virResctrlInfoPerCache control;
};
struct _virResctrlInfoPerLevel {
virResctrlInfoPerType **types;
};
/* Information about memory bandwidth allocation */
struct _virResctrlInfoMemBW {
/* minimum memory bandwidth allowed */
unsigned int min_bandwidth;
/* bandwidth granularity */
unsigned int bandwidth_granularity;
/* Maximum number of simultaneous allocations */
unsigned int max_allocation;
/* level number of last level cache */
unsigned int last_level_cache;
/* max id of last level cache, this is used to track
* how many last level cache available in host system,
* the number of memory bandwidth allocation controller
* is identical with last level cache. */
unsigned int max_id;
};
struct _virResctrlInfoMongrp {
/* Maximum number of simultaneous monitors */
unsigned int max_monitor;
/* null-terminal string list for monitor features */
char **features;
/* Number of monitor features */
size_t nfeatures;
/* Last level cache related information */
/* This adjustable value affects the final reuse of resources used by
* monitor. After the action of removing a monitor, the kernel may not
* release all hardware resources that monitor used immediately if the
* cache occupancy value associated with 'removed' monitor is above this
* threshold. Once the cache occupancy is below this threshold, the
* underlying hardware resource will be reclaimed and be put into the
* resource pool for next reusing.*/
unsigned int cache_reuse_threshold;
/* The cache 'level' that has the monitor capability */
unsigned int cache_level;
};
struct _virResctrlInfo {
virObject parent;
virResctrlInfoPerLevel **levels;
size_t nlevels;
virResctrlInfoMemBW *membw_info;
virResctrlInfoMongrp *monitor_info;
};
static void
virResctrlInfoDispose(void *obj)
{
size_t i = 0;
size_t j = 0;
virResctrlInfo *resctrl = obj;
for (i = 0; i < resctrl->nlevels; i++) {
virResctrlInfoPerLevel *level = resctrl->levels[i];
if (!level)
continue;
if (level->types) {
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++)
g_free(level->types[j]);
}
g_free(level->types);
g_free(level);
}
if (resctrl->monitor_info)
g_strfreev(resctrl->monitor_info->features);
g_free(resctrl->membw_info);
g_free(resctrl->levels);
g_free(resctrl->monitor_info);
}
void
virResctrlInfoMonFree(virResctrlInfoMon *mon)
{
if (!mon)
return;
g_strfreev(mon->features);
g_free(mon);
}
/* virResctrlAlloc and virResctrlMonitor */
/*
* virResctrlAlloc and virResctrlMonitor are representing a resource control
* group (in XML under cputune/cachetune and consequently a directory under
* /sys/fs/resctrl). virResctrlAlloc is the data structure for resource
* allocation, while the virResctrlMonitor represents the resource monitoring
* part.
*
* virResctrlAlloc represents one allocation. Since it can have multiple
* parts of multiple caches allocated it is represented as bunch of nested
* sparse arrays (by sparse I mean array of pointers so that each might be NULL
* in case there is no allocation for that particular cache allocation (level,
* cache, ...) or memory allocation for particular node).
*
* Allocation corresponding to root directory, /sys/fs/sysctrl/, defines the
* default resource allocating policy, which is created immediately after
* mounting, and owns all the tasks and cpus in the system. Cache or memory
* bandwidth resource will be shared for tasks in this allocation.
*
* =====Cache allocation technology (CAT)=====
*
* Since one allocation can be made for caches on different levels, the first
* nested sparse array is of types virResctrlAllocPerLevel. For example if you
* have allocation for level 3 cache, there will be three NULL pointers and then
* allocated pointer to virResctrlAllocPerLevel. That way you can access it by
* `alloc[level]` as O(1) is desired instead of crawling through normal arrays
* or lists in three nested loops. The code uses a lot of direct accesses.
*
* Each virResctrlAllocPerLevel can have allocations for different cache
* allocation types. You can allocate instruction cache (VIR_CACHE_TYPE_CODE),
* data cache (VIR_CACHE_TYPE_DATA) or unified cache (VIR_CACHE_TYPE_BOTH).
* Those allocations are kept in sparse array of virResctrlAllocPerType pointers.
*
* For each virResctrlAllocPerType users can request some size of the cache to
* be allocated. That's what the sparse array `sizes` is for. Non-NULL
* pointers represent requested size allocations. The array is indexed by host
* cache id (gotten from `/sys/devices/system/cpu/cpuX/cache/indexY/id`). Users
* can see this information e.g. in the output of `virsh capabilities` (for that
* information there's the other struct, namely `virResctrlInfo`).
*
* When allocation is being created we need to find unused part of the cache for
* all of them. While doing that we store the bitmask in a sparse array of
* virBitmaps named `masks` indexed the same way as `sizes`. The upper bounds
* of the sparse arrays are stored in nmasks or nsizes, respectively.
*
* =====Memory Bandwidth allocation technology (MBA)=====
*
* The memory bandwidth allocation support in virResctrlAlloc works in the
* same fashion as CAT. However, memory bandwidth controller doesn't have a
* hierarchy organization as cache, each node have one memory bandwidth
* controller to memory bandwidth distribution. The number of memory bandwidth
* controller is identical with number of last level cache. So MBA also employs
* a sparse array to represent whether a memory bandwidth allocation happens
* on corresponding node. The available memory controller number is collected
* in 'virResctrlInfo'.
*
* =====Cache monitoring technology (CMT)=====
*
* Cache monitoring technology is used to perceive how many cache the process
* is using actually. virResctrlMonitor represents the resource control
* monitoring group, it is supported to monitor resource utilization
* information on granularity of vcpu.
*
* From a hardware perspective, cache monitoring technology (CMT), memory
* bandwidth technology (MBM), as well as the CAT and MBA, are all orthogonal
* features. The monitor will be created under the scope of default resctrl
* group if no specific CAT or MBA entries are provided for the guest."
*/
struct _virResctrlAllocPerType {
/* There could be bool saying whether this is set or not, but since everything
* in virResctrlAlloc (and most of libvirt) goes with pointer arrays we would
* have to have one more level of allocation anyway, so this stays faithful to
* the concept */
unsigned long long **sizes;
size_t nsizes;
/* Mask for each cache */
virBitmap **masks;
size_t nmasks;
};
struct _virResctrlAllocPerLevel {
virResctrlAllocPerType **types; /* Indexed with enum virCacheType */
/* There is no `ntypes` member variable as it is always allocated for
* VIR_CACHE_TYPE_LAST number of items */
};
/*
* virResctrlAllocMemBW represents one memory bandwidth allocation.
* Since it can have several last level caches in a NUMA system, it is
* also represented as a nested sparse arrays as virRestrlAllocPerLevel.
*/
struct _virResctrlAllocMemBW {
unsigned int **bandwidths;
size_t nbandwidths;
};
struct _virResctrlAlloc {
virObject parent;
virResctrlAllocPerLevel **levels;
size_t nlevels;
virResctrlAllocMemBW *mem_bw;
/* The identifier (any unique string for now) */
char *id;
/* libvirt-generated path in /sys/fs/resctrl for this particular
* allocation */
char *path;
};
/*
* virResctrlMonitor is the data structure for resctrl monitor. Resctrl
* monitor represents a resctrl monitoring group, which can be used to
* monitor the resource utilization information for either cache or
* memory bandwidth.
*/
struct _virResctrlMonitor {
virObject parent;
/* Each virResctrlMonitor is associated with one specific allocation,
* either the root directory allocation under /sys/fs/resctrl or a
* specific allocation defined under the root directory.
* This pointer points to the allocation this monitor is associated with.
*/
virResctrlAlloc *alloc;
/* The monitor identifier. For a monitor has the same @path name as its
* @alloc, the @id will be set to the same value as it is in @alloc->id.
*/
char *id;
/* libvirt-generated path in /sys/fs/resctrl for this particular
* monitor */
char *path;
};
static void
virResctrlAllocDispose(void *obj)
{
size_t i = 0;
size_t j = 0;
size_t k = 0;
virResctrlAlloc *alloc = obj;
for (i = 0; i < alloc->nlevels; i++) {
virResctrlAllocPerLevel *level = alloc->levels[i];
if (!level)
continue;
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++) {
virResctrlAllocPerType *type = level->types[j];
if (!type)
continue;
for (k = 0; k < type->nsizes; k++)
g_free(type->sizes[k]);
for (k = 0; k < type->nmasks; k++)
virBitmapFree(type->masks[k]);
g_free(type->sizes);
g_free(type->masks);
g_free(type);
}
g_free(level->types);
g_free(level);
}
if (alloc->mem_bw) {
virResctrlAllocMemBW *mem_bw = alloc->mem_bw;
for (i = 0; i < mem_bw->nbandwidths; i++)
g_free(mem_bw->bandwidths[i]);
g_free(alloc->mem_bw->bandwidths);
g_free(alloc->mem_bw);
}
g_free(alloc->id);
g_free(alloc->path);
g_free(alloc->levels);
}
static void
virResctrlMonitorDispose(void *obj)
{
virResctrlMonitor *monitor = obj;
virObjectUnref(monitor->alloc);
g_free(monitor->id);
g_free(monitor->path);
}
/* Global initialization for classes */
static int
virResctrlOnceInit(void)
{
if (!VIR_CLASS_NEW(virResctrlInfo, virClassForObject()))
return -1;
if (!VIR_CLASS_NEW(virResctrlAlloc, virClassForObject()))
return -1;
if (!VIR_CLASS_NEW(virResctrlMonitor, virClassForObject()))
return -1;
return 0;
}
VIR_ONCE_GLOBAL_INIT(virResctrl);
/* Common functions */
#ifndef WIN32
static int
virResctrlLock(void)
{
int fd = open(SYSFS_RESCTRL_PATH, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
virReportSystemError(errno, "%s", _("Cannot open resctrl"));
return -1;
}
if (flock(fd, LOCK_EX) < 0) {
virReportSystemError(errno, "%s", _("Cannot lock resctrl"));
VIR_FORCE_CLOSE(fd);
return -1;
}
return fd;
}
static int
virResctrlUnlock(int fd)
{
if (fd == -1)
return 0;
/* The lock gets unlocked by closing the fd, which we need to do anyway in
* order to clean up properly */
if (VIR_CLOSE(fd) < 0) {
virReportSystemError(errno, "%s", _("Cannot close resctrl"));
/* Trying to save the already broken */
if (flock(fd, LOCK_UN) < 0)
virReportSystemError(errno, "%s", _("Cannot unlock resctrl"));
return -1;
}
return 0;
}
#else /* WIN32 */
static int
virResctrlLock(void)
{
virReportSystemError(ENOSYS, "%s",
_("resctrl locking is not supported "
"on this platform"));
return -1;
}
static int
virResctrlUnlock(int fd G_GNUC_UNUSED)
{
virReportSystemError(ENOSYS, "%s",
_("resctrl locking is not supported "
"on this platform"));
return -1;
}
#endif /* WIN32 */
/* virResctrlInfo-related definitions */
static int
virResctrlGetCacheInfo(virResctrlInfo *resctrl,
DIR *dirp)
{
int rv = -1;
struct dirent *ent = NULL;
while ((rv = virDirRead(dirp, &ent, SYSFS_RESCTRL_PATH "/info")) > 0) {
g_autofree char *cbm_mask_str = NULL;
g_autoptr(virBitmap) cbm_mask_map = NULL;
char *endptr = NULL;
int type = 0;
unsigned int level = 0;
virResctrlInfoPerLevel *i_level = NULL;
g_autofree virResctrlInfoPerType *i_type = NULL;
VIR_DEBUG("Parsing info type '%s'", ent->d_name);
if (ent->d_name[0] != 'L')
continue;
if (virStrToLong_uip(ent->d_name + 1, &endptr, 10, &level) < 0) {
VIR_DEBUG("Cannot parse resctrl cache info level '%s'", ent->d_name + 1);
continue;
}
type = virResctrlTypeFromString(endptr);
if (type < 0) {
VIR_DEBUG("Ignoring resctrl cache info with suffix '%s'", endptr);
continue;
}
i_type = g_new0(virResctrlInfoPerType, 1);
i_type->control.scope = type;
rv = virFileReadValueUint(&i_type->control.max_allocation,
SYSFS_RESCTRL_PATH "/info/%s/num_closids",
ent->d_name);
if (rv == -2) {
/* The file doesn't exist, so it's unusable for us,
* but we can scan further */
VIR_WARN("The path '" SYSFS_RESCTRL_PATH "/info/%s/num_closids' "
"does not exist",
ent->d_name);
} else if (rv < 0) {
/* Other failures are fatal, so just quit */
return -1;
}
rv = virFileReadValueString(&cbm_mask_str,
SYSFS_RESCTRL_PATH
"/info/%s/cbm_mask",
ent->d_name);
if (rv == -2) {
/* If the previous file exists, so should this one. Hence -2 is
* fatal in this case as well (errors out in next condition) - the
* kernel interface might've changed too much or something else is
* wrong. */
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot get cbm_mask from resctrl cache info"));
}
if (rv < 0)
return -1;
virStringTrimOptionalNewline(cbm_mask_str);
if (!(cbm_mask_map = virBitmapNewString(cbm_mask_str))) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot parse cbm_mask from resctrl cache info"));
return -1;
}
i_type->bits = virBitmapCountBits(cbm_mask_map);
rv = virFileReadValueUint(&i_type->min_cbm_bits,
SYSFS_RESCTRL_PATH "/info/%s/min_cbm_bits",
ent->d_name);
if (rv == -2)
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot get min_cbm_bits from resctrl cache info"));
if (rv < 0)
return -1;
if (resctrl->nlevels <= level)
VIR_EXPAND_N(resctrl->levels, resctrl->nlevels,
level - resctrl->nlevels + 1);
if (!resctrl->levels[level]) {
virResctrlInfoPerType **types = NULL;
types = g_new0(virResctrlInfoPerType *, VIR_CACHE_TYPE_LAST);
resctrl->levels[level] = g_new0(virResctrlInfoPerLevel, 1);
resctrl->levels[level]->types = types;
}
i_level = resctrl->levels[level];
if (i_level->types[type]) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Duplicate cache type in resctrl for level %u"),
level);
return -1;
}
i_level->types[type] = g_steal_pointer(&i_type);
}
return 0;
}
static int
virResctrlGetMemoryBandwidthInfo(virResctrlInfo *resctrl)
{
int rv = -1;
g_autofree virResctrlInfoMemBW *i_membw = NULL;
/* query memory bandwidth allocation info */
i_membw = g_new0(virResctrlInfoMemBW, 1);
rv = virFileReadValueUint(&i_membw->bandwidth_granularity,
SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran");
if (rv == -2) {
/* The file doesn't exist, so it's unusable for us,
* probably memory bandwidth allocation unsupported */
VIR_INFO("The path '" SYSFS_RESCTRL_PATH "/info/MB/bandwidth_gran'"
"does not exist");
return 0;
} else if (rv < 0) {
/* Other failures are fatal, so just quit */
return -1;
}
rv = virFileReadValueUint(&i_membw->min_bandwidth,
SYSFS_RESCTRL_PATH "/info/MB/min_bandwidth");
if (rv == -2) {
/* If the previous file exists, so should this one. Hence -2 is
* fatal in this case (errors out in next condition) - the kernel
* interface might've changed too much or something else is wrong. */
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot get min bandwidth from resctrl memory info"));
}
if (rv < 0)
return -1;
rv = virFileReadValueUint(&i_membw->max_allocation,
SYSFS_RESCTRL_PATH "/info/MB/num_closids");
if (rv == -2) {
/* Similar reasoning to min_bandwidth above. */
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot get max allocation from resctrl memory info"));
}
if (rv < 0)
return -1;
resctrl->membw_info = g_steal_pointer(&i_membw);
return 0;
}
/*
* Retrieve monitor capability from the resource control file system.
*
* The monitor capability is exposed through "SYSFS_RESCTRL_PATH/info/L3_MON"
* directory under the resource control file system. The monitor capability is
* parsed by reading the interface files and stored in the structure
* 'virResctrlInfoMongrp'.
*
* Not all host supports the resource monitor, leave the pointer
* @resctrl->monitor_info empty if not supported.
*/
static int
virResctrlGetMonitorInfo(virResctrlInfo *resctrl)
{
int rv = -1;
g_autofree char *featurestr = NULL;
g_autofree virResctrlInfoMongrp *info_monitor = NULL;
info_monitor = g_new0(virResctrlInfoMongrp, 1);
/* For now, monitor only exists in level 3 cache */
info_monitor->cache_level = 3;
rv = virFileReadValueUint(&info_monitor->max_monitor,
SYSFS_RESCTRL_PATH "/info/L3_MON/num_rmids");
if (rv == -2) {
/* The file doesn't exist, so it's unusable for us, probably resource
* monitor unsupported */
VIR_INFO("The file '" SYSFS_RESCTRL_PATH "/info/L3_MON/num_rmids' "
"does not exist");
return 0;
} else if (rv < 0) {
/* Other failures are fatal, so just quit */
return -1;
}
rv = virFileReadValueUint(&info_monitor->cache_reuse_threshold,
SYSFS_RESCTRL_PATH
"/info/L3_MON/max_threshold_occupancy");
if (rv == -2) {
/* If CMT is not supported, then 'max_threshold_occupancy' file
* will not exist. */
VIR_DEBUG("File '" SYSFS_RESCTRL_PATH
"/info/L3_MON/max_threshold_occupancy' does not exist");
} else if (rv < 0) {
return -1;
}
rv = virFileReadValueString(&featurestr,
SYSFS_RESCTRL_PATH
"/info/L3_MON/mon_features");
if (rv == -2)
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot get mon_features from resctrl"));
if (rv < 0)
return -1;
if (!*featurestr) {
/* If no feature found in "/info/L3_MON/mon_features",
* some error happens */
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Got empty feature list from resctrl"));
return -1;
}
info_monitor->features = g_strsplit(featurestr, "\n", 0);
info_monitor->nfeatures = g_strv_length(info_monitor->features);
VIR_DEBUG("Resctrl supported %zd monitoring features", info_monitor->nfeatures);
resctrl->monitor_info = g_steal_pointer(&info_monitor);
return 0;
}
static int
virResctrlGetInfo(virResctrlInfo *resctrl)
{
g_autoptr(DIR) dirp = NULL;
int ret = -1;
ret = virDirOpenIfExists(&dirp, SYSFS_RESCTRL_PATH "/info");
if (ret <= 0)
return ret;
if ((ret = virResctrlGetMemoryBandwidthInfo(resctrl)) < 0)
return -1;
if ((ret = virResctrlGetCacheInfo(resctrl, dirp)) < 0)
return -1;
if ((ret = virResctrlGetMonitorInfo(resctrl)) < 0)
return -1;
return 0;
}
virResctrlInfo *
virResctrlInfoNew(void)
{
virResctrlInfo *ret = NULL;
if (virResctrlInitialize() < 0)
return NULL;
ret = virObjectNew(virResctrlInfoClass);
if (!ret)
return NULL;
if (virResctrlGetInfo(ret) < 0) {
virObjectUnref(ret);
return NULL;
}
return ret;
}
static bool
virResctrlInfoIsEmpty(virResctrlInfo *resctrl)
{
size_t i = 0;
size_t j = 0;
if (!resctrl)
return true;
if (resctrl->membw_info)
return false;
if (resctrl->monitor_info)
return false;
for (i = 0; i < resctrl->nlevels; i++) {
virResctrlInfoPerLevel *i_level = resctrl->levels[i];
if (!i_level)
continue;
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++) {
if (i_level->types[j])
return false;
}
}
return true;
}
int
virResctrlInfoGetMemoryBandwidth(virResctrlInfo *resctrl,
unsigned int level,
virResctrlInfoMemBWPerNode *control)
{
virResctrlInfoMemBW *membw_info = resctrl->membw_info;
if (!membw_info)
return 0;
if (membw_info->last_level_cache != level)
return 0;
control->granularity = membw_info->bandwidth_granularity;
control->min = membw_info->min_bandwidth;
control->max_allocation = membw_info->max_allocation;
return 1;
}
int
virResctrlInfoGetCache(virResctrlInfo *resctrl,
unsigned int level,
unsigned long long size,
size_t *ncontrols,
virResctrlInfoPerCache ***controls)
{
virResctrlInfoPerLevel *i_level = NULL;
virResctrlInfoPerType *i_type = NULL;
size_t i = 0;
if (virResctrlInfoIsEmpty(resctrl))
return 0;
/* Let's take the opportunity to update the number of last level
* cache. This number of memory bandwidth controller is same with
* last level cache */
if (resctrl->membw_info) {
virResctrlInfoMemBW *membw_info = resctrl->membw_info;
if (level > membw_info->last_level_cache) {
membw_info->last_level_cache = level;
membw_info->max_id = 0;
} else if (membw_info->last_level_cache == level) {
membw_info->max_id++;
}
}
if (level >= resctrl->nlevels)
return 0;
i_level = resctrl->levels[level];
if (!i_level)
return 0;
for (i = 0; i < VIR_CACHE_TYPE_LAST; i++) {
i_type = i_level->types[i];
if (!i_type)
continue;
/* Let's take the opportunity to update our internal information about
* the cache size */
if (!i_type->size) {
i_type->size = size;
i_type->control.granularity = size / i_type->bits;
if (i_type->min_cbm_bits != 1)
i_type->control.min = i_type->min_cbm_bits * i_type->control.granularity;
} else {
if (i_type->size != size) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("level %u cache size %llu does not match "
"expected size %llu"),
level, i_type->size, size);
goto error;
}
i_type->max_cache_id++;
}
VIR_EXPAND_N(*controls, *ncontrols, 1);
(*controls)[*ncontrols - 1] = g_new0(virResctrlInfoPerCache, 1);
memcpy((*controls)[*ncontrols - 1], &i_type->control, sizeof(i_type->control));
}
return 0;
error:
while (*ncontrols)
VIR_FREE((*controls)[--*ncontrols]);
VIR_FREE(*controls);
return -1;
}
/* virResctrlInfoGetMonitorPrefix
*
* @resctrl: Pointer to virResctrlInfo
* @prefix: Monitor prefix name for monitor looking for.
* @monitor: Returns the capability information for target monitor if the
* monitor with @prefex is supported by host.
*
* Return monitor capability information for @prefix through @monitor.
* If monitor with @prefix is not supported in system, @monitor will be
* cleared to NULL.
*
* Returns 0 if @monitor is created or monitor type with @prefix is not
* supported by host, -1 on failure with error message set.
*/
int
virResctrlInfoGetMonitorPrefix(virResctrlInfo *resctrl,
const char *prefix,
virResctrlInfoMon **monitor)
{
size_t i = 0;
virResctrlInfoMongrp *mongrp_info = NULL;
virResctrlInfoMon *mon = NULL;
int ret = -1;
if (!prefix) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Empty prefix name for resctrl monitor"));
return -1;
}
if (virResctrlInfoIsEmpty(resctrl))
return 0;
mongrp_info = resctrl->monitor_info;
if (!mongrp_info) {
VIR_INFO("Monitor is not supported in host");
return 0;
}
for (i = 0; i < VIR_RESCTRL_MONITOR_TYPE_LAST; i++) {
if (STREQ(prefix, virResctrlMonitorPrefixTypeToString(i))) {
mon = g_new0(virResctrlInfoMon, 1);
mon->type = i;
break;
}
}
if (!mon) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Bad prefix name '%s' for resctrl monitor"),
prefix);
return -1;
}
mon->max_monitor = mongrp_info->max_monitor;
if (mon->type == VIR_RESCTRL_MONITOR_TYPE_CACHE) {
mon->cache_reuse_threshold = mongrp_info->cache_reuse_threshold;
mon->cache_level = mongrp_info->cache_level;
}
mon->features = g_new0(char *, mongrp_info->nfeatures + 1);
for (i = 0; i < mongrp_info->nfeatures; i++) {
if (STRPREFIX(mongrp_info->features[i], prefix))
mon->features[mon->nfeatures++] = g_strdup(mongrp_info->features[i]);
}
mon->features = g_renew(char *, mon->features, mon->nfeatures + 1);
ret = 0;
/* In case *monitor is pointed to some monitor, clean it. */
virResctrlInfoMonFree(*monitor);
if (mon->nfeatures == 0) {
/* No feature found for current monitor, means host does not support
* monitor type with @prefix name.
* Telling caller this monitor is supported by hardware specification,
* but not supported by this host. */
VIR_INFO("No resctrl monitor features using prefix '%s' found", prefix);
goto cleanup;
}
*monitor = g_steal_pointer(&mon);
cleanup:
virResctrlInfoMonFree(mon);
return ret;
}
/* virResctrlAlloc-related definitions */
virResctrlAlloc *
virResctrlAllocNew(void)
{
if (virResctrlInitialize() < 0)
return NULL;
return virObjectNew(virResctrlAllocClass);
}
bool
virResctrlAllocIsEmpty(virResctrlAlloc *alloc)
{
size_t i = 0;
size_t j = 0;
size_t k = 0;
if (!alloc)
return true;
if (alloc->mem_bw)
return false;
for (i = 0; i < alloc->nlevels; i++) {
virResctrlAllocPerLevel *a_level = alloc->levels[i];
if (!a_level)
continue;
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++) {
virResctrlAllocPerType *a_type = a_level->types[j];
if (!a_type)
continue;
for (k = 0; k < a_type->nsizes; k++) {
if (a_type->sizes[k])
return false;
}
for (k = 0; k < a_type->nmasks; k++) {
if (a_type->masks[k])
return false;
}
}
}
return true;
}
static virResctrlAllocPerType *
virResctrlAllocGetType(virResctrlAlloc *alloc,
unsigned int level,
virCacheType type)
{
virResctrlAllocPerLevel *a_level = NULL;
if (alloc->nlevels <= level)
VIR_EXPAND_N(alloc->levels, alloc->nlevels, level - alloc->nlevels + 1);
if (!alloc->levels[level]) {
virResctrlAllocPerType **types = NULL;
types = g_new0(virResctrlAllocPerType *, VIR_CACHE_TYPE_LAST);
alloc->levels[level] = g_new0(virResctrlAllocPerLevel, 1);
alloc->levels[level]->types = types;
}
a_level = alloc->levels[level];
if (!a_level->types[type])
a_level->types[type] = g_new0(virResctrlAllocPerType, 1);
return a_level->types[type];
}
static int
virResctrlAllocUpdateMask(virResctrlAlloc *alloc,
unsigned int level,
virCacheType type,
unsigned int cache,
virBitmap *mask)
{
virResctrlAllocPerType *a_type = virResctrlAllocGetType(alloc, level, type);
if (!a_type)
return -1;
if (a_type->nmasks <= cache)
VIR_EXPAND_N(a_type->masks, a_type->nmasks,
cache - a_type->nmasks + 1);
if (a_type->masks[cache])
virBitmapFree(a_type->masks[cache]);
a_type->masks[cache] = virBitmapNewCopy(mask);
return 0;
}
static int
virResctrlAllocUpdateSize(virResctrlAlloc *alloc,
unsigned int level,
virCacheType type,
unsigned int cache,
unsigned long long size)
{
virResctrlAllocPerType *a_type = virResctrlAllocGetType(alloc, level, type);
if (!a_type)
return -1;
if (a_type->nsizes <= cache)
VIR_EXPAND_N(a_type->sizes, a_type->nsizes,
cache - a_type->nsizes + 1);
if (!a_type->sizes[cache])
a_type->sizes[cache] = g_new0(unsigned long long, 1);
*(a_type->sizes[cache]) = size;
return 0;
}
/*
* Check if there is an allocation for this level/type/cache already. Called
* before updating the structure. VIR_CACHE_TYPE_BOTH collides with any type,
* the other types collide with itself. This code basically checks if either:
* `alloc[level]->types[type]->sizes[cache]`
* or
* `alloc[level]->types[VIR_CACHE_TYPE_BOTH]->sizes[cache]`
* is non-NULL. All the fuzz around it is checking for NULL pointers along
* the way.
*/
static bool
virResctrlAllocCheckCollision(virResctrlAlloc *alloc,
unsigned int level,
virCacheType type,
unsigned int cache)
{
virResctrlAllocPerLevel *a_level = NULL;
virResctrlAllocPerType *a_type = NULL;
if (!alloc)
return false;
if (alloc->nlevels <= level)
return false;
a_level = alloc->levels[level];
if (!a_level)
return false;
a_type = a_level->types[VIR_CACHE_TYPE_BOTH];
/* If there is an allocation for type 'both', there can be no other
* allocation for the same cache */
if (a_type && a_type->nsizes > cache && a_type->sizes[cache])
return true;
if (type == VIR_CACHE_TYPE_BOTH) {
a_type = a_level->types[VIR_CACHE_TYPE_CODE];
if (a_type && a_type->nsizes > cache && a_type->sizes[cache])
return true;
a_type = a_level->types[VIR_CACHE_TYPE_DATA];
if (a_type && a_type->nsizes > cache && a_type->sizes[cache])
return true;
} else {
a_type = a_level->types[type];
if (a_type && a_type->nsizes > cache && a_type->sizes[cache])
return true;
}
return false;
}
int
virResctrlAllocSetCacheSize(virResctrlAlloc *alloc,
unsigned int level,
virCacheType type,
unsigned int cache,
unsigned long long size)
{
if (virResctrlAllocCheckCollision(alloc, level, type, cache)) {
virReportError(VIR_ERR_XML_ERROR,
_("Colliding cache allocations for cache "
"level '%u' id '%u', type '%s'"),
level, cache, virCacheTypeToString(type));
return -1;
}
return virResctrlAllocUpdateSize(alloc, level, type, cache, size);
}
int
virResctrlAllocForeachCache(virResctrlAlloc *alloc,
virResctrlAllocForeachCacheCallback cb,
void *opaque)
{
int ret = 0;
unsigned int level = 0;
unsigned int type = 0;
unsigned int cache = 0;
if (!alloc)
return 0;
for (level = 0; level < alloc->nlevels; level++) {
virResctrlAllocPerLevel *a_level = alloc->levels[level];
if (!a_level)
continue;
for (type = 0; type < VIR_CACHE_TYPE_LAST; type++) {
virResctrlAllocPerType *a_type = a_level->types[type];
if (!a_type)
continue;
for (cache = 0; cache < a_type->nsizes; cache++) {
unsigned long long *size = a_type->sizes[cache];
if (!size)
continue;
ret = cb(level, type, cache, *size, opaque);
if (ret < 0)
return ret;
}
}
}
return 0;
}
/* virResctrlAllocSetMemoryBandwidth
* @alloc: Pointer to an active allocation
* @id: node id of MBA to be set
* @memory_bandwidth: new memory bandwidth value
*
* Set the @memory_bandwidth for the node @id entry in the @alloc.
*
* Returns 0 on success, -1 on failure with error message set.
*/
int
virResctrlAllocSetMemoryBandwidth(virResctrlAlloc *alloc,
unsigned int id,
unsigned int memory_bandwidth)
{
virResctrlAllocMemBW *mem_bw = alloc->mem_bw;
if (memory_bandwidth > 100) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("Memory Bandwidth value exceeding 100 is invalid."));
return -1;
}
if (!mem_bw) {
mem_bw = g_new0(virResctrlAllocMemBW, 1);
alloc->mem_bw = mem_bw;
}
if (mem_bw->nbandwidths <= id)
VIR_EXPAND_N(mem_bw->bandwidths, mem_bw->nbandwidths,
id - mem_bw->nbandwidths + 1);
if (mem_bw->bandwidths[id]) {
virReportError(VIR_ERR_XML_ERROR,
_("Memory Bandwidth already defined for node %u"),
id);
return -1;
}
mem_bw->bandwidths[id] = g_new0(unsigned int, 1);
*(mem_bw->bandwidths[id]) = memory_bandwidth;
return 0;
}
/* virResctrlAllocForeachMemory
* @alloc: Pointer to an active allocation
* @cb: Callback function
* @opaque: Opaque data to be passed to @cb
*
* If available, traverse the defined memory bandwidth allocations and
* call the @cb function.
*
* Returns 0 on success, -1 and immediate failure if the @cb has any failure.
*/
int
virResctrlAllocForeachMemory(virResctrlAlloc *alloc,
virResctrlAllocForeachMemoryCallback cb,
void *opaque)
{
size_t i = 0;
virResctrlAllocMemBW *mem_bw;
if (!alloc || !alloc->mem_bw)
return 0;
mem_bw = alloc->mem_bw;
for (i = 0; i < mem_bw->nbandwidths; i++) {
if (mem_bw->bandwidths[i]) {
if (cb(i, *mem_bw->bandwidths[i], opaque) < 0)
return -1;
}
}
return 0;
}
static int
virResctrlSetID(char **resctrlid,
const char *id)
{
if (!id) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("New resctrl 'id' cannot be NULL"));
return -1;
}
if (*resctrlid) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Attempt to overwrite resctrlid='%s' with id='%s'"),
*resctrlid, id);
return -1;
}
*resctrlid = g_strdup(id);
return 0;
}
int
virResctrlAllocSetID(virResctrlAlloc *alloc,
const char *id)
{
return virResctrlSetID(&alloc->id, id);
}
const char *
virResctrlAllocGetID(virResctrlAlloc *alloc)
{
return alloc->id;
}
/* Format the Memory Bandwidth Allocation line that will be found in
* the schemata files. The line should be start with "MB:" and be
* followed by "id=value" pairs separated by a semi-colon such as:
*
* MB:0=100;1=100
*
* which indicates node id 0 has 100 percent bandwidth and node id 1
* has 100 percent bandwidth. A trailing semi-colon is not formatted.
*/
static int
virResctrlAllocMemoryBandwidthFormat(virResctrlAlloc *alloc,
virBuffer *buf)
{
size_t i;
if (!alloc->mem_bw)
return 0;
virBufferAddLit(buf, "MB:");
for (i = 0; i < alloc->mem_bw->nbandwidths; i++) {
if (alloc->mem_bw->bandwidths[i]) {
virBufferAsprintf(buf, "%zd=%u;", i,
*(alloc->mem_bw->bandwidths[i]));
}
}
virBufferTrim(buf, ";");
virBufferAddChar(buf, '\n');
return 0;
}
static int
virResctrlAllocParseProcessMemoryBandwidth(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
char *mem_bw)
{
unsigned int bandwidth;
unsigned int id;
char *tmp = NULL;
tmp = strchr(mem_bw, '=');
if (!tmp)
return 0;
*tmp = '\0';
tmp++;
if (virStrToLong_uip(mem_bw, NULL, 10, &id) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Invalid node id %u "), id);
return -1;
}
if (virStrToLong_uip(tmp, NULL, 10, &bandwidth) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Invalid bandwidth %u"), bandwidth);
return -1;
}
if (bandwidth < resctrl->membw_info->min_bandwidth ||
id > resctrl->membw_info->max_id) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Missing or inconsistent resctrl info for "
"memory bandwidth node '%u'"), id);
return -1;
}
if (alloc->mem_bw->nbandwidths <= id) {
VIR_EXPAND_N(alloc->mem_bw->bandwidths, alloc->mem_bw->nbandwidths,
id - alloc->mem_bw->nbandwidths + 1);
}
if (!alloc->mem_bw->bandwidths[id])
alloc->mem_bw->bandwidths[id] = g_new0(unsigned int, 1);
*(alloc->mem_bw->bandwidths[id]) = bandwidth;
return 0;
}
/* Parse a schemata formatted MB: entry. Format details are described in
* virResctrlAllocMemoryBandwidthFormat.
*/
static int
virResctrlAllocParseMemoryBandwidthLine(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
char *line)
{
g_auto(GStrv) mbs = NULL;
GStrv next;
char *tmp = NULL;
/* For no reason there can be spaces */
virSkipSpaces((const char **) &line);
if (STRNEQLEN(line, "MB", 2))
return 0;
if (!resctrl || !resctrl->membw_info ||
!resctrl->membw_info->min_bandwidth ||
!resctrl->membw_info->bandwidth_granularity) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Missing or inconsistent resctrl info for "
"memory bandwidth allocation"));
return -1;
}
if (!alloc->mem_bw)
alloc->mem_bw = g_new0(virResctrlAllocMemBW, 1);
tmp = strchr(line, ':');
if (!tmp)
return 0;
tmp++;
mbs = g_strsplit(tmp, ";", 0);
for (next = mbs; *next; next++) {
if (virResctrlAllocParseProcessMemoryBandwidth(resctrl, alloc, *next) < 0)
return -1;
}
return 0;
}
static int
virResctrlAllocFormatCache(virResctrlAlloc *alloc,
virBuffer *buf)
{
unsigned int level = 0;
unsigned int type = 0;
unsigned int cache = 0;
for (level = 0; level < alloc->nlevels; level++) {
virResctrlAllocPerLevel *a_level = alloc->levels[level];
if (!a_level)
continue;
for (type = 0; type < VIR_CACHE_TYPE_LAST; type++) {
virResctrlAllocPerType *a_type = a_level->types[type];
if (!a_type)
continue;
virBufferAsprintf(buf, "L%u%s:", level, virResctrlTypeToString(type));
for (cache = 0; cache < a_type->nmasks; cache++) {
virBitmap *mask = a_type->masks[cache];
char *mask_str = NULL;
if (!mask)
continue;
mask_str = virBitmapToString(mask);
if (!mask_str)
return -1;
virBufferAsprintf(buf, "%u=%s;", cache, mask_str);
VIR_FREE(mask_str);
}
virBufferTrim(buf, ";");
virBufferAddChar(buf, '\n');
}
}
return 0;
}
char *
virResctrlAllocFormat(virResctrlAlloc *alloc)
{
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
if (!alloc)
return NULL;
if (virResctrlAllocFormatCache(alloc, &buf) < 0)
return NULL;
if (virResctrlAllocMemoryBandwidthFormat(alloc, &buf) < 0)
return NULL;
return virBufferContentAndReset(&buf);
}
static int
virResctrlAllocParseProcessCache(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
unsigned int level,
virCacheType type,
char *cache)
{
char *tmp = strchr(cache, '=');
unsigned int cache_id = 0;
g_autoptr(virBitmap) mask = NULL;
if (!tmp)
return 0;
*tmp = '\0';
tmp++;
if (virStrToLong_uip(cache, NULL, 10, &cache_id) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Invalid cache id '%s'"), cache);
return -1;
}
mask = virBitmapNewString(tmp);
if (!mask)
return -1;
if (!resctrl ||
level >= resctrl->nlevels ||
!resctrl->levels[level] ||
!resctrl->levels[level]->types[type]) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Missing or inconsistent resctrl info for "
"level '%u' type '%s'"),
level, virCacheTypeToString(type));
return -1;
}
virBitmapShrink(mask, resctrl->levels[level]->types[type]->bits);
if (virResctrlAllocUpdateMask(alloc, level, type, cache_id, mask) < 0)
return -1;
return 0;
}
static int
virResctrlAllocParseCacheLine(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
char *line)
{
g_auto(GStrv) caches = NULL;
GStrv next;
char *tmp = NULL;
unsigned int level = 0;
int type = -1;
/* For no reason there can be spaces */
virSkipSpaces((const char **) &line);
/* Skip lines that don't concern caches, e.g. MB: etc. */
if (line[0] != 'L')
return 0;
/* And lines that we can't parse too */
tmp = strchr(line, ':');
if (!tmp)
return 0;
*tmp = '\0';
tmp++;
if (virStrToLong_uip(line + 1, &line, 10, &level) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Cannot parse resctrl schema level '%s'"),
line + 1);
return -1;
}
type = virResctrlTypeFromString(line);
if (type < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Cannot parse resctrl schema level '%s'"),
line + 1);
return -1;
}
caches = g_strsplit(tmp, ";", 0);
if (!caches)
return 0;
for (next = caches; *next; next++) {
if (virResctrlAllocParseProcessCache(resctrl, alloc, level, type, *next) < 0)
return -1;
}
return 0;
}
static int
virResctrlAllocParse(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
const char *schemata)
{
g_auto(GStrv) lines = NULL;
GStrv next;
lines = g_strsplit(schemata, "\n", 0);
for (next = lines; *next; next++) {
if (virResctrlAllocParseCacheLine(resctrl, alloc, *next) < 0)
return -1;
if (virResctrlAllocParseMemoryBandwidthLine(resctrl, alloc, *next) < 0)
return -1;
}
return 0;
}
static int
virResctrlAllocGetGroup(virResctrlInfo *resctrl,
const char *groupname,
virResctrlAlloc **alloc)
{
char *schemata = NULL;
int rv = virFileReadValueString(&schemata,
SYSFS_RESCTRL_PATH "/%s/schemata",
groupname);
*alloc = NULL;
if (rv < 0)
return rv;
*alloc = virResctrlAllocNew();
if (!*alloc)
goto error;
if (virResctrlAllocParse(resctrl, *alloc, schemata) < 0)
goto error;
VIR_FREE(schemata);
return 0;
error:
VIR_FREE(schemata);
virObjectUnref(*alloc);
*alloc = NULL;
return -1;
}
static virResctrlAlloc *
virResctrlAllocGetDefault(virResctrlInfo *resctrl)
{
virResctrlAlloc *ret = NULL;
int rv = virResctrlAllocGetGroup(resctrl, ".", &ret);
if (rv == -2) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Could not read schemata file for the default group"));
}
return ret;
}
static void
virResctrlAllocSubtractPerType(virResctrlAllocPerType *dst,
virResctrlAllocPerType *src)
{
size_t i = 0;
if (!dst || !src)
return;
for (i = 0; i < dst->nmasks && i < src->nmasks; i++) {
if (dst->masks[i] && src->masks[i])
virBitmapSubtract(dst->masks[i], src->masks[i]);
}
}
static void
virResctrlAllocSubtract(virResctrlAlloc *dst,
virResctrlAlloc *src)
{
size_t i = 0;
size_t j = 0;
if (!src)
return;
for (i = 0; i < dst->nlevels && i < src->nlevels; i++) {
if (dst->levels[i] && src->levels[i]) {
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++) {
virResctrlAllocSubtractPerType(dst->levels[i]->types[j],
src->levels[i]->types[j]);
}
}
}
}
static virResctrlAlloc *
virResctrlAllocNewFromInfo(virResctrlInfo *info)
{
size_t i = 0;
g_autoptr(virResctrlAlloc) ret = virResctrlAllocNew();
if (!ret)
return NULL;
for (i = 0; i < info->nlevels; i++) {
virResctrlInfoPerLevel *i_level = info->levels[i];
size_t j = 0;
if (!i_level)
continue;
for (j = 0; j < VIR_CACHE_TYPE_LAST; j++) {
virResctrlInfoPerType *i_type = i_level->types[j];
g_autoptr(virBitmap) mask = NULL;
size_t k = 0;
if (!i_type)
continue;
mask = virBitmapNew(i_type->bits);
virBitmapSetAll(mask);
for (k = 0; k <= i_type->max_cache_id; k++) {
if (virResctrlAllocUpdateMask(ret, i, j, k, mask) < 0)
return NULL;
}
}
}
/* set default free memory bandwidth to 100% */
if (info->membw_info) {
ret->mem_bw = g_new0(virResctrlAllocMemBW, 1);
VIR_EXPAND_N(ret->mem_bw->bandwidths, ret->mem_bw->nbandwidths,
info->membw_info->max_id + 1);
for (i = 0; i < ret->mem_bw->nbandwidths; i++) {
ret->mem_bw->bandwidths[i] = g_new0(unsigned int, 1);
*(ret->mem_bw->bandwidths[i]) = 100;
}
}
return g_steal_pointer(&ret);
}
/*
* This function creates an allocation that represents all unused parts of all
* caches in the system. It uses virResctrlInfo for creating a new full
* allocation with all bits set (using virResctrlAllocNewFromInfo()) and then
* scans for all allocations under /sys/fs/resctrl and subtracts each one of
* them from it. That way it can then return an allocation with only bit set
* being those that are not mentioned in any other allocation. It is used for
* two things, a) calculating the masks when creating allocations and b) from
* tests.
*
* MBA (Memory Bandwidth Allocation) is not taken into account as it is a
* limiting setting, not an allocating one. The way it works is also vastly
* different from CAT.
*/
virResctrlAlloc *
virResctrlAllocGetUnused(virResctrlInfo *resctrl)
{
g_autoptr(virResctrlAlloc) ret = NULL;
g_autoptr(virResctrlAlloc) alloc_default = NULL;
struct dirent *ent = NULL;
g_autoptr(DIR) dirp = NULL;
int rv = -1;
if (virResctrlInfoIsEmpty(resctrl)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Resource control is not supported on this host"));
return NULL;
}
ret = virResctrlAllocNewFromInfo(resctrl);
if (!ret)
return NULL;
alloc_default = virResctrlAllocGetDefault(resctrl);
if (!alloc_default)
return NULL;
virResctrlAllocSubtract(ret, alloc_default);
if (virDirOpen(&dirp, SYSFS_RESCTRL_PATH) < 0)
return NULL;
while ((rv = virDirRead(dirp, &ent, SYSFS_RESCTRL_PATH)) > 0) {
g_autoptr(virResctrlAlloc) alloc = NULL;
if (STREQ(ent->d_name, "info"))
continue;
rv = virResctrlAllocGetGroup(resctrl, ent->d_name, &alloc);
if (rv == -2)
continue;
if (rv < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Could not read schemata file for group %s"),
ent->d_name);
return NULL;
}
virResctrlAllocSubtract(ret, alloc);
}
if (rv < 0)
return NULL;
return g_steal_pointer(&ret);
}
/*
* Given the information about requested allocation type `a_type`, the host
* cache for a particular type `i_type` and unused bits in the system `f_type`
* this function tries to find the smallest free space in which the allocation
* for cache id `cache` would fit. We're looking for the smallest place in
* order to minimize fragmentation and maximize the possibility of succeeding.
*
* Per-cache allocation for the @level, @type and @cache must already be
* allocated for @alloc (does not have to exist though).
*/
static int
virResctrlAllocFindUnused(virResctrlAlloc *alloc,
virResctrlInfoPerType *i_type,
virResctrlAllocPerType *f_type,
unsigned int level,
unsigned int type,
unsigned int cache)
{
unsigned long long *size = alloc->levels[level]->types[type]->sizes[cache];
g_autoptr(virBitmap) a_mask = NULL;
virBitmap *f_mask = NULL;
unsigned long long need_bits;
size_t i = 0;
ssize_t pos = -1;
ssize_t last_bits = 0;
ssize_t last_pos = -1;
if (!size)
return 0;
if (cache >= f_type->nmasks) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache with id %u does not exists for level %d"),
cache, level);
return -1;
}
f_mask = f_type->masks[cache];
if (!f_mask) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache level %d id %u does not support tuning for "
"scope type '%s'"),
level, cache, virCacheTypeToString(type));
return -1;
}
if (*size == i_type->size) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache allocation for the whole cache is not "
"possible, specify size smaller than %llu"),
i_type->size);
return -1;
}
need_bits = *size / i_type->control.granularity;
if (*size % i_type->control.granularity) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache allocation of size %llu is not "
"divisible by granularity %llu"),
*size, i_type->control.granularity);
return -1;
}
if (need_bits < i_type->min_cbm_bits) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache allocation of size %llu is smaller "
"than the minimum allowed allocation %llu"),
*size,
i_type->control.granularity * i_type->min_cbm_bits);
return -1;
}
while ((pos = virBitmapNextSetBit(f_mask, pos)) >= 0) {
ssize_t pos_clear = virBitmapNextClearBit(f_mask, pos);
ssize_t bits;
if (pos_clear < 0)
pos_clear = virBitmapSize(f_mask);
bits = pos_clear - pos;
/* Not enough bits, move on and skip all of them */
if (bits < need_bits) {
pos = pos_clear;
continue;
}
/* This fits perfectly */
if (bits == need_bits) {
last_pos = pos;
break;
}
/* Remember the smaller region if we already found on before */
if (last_pos < 0 || (last_bits && bits < last_bits)) {
last_bits = bits;
last_pos = pos;
}
pos = pos_clear;
}
if (last_pos < 0) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Not enough room for allocation of "
"%llu bytes for level %u cache %u "
"scope type '%s'"),
*size, level, cache,
virCacheTypeToString(type));
return -1;
}
a_mask = virBitmapNew(i_type->bits);
for (i = last_pos; i < last_pos + need_bits; i++)
ignore_value(virBitmapSetBit(a_mask, i));
if (virResctrlAllocUpdateMask(alloc, level, type, cache, a_mask) < 0)
return -1;
return 0;
}
static int
virResctrlAllocMemoryBandwidth(virResctrlInfo *resctrl,
virResctrlAlloc *alloc)
{
size_t i;
virResctrlAllocMemBW *mem_bw_alloc = alloc->mem_bw;
virResctrlInfoMemBW *mem_bw_info = resctrl->membw_info;
if (!mem_bw_alloc)
return 0;
if (!mem_bw_info) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("RDT Memory Bandwidth allocation unsupported"));
return -1;
}
for (i = 0; i < mem_bw_alloc->nbandwidths; i++) {
if (!mem_bw_alloc->bandwidths[i])
continue;
if (*(mem_bw_alloc->bandwidths[i]) % mem_bw_info->bandwidth_granularity) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Memory Bandwidth allocation of size "
"%u is not divisible by granularity %u"),
*(mem_bw_alloc->bandwidths[i]),
mem_bw_info->bandwidth_granularity);
return -1;
}
if (*(mem_bw_alloc->bandwidths[i]) < mem_bw_info->min_bandwidth) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Memory Bandwidth allocation of size "
"%u is smaller than the minimum "
"allowed allocation %u"),
*(mem_bw_alloc->bandwidths[i]),
mem_bw_info->min_bandwidth);
return -1;
}
if (i > mem_bw_info->max_id) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("bandwidth controller id %zd does not "
"exist, max controller id %u"),
i, mem_bw_info->max_id);
return -1;
}
}
return 0;
}
static int
virResctrlAllocCopyMemBW(virResctrlAlloc *dst,
virResctrlAlloc *src)
{
size_t i = 0;
virResctrlAllocMemBW *dst_bw = NULL;
virResctrlAllocMemBW *src_bw = src->mem_bw;
if (!src->mem_bw)
return 0;
if (!dst->mem_bw)
dst->mem_bw = g_new0(virResctrlAllocMemBW, 1);
dst_bw = dst->mem_bw;
if (src_bw->nbandwidths > dst_bw->nbandwidths)
VIR_EXPAND_N(dst_bw->bandwidths, dst_bw->nbandwidths,
src_bw->nbandwidths - dst_bw->nbandwidths);
for (i = 0; i < src_bw->nbandwidths; i++) {
if (dst_bw->bandwidths[i])
continue;
dst_bw->bandwidths[i] = g_new0(unsigned int, 1);
*dst_bw->bandwidths[i] = *src_bw->bandwidths[i];
}
return 0;
}
static int
virResctrlAllocCopyMasks(virResctrlAlloc *dst,
virResctrlAlloc *src)
{
unsigned int level = 0;
for (level = 0; level < src->nlevels; level++) {
virResctrlAllocPerLevel *s_level = src->levels[level];
unsigned int type = 0;
if (!s_level)
continue;
for (type = 0; type < VIR_CACHE_TYPE_LAST; type++) {
virResctrlAllocPerType *s_type = s_level->types[type];
virResctrlAllocPerType *d_type = NULL;
unsigned int cache = 0;
if (!s_type)
continue;
d_type = virResctrlAllocGetType(dst, level, type);
if (!d_type)
return -1;
for (cache = 0; cache < s_type->nmasks; cache++) {
virBitmap *mask = s_type->masks[cache];
if (mask && virResctrlAllocUpdateMask(dst, level, type, cache, mask) < 0)
return -1;
}
}
}
return 0;
}
/*
* This function is called when creating an allocation in the system.
* What it does is that it gets all the unused resources using
* virResctrlAllocGetUnused and then tries to find a proper space for
* every requested allocation effectively transforming `sizes` into `masks`.
*/
static int
virResctrlAllocAssign(virResctrlInfo *resctrl,
virResctrlAlloc *alloc)
{
unsigned int level = 0;
g_autoptr(virResctrlAlloc) alloc_free = NULL;
g_autoptr(virResctrlAlloc) alloc_default = NULL;
alloc_free = virResctrlAllocGetUnused(resctrl);
if (!alloc_free)
return -1;
alloc_default = virResctrlAllocGetDefault(resctrl);
if (!alloc_default)
return -1;
if (virResctrlAllocMemoryBandwidth(resctrl, alloc) < 0)
return -1;
if (virResctrlAllocCopyMasks(alloc, alloc_default) < 0)
return -1;
if (virResctrlAllocCopyMemBW(alloc, alloc_default) < 0)
return -1;
for (level = 0; level < alloc->nlevels; level++) {
virResctrlAllocPerLevel *a_level = alloc->levels[level];
virResctrlAllocPerLevel *f_level = NULL;
unsigned int type = 0;
if (!a_level)
continue;
if (level < alloc_free->nlevels)
f_level = alloc_free->levels[level];
if (!f_level) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache level %d does not support tuning"),
level);
return -1;
}
for (type = 0; type < VIR_CACHE_TYPE_LAST; type++) {
virResctrlAllocPerType *a_type = a_level->types[type];
virResctrlAllocPerType *f_type = f_level->types[type];
unsigned int cache = 0;
if (!a_type)
continue;
if (!f_type) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Cache level %d does not support tuning for "
"scope type '%s'"),
level, virCacheTypeToString(type));
return -1;
}
for (cache = 0; cache < a_type->nsizes; cache++) {
virResctrlInfoPerLevel *i_level = resctrl->levels[level];
virResctrlInfoPerType *i_type = i_level->types[type];
if (virResctrlAllocFindUnused(alloc, i_type, f_type, level, type, cache) < 0)
return -1;
}
}
}
return 0;
}
static char *
virResctrlDeterminePath(const char *parentpath,
const char *prefix,
const char *id)
{
if (!id) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Resctrl ID must be set before determining resctrl "
"parentpath='%s' prefix='%s'"), parentpath, prefix);
return NULL;
}
return g_strdup_printf("%s/%s-%s", parentpath, prefix, id);
}
int
virResctrlAllocDeterminePath(virResctrlAlloc *alloc,
const char *machinename)
{
if (alloc->path) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Resctrl allocation path is already set to '%s'"),
alloc->path);
return -1;
}
/* If the allocation is empty, then the path will be SYSFS_RESCTRL_PATH */
if (virResctrlAllocIsEmpty(alloc)) {
alloc->path = g_strdup(SYSFS_RESCTRL_PATH);
return 0;
}
alloc->path = virResctrlDeterminePath(SYSFS_RESCTRL_PATH,
machinename, alloc->id);
if (!alloc->path)
return -1;
return 0;
}
/* This function creates a resctrl directory in resource control file system,
* and the directory path is specified by @path. */
static int
virResctrlCreateGroupPath(const char *path)
{
/* Directory exists, return */
if (virFileExists(path))
return 0;
if (g_mkdir_with_parents(path, 0777) < 0) {
virReportSystemError(errno,
_("Cannot create resctrl directory '%s'"),
path);
return -1;
}
return 0;
}
/* This checks if the directory for the alloc exists. If not it tries to create
* it and apply appropriate alloc settings. */
int
virResctrlAllocCreate(virResctrlInfo *resctrl,
virResctrlAlloc *alloc,
const char *machinename)
{
g_autofree char *schemata_path = NULL;
g_autofree char *alloc_str = NULL;
int ret = -1;
int lockfd = -1;
if (!alloc)
return 0;
if (virResctrlInfoIsEmpty(resctrl)) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Resource control is not supported on this host"));
return -1;
}
if (virResctrlAllocDeterminePath(alloc, machinename) < 0)
return -1;
/* If using the system/default path for the allocation, then we're done */
if (STREQ(alloc->path, SYSFS_RESCTRL_PATH))
return 0;
lockfd = virResctrlLock();
if (lockfd < 0)
goto cleanup;
if (virResctrlAllocAssign(resctrl, alloc) < 0)
goto cleanup;
if (virResctrlCreateGroupPath(alloc->path) < 0)
goto cleanup;
alloc_str = virResctrlAllocFormat(alloc);
if (!alloc_str)
goto cleanup;
schemata_path = g_strdup_printf("%s/schemata", alloc->path);
VIR_DEBUG("Writing resctrl schemata '%s' into '%s'", alloc_str, schemata_path);
if (virFileWriteStr(schemata_path, alloc_str, 0) < 0) {
rmdir(alloc->path);
virReportSystemError(errno,
_("Cannot write into schemata file '%s'"),
schemata_path);
goto cleanup;
}
ret = 0;
cleanup:
virResctrlUnlock(lockfd);
return ret;
}
static int
virResctrlAddPID(const char *path,
pid_t pid)
{
g_autofree char *tasks = NULL;
g_autofree char *pidstr = NULL;
if (!path) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Cannot add pid to non-existing resctrl group"));
return -1;
}
tasks = g_strdup_printf("%s/tasks", path);
pidstr = g_strdup_printf("%lld", (long long int)pid);
if (virFileWriteStr(tasks, pidstr, 0) < 0) {
virReportSystemError(errno,
_("Cannot write pid in tasks file '%s'"),
tasks);
return -1;
}
return 0;
}
int
virResctrlAllocAddPID(virResctrlAlloc *alloc,
pid_t pid)
{
/* If the allocation is empty, then it is impossible to add a PID to
* allocation due to lacking of its 'tasks' file so just return */
if (virResctrlAllocIsEmpty(alloc))
return 0;
return virResctrlAddPID(alloc->path, pid);
}
int
virResctrlAllocRemove(virResctrlAlloc *alloc)
{
int ret = 0;
if (!alloc->path)
return 0;
/* Do not destroy if path is the system/default path for the allocation */
if (STREQ(alloc->path, SYSFS_RESCTRL_PATH))
return 0;
VIR_DEBUG("Removing resctrl allocation %s", alloc->path);
if (rmdir(alloc->path) != 0 && errno != ENOENT) {
ret = -errno;
VIR_ERROR(_("Unable to remove %s (%d)"), alloc->path, errno);
}
return ret;
}
/* virResctrlMonitor-related definitions */
virResctrlMonitor *
virResctrlMonitorNew(void)
{
if (virResctrlInitialize() < 0)
return NULL;
return virObjectNew(virResctrlMonitorClass);
}
/*
* virResctrlMonitorDeterminePath
*
* @monitor: Pointer to a resctrl monitor
* @machinename: Name string of the VM
*
* Determines the directory path that the underlying resctrl group will be
* created with.
*
* A monitor represents a directory under resource control file system,
* its directory path could be the same path as @monitor->alloc, could be a
* path of directory under 'mon_groups' of @monitor->alloc, or a path of
* directory under '/sys/fs/resctrl/mon_groups' if @monitor->alloc is NULL.
*
* Returns 0 on success, -1 on error.
*/
int
virResctrlMonitorDeterminePath(virResctrlMonitor *monitor,
const char *machinename)
{
g_autofree char *parentpath = NULL;
if (!monitor) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Invalid resctrl monitor"));
return -1;
}
if (!monitor->alloc) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Missing resctrl monitor alloc"));
return -1;
}
if (monitor->path) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Resctrl monitor path is already set to '%s'"),
monitor->path);
return -1;
}
if (!virResctrlAllocIsEmpty(monitor->alloc) &&
STREQ_NULLABLE(monitor->id, monitor->alloc->id)) {
monitor->path = g_strdup(monitor->alloc->path);
return 0;
}
parentpath = g_strdup_printf("%s/mon_groups", monitor->alloc->path);
monitor->path = virResctrlDeterminePath(parentpath, machinename,
monitor->id);
if (!monitor->path)
return -1;
return 0;
}
int
virResctrlMonitorAddPID(virResctrlMonitor *monitor,
pid_t pid)
{
return virResctrlAddPID(monitor->path, pid);
}
int
virResctrlMonitorCreate(virResctrlMonitor *monitor,
const char *machinename)
{
int lockfd = -1;
int ret = -1;
if (!monitor)
return 0;
if (virResctrlMonitorDeterminePath(monitor, machinename) < 0)
return -1;
lockfd = virResctrlLock();
if (lockfd < 0)
return -1;
ret = virResctrlCreateGroupPath(monitor->path);
virResctrlUnlock(lockfd);
return ret;
}
int
virResctrlMonitorSetID(virResctrlMonitor *monitor,
const char *id)
{
return virResctrlSetID(&monitor->id, id);
}
const char *
virResctrlMonitorGetID(virResctrlMonitor *monitor)
{
return monitor->id;
}
void
virResctrlMonitorSetAlloc(virResctrlMonitor *monitor,
virResctrlAlloc *alloc)
{
monitor->alloc = virObjectRef(alloc);
}
int
virResctrlMonitorRemove(virResctrlMonitor *monitor)
{
int ret = 0;
if (!monitor->path)
return 0;
if (STREQ(monitor->path, monitor->alloc->path))
return 0;
VIR_DEBUG("Removing resctrl monitor path=%s", monitor->path);
if (rmdir(monitor->path) != 0 && errno != ENOENT) {
ret = -errno;
VIR_ERROR(_("Unable to remove %s (%d)"), monitor->path, errno);
}
return ret;
}
static int
virResctrlMonitorStatsSorter(const void *a,
const void *b)
{
return (*(virResctrlMonitorStats **)a)->id
- (*(virResctrlMonitorStats **)b)->id;
}
/*
* virResctrlMonitorGetStats
*
* @monitor: The monitor that the statistic data will be retrieved from.
* @resources: A string list for the monitor feature names.
* @stats: Pointer of of virResctrlMonitorStats * array for holding cache or
* memory bandwidth usage data.
* @nstats: A size_t pointer to hold the returned array length of @stats
*
* Get cache or memory bandwidth utilization information.
*
* Returns 0 on success, -1 on error.
*/
int
virResctrlMonitorGetStats(virResctrlMonitor *monitor,
const char **resources,
virResctrlMonitorStats ***stats,
size_t *nstats)
{
int rv = -1;
int ret = -1;
size_t i = 0;
unsigned long long val = 0;
g_autoptr(DIR) dirp = NULL;
g_autofree char *datapath = NULL;
struct dirent *ent = NULL;
virResctrlMonitorStats *stat = NULL;
size_t nresources = g_strv_length((char **) resources);
if (!monitor) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Invalid resctrl monitor"));
return -1;
}
datapath = g_strdup_printf("%s/mon_data", monitor->path);
if (virDirOpen(&dirp, datapath) < 0)
goto cleanup;
*nstats = 0;
while (virDirRead(dirp, &ent, datapath) > 0) {
g_autofree char *filepath = NULL;
char *node_id = NULL;
/* Looking for directory that contains resource utilization
* information file. The directory name is arranged in format
* "mon_<node_name>_<node_id>". For example, "mon_L3_00" and
* "mon_L3_01" are two target directories for a two nodes system
* with resource utilization data file for each node respectively.
*/
filepath = g_strdup_printf("%s/%s", datapath, ent->d_name);
if (!virFileIsDir(filepath))
continue;
/* Looking for directory has a prefix 'mon_L' */
if (!(node_id = STRSKIP(ent->d_name, "mon_L")))
continue;
/* Looking for directory has another '_' */
node_id = strchr(node_id, '_');
if (!node_id)
continue;
/* Skip the character '_' */
if (!(node_id = STRSKIP(node_id, "_")))
continue;
stat = g_new0(virResctrlMonitorStats, 1);
stat->features = g_new0(char *, nresources + 1);
/* The node ID number should be here, parsing it. */
if (virStrToLong_uip(node_id, NULL, 0, &stat->id) < 0)
goto cleanup;
for (i = 0; resources[i]; i++) {
rv = virFileReadValueUllong(&val, "%s/%s/%s", datapath,
ent->d_name, resources[i]);
if (rv == -2) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("File '%s/%s/%s' does not exist."),
datapath, ent->d_name, resources[i]);
}
if (rv < 0)
goto cleanup;
VIR_APPEND_ELEMENT(stat->vals, stat->nvals, val);
stat->features[i] = g_strdup(resources[i]);
}
VIR_APPEND_ELEMENT(*stats, *nstats, stat);
}
/* Sort in id's ascending order */
if (*nstats)
qsort(*stats, *nstats, sizeof(**stats), virResctrlMonitorStatsSorter);
ret = 0;
cleanup:
virResctrlMonitorStatsFree(stat);
return ret;
}
void
virResctrlMonitorStatsFree(virResctrlMonitorStats *stat)
{
if (!stat)
return;
g_strfreev(stat->features);
g_free(stat->vals);
g_free(stat);
}