libvirt/src/conf/capabilities.c

2295 lines
69 KiB
C
Raw Normal View History

/*
* capabilities.c: hypervisor capabilities
*
* Copyright (C) 2006-2015 Red Hat, Inc.
* Copyright (C) 2006-2008 Daniel P. Berrange
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <unistd.h>
#include "capabilities.h"
#include "cpu_conf.h"
#include "domain_conf.h"
#include "storage_conf.h"
#include "viralloc.h"
#include "virarch.h"
#include "virbuffer.h"
#include "virerror.h"
#include "virfile.h"
#include "virhostcpu.h"
#include "virhostmem.h"
#include "virlog.h"
#include "virnuma.h"
#include "virstring.h"
#include "viruuid.h"
#include "virenum.h"
#include "virutil.h"
#define VIR_FROM_THIS VIR_FROM_CAPABILITIES
#define SYSFS_SYSTEM_PATH "/sys/devices/system"
VIR_LOG_INIT("conf.capabilities");
VIR_ENUM_DECL(virCapsHostPMTarget);
VIR_ENUM_IMPL(virCapsHostPMTarget,
VIR_NODE_SUSPEND_TARGET_LAST,
"suspend_mem", "suspend_disk", "suspend_hybrid",
);
static virClass *virCapsClass;
static void virCapsDispose(void *obj);
static int virCapabilitiesOnceInit(void)
{
if (!VIR_CLASS_NEW(virCaps, virClassForObject()))
return -1;
return 0;
}
VIR_ONCE_GLOBAL_INIT(virCapabilities);
/**
* virCapabilitiesNew:
* @hostarch: host machine architecture
capabilities: use bool instead of int While preparing to add a capability for active commit, I noticed that the existing code was abusing int for boolean values. * src/conf/capabilities.h (_virCapsGuestFeature, _virCapsHost) (virCapabilitiesNew, virCapabilitiesAddGuestFeature): Improve types. * src/conf/capabilities.c (virCapabilitiesNew) (virCapabilitiesAddGuestFeature): Adjust signature. * src/bhyve/bhyve_capabilities.c (virBhyveCapsBuild): Update clients. * src/esx/esx_driver.c (esxCapsInit): Likewise. * src/libxl/libxl_conf.c (libxlMakeCapabilities): Likewise. * src/lxc/lxc_conf.c (virLXCDriverCapsInit): Likewise. * src/openvz/openvz_conf.c (openvzCapsInit): Likewise. * src/parallels/parallels_driver.c (parallelsBuildCapabilities): Likewise. * src/phyp/phyp_driver.c (phypCapsInit): Likewise. * src/qemu/qemu_capabilities.c (virQEMUCapsInit) (virQEMUCapsInitGuestFromBinary): Likewise. * src/security/virt-aa-helper.c (get_definition): Likewise. * src/test/test_driver.c (testBuildCapabilities): Likewise. * src/uml/uml_conf.c (umlCapsInit): Likewise. * src/vbox/vbox_tmpl.c (vboxCapsInit): Likewise. * src/vmware/vmware_conf.c (vmwareCapsInit): Likewise. * src/xen/xen_hypervisor.c (xenHypervisorBuildCapabilities): Likewise. * src/xenapi/xenapi_driver.c (getCapsObject): Likewise. * tests/qemucaps2xmltest.c (testGetCaps): Likewise. * tests/testutils.c (virTestGenericCapsInit): Likewise. * tests/testutilslxc.c (testLXCCapsInit): Likewise. * tests/testutilsqemu.c (testQemuCapsInit): Likewise. * tests/testutilsxen.c (testXenCapsInit): Likewise. * tests/vircaps2xmltest.c (buildVirCapabilities): Likewise. * tests/vircapstest.c (buildNUMATopology): Likewise. * tests/vmx2xmltest.c (testCapsInit): Likewise. * tests/xml2vmxtest.c (testCapsInit): Likewise. Signed-off-by: Eric Blake <eblake@redhat.com>
2014-07-14 06:56:13 -06:00
* @offlineMigrate: true if offline migration is available
* @liveMigrate: true if live migration is available
*
* Allocate a new capabilities object
*/
virCaps *
virCapabilitiesNew(virArch hostarch,
capabilities: use bool instead of int While preparing to add a capability for active commit, I noticed that the existing code was abusing int for boolean values. * src/conf/capabilities.h (_virCapsGuestFeature, _virCapsHost) (virCapabilitiesNew, virCapabilitiesAddGuestFeature): Improve types. * src/conf/capabilities.c (virCapabilitiesNew) (virCapabilitiesAddGuestFeature): Adjust signature. * src/bhyve/bhyve_capabilities.c (virBhyveCapsBuild): Update clients. * src/esx/esx_driver.c (esxCapsInit): Likewise. * src/libxl/libxl_conf.c (libxlMakeCapabilities): Likewise. * src/lxc/lxc_conf.c (virLXCDriverCapsInit): Likewise. * src/openvz/openvz_conf.c (openvzCapsInit): Likewise. * src/parallels/parallels_driver.c (parallelsBuildCapabilities): Likewise. * src/phyp/phyp_driver.c (phypCapsInit): Likewise. * src/qemu/qemu_capabilities.c (virQEMUCapsInit) (virQEMUCapsInitGuestFromBinary): Likewise. * src/security/virt-aa-helper.c (get_definition): Likewise. * src/test/test_driver.c (testBuildCapabilities): Likewise. * src/uml/uml_conf.c (umlCapsInit): Likewise. * src/vbox/vbox_tmpl.c (vboxCapsInit): Likewise. * src/vmware/vmware_conf.c (vmwareCapsInit): Likewise. * src/xen/xen_hypervisor.c (xenHypervisorBuildCapabilities): Likewise. * src/xenapi/xenapi_driver.c (getCapsObject): Likewise. * tests/qemucaps2xmltest.c (testGetCaps): Likewise. * tests/testutils.c (virTestGenericCapsInit): Likewise. * tests/testutilslxc.c (testLXCCapsInit): Likewise. * tests/testutilsqemu.c (testQemuCapsInit): Likewise. * tests/testutilsxen.c (testXenCapsInit): Likewise. * tests/vircaps2xmltest.c (buildVirCapabilities): Likewise. * tests/vircapstest.c (buildNUMATopology): Likewise. * tests/vmx2xmltest.c (testCapsInit): Likewise. * tests/xml2vmxtest.c (testCapsInit): Likewise. Signed-off-by: Eric Blake <eblake@redhat.com>
2014-07-14 06:56:13 -06:00
bool offlineMigrate,
bool liveMigrate)
{
virCaps *caps;
if (virCapabilitiesInitialize() < 0)
return NULL;
if (!(caps = virObjectNew(virCapsClass)))
2008-05-29 15:28:28 +00:00
return NULL;
caps->host.arch = hostarch;
caps->host.offlineMigrate = offlineMigrate;
caps->host.liveMigrate = liveMigrate;
return caps;
}
void
virCapabilitiesClearHostNUMACellCPUTopology(virCapsHostNUMACellCPU *cpus,
size_t ncpus)
{
size_t i;
if (!cpus)
return;
for (i = 0; i < ncpus; i++)
g_clear_pointer(&cpus[i].siblings, virBitmapFree);
}
static void
virCapabilitiesFreeHostNUMACell(virCapsHostNUMACell *cell)
{
2008-05-29 15:28:28 +00:00
if (cell == NULL)
return;
virCapabilitiesClearHostNUMACellCPUTopology(cell->cpus, cell->ncpus);
g_free(cell->cpus);
g_free(cell->distances);
g_free(cell->pageinfo);
if (cell->caches)
g_array_unref(cell->caches);
g_free(cell);
}
static void
virCapabilitiesFreeGuestMachine(virCapsGuestMachine *machine)
{
if (machine == NULL)
return;
g_free(machine->name);
g_free(machine->canonical);
g_free(machine);
}
static void
virCapabilitiesFreeGuestDomain(virCapsGuestDomain *dom)
{
size_t i;
2008-05-29 15:28:28 +00:00
if (dom == NULL)
return;
g_free(dom->info.emulator);
g_free(dom->info.loader);
for (i = 0; i < dom->info.nmachines; i++)
virCapabilitiesFreeGuestMachine(dom->info.machines[i]);
g_free(dom->info.machines);
g_free(dom);
}
void
virCapabilitiesFreeGuest(virCapsGuest *guest)
{
size_t i;
2008-05-29 15:28:28 +00:00
if (guest == NULL)
return;
g_free(guest->arch.defaultInfo.emulator);
g_free(guest->arch.defaultInfo.loader);
for (i = 0; i < guest->arch.defaultInfo.nmachines; i++)
virCapabilitiesFreeGuestMachine(guest->arch.defaultInfo.machines[i]);
g_free(guest->arch.defaultInfo.machines);
for (i = 0; i < guest->arch.ndomains; i++)
virCapabilitiesFreeGuestDomain(guest->arch.domains[i]);
g_free(guest->arch.domains);
g_free(guest);
}
static void
virCapabilitiesFreeStoragePool(virCapsStoragePool *pool)
{
if (!pool)
return;
g_free(pool);
}
void
virCapabilitiesHostNUMAUnref(virCapsHostNUMA *caps)
{
if (!caps)
return;
if (g_atomic_int_dec_and_test(&caps->refs)) {
g_ptr_array_unref(caps->cells);
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
if (caps->interconnects)
g_array_unref(caps->interconnects);
g_free(caps);
}
}
void
virCapabilitiesHostNUMARef(virCapsHostNUMA *caps)
{
g_atomic_int_inc(&caps->refs);
}
static void
virCapsHostMemBWNodeFree(virCapsHostMemBWNode *ptr)
{
if (!ptr)
return;
virBitmapFree(ptr->cpus);
g_free(ptr);
}
static void
virCapabilitiesClearSecModel(virCapsHostSecModel *secmodel)
{
size_t i;
for (i = 0; i < secmodel->nlabels; i++) {
VIR_FREE(secmodel->labels[i].type);
VIR_FREE(secmodel->labels[i].label);
}
VIR_FREE(secmodel->labels);
VIR_FREE(secmodel->model);
VIR_FREE(secmodel->doi);
}
static void
virCapsDispose(void *object)
{
virCaps *caps = object;
size_t i;
for (i = 0; i < caps->npools; i++)
virCapabilitiesFreeStoragePool(caps->pools[i]);
g_free(caps->pools);
for (i = 0; i < caps->nguests; i++)
virCapabilitiesFreeGuest(caps->guests[i]);
g_free(caps->guests);
for (i = 0; i < caps->host.nfeatures; i++)
g_free(caps->host.features[i]);
g_free(caps->host.features);
if (caps->host.numa)
virCapabilitiesHostNUMAUnref(caps->host.numa);
for (i = 0; i < caps->host.nmigrateTrans; i++)
g_free(caps->host.migrateTrans[i]);
g_free(caps->host.migrateTrans);
for (i = 0; i < caps->host.nsecModels; i++)
virCapabilitiesClearSecModel(&caps->host.secModels[i]);
g_free(caps->host.secModels);
for (i = 0; i < caps->host.cache.nbanks; i++)
virCapsHostCacheBankFree(caps->host.cache.banks[i]);
virResctrlInfoMonFree(caps->host.cache.monitor);
g_free(caps->host.cache.banks);
for (i = 0; i < caps->host.memBW.nnodes; i++)
virCapsHostMemBWNodeFree(caps->host.memBW.nodes[i]);
virResctrlInfoMonFree(caps->host.memBW.monitor);
g_free(caps->host.memBW.nodes);
g_free(caps->host.netprefix);
g_free(caps->host.pagesSize);
virCPUDefFree(caps->host.cpu);
virObjectUnref(caps->host.resctrl);
}
/**
* virCapabilitiesAddHostFeature:
* @caps: capabilities to extend
* @name: name of new feature
*
* Registers a new host CPU feature, eg 'pae', or 'vmx'
*/
int
virCapabilitiesAddHostFeature(virCaps *caps,
const char *name)
{
VIR_RESIZE_N(caps->host.features, caps->host.nfeatures_max,
caps->host.nfeatures, 1);
caps->host.features[caps->host.nfeatures] = g_strdup(name);
caps->host.nfeatures++;
return 0;
}
/**
* virCapabilitiesAddHostMigrateTransport:
* @caps: capabilities to extend
* @name: name of migration transport
*
* Registers a new domain migration transport URI
*/
int
virCapabilitiesAddHostMigrateTransport(virCaps *caps,
const char *name)
{
VIR_RESIZE_N(caps->host.migrateTrans, caps->host.nmigrateTrans_max,
caps->host.nmigrateTrans, 1);
caps->host.migrateTrans[caps->host.nmigrateTrans] = g_strdup(name);
caps->host.nmigrateTrans++;
return 0;
}
/**
* virCapabilitiesSetNetPrefix:
* @caps: capabilities to extend
* @name: prefix for host generated network interfaces
*
* Registers the prefix that is used for generated network interfaces
*/
int
virCapabilitiesSetNetPrefix(virCaps *caps,
const char *prefix)
{
caps->host.netprefix = g_strdup(prefix);
return 0;
}
/**
* virCapabilitiesHostNUMAAddCell:
* @caps: capabilities to extend
* @num: ID number of NUMA cell
* @mem: Total size of memory in the NUMA node (in KiB)
* @ncpus: number of CPUs in cell
* @cpus: array of CPU definition structures
* @ndistances: number of sibling NUMA nodes
* @distances: NUMA distances to other nodes
* @npageinfo: number of pages at node @num
* @pageinfo: info on each single memory page
* @caches: info on memory side caches
*
* Registers a new NUMA cell for a host, passing in a array of
* CPU IDs belonging to the cell, distances to other NUMA nodes
* and info on hugepages on the node.
*
* All pointers are stolen.
*/
void
virCapabilitiesHostNUMAAddCell(virCapsHostNUMA *caps,
int num,
unsigned long long mem,
int ncpus,
virCapsHostNUMACellCPU **cpus,
int ndistances,
virNumaDistance **distances,
int npageinfo,
virCapsHostNUMACellPageInfo **pageinfo,
GArray **caches)
{
virCapsHostNUMACell *cell = g_new0(virCapsHostNUMACell, 1);
2008-07-07 09:52:26 +00:00
cell->num = num;
cell->mem = mem;
if (cpus) {
cell->ncpus = ncpus;
cell->cpus = g_steal_pointer(cpus);
}
if (distances) {
cell->ndistances = ndistances;
cell->distances = g_steal_pointer(distances);
}
if (pageinfo) {
cell->npageinfo = npageinfo;
cell->pageinfo = g_steal_pointer(pageinfo);
}
if (caches) {
cell->caches = g_steal_pointer(caches);
}
2008-07-07 09:52:26 +00:00
g_ptr_array_add(caps->cells, cell);
}
/**
* virCapabilitiesAllocMachines:
* @machines: NULL-terminated list of machine variants for emulator ('pc', or 'isapc', etc)
* @nmachines: filled with number of machine variants for emulator
*
* Allocate a table of virCapsGuestMachine *from the supplied table
* of machine names.
*/
virCapsGuestMachine **
virCapabilitiesAllocMachines(const char *const *names,
int *nnames)
{
virCapsGuestMachine **machines;
size_t i;
*nnames = g_strv_length((gchar **)names);
machines = g_new0(virCapsGuestMachine *, *nnames);
for (i = 0; i < *nnames; i++) {
machines[i] = g_new0(virCapsGuestMachine, 1);
machines[i]->name = g_strdup(names[i]);
}
return machines;
}
/**
* virCapabilitiesAddGuest:
* @caps: capabilities to extend
* @ostype: guest operating system type, of enum VIR_DOMAIN_OSTYPE
* @arch: guest CPU architecture
* @wordsize: number of bits in CPU word
* @emulator: path to default device emulator for arch/ostype
* @loader: path to default BIOS loader for arch/ostype
* @nmachines: number of machine variants for emulator
* @machines: machine variants for emulator ('pc', or 'isapc', etc)
*
* Registers a new guest operating system. This should be
* followed by registration of at least one domain for
* running the guest
*/
virCapsGuest *
virCapabilitiesAddGuest(virCaps *caps,
int ostype,
virArch arch,
const char *emulator,
const char *loader,
int nmachines,
virCapsGuestMachine **machines)
{
virCapsGuest *guest;
guest = g_new0(virCapsGuest, 1);
guest->ostype = ostype;
guest->arch.id = arch;
guest->arch.wordsize = virArchGetWordSize(arch);
guest->arch.defaultInfo.emulator = g_strdup(emulator);
guest->arch.defaultInfo.loader = g_strdup(loader);
VIR_RESIZE_N(caps->guests, caps->nguests_max, caps->nguests, 1);
caps->guests[caps->nguests++] = guest;
if (nmachines) {
guest->arch.defaultInfo.nmachines = nmachines;
guest->arch.defaultInfo.machines = machines;
}
return guest;
}
/**
* virCapabilitiesAddGuestDomain:
* @guest: guest to support
* @hvtype: hypervisor type ('xen', 'qemu', 'kvm')
* @emulator: specialized device emulator for domain
* @loader: specialized BIOS loader for domain
* @nmachines: number of machine variants for emulator
* @machines: specialized machine variants for emulator
*
* Registers a virtual domain capable of running a
* guest operating system
*/
virCapsGuestDomain *
virCapabilitiesAddGuestDomain(virCapsGuest *guest,
int hvtype,
const char *emulator,
const char *loader,
int nmachines,
virCapsGuestMachine **machines)
{
virCapsGuestDomain *dom;
dom = g_new0(virCapsGuestDomain, 1);
dom->type = hvtype;
dom->info.emulator = g_strdup(emulator);
dom->info.loader = g_strdup(loader);
VIR_RESIZE_N(guest->arch.domains, guest->arch.ndomains_max,
guest->arch.ndomains, 1);
guest->arch.domains[guest->arch.ndomains] = dom;
guest->arch.ndomains++;
if (nmachines) {
dom->info.nmachines = nmachines;
dom->info.machines = machines;
}
return dom;
}
struct virCapsGuestFeatureInfo {
const char *name;
bool togglesRequired;
};
static const struct virCapsGuestFeatureInfo virCapsGuestFeatureInfos[VIR_CAPS_GUEST_FEATURE_TYPE_LAST] = {
[VIR_CAPS_GUEST_FEATURE_TYPE_PAE] = { "pae", false },
[VIR_CAPS_GUEST_FEATURE_TYPE_NONPAE] = { "nonpae", false },
[VIR_CAPS_GUEST_FEATURE_TYPE_IA64_BE] = { "ia64_be", false },
[VIR_CAPS_GUEST_FEATURE_TYPE_ACPI] = { "acpi", true },
[VIR_CAPS_GUEST_FEATURE_TYPE_APIC] = { "apic", true },
[VIR_CAPS_GUEST_FEATURE_TYPE_CPUSELECTION] = { "cpuselection", false },
[VIR_CAPS_GUEST_FEATURE_TYPE_DEVICEBOOT] = { "deviceboot", false },
[VIR_CAPS_GUEST_FEATURE_TYPE_DISKSNAPSHOT] = { "disksnapshot", true },
[VIR_CAPS_GUEST_FEATURE_TYPE_HAP] = { "hap", true },
};
static void
virCapabilitiesAddGuestFeatureInternal(virCapsGuest *guest,
virCapsGuestFeatureType feature,
bool defaultOn,
bool toggle)
{
guest->features[feature].present = true;
if (virCapsGuestFeatureInfos[feature].togglesRequired) {
guest->features[feature].defaultOn = virTristateSwitchFromBool(defaultOn);
guest->features[feature].toggle = virTristateBoolFromBool(toggle);
}
}
/**
* virCapabilitiesAddGuestFeature:
* @guest: guest to associate feature with
* @feature: feature to add
*
capabilities: use bool instead of int While preparing to add a capability for active commit, I noticed that the existing code was abusing int for boolean values. * src/conf/capabilities.h (_virCapsGuestFeature, _virCapsHost) (virCapabilitiesNew, virCapabilitiesAddGuestFeature): Improve types. * src/conf/capabilities.c (virCapabilitiesNew) (virCapabilitiesAddGuestFeature): Adjust signature. * src/bhyve/bhyve_capabilities.c (virBhyveCapsBuild): Update clients. * src/esx/esx_driver.c (esxCapsInit): Likewise. * src/libxl/libxl_conf.c (libxlMakeCapabilities): Likewise. * src/lxc/lxc_conf.c (virLXCDriverCapsInit): Likewise. * src/openvz/openvz_conf.c (openvzCapsInit): Likewise. * src/parallels/parallels_driver.c (parallelsBuildCapabilities): Likewise. * src/phyp/phyp_driver.c (phypCapsInit): Likewise. * src/qemu/qemu_capabilities.c (virQEMUCapsInit) (virQEMUCapsInitGuestFromBinary): Likewise. * src/security/virt-aa-helper.c (get_definition): Likewise. * src/test/test_driver.c (testBuildCapabilities): Likewise. * src/uml/uml_conf.c (umlCapsInit): Likewise. * src/vbox/vbox_tmpl.c (vboxCapsInit): Likewise. * src/vmware/vmware_conf.c (vmwareCapsInit): Likewise. * src/xen/xen_hypervisor.c (xenHypervisorBuildCapabilities): Likewise. * src/xenapi/xenapi_driver.c (getCapsObject): Likewise. * tests/qemucaps2xmltest.c (testGetCaps): Likewise. * tests/testutils.c (virTestGenericCapsInit): Likewise. * tests/testutilslxc.c (testLXCCapsInit): Likewise. * tests/testutilsqemu.c (testQemuCapsInit): Likewise. * tests/testutilsxen.c (testXenCapsInit): Likewise. * tests/vircaps2xmltest.c (buildVirCapabilities): Likewise. * tests/vircapstest.c (buildNUMATopology): Likewise. * tests/vmx2xmltest.c (testCapsInit): Likewise. * tests/xml2vmxtest.c (testCapsInit): Likewise. Signed-off-by: Eric Blake <eblake@redhat.com>
2014-07-14 06:56:13 -06:00
* Registers a feature for a guest domain.
*/
void
virCapabilitiesAddGuestFeature(virCapsGuest *guest,
virCapsGuestFeatureType feature)
{
virCapabilitiesAddGuestFeatureInternal(guest, feature, false, false);
}
/**
* virCapabilitiesAddGuestFeatureWithToggle:
* @guest: guest to associate feature with
* @feature: feature to add
* @defaultOn: true if it defaults to on
* @toggle: true if its state can be toggled
*
* Registers a feature with toggles for a guest domain.
*/
void
virCapabilitiesAddGuestFeatureWithToggle(virCapsGuest *guest,
virCapsGuestFeatureType feature,
bool defaultOn,
bool toggle)
{
virCapabilitiesAddGuestFeatureInternal(guest, feature, defaultOn, toggle);
}
/**
* virCapabilitiesHostSecModelAddBaseLabel
* @secmodel: Security model to add a base label for
* @type: virtualization type
* @label: base label
*
* Returns non-zero on error.
*/
extern int
virCapabilitiesHostSecModelAddBaseLabel(virCapsHostSecModel *secmodel,
const char *type,
const char *label)
{
if (type == NULL || label == NULL)
return -1;
VIR_EXPAND_N(secmodel->labels, secmodel->nlabels, 1);
secmodel->labels[secmodel->nlabels - 1].type = g_strdup(type);
secmodel->labels[secmodel->nlabels - 1].label = g_strdup(label);
return 0;
}
static virCapsDomainData *
virCapabilitiesDomainDataLookupInternal(virCaps *caps,
int ostype,
virArch arch,
virDomainVirtType domaintype,
const char *emulator,
const char *machinetype)
{
virCapsGuest *foundguest = NULL;
virCapsGuestDomain *founddomain = NULL;
virCapsGuestMachine *foundmachine = NULL;
virCapsDomainData *ret = NULL;
size_t i, j, k;
VIR_DEBUG("Lookup ostype=%d arch=%d domaintype=%d emulator=%s machine=%s",
ostype, arch, domaintype, NULLSTR(emulator), NULLSTR(machinetype));
for (i = 0; i < caps->nguests; i++) {
virCapsGuest *guest = caps->guests[i];
if (ostype != -1 && guest->ostype != ostype) {
VIR_DEBUG("Skip os type want=%d vs got=%d", ostype, guest->ostype);
continue;
}
VIR_DEBUG("Match os type %d", ostype);
if ((arch != VIR_ARCH_NONE) && (guest->arch.id != arch)) {
VIR_DEBUG("Skip arch want=%d vs got=%d", arch, guest->arch.id);
continue;
}
VIR_DEBUG("Match arch %d", arch);
for (j = 0; j < guest->arch.ndomains; j++) {
virCapsGuestDomain *domain = guest->arch.domains[j];
virCapsGuestMachine **machinelist;
int nmachines;
const char *check_emulator = NULL;
if (domaintype != VIR_DOMAIN_VIRT_NONE &&
(domain->type != domaintype)) {
VIR_DEBUG("Skip domain type want=%d vs got=%d", domaintype, domain->type);
continue;
}
VIR_DEBUG("Match domain type %d", domaintype);
check_emulator = domain->info.emulator;
if (!check_emulator)
check_emulator = guest->arch.defaultInfo.emulator;
if (emulator && STRNEQ_NULLABLE(check_emulator, emulator)) {
VIR_DEBUG("Skip emulator got=%s vs want=%s",
emulator, NULLSTR(check_emulator));
continue;
}
VIR_DEBUG("Match emulator %s", NULLSTR(emulator));
if (domain->info.nmachines) {
nmachines = domain->info.nmachines;
machinelist = domain->info.machines;
} else {
nmachines = guest->arch.defaultInfo.nmachines;
machinelist = guest->arch.defaultInfo.machines;
}
for (k = 0; k < nmachines; k++) {
virCapsGuestMachine *machine = machinelist[k];
if (machinetype &&
STRNEQ(machine->name, machinetype) &&
STRNEQ_NULLABLE(machine->canonical, machinetype)) {
VIR_DEBUG("Skip machine type want=%s vs got=%s got=%s",
machinetype, machine->name, NULLSTR(machine->canonical));
continue;
}
VIR_DEBUG("Match machine type machine %s", NULLSTR(machinetype));
foundmachine = machine;
break;
}
if (!foundmachine && nmachines)
continue;
founddomain = domain;
break;
}
if (!founddomain)
continue;
foundguest = guest;
break;
}
/* XXX check default_emulator, see how it uses this */
if (!foundguest) {
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
if (ostype)
virBufferAsprintf(&buf, "ostype=%s ",
virDomainOSTypeToString(ostype));
if (arch)
virBufferAsprintf(&buf, "arch=%s ", virArchToString(arch));
if (domaintype > VIR_DOMAIN_VIRT_NONE)
virBufferAsprintf(&buf, "domaintype=%s ",
virDomainVirtTypeToString(domaintype));
if (emulator)
virBufferEscapeString(&buf, "emulator=%s ", emulator);
if (machinetype)
virBufferEscapeString(&buf, "machine=%s ", machinetype);
if (virBufferCurrentContent(&buf) &&
!virBufferCurrentContent(&buf)[0])
virBufferAsprintf(&buf, "%s", _("any configuration"));
virReportError(VIR_ERR_INVALID_ARG,
_("could not find capabilities for %1$s"),
virBufferCurrentContent(&buf));
return ret;
}
ret = g_new0(virCapsDomainData, 1);
ret->ostype = foundguest->ostype;
ret->arch = foundguest->arch.id;
if (founddomain) {
ret->domaintype = founddomain->type;
ret->emulator = founddomain->info.emulator;
}
if (!ret->emulator)
ret->emulator = foundguest->arch.defaultInfo.emulator;
if (foundmachine)
ret->machinetype = foundmachine->name;
return ret;
}
/**
* virCapabilitiesDomainDataLookup:
* @caps: capabilities to query
* @ostype: guest operating system type, of enum VIR_DOMAIN_OSTYPE
* @arch: Architecture to search for
* @domaintype: domain type to search for, of enum virDomainVirtType
* @emulator: Emulator path to search for
* @machinetype: Machine type to search for
*
* Search capabilities for the passed values, and if found return
* virCapabilitiesDomainDataLookup filled in with the default values
*/
virCapsDomainData *
virCapabilitiesDomainDataLookup(virCaps *caps,
int ostype,
virArch arch,
int domaintype,
const char *emulator,
const char *machinetype)
{
virCapsDomainData *ret;
if (arch == VIR_ARCH_NONE) {
/* Prefer host arch if its available */
ret = virCapabilitiesDomainDataLookupInternal(caps, ostype,
caps->host.arch,
domaintype,
emulator, machinetype);
if (ret)
return ret;
}
return virCapabilitiesDomainDataLookupInternal(caps, ostype,
arch, domaintype,
emulator, machinetype);
}
bool
virCapabilitiesDomainSupported(virCaps *caps,
int ostype,
virArch arch,
int virttype)
{
g_autofree virCapsDomainData *capsdata = NULL;
capsdata = virCapabilitiesDomainDataLookup(caps, ostype,
arch,
virttype,
NULL, NULL);
return capsdata != NULL;
}
int
virCapabilitiesAddStoragePool(virCaps *caps,
int poolType)
{
virCapsStoragePool *pool;
pool = g_new0(virCapsStoragePool, 1);
pool->type = poolType;
VIR_RESIZE_N(caps->pools, caps->npools_max, caps->npools, 1);
caps->pools[caps->npools++] = pool;
return 0;
}
static int
virCapsHostNUMACellCPUFormat(virBuffer *buf,
const virCapsHostNUMACellCPU *cpus,
int ncpus)
{
g_auto(virBuffer) attrBuf = VIR_BUFFER_INITIALIZER;
g_auto(virBuffer) childBuf = VIR_BUFFER_INIT_CHILD(buf);
size_t j;
virBufferAsprintf(&attrBuf, " num='%d'", ncpus);
for (j = 0; j < ncpus; j++) {
virBufferAsprintf(&childBuf, "<cpu id='%d'", cpus[j].id);
if (cpus[j].siblings) {
g_autofree char *siblings = NULL;
if (!(siblings = virBitmapFormat(cpus[j].siblings)))
return -1;
virBufferAsprintf(&childBuf,
" socket_id='%d' die_id='%d' core_id='%d' siblings='%s'",
cpus[j].socket_id,
cpus[j].die_id,
cpus[j].core_id,
siblings);
}
virBufferAddLit(&childBuf, "/>\n");
}
virXMLFormatElement(buf, "cpus", &attrBuf, &childBuf);
return 0;
}
static int
virCapabilitiesHostNUMAFormat(virBuffer *buf,
virCapsHostNUMA *caps)
{
size_t i;
if (!caps)
return 0;
virBufferAddLit(buf, "<topology>\n");
virBufferAdjustIndent(buf, 2);
virBufferAsprintf(buf, "<cells num='%d'>\n", caps->cells->len);
virBufferAdjustIndent(buf, 2);
for (i = 0; i < caps->cells->len; i++) {
virCapsHostNUMACell *cell = g_ptr_array_index(caps->cells, i);
size_t j;
virBufferAsprintf(buf, "<cell id='%d'>\n", cell->num);
virBufferAdjustIndent(buf, 2);
/* Print out the numacell memory total if it is available */
if (cell->mem)
virBufferAsprintf(buf, "<memory unit='KiB'>%llu</memory>\n",
cell->mem);
for (j = 0; j < cell->npageinfo; j++) {
virBufferAsprintf(buf, "<pages unit='KiB' size='%u'>%llu</pages>\n",
cell->pageinfo[j].size,
cell->pageinfo[j].avail);
}
virNumaDistanceFormat(buf, cell->distances, cell->ndistances);
if (cell->caches) {
virNumaCache *caches = &g_array_index(cell->caches, virNumaCache, 0);
virNumaCacheFormat(buf, caches, cell->caches->len);
}
if (virCapsHostNUMACellCPUFormat(buf, cell->cpus, cell->ncpus) < 0)
return -1;
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</cell>\n");
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</cells>\n");
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
if (caps->interconnects) {
const virNumaInterconnect *interconnects;
interconnects = &g_array_index(caps->interconnects, virNumaInterconnect, 0);
virNumaInterconnectFormat(buf, interconnects, caps->interconnects->len);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</topology>\n");
return 0;
}
static int
virCapabilitiesFormatResctrlMonitor(virBuffer *buf,
virResctrlInfoMon *monitor)
{
size_t i = 0;
g_auto(virBuffer) childrenBuf = VIR_BUFFER_INIT_CHILD(buf);
/* monitor not supported, no capability */
if (!monitor)
return 0;
/* no feature found in monitor means no capability, return */
if (monitor->nfeatures == 0)
return 0;
virBufferAddLit(buf, "<monitor ");
/* CMT might not enabled, if enabled show related attributes. */
if (monitor->type == VIR_RESCTRL_MONITOR_TYPE_CACHE)
virBufferAsprintf(buf,
"level='%u' reuseThreshold='%u' ",
monitor->cache_level,
monitor->cache_reuse_threshold);
virBufferAsprintf(buf,
"maxMonitors='%u'>\n",
monitor->max_monitor);
for (i = 0; i < monitor->nfeatures; i++) {
virBufferAsprintf(&childrenBuf,
"<feature name='%s'/>\n",
monitor->features[i]);
}
virBufferAddBuffer(buf, &childrenBuf);
virBufferAddLit(buf, "</monitor>\n");
return 0;
}
static int
virCapabilitiesFormatCaches(virBuffer *buf,
virCapsHostCache *cache)
{
size_t i = 0;
size_t j = 0;
if (!cache->nbanks && !cache->monitor)
return 0;
virBufferAddLit(buf, "<cache>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < cache->nbanks; i++) {
g_auto(virBuffer) attrBuf = VIR_BUFFER_INITIALIZER;
g_auto(virBuffer) childrenBuf = VIR_BUFFER_INIT_CHILD(buf);
virCapsHostCacheBank *bank = cache->banks[i];
g_autofree char *cpus_str = virBitmapFormat(bank->cpus);
const char *unit = NULL;
unsigned long long short_size = virFormatIntPretty(bank->size, &unit);
if (!cpus_str)
return -1;
/*
* Let's just *hope* the size is aligned to KiBs so that it does not
* bite is back in the future
*/
virBufferAsprintf(&attrBuf,
" id='%u' level='%u' type='%s' "
"size='%llu' unit='%s' cpus='%s'",
bank->id, bank->level,
virCacheTypeToString(bank->type),
short_size, unit, cpus_str);
for (j = 0; j < bank->ncontrols; j++) {
const char *min_unit;
virResctrlInfoPerCache *controls = bank->controls[j];
unsigned long long gran_short_size = controls->granularity;
unsigned long long min_short_size = controls->min;
gran_short_size = virFormatIntPretty(gran_short_size, &unit);
min_short_size = virFormatIntPretty(min_short_size, &min_unit);
/* Only use the smaller unit if they are different */
if (min_short_size) {
unsigned long long gran_div;
unsigned long long min_div;
gran_div = controls->granularity / gran_short_size;
min_div = controls->min / min_short_size;
if (min_div > gran_div) {
min_short_size *= min_div / gran_div;
} else if (min_div < gran_div) {
unit = min_unit;
gran_short_size *= gran_div / min_div;
}
}
virBufferAsprintf(&childrenBuf,
"<control granularity='%llu'",
gran_short_size);
if (min_short_size)
virBufferAsprintf(&childrenBuf, " min='%llu'", min_short_size);
virBufferAsprintf(&childrenBuf,
" unit='%s' type='%s' maxAllocs='%u'/>\n",
unit,
virCacheTypeToString(controls->scope),
controls->max_allocation);
}
virXMLFormatElement(buf, "bank", &attrBuf, &childrenBuf);
}
if (virCapabilitiesFormatResctrlMonitor(buf, cache->monitor) < 0)
return -1;
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</cache>\n");
return 0;
}
static int
virCapabilitiesFormatMemoryBandwidth(virBuffer *buf,
virCapsHostMemBW *memBW)
{
size_t i = 0;
if (!memBW->nnodes && !memBW->monitor)
return 0;
virBufferAddLit(buf, "<memory_bandwidth>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < memBW->nnodes; i++) {
g_auto(virBuffer) attrBuf = VIR_BUFFER_INITIALIZER;
g_auto(virBuffer) childrenBuf = VIR_BUFFER_INIT_CHILD(buf);
virCapsHostMemBWNode *node = memBW->nodes[i];
virResctrlInfoMemBWPerNode *control = &node->control;
g_autofree char *cpus_str = virBitmapFormat(node->cpus);
if (!cpus_str)
return -1;
virBufferAsprintf(&attrBuf,
" id='%u' cpus='%s'",
node->id, cpus_str);
virBufferAsprintf(&childrenBuf,
"<control granularity='%u' min='%u' "
"maxAllocs='%u'/>\n",
control->granularity, control->min,
control->max_allocation);
virXMLFormatElement(buf, "node", &attrBuf, &childrenBuf);
}
if (virCapabilitiesFormatResctrlMonitor(buf, memBW->monitor) < 0)
return -1;
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</memory_bandwidth>\n");
return 0;
}
static int
virCapabilitiesFormatHostXML(virCapsHost *host,
virBuffer *buf)
{
size_t i, j;
char host_uuid[VIR_UUID_STRING_BUFLEN];
/* The lack of some data means we have nothing
* minimally to format, so just return. */
if (!virUUIDIsValid(host->host_uuid) &&
!host->arch && !host->powerMgmt && !host->iommu)
return 0;
virBufferAddLit(buf, "<host>\n");
virBufferAdjustIndent(buf, 2);
if (virUUIDIsValid(host->host_uuid)) {
virUUIDFormat(host->host_uuid, host_uuid);
virBufferAsprintf(buf, "<uuid>%s</uuid>\n", host_uuid);
}
virBufferAddLit(buf, "<cpu>\n");
virBufferAdjustIndent(buf, 2);
if (host->arch)
virBufferAsprintf(buf, "<arch>%s</arch>\n",
virArchToString(host->arch));
if (host->nfeatures) {
virBufferAddLit(buf, "<features>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < host->nfeatures; i++) {
virBufferAsprintf(buf, "<%s/>\n",
host->features[i]);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</features>\n");
}
virCPUDefFormatBuf(buf, host->cpu);
for (i = 0; i < host->nPagesSize; i++) {
virBufferAsprintf(buf, "<pages unit='KiB' size='%u'/>\n",
host->pagesSize[i]);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</cpu>\n");
/* The PM query was successful. */
if (host->powerMgmt) {
/* The host supports some PM features. */
unsigned int pm = host->powerMgmt;
virBufferAddLit(buf, "<power_management>\n");
virBufferAdjustIndent(buf, 2);
while (pm) {
int bit = __builtin_ffs(pm) - 1;
virBufferAsprintf(buf, "<%s/>\n",
virCapsHostPMTargetTypeToString(bit));
pm &= ~(1U << bit);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</power_management>\n");
} else {
/* The host does not support any PM feature. */
virBufferAddLit(buf, "<power_management/>\n");
}
virBufferAsprintf(buf, "<iommu support='%s'/>\n",
host->iommu ? "yes" : "no");
if (host->offlineMigrate) {
virBufferAddLit(buf, "<migration_features>\n");
virBufferAdjustIndent(buf, 2);
if (host->liveMigrate)
virBufferAddLit(buf, "<live/>\n");
if (host->nmigrateTrans) {
virBufferAddLit(buf, "<uri_transports>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < host->nmigrateTrans; i++) {
virBufferAsprintf(buf, "<uri_transport>%s</uri_transport>\n",
host->migrateTrans[i]);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</uri_transports>\n");
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</migration_features>\n");
}
if (host->netprefix)
virBufferAsprintf(buf, "<netprefix>%s</netprefix>\n",
host->netprefix);
if (virCapabilitiesHostNUMAFormat(buf, host->numa) < 0)
return -1;
if (virCapabilitiesFormatCaches(buf, &host->cache) < 0)
return -1;
if (virCapabilitiesFormatMemoryBandwidth(buf, &host->memBW) < 0)
return -1;
for (i = 0; i < host->nsecModels; i++) {
virBufferAddLit(buf, "<secmodel>\n");
virBufferAdjustIndent(buf, 2);
virBufferAsprintf(buf, "<model>%s</model>\n",
host->secModels[i].model);
virBufferAsprintf(buf, "<doi>%s</doi>\n",
host->secModels[i].doi);
for (j = 0; j < host->secModels[i].nlabels; j++) {
virBufferAsprintf(buf, "<baselabel type='%s'>%s</baselabel>\n",
host->secModels[i].labels[j].type,
host->secModels[i].labels[j].label);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</secmodel>\n");
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</host>\n\n");
return 0;
}
static void
virCapabilitiesFormatGuestFeatures(virCapsGuest *guest,
virBuffer *buf)
{
g_auto(virBuffer) childBuf = VIR_BUFFER_INIT_CHILD(buf);
size_t i;
for (i = 0; i < VIR_CAPS_GUEST_FEATURE_TYPE_LAST; i++) {
virCapsGuestFeature *feature = guest->features + i;
if (!feature->present)
continue;
virBufferAsprintf(&childBuf, "<%s", virCapsGuestFeatureInfos[i].name);
if (feature->defaultOn) {
virBufferAsprintf(&childBuf, " default='%s'",
virTristateSwitchTypeToString(feature->defaultOn));
}
if (feature->toggle) {
virBufferAsprintf(&childBuf, " toggle='%s'",
virTristateBoolTypeToString(feature->toggle));
}
virBufferAddLit(&childBuf, "/>\n");
}
virXMLFormatElement(buf, "features", NULL, &childBuf);
}
static void
virCapabilitiesFormatGuestXML(virCapsGuest **guests,
size_t nguests,
virBuffer *buf)
{
size_t i, j, k;
for (i = 0; i < nguests; i++) {
virBufferAddLit(buf, "<guest>\n");
virBufferAdjustIndent(buf, 2);
virBufferAsprintf(buf, "<os_type>%s</os_type>\n",
virDomainOSTypeToString(guests[i]->ostype));
if (guests[i]->arch.id)
virBufferAsprintf(buf, "<arch name='%s'>\n",
virArchToString(guests[i]->arch.id));
virBufferAdjustIndent(buf, 2);
virBufferAsprintf(buf, "<wordsize>%d</wordsize>\n",
guests[i]->arch.wordsize);
if (guests[i]->arch.defaultInfo.emulator)
virBufferAsprintf(buf, "<emulator>%s</emulator>\n",
guests[i]->arch.defaultInfo.emulator);
if (guests[i]->arch.defaultInfo.loader)
virBufferAsprintf(buf, "<loader>%s</loader>\n",
guests[i]->arch.defaultInfo.loader);
for (j = 0; j < guests[i]->arch.defaultInfo.nmachines; j++) {
virCapsGuestMachine *machine = guests[i]->arch.defaultInfo.machines[j];
virBufferAddLit(buf, "<machine");
if (machine->canonical)
virBufferAsprintf(buf, " canonical='%s'", machine->canonical);
if (machine->maxCpus > 0)
virBufferAsprintf(buf, " maxCpus='%d'", machine->maxCpus);
if (machine->deprecated)
virBufferAddLit(buf, " deprecated='yes'");
virBufferAsprintf(buf, ">%s</machine>\n", machine->name);
}
for (j = 0; j < guests[i]->arch.ndomains; j++) {
virBufferAsprintf(buf, "<domain type='%s'",
virDomainVirtTypeToString(guests[i]->arch.domains[j]->type));
if (!guests[i]->arch.domains[j]->info.emulator &&
!guests[i]->arch.domains[j]->info.loader &&
!guests[i]->arch.domains[j]->info.nmachines) {
virBufferAddLit(buf, "/>\n");
continue;
}
virBufferAddLit(buf, ">\n");
virBufferAdjustIndent(buf, 2);
if (guests[i]->arch.domains[j]->info.emulator)
virBufferAsprintf(buf, "<emulator>%s</emulator>\n",
guests[i]->arch.domains[j]->info.emulator);
if (guests[i]->arch.domains[j]->info.loader)
virBufferAsprintf(buf, "<loader>%s</loader>\n",
guests[i]->arch.domains[j]->info.loader);
for (k = 0; k < guests[i]->arch.domains[j]->info.nmachines; k++) {
virCapsGuestMachine *machine = guests[i]->arch.domains[j]->info.machines[k];
virBufferAddLit(buf, "<machine");
if (machine->canonical)
virBufferAsprintf(buf, " canonical='%s'", machine->canonical);
if (machine->maxCpus > 0)
virBufferAsprintf(buf, " maxCpus='%d'", machine->maxCpus);
virBufferAsprintf(buf, ">%s</machine>\n", machine->name);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</domain>\n");
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</arch>\n");
virCapabilitiesFormatGuestFeatures(guests[i], buf);
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</guest>\n\n");
}
}
static void
virCapabilitiesFormatStoragePoolXML(virCapsStoragePool **pools,
size_t npools,
virBuffer *buf)
{
size_t i;
if (npools == 0)
return;
virBufferAddLit(buf, "<pool>\n");
virBufferAdjustIndent(buf, 2);
virBufferAddLit(buf, "<enum name='type'>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < npools; i++)
virBufferAsprintf(buf, "<value>%s</value>\n",
virStoragePoolTypeToString(pools[i]->type));
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</enum>\n");
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</pool>\n\n");
}
/**
* virCapabilitiesFormatXML:
* @caps: capabilities to format
*
* Convert the capabilities object into an XML representation
*
* Returns the XML document as a string
*/
char *
virCapabilitiesFormatXML(virCaps *caps)
{
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
virBufferAddLit(&buf, "<capabilities>\n\n");
virBufferAdjustIndent(&buf, 2);
if (virCapabilitiesFormatHostXML(&caps->host, &buf) < 0)
return NULL;
virCapabilitiesFormatGuestXML(caps->guests, caps->nguests, &buf);
virCapabilitiesFormatStoragePoolXML(caps->pools, caps->npools, &buf);
virBufferAdjustIndent(&buf, -2);
virBufferAddLit(&buf, "</capabilities>\n");
return virBufferContentAndReset(&buf);
}
/* get the maximum ID of cpus in the host */
static unsigned int
virCapabilitiesHostNUMAGetMaxcpu(virCapsHostNUMA *caps)
{
unsigned int maxcpu = 0;
size_t node;
size_t cpu;
for (node = 0; node < caps->cells->len; node++) {
virCapsHostNUMACell *cell = g_ptr_array_index(caps->cells, node);
for (cpu = 0; cpu < cell->ncpus; cpu++) {
if (cell->cpus[cpu].id > maxcpu)
maxcpu = cell->cpus[cpu].id;
}
}
return maxcpu;
}
/* set cpus of a numa node in the bitmask */
static int
virCapabilitiesHostNUMAGetCellCpus(virCapsHostNUMA *caps,
size_t node,
virBitmap *cpumask)
{
virCapsHostNUMACell *cell = NULL;
size_t cpu;
size_t i;
/* The numa node numbers can be non-contiguous. Ex: 0,1,16,17. */
for (i = 0; i < caps->cells->len; i++) {
cell = g_ptr_array_index(caps->cells, i);
if (cell->num == node)
break;
cell = NULL;
}
for (cpu = 0; cell && cpu < cell->ncpus; cpu++) {
if (virBitmapSetBit(cpumask, cell->cpus[cpu].id) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Cpu '%1$u' in node '%2$zu' is out of range of the provided bitmap"),
cell->cpus[cpu].id, node);
return -1;
}
}
return 0;
}
virBitmap *
virCapabilitiesHostNUMAGetCpus(virCapsHostNUMA *caps,
virBitmap *nodemask)
{
g_autoptr(virBitmap) ret = NULL;
unsigned int maxcpu = virCapabilitiesHostNUMAGetMaxcpu(caps);
ssize_t node = -1;
ret = virBitmapNew(maxcpu + 1);
while ((node = virBitmapNextSetBit(nodemask, node)) >= 0) {
if (virCapabilitiesHostNUMAGetCellCpus(caps, node, ret) < 0)
return NULL;
}
return g_steal_pointer(&ret);
}
int
virCapabilitiesHostNUMAGetMaxNode(virCapsHostNUMA *caps)
{
virCapsHostNUMACell *cell = g_ptr_array_index(caps->cells, caps->cells->len - 1);
return cell->num;
}
int
virCapabilitiesGetNodeInfo(virNodeInfoPtr nodeinfo)
{
virArch hostarch = virArchFromHost();
unsigned long long memorybytes;
memset(nodeinfo, 0, sizeof(*nodeinfo));
if (virStrcpyStatic(nodeinfo->model, virArchToString(hostarch)) < 0)
return -1;
if (virHostMemGetInfo(&memorybytes, NULL) < 0)
return -1;
nodeinfo->memory = memorybytes / 1024;
if (virHostCPUGetInfo(hostarch,
&nodeinfo->cpus, &nodeinfo->mhz,
&nodeinfo->nodes, &nodeinfo->sockets,
&nodeinfo->cores, &nodeinfo->threads) < 0)
return -1;
return 0;
}
/* returns 1 on success, 0 if the detection failed and -1 on hard error */
static int
virCapabilitiesFillCPUInfo(int cpu_id G_GNUC_UNUSED,
virCapsHostNUMACellCPU *cpu G_GNUC_UNUSED)
{
#ifdef __linux__
cpu->id = cpu_id;
if (virHostCPUGetSocket(cpu_id, &cpu->socket_id) < 0 ||
virHostCPUGetDie(cpu_id, &cpu->die_id) < 0 ||
virHostCPUGetCore(cpu_id, &cpu->core_id) < 0)
return -1;
if (!(cpu->siblings = virHostCPUGetSiblingsList(cpu_id)))
return -1;
return 0;
#else
virReportError(VIR_ERR_NO_SUPPORT, "%s",
_("node cpu info not implemented on this platform"));
return -1;
#endif
}
static int
virCapabilitiesGetNUMADistances(int node,
virNumaDistance **distancesRet,
int *ndistancesRet)
{
g_autofree virNumaDistance *tmp = NULL;
int tmp_size = 0;
g_autofree int *distances = NULL;
int ndistances = 0;
size_t i;
if (virNumaGetDistances(node, &distances, &ndistances) < 0)
return -1;
if (!distances) {
*distancesRet = NULL;
*ndistancesRet = 0;
return 0;
}
tmp = g_new0(virNumaDistance, ndistances);
for (i = 0; i < ndistances; i++) {
if (!distances[i])
continue;
tmp[tmp_size].cellid = i;
tmp[tmp_size].value = distances[i];
tmp_size++;
}
VIR_REALLOC_N(tmp, tmp_size);
*ndistancesRet = tmp_size;
*distancesRet = g_steal_pointer(&tmp);
tmp_size = 0;
return 0;
}
static int
virCapabilitiesGetNUMAPagesInfo(int node,
virCapsHostNUMACellPageInfo **pageinfo,
int *npageinfo)
{
g_autofree unsigned int *pages_size = NULL;
g_autofree unsigned long long *pages_avail = NULL;
size_t npages, i;
if (virNumaGetPages(node, &pages_size, &pages_avail, NULL, &npages) < 0)
return -1;
*pageinfo = g_new0(virCapsHostNUMACellPageInfo, npages);
*npageinfo = npages;
for (i = 0; i < npages; i++) {
(*pageinfo)[i].size = pages_size[i];
(*pageinfo)[i].avail = pages_avail[i];
}
return 0;
}
static int
virCapabilitiesGetNodeCacheReadFileUint(const char *prefix,
const char *dir,
const char *file,
unsigned int *value)
{
g_autofree char *path = g_build_filename(prefix, dir, file, NULL);
int rv = virFileReadValueUint(value, "%s", path);
if (rv < 0) {
if (rv == -2) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("File '%1$s' does not exist"),
path);
}
return -1;
}
return 0;
}
static int
virCapabilitiesGetNodeCacheReadFileUllong(const char *prefix,
const char *dir,
const char *file,
unsigned long long *value)
{
g_autofree char *path = g_build_filename(prefix, dir, file, NULL);
int rv = virFileReadValueUllong(value, "%s", path);
if (rv < 0) {
if (rv == -2) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("File '%1$s' does not exist"),
path);
}
return -1;
}
return 0;
}
static int
virCapsHostNUMACellCacheComparator(const void *a,
const void *b)
{
const virNumaCache *aa = a;
const virNumaCache *bb = b;
return aa->level - bb->level;
}
static int
virCapabilitiesGetNodeCache(int node,
GArray **cachesRet)
{
g_autoptr(DIR) dir = NULL;
int direrr = 0;
struct dirent *entry;
g_autofree char *path = NULL;
g_autoptr(GArray) caches = g_array_new(FALSE, FALSE, sizeof(virNumaCache));
path = g_strdup_printf(SYSFS_SYSTEM_PATH "/node/node%d/memory_side_cache", node);
if (virDirOpenIfExists(&dir, path) < 0)
return -1;
while (dir && (direrr = virDirRead(dir, &entry, path)) > 0) {
const char *dname = STRSKIP(entry->d_name, "index");
virNumaCache cache = { 0 };
unsigned int indexing;
unsigned int write_policy;
if (!dname)
continue;
if (virStrToLong_ui(dname, NULL, 10, &cache.level) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unable to parse %1$s"),
entry->d_name);
return -1;
}
if (virCapabilitiesGetNodeCacheReadFileUllong(path, entry->d_name,
"size", &cache.size) < 0)
return -1;
cache.size >>= 10; /* read in bytes but stored in kibibytes */
if (virCapabilitiesGetNodeCacheReadFileUint(path, entry->d_name,
"line_size", &cache.line) < 0)
return -1;
if (virCapabilitiesGetNodeCacheReadFileUint(path, entry->d_name,
"indexing", &indexing) < 0)
return -1;
/* see enum cache_indexing in kernel */
switch (indexing) {
case 0: cache.associativity = VIR_NUMA_CACHE_ASSOCIATIVITY_DIRECT; break;
case 1: cache.associativity = VIR_NUMA_CACHE_ASSOCIATIVITY_FULL; break;
case 2: cache.associativity = VIR_NUMA_CACHE_ASSOCIATIVITY_NONE; break;
default:
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unknown indexing value '%1$u'"),
indexing);
return -1;
}
if (virCapabilitiesGetNodeCacheReadFileUint(path, entry->d_name,
"write_policy", &write_policy) < 0)
return -1;
/* see enum cache_write_policy in kernel */
switch (write_policy) {
case 0: cache.policy = VIR_NUMA_CACHE_POLICY_WRITEBACK; break;
case 1: cache.policy = VIR_NUMA_CACHE_POLICY_WRITETHROUGH; break;
case 2: cache.policy = VIR_NUMA_CACHE_POLICY_NONE; break;
default:
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unknown write_policy value '%1$u'"),
write_policy);
return -1;
}
g_array_append_val(caches, cache);
}
if (direrr < 0)
return -1;
if (caches->len > 0) {
g_array_sort(caches, virCapsHostNUMACellCacheComparator);
*cachesRet = g_steal_pointer(&caches);
} else {
*cachesRet = NULL;
}
return 0;
}
static int
virCapabilitiesHostNUMAInitFake(virCapsHostNUMA *caps)
{
virNodeInfo nodeinfo;
virCapsHostNUMACellCPU *cpus;
int ncpus;
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
int n, s, c, t;
int id, cid;
int onlinecpus G_GNUC_UNUSED;
bool tmp;
if (virCapabilitiesGetNodeInfo(&nodeinfo) < 0)
return -1;
ncpus = VIR_NODEINFO_MAXCPUS(nodeinfo);
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
id = 0;
for (n = 0; n < nodeinfo.nodes; n++) {
int nodecpus = nodeinfo.sockets * nodeinfo.cores * nodeinfo.threads;
cid = 0;
cpus = g_new0(virCapsHostNUMACellCPU, nodecpus);
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
for (s = 0; s < nodeinfo.sockets; s++) {
for (c = 0; c < nodeinfo.cores; c++) {
g_autoptr(virBitmap) siblings = virBitmapNew(ncpus);
for (t = 0; t < nodeinfo.threads; t++)
ignore_value(virBitmapSetBit(siblings, id + t));
for (t = 0; t < nodeinfo.threads; t++) {
if (virHostCPUGetOnline(id, &tmp) < 0)
goto error;
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
if (tmp) {
cpus[cid].id = id;
cpus[cid].die_id = 0;
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
cpus[cid].socket_id = s;
cpus[cid].core_id = c;
cpus[cid].siblings = virBitmapNewCopy(siblings);
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
cid++;
}
id++;
}
}
}
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
virCapabilitiesHostNUMAAddCell(caps, 0,
nodeinfo.memory,
cid, &cpus,
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
0, NULL,
0, NULL,
NULL);
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
}
return 0;
error:
conf: fix populating of fake NUMA in multi-node hosts If the host OS doesn't have NUMA present, we fallback to populating fake NUMA info and the code thus assumes only a single NUMA node. Unfortunately we also fallback to fake NUMA if numactl-devel was not present, and in this case we can still have multiple NUMA nodes. In this case we create all CPUs, but only the CPUs in the first node have any data filled in, resulting in capabilities like: <topology> <cells num='1'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='48'> <cpu id='0' socket_id='0' core_id='0' siblings='0'/> <cpu id='1' socket_id='0' core_id='0' siblings='1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2'/> <cpu id='3' socket_id='0' core_id='1' siblings='3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4'/> <cpu id='5' socket_id='0' core_id='2' siblings='5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6'/> <cpu id='7' socket_id='0' core_id='3' siblings='7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8'/> <cpu id='9' socket_id='0' core_id='4' siblings='9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10'/> <cpu id='11' socket_id='0' core_id='5' siblings='11'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> <cpu id='0'/> </cpus> </cell> </cells> </topology> With this new code we get something slightly less broken <topology> <cells num='4'> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='0' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='1' socket_id='0' core_id='0' siblings='0-1'/> <cpu id='2' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='3' socket_id='0' core_id='1' siblings='2-3'/> <cpu id='4' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='5' socket_id='0' core_id='2' siblings='4-5'/> <cpu id='6' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='7' socket_id='0' core_id='3' siblings='6-7'/> <cpu id='8' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='9' socket_id='0' core_id='4' siblings='8-9'/> <cpu id='10' socket_id='0' core_id='5' siblings='10-11'/> <cpu id='11' socket_id='0' core_id='5' siblings='10-11'/> </cpus> </cell> <cell id='0'> <memory unit='KiB'>15977572</memory> <cpus num='12'> <cpu id='12' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='13' socket_id='0' core_id='0' siblings='12-13'/> <cpu id='14' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='15' socket_id='0' core_id='1' siblings='14-15'/> <cpu id='16' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='17' socket_id='0' core_id='2' siblings='16-17'/> <cpu id='18' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='19' socket_id='0' core_id='3' siblings='18-19'/> <cpu id='20' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='21' socket_id='0' core_id='4' siblings='20-21'/> <cpu id='22' socket_id='0' core_id='5' siblings='22-23'/> <cpu id='23' socket_id='0' core_id='5' siblings='22-23'/> </cpus> </cell> </cells> </topology> The topology at least now reflects what 'virsh nodeinfo' reports. The main bug is that the CPU "id" values won't match what the Linux host actually uses. Reviewed-by: Michal Privoznik <mprivozn@redhat.com> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2019-12-18 06:31:53 -05:00
for (; cid >= 0; cid--)
virBitmapFree(cpus[cid].siblings);
VIR_FREE(cpus);
return -1;
}
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
static void
virCapabilitiesHostInsertHMAT(GArray *interconnects,
unsigned int initiator,
unsigned int target,
unsigned int read_bandwidth,
unsigned int write_bandwidth,
unsigned int read_latency,
unsigned int write_latency)
{
virNumaInterconnect ni;
ni = (virNumaInterconnect) { VIR_NUMA_INTERCONNECT_TYPE_BANDWIDTH,
initiator, target, 0, VIR_MEMORY_LATENCY_READ, read_bandwidth};
g_array_append_val(interconnects, ni);
ni = (virNumaInterconnect) { VIR_NUMA_INTERCONNECT_TYPE_BANDWIDTH,
initiator, target, 0, VIR_MEMORY_LATENCY_WRITE, write_bandwidth};
g_array_append_val(interconnects, ni);
ni = (virNumaInterconnect) { VIR_NUMA_INTERCONNECT_TYPE_LATENCY,
initiator, target, 0, VIR_MEMORY_LATENCY_READ, read_latency};
g_array_append_val(interconnects, ni);
ni = (virNumaInterconnect) { VIR_NUMA_INTERCONNECT_TYPE_LATENCY,
initiator, target, 0, VIR_MEMORY_LATENCY_WRITE, write_latency};
g_array_append_val(interconnects, ni);
}
static int
virCapabilitiesHostNUMAInitInterconnectsNode(GArray *interconnects,
unsigned int node)
{
g_autofree char *path = NULL;
g_autofree char *initPath = NULL;
g_autoptr(DIR) dir = NULL;
int direrr = 0;
struct dirent *entry;
unsigned int read_bandwidth;
unsigned int write_bandwidth;
unsigned int read_latency;
unsigned int write_latency;
/* Unfortunately, kernel does not expose full HMAT table. I mean it does,
* in its binary form under /sys/firmware/acpi/tables/HMAT but we don't
* want to parse that. But some important info is still exposed, under
* "access0" and "access1" directories. The former contains the best
* interconnect to given node including CPUs and devices that might do I/O
* (such as GPUs and NICs). The latter contains the best interconnect to
* given node but only CPUs are considered. Stick with access1 until sysfs
* exposes the full table in a sensible way.
* NB on most system access0 and access1 contain the same values. */
path = g_strdup_printf(SYSFS_SYSTEM_PATH "/node/node%d/access1", node);
if (!virFileExists(path))
return 0;
if (virCapabilitiesGetNodeCacheReadFileUint(path, "initiators",
"read_bandwidth",
&read_bandwidth) < 0)
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
return -1;
if (virCapabilitiesGetNodeCacheReadFileUint(path, "initiators",
"write_bandwidth",
&write_bandwidth) < 0)
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
return -1;
/* Bandwidths are read in MiB but stored in KiB */
read_bandwidth <<= 10;
write_bandwidth <<= 10;
if (virCapabilitiesGetNodeCacheReadFileUint(path, "initiators",
"read_latency",
&read_latency) < 0)
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
return -1;
if (virCapabilitiesGetNodeCacheReadFileUint(path, "initiators",
"write_latency",
&write_latency) < 0)
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
return -1;
initPath = g_strdup_printf("%s/initiators", path);
if (virDirOpen(&dir, initPath) < 0)
return -1;
while ((direrr = virDirRead(dir, &entry, path)) > 0) {
const char *dname = STRSKIP(entry->d_name, "node");
unsigned int initNode;
if (!dname)
continue;
if (virStrToLong_ui(dname, NULL, 10, &initNode) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unable to parse %1$s"),
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
entry->d_name);
return -1;
}
virCapabilitiesHostInsertHMAT(interconnects,
initNode, node,
read_bandwidth,
write_bandwidth,
read_latency,
write_latency);
}
return 0;
}
static int
virCapsHostNUMAInterconnectComparator(const void *a,
const void *b)
{
const virNumaInterconnect *aa = a;
const virNumaInterconnect *bb = b;
if (aa->type != bb->type)
return aa->type - bb->type;
if (aa->initiator != bb->initiator)
return aa->initiator - bb->initiator;
if (aa->target != bb->target)
return aa->target - bb->target;
if (aa->cache != bb->cache)
return aa->cache - bb->cache;
if (aa->accessType != bb->accessType)
return aa->accessType - bb->accessType;
return aa->value - bb->value;
}
static int
virCapabilitiesHostNUMAInitInterconnects(virCapsHostNUMA *caps)
{
g_autoptr(DIR) dir = NULL;
int direrr = 0;
struct dirent *entry;
const char *path = SYSFS_SYSTEM_PATH "/node/";
g_autoptr(GArray) interconnects = g_array_new(FALSE, FALSE, sizeof(virNumaInterconnect));
if (virDirOpenIfExists(&dir, path) < 0)
return -1;
while (dir && (direrr = virDirRead(dir, &entry, path)) > 0) {
const char *dname = STRSKIP(entry->d_name, "node");
unsigned int node;
if (!dname)
continue;
if (virStrToLong_ui(dname, NULL, 10, &node) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unable to parse %1$s"),
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
entry->d_name);
return -1;
}
if (virCapabilitiesHostNUMAInitInterconnectsNode(interconnects, node) < 0)
return -1;
}
if (interconnects->len > 0) {
g_array_sort(interconnects, virCapsHostNUMAInterconnectComparator);
caps->interconnects = g_steal_pointer(&interconnects);
}
return 0;
}
static int
virCapabilitiesHostNUMAInitReal(virCapsHostNUMA *caps)
{
int n;
virCapsHostNUMACellCPU *cpus = NULL;
int ret = -1;
int ncpus = 0;
int max_node;
if ((max_node = virNumaGetMaxNode()) < 0)
goto cleanup;
for (n = 0; n <= max_node; n++) {
g_autoptr(virBitmap) cpumap = NULL;
g_autofree virNumaDistance *distances = NULL;
int ndistances = 0;
g_autofree virCapsHostNUMACellPageInfo *pageinfo = NULL;
int npageinfo = 0;
unsigned long long memory;
g_autoptr(GArray) caches = NULL;
int cpu;
size_t i;
if ((ncpus = virNumaGetNodeCPUs(n, &cpumap)) < 0) {
if (ncpus == -2)
continue;
ncpus = 0;
goto cleanup;
}
cpus = g_new0(virCapsHostNUMACellCPU, ncpus);
cpu = 0;
for (i = 0; i < virBitmapSize(cpumap); i++) {
if (virBitmapIsBitSet(cpumap, i)) {
if (virCapabilitiesFillCPUInfo(i, cpus + cpu++) < 0)
goto cleanup;
}
}
if (virCapabilitiesGetNUMADistances(n, &distances, &ndistances) < 0)
goto cleanup;
if (virCapabilitiesGetNUMAPagesInfo(n, &pageinfo, &npageinfo) < 0)
goto cleanup;
if (virCapabilitiesGetNodeCache(n, &caches) < 0)
goto cleanup;
/* Detect the amount of memory in the numa cell in KiB */
virNumaGetNodeMemory(n, &memory, NULL);
memory >>= 10;
virCapabilitiesHostNUMAAddCell(caps, n, memory,
ncpus, &cpus,
ndistances, &distances,
npageinfo, &pageinfo,
&caches);
}
capabilities: Expose NUMA interconnects Links between NUMA nodes can have different latencies and bandwidths. This info is newly defined in ACPI 6.2 under Heterogeneous Memory Attribute Table (HMAT) table. Linux kernel learned how to report these values under sysfs and thus we can expose them in our capabilities XML. The sysfs interface is documented in kernel's Documentation/admin-guide/mm/numaperf.rst. Long story short, two nodes can be in initiator-target relationship. A node can be initiator if it has a CPU or a device that's capable of initiating memory transfer. Therefore a node that has just memory can only be target. An initiator-target link can then have any combination of {bandwidth, latency} - {access, read, write} attribute (6 in total). However, the standard says access is applicable iff read and write values are the same. Therefore, we really have just four combinations of attributes: bandwidth-read, bandwidth-write, latency-read, latency-write. This is the combination that kernel reports anyway. Then, under /sys/system/devices/node/nodeX/acccessN/initiators we find values for those 4 attributes and also symlinks named "nodeN" which then represent initiators to nodeX. For instance: /sys/system/node/node1/access1/initiators/node0 -> ../../node0 /sys/system/node/node1/access1/initiators/read_bandwidth /sys/system/node/node1/access1/initiators/read_latency /sys/system/node/node1/access1/initiators/write_bandwidth /sys/system/node/node1/access1/initiators/write_latency This means that node0 is initiator and node1 is target and values of the interconnect can be read. In theory, there can be separate links to memory side caches too (e.g. one link from node X to node Y's main memory, another from node X to node Y's L1 cache, another one to L2 cache and so on). But sysfs does not express this relationship just yet. The "accessN" means either "access0" or "access1". The difference is that while the former expresses the best interconnect between two nodes including CPUS and I/O devices (such as GPUs and NICs), the latter includes only CPUs and thus is what we need. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1786309 Signed-off-by: Michal Privoznik <mprivozn@redhat.com> Reviewed-by: Martin Kletzander <mkletzan@redhat.com>
2021-05-17 13:36:34 +02:00
if (virCapabilitiesHostNUMAInitInterconnects(caps) < 0)
goto cleanup;
ret = 0;
cleanup:
virCapabilitiesClearHostNUMACellCPUTopology(cpus, ncpus);
VIR_FREE(cpus);
return ret;
}
virCapsHostNUMA *
virCapabilitiesHostNUMANew(void)
{
virCapsHostNUMA *caps = NULL;
caps = g_new0(virCapsHostNUMA, 1);
caps->refs = 1;
caps->cells = g_ptr_array_new_with_free_func(
(GDestroyNotify)virCapabilitiesFreeHostNUMACell);
return caps;
}
virCapsHostNUMA *
virCapabilitiesHostNUMANewHost(void)
{
virCapsHostNUMA *caps = virCapabilitiesHostNUMANew();
if (virNumaIsAvailable()) {
if (virCapabilitiesHostNUMAInitReal(caps) == 0)
return caps;
virCapabilitiesHostNUMAUnref(caps);
caps = virCapabilitiesHostNUMANew();
VIR_WARN("Failed to query host NUMA topology, faking single NUMA node");
}
if (virCapabilitiesHostNUMAInitFake(caps) < 0) {
virCapabilitiesHostNUMAUnref(caps);
return NULL;
}
return caps;
}
int
virCapabilitiesInitPages(virCaps *caps)
{
g_autofree unsigned int *pages_size = NULL;
size_t npages;
if (virNumaGetPages(-1 /* Magic constant for overall info */,
&pages_size, NULL, NULL, &npages) < 0)
return -1;
caps->host.pagesSize = g_steal_pointer(&pages_size);
caps->host.nPagesSize = npages;
npages = 0;
return 0;
}
bool
virCapsHostCacheBankEquals(virCapsHostCacheBank *a,
virCapsHostCacheBank *b)
{
return (a->id == b->id &&
a->level == b->level &&
a->type == b->type &&
a->size == b->size &&
virBitmapEqual(a->cpus, b->cpus));
}
void
virCapsHostCacheBankFree(virCapsHostCacheBank *ptr)
{
size_t i;
if (!ptr)
return;
virBitmapFree(ptr->cpus);
for (i = 0; i < ptr->ncontrols; i++)
g_free(ptr->controls[i]);
g_free(ptr->controls);
g_free(ptr);
}
static int
virCapsHostCacheBankSorter(const void *a,
const void *b)
{
virCapsHostCacheBank *ca = *(virCapsHostCacheBank **)a;
virCapsHostCacheBank *cb = *(virCapsHostCacheBank **)b;
if (ca->level < cb->level)
return -1;
if (ca->level > cb->level)
return 1;
return ca->id - cb->id;
}
static int
virCapabilitiesInitResctrl(virCaps *caps)
{
if (caps->host.resctrl)
return 0;
caps->host.resctrl = virResctrlInfoNew();
if (!caps->host.resctrl)
return -1;
return 0;
}
static int
virCapabilitiesInitResctrlMemory(virCaps *caps)
{
virCapsHostMemBWNode *node = NULL;
size_t i = 0;
int ret = -1;
const virResctrlMonitorType montype = VIR_RESCTRL_MONITOR_TYPE_MEMBW;
const char *prefix = virResctrlMonitorPrefixTypeToString(montype);
for (i = 0; i < caps->host.cache.nbanks; i++) {
virCapsHostCacheBank *bank = caps->host.cache.banks[i];
node = g_new0(virCapsHostMemBWNode, 1);
if (virResctrlInfoGetMemoryBandwidth(caps->host.resctrl,
bank->level, &node->control) > 0) {
node->id = bank->id;
node->cpus = virBitmapNewCopy(bank->cpus);
VIR_APPEND_ELEMENT(caps->host.memBW.nodes, caps->host.memBW.nnodes, node);
}
g_clear_pointer(&node, virCapsHostMemBWNodeFree);
}
if (virResctrlInfoGetMonitorPrefix(caps->host.resctrl, prefix,
&caps->host.memBW.monitor) < 0)
goto cleanup;
ret = 0;
cleanup:
virCapsHostMemBWNodeFree(node);
return ret;
}
int
virCapabilitiesInitCaches(virCaps *caps)
{
size_t i = 0;
g_autoptr(virBitmap) cpus = NULL;
ssize_t pos = -1;
struct dirent *ent = NULL;
const virResctrlMonitorType montype = VIR_RESCTRL_MONITOR_TYPE_CACHE;
const char *prefix = virResctrlMonitorPrefixTypeToString(montype);
/* Minimum level to expose in capabilities. Can be lowered or removed (with
* the appropriate code below), but should not be increased, because we'd
* lose information. */
const int cache_min_level = 3;
if (virCapabilitiesInitResctrl(caps) < 0)
return -1;
/* offline CPUs don't provide cache info */
if (virFileReadValueBitmap(&cpus, "%s/cpu/online", SYSFS_SYSTEM_PATH) < 0)
return -1;
while ((pos = virBitmapNextSetBit(cpus, pos)) >= 0) {
int rv = -1;
g_autoptr(DIR) dirp = NULL;
g_autofree char *path = g_strdup_printf("%s/cpu/cpu%zd/cache/", SYSFS_SYSTEM_PATH, pos);
rv = virDirOpenIfExists(&dirp, path);
if (rv < 0)
return -1;
if (!dirp)
continue;
while ((rv = virDirRead(dirp, &ent, path)) > 0) {
g_autofree char *type = NULL;
g_autoptr(virCapsHostCacheBank) bank = NULL;
int kernel_type;
unsigned int level;
int ret;
if (!STRPREFIX(ent->d_name, "index"))
continue;
if (virFileReadValueUint(&level,
"%s/cpu/cpu%zd/cache/%s/level",
SYSFS_SYSTEM_PATH, pos, ent->d_name) < 0)
return -1;
if (level < cache_min_level)
continue;
bank = g_new0(virCapsHostCacheBank, 1);
bank->level = level;
ret = virFileReadValueUint(&bank->id,
"%s/cpu/cpu%zd/cache/%s/id",
SYSFS_SYSTEM_PATH, pos, ent->d_name);
if (ret == -2) {
VIR_DEBUG("CPU %zd cache %s 'id' missing", pos, ent->d_name);
continue;
}
if (ret < 0)
return -1;
ret = virFileReadValueUint(&bank->level,
"%s/cpu/cpu%zd/cache/%s/level",
SYSFS_SYSTEM_PATH, pos, ent->d_name);
if (ret == -2) {
VIR_DEBUG("CPU %zd cache %s 'level' missing", pos, ent->d_name);
continue;
}
if (ret < 0)
return -1;
ret = virFileReadValueString(&type,
"%s/cpu/cpu%zd/cache/%s/type",
SYSFS_SYSTEM_PATH, pos, ent->d_name);
if (ret == -2) {
VIR_DEBUG("CPU %zd cache %s 'type' missing", pos, ent->d_name);
continue;
}
if (ret < 0)
return -1;
ret = virFileReadValueScaledInt(&bank->size,
"%s/cpu/cpu%zd/cache/%s/size",
SYSFS_SYSTEM_PATH, pos, ent->d_name);
if (ret == -2) {
VIR_DEBUG("CPU %zd cache %s 'size' missing", pos, ent->d_name);
continue;
}
if (ret < 0)
return -1;
ret = virFileReadValueBitmap(&bank->cpus,
"%s/cpu/cpu%zd/cache/%s/shared_cpu_list",
SYSFS_SYSTEM_PATH, pos, ent->d_name);
if (ret == -2) {
VIR_DEBUG("CPU %zd cache %s 'shared_cpu_list' missing", pos, ent->d_name);
continue;
}
if (ret < 0)
return -1;
kernel_type = virCacheKernelTypeFromString(type);
if (kernel_type < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Unknown cache type '%1$s'"), type);
return -1;
}
bank->type = kernel_type;
for (i = 0; i < caps->host.cache.nbanks; i++) {
if (virCapsHostCacheBankEquals(bank, caps->host.cache.banks[i]))
break;
}
if (i == caps->host.cache.nbanks) {
/* If it is a new cache, then update its resctrl information. */
if (virResctrlInfoGetCache(caps->host.resctrl,
bank->level,
bank->size,
&bank->ncontrols,
&bank->controls) < 0)
return -1;
VIR_APPEND_ELEMENT(caps->host.cache.banks, caps->host.cache.nbanks, bank);
}
}
if (rv < 0)
return -1;
}
/* Sort the array in order for the tests to be predictable. This way we can
* still traverse the directory instead of guessing names (in case there is
* 'index1' and 'index3' but no 'index2'). */
if (caps->host.cache.banks) {
qsort(caps->host.cache.banks, caps->host.cache.nbanks,
sizeof(*caps->host.cache.banks), virCapsHostCacheBankSorter);
}
if (virCapabilitiesInitResctrlMemory(caps) < 0)
return -1;
if (virResctrlInfoGetMonitorPrefix(caps->host.resctrl, prefix,
&caps->host.cache.monitor) < 0)
return -1;
return 0;
}
void
virCapabilitiesHostInitIOMMU(virCaps *caps)
{
caps->host.iommu = virHostHasIOMMU();
}