mirror of
https://gitlab.com/libvirt/libvirt.git
synced 2025-01-18 18:45:16 +00:00
d18a5f716a
By making use of GNU C's cleanup attribute handled by the VIR_AUTOPTR macro for declaring aggregate pointer variables, majority of the calls to *Free functions can be dropped, which in turn leads to getting rid of most of our cleanup sections. Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com> Reviewed-by: Erik Skultety <eskultet@redhat.com>
998 lines
27 KiB
C
998 lines
27 KiB
C
/*
|
|
* virnuma.c: helper APIs for managing numa
|
|
*
|
|
* Copyright (C) 2011-2014 Red Hat, Inc.
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library. If not, see
|
|
* <http://www.gnu.org/licenses/>.
|
|
*
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#define NUMA_MAX_N_CPUS 4096
|
|
|
|
#if WITH_NUMACTL
|
|
# define NUMA_VERSION1_COMPATIBILITY 1
|
|
# include <numa.h>
|
|
|
|
# if LIBNUMA_API_VERSION > 1
|
|
# undef NUMA_MAX_N_CPUS
|
|
# define NUMA_MAX_N_CPUS (numa_all_cpus_ptr->size)
|
|
# endif
|
|
|
|
#endif /* WITH_NUMACTL */
|
|
|
|
#include <sys/types.h>
|
|
#include <dirent.h>
|
|
|
|
#include "virnuma.h"
|
|
#include "vircommand.h"
|
|
#include "virerror.h"
|
|
#include "virlog.h"
|
|
#include "viralloc.h"
|
|
#include "virbitmap.h"
|
|
#include "virstring.h"
|
|
#include "virfile.h"
|
|
#include "virhostmem.h"
|
|
|
|
#define VIR_FROM_THIS VIR_FROM_NONE
|
|
|
|
VIR_LOG_INIT("util.numa");
|
|
|
|
|
|
#if HAVE_NUMAD
|
|
char *
|
|
virNumaGetAutoPlacementAdvice(unsigned short vcpus,
|
|
unsigned long long balloon)
|
|
{
|
|
VIR_AUTOPTR(virCommand) cmd = NULL;
|
|
char *output = NULL;
|
|
|
|
cmd = virCommandNewArgList(NUMAD, "-w", NULL);
|
|
virCommandAddArgFormat(cmd, "%d:%llu", vcpus,
|
|
VIR_DIV_UP(balloon, 1024));
|
|
|
|
virCommandSetOutputBuffer(cmd, &output);
|
|
|
|
if (virCommandRun(cmd, NULL) < 0)
|
|
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
|
_("Failed to query numad for the "
|
|
"advisory nodeset"));
|
|
|
|
return output;
|
|
}
|
|
#else /* !HAVE_NUMAD */
|
|
char *
|
|
virNumaGetAutoPlacementAdvice(unsigned short vcpus ATTRIBUTE_UNUSED,
|
|
unsigned long long balloon ATTRIBUTE_UNUSED)
|
|
{
|
|
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
|
_("numad is not available on this host"));
|
|
return NULL;
|
|
}
|
|
#endif /* !HAVE_NUMAD */
|
|
|
|
#if WITH_NUMACTL
|
|
int
|
|
virNumaSetupMemoryPolicy(virDomainNumatuneMemMode mode,
|
|
virBitmapPtr nodeset)
|
|
{
|
|
nodemask_t mask;
|
|
int node = -1;
|
|
int ret = -1;
|
|
int bit = 0;
|
|
size_t i;
|
|
int maxnode = 0;
|
|
|
|
if (!nodeset)
|
|
return 0;
|
|
|
|
if (!virNumaNodesetIsAvailable(nodeset))
|
|
return -1;
|
|
|
|
maxnode = numa_max_node();
|
|
maxnode = maxnode < NUMA_NUM_NODES ? maxnode : NUMA_NUM_NODES;
|
|
|
|
/* Convert nodemask to NUMA bitmask. */
|
|
nodemask_zero(&mask);
|
|
bit = -1;
|
|
while ((bit = virBitmapNextSetBit(nodeset, bit)) >= 0) {
|
|
if (bit > maxnode) {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR,
|
|
_("NUMA node %d is out of range"), bit);
|
|
return -1;
|
|
}
|
|
nodemask_set(&mask, bit);
|
|
}
|
|
|
|
switch (mode) {
|
|
case VIR_DOMAIN_NUMATUNE_MEM_STRICT:
|
|
numa_set_bind_policy(1);
|
|
numa_set_membind(&mask);
|
|
numa_set_bind_policy(0);
|
|
break;
|
|
|
|
case VIR_DOMAIN_NUMATUNE_MEM_PREFERRED:
|
|
{
|
|
int nnodes = 0;
|
|
for (i = 0; i < NUMA_NUM_NODES; i++) {
|
|
if (nodemask_isset(&mask, i)) {
|
|
node = i;
|
|
nnodes++;
|
|
}
|
|
}
|
|
|
|
if (nnodes != 1) {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR,
|
|
"%s", _("NUMA memory tuning in 'preferred' mode "
|
|
"only supports single node"));
|
|
goto cleanup;
|
|
}
|
|
|
|
numa_set_bind_policy(0);
|
|
numa_set_preferred(node);
|
|
}
|
|
break;
|
|
|
|
case VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE:
|
|
numa_set_interleave_mask(&mask);
|
|
break;
|
|
|
|
case VIR_DOMAIN_NUMATUNE_MEM_LAST:
|
|
break;
|
|
}
|
|
ret = 0;
|
|
|
|
cleanup:
|
|
return ret;
|
|
}
|
|
|
|
bool
|
|
virNumaIsAvailable(void)
|
|
{
|
|
return numa_available() != -1;
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetMaxNode:
|
|
* Get the highest node number available on the current system.
|
|
* (See the node numbers in /sys/devices/system/node/ ).
|
|
*
|
|
* Returns the highest NUMA node id on success, -1 on error.
|
|
*/
|
|
int
|
|
virNumaGetMaxNode(void)
|
|
{
|
|
int ret;
|
|
|
|
if (!virNumaIsAvailable()) {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
|
_("NUMA isn't available on this host"));
|
|
return -1;
|
|
}
|
|
|
|
if ((ret = numa_max_node()) < 0) {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
|
_("Failed to request maximum NUMA node id"));
|
|
return -1;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetNodeMemory:
|
|
* @node: identifier of the requested NUMA node
|
|
* @memsize: returns the total size of memory in the NUMA node
|
|
* @memfree: returns the total free memory in a NUMA node
|
|
*
|
|
* Returns the size of the memory in one NUMA node in bytes via the @size
|
|
* argument and free memory of a node in the @free argument. The caller has to
|
|
* guarantee that @node is in range (see virNumaGetMaxNode).
|
|
*
|
|
* Returns 0 on success, -1 on error. Does not report errors.
|
|
*/
|
|
int
|
|
virNumaGetNodeMemory(int node,
|
|
unsigned long long *memsize,
|
|
unsigned long long *memfree)
|
|
{
|
|
long long node_size;
|
|
long long node_free;
|
|
|
|
if (memsize)
|
|
*memsize = 0;
|
|
|
|
if (memfree)
|
|
*memfree = 0;
|
|
|
|
if ((node_size = numa_node_size64(node, &node_free)) < 0)
|
|
return -1;
|
|
|
|
if (memsize)
|
|
*memsize = node_size;
|
|
|
|
if (memfree)
|
|
*memfree = node_free;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetNodeCPUs:
|
|
* @node: identifier of the requested NUMA node
|
|
* @cpus: returns a bitmap of CPUs in @node
|
|
*
|
|
* Returns count of CPUs in the selected node and sets the map of the cpus to
|
|
* @cpus. On error if the @node doesn't exist in the system this function
|
|
* returns -2 and sets @cpus to NULL. On other errors -1 is returned, @cpus
|
|
* is set to NULL and an error is reported.
|
|
*/
|
|
|
|
# define n_bits(var) (8 * sizeof(var))
|
|
# define MASK_CPU_ISSET(mask, cpu) \
|
|
(((mask)[((cpu) / n_bits(*(mask)))] >> ((cpu) % n_bits(*(mask)))) & 1)
|
|
int
|
|
virNumaGetNodeCPUs(int node,
|
|
virBitmapPtr *cpus)
|
|
{
|
|
int ncpus = 0;
|
|
int max_n_cpus = virNumaGetMaxCPUs();
|
|
int mask_n_bytes = max_n_cpus / 8;
|
|
size_t i;
|
|
VIR_AUTOFREE(unsigned long *) mask = NULL;
|
|
VIR_AUTOFREE(unsigned long *) allonesmask = NULL;
|
|
VIR_AUTOPTR(virBitmap) cpumap = NULL;
|
|
|
|
*cpus = NULL;
|
|
|
|
if (VIR_ALLOC_N(mask, mask_n_bytes / sizeof(*mask)) < 0)
|
|
return -1;
|
|
|
|
if (VIR_ALLOC_N(allonesmask, mask_n_bytes / sizeof(*mask)) < 0)
|
|
return -1;
|
|
|
|
memset(allonesmask, 0xff, mask_n_bytes);
|
|
|
|
/* The first time this returns -1, ENOENT if node doesn't exist... */
|
|
if (numa_node_to_cpus(node, mask, mask_n_bytes) < 0) {
|
|
VIR_WARN("NUMA topology for cell %d is not available, ignoring", node);
|
|
return -2;
|
|
}
|
|
|
|
/* second, third... times it returns an all-1's mask */
|
|
if (memcmp(mask, allonesmask, mask_n_bytes) == 0) {
|
|
VIR_DEBUG("NUMA topology for cell %d is invalid, ignoring", node);
|
|
return -2;
|
|
}
|
|
|
|
if (!(cpumap = virBitmapNew(max_n_cpus)))
|
|
return -1;
|
|
|
|
for (i = 0; i < max_n_cpus; i++) {
|
|
if (MASK_CPU_ISSET(mask, i)) {
|
|
ignore_value(virBitmapSetBit(cpumap, i));
|
|
ncpus++;
|
|
}
|
|
}
|
|
|
|
VIR_STEAL_PTR(*cpus, cpumap);
|
|
return ncpus;
|
|
}
|
|
# undef MASK_CPU_ISSET
|
|
# undef n_bits
|
|
|
|
#else /* !WITH_NUMACTL */
|
|
|
|
int
|
|
virNumaSetupMemoryPolicy(virDomainNumatuneMemMode mode ATTRIBUTE_UNUSED,
|
|
virBitmapPtr nodeset)
|
|
{
|
|
if (!virNumaNodesetIsAvailable(nodeset))
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
bool
|
|
virNumaIsAvailable(void)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaGetMaxNode(void)
|
|
{
|
|
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
|
_("NUMA isn't available on this host"));
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaGetNodeMemory(int node ATTRIBUTE_UNUSED,
|
|
unsigned long long *memsize,
|
|
unsigned long long *memfree)
|
|
{
|
|
if (memsize)
|
|
*memsize = 0;
|
|
|
|
if (memfree)
|
|
*memfree = 0;
|
|
|
|
VIR_DEBUG("NUMA isn't available on this host");
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaGetNodeCPUs(int node ATTRIBUTE_UNUSED,
|
|
virBitmapPtr *cpus)
|
|
{
|
|
*cpus = NULL;
|
|
|
|
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
|
|
_("NUMA isn't available on this host"));
|
|
return -1;
|
|
}
|
|
#endif /* !WITH_NUMACTL */
|
|
|
|
/**
|
|
* virNumaGetMaxCPUs:
|
|
*
|
|
* Get the maximum count of CPUs supportable in the host.
|
|
*
|
|
* Returns the count of CPUs supported.
|
|
*/
|
|
unsigned int
|
|
virNumaGetMaxCPUs(void)
|
|
{
|
|
return NUMA_MAX_N_CPUS;
|
|
}
|
|
|
|
|
|
#if WITH_NUMACTL && HAVE_NUMA_BITMASK_ISBITSET
|
|
/**
|
|
* virNumaNodeIsAvailable:
|
|
* @node: node to check
|
|
*
|
|
* On some hosts the set of NUMA nodes isn't continuous.
|
|
* Use this function to test if the @node is available.
|
|
*
|
|
* Returns: true if @node is available,
|
|
* false if @node doesn't exist
|
|
*/
|
|
bool
|
|
virNumaNodeIsAvailable(int node)
|
|
{
|
|
return numa_bitmask_isbitset(numa_nodes_ptr, node);
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetDistances:
|
|
* @node: identifier of the requested NUMA node
|
|
* @distances: array of distances to sibling nodes
|
|
* @ndistances: size of @distances
|
|
*
|
|
* Get array of distances to sibling nodes from @node. If a
|
|
* distances[x] equals to zero, the node x is not enabled or
|
|
* doesn't exist. As a special case, if @node itself refers to
|
|
* disabled or nonexistent NUMA node, then @distances and
|
|
* @ndistances are set to NULL and zero respectively.
|
|
*
|
|
* The distances are a bit of magic. For a local node the value
|
|
* is 10, for remote it's typically 20 meaning that time penalty
|
|
* for accessing a remote node is two time bigger than when
|
|
* accessing a local node.
|
|
*
|
|
* Returns 0 on success, -1 otherwise.
|
|
*/
|
|
int
|
|
virNumaGetDistances(int node,
|
|
int **distances,
|
|
int *ndistances)
|
|
{
|
|
int ret = -1;
|
|
int max_node;
|
|
size_t i;
|
|
|
|
if (!virNumaNodeIsAvailable(node)) {
|
|
VIR_DEBUG("Node %d does not exist", node);
|
|
*distances = NULL;
|
|
*ndistances = 0;
|
|
return 0;
|
|
}
|
|
|
|
if ((max_node = virNumaGetMaxNode()) < 0)
|
|
goto cleanup;
|
|
|
|
if (VIR_ALLOC_N(*distances, max_node + 1) < 0)
|
|
goto cleanup;
|
|
|
|
*ndistances = max_node + 1;
|
|
|
|
for (i = 0; i <= max_node; i++) {
|
|
if (!virNumaNodeIsAvailable(node))
|
|
continue;
|
|
|
|
(*distances)[i] = numa_distance(node, i);
|
|
}
|
|
|
|
ret = 0;
|
|
cleanup:
|
|
return ret;
|
|
}
|
|
|
|
#else /* !(WITH_NUMACTL && HAVE_NUMA_BITMASK_ISBITSET) */
|
|
|
|
bool
|
|
virNumaNodeIsAvailable(int node)
|
|
{
|
|
int max_node = virNumaGetMaxNode();
|
|
|
|
if (max_node < 0)
|
|
return false;
|
|
|
|
/* Do we have anything better? */
|
|
return (node >= 0) && (node <= max_node);
|
|
}
|
|
|
|
|
|
int
|
|
virNumaGetDistances(int node ATTRIBUTE_UNUSED,
|
|
int **distances,
|
|
int *ndistances)
|
|
{
|
|
*distances = NULL;
|
|
*ndistances = 0;
|
|
VIR_DEBUG("NUMA distance information isn't available on this host");
|
|
return 0;
|
|
}
|
|
#endif /* !(WITH_NUMACTL && HAVE_NUMA_BITMASK_ISBITSET) */
|
|
|
|
|
|
/* currently all the huge page stuff below is linux only */
|
|
#ifdef __linux__
|
|
|
|
# define HUGEPAGES_NUMA_PREFIX "/sys/devices/system/node/"
|
|
# define HUGEPAGES_SYSTEM_PREFIX "/sys/kernel/mm/hugepages/"
|
|
# define HUGEPAGES_PREFIX "hugepages-"
|
|
|
|
static int
|
|
virNumaGetHugePageInfoPath(char **path,
|
|
int node,
|
|
unsigned int page_size,
|
|
const char *suffix)
|
|
{
|
|
int ret;
|
|
|
|
if (node == -1) {
|
|
/* We are aiming at overall system info */
|
|
ret = virAsprintf(path,
|
|
HUGEPAGES_SYSTEM_PREFIX HUGEPAGES_PREFIX "%ukB/%s",
|
|
page_size, suffix ? suffix : "");
|
|
} else {
|
|
/* We are aiming on specific NUMA node */
|
|
ret = virAsprintf(path,
|
|
HUGEPAGES_NUMA_PREFIX "node%d/hugepages/"
|
|
HUGEPAGES_PREFIX "%ukB/%s",
|
|
node, page_size, suffix ? suffix : "");
|
|
}
|
|
|
|
if (ret >= 0 && !virFileExists(*path)) {
|
|
ret = -1;
|
|
if (node != -1) {
|
|
if (!virNumaNodeIsAvailable(node)) {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("NUMA node %d is not available"),
|
|
node);
|
|
} else {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("page size %u is not available on node %d"),
|
|
page_size, node);
|
|
}
|
|
} else {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("page size %u is not available"),
|
|
page_size);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
virNumaGetHugePageInfoDir(char **path, int node)
|
|
{
|
|
if (node == -1) {
|
|
return VIR_STRDUP(*path, HUGEPAGES_SYSTEM_PREFIX);
|
|
} else {
|
|
return virAsprintf(path,
|
|
HUGEPAGES_NUMA_PREFIX "node%d/hugepages/",
|
|
node);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* virNumaGetHugePageInfo:
|
|
* @node: NUMA node id
|
|
* @page_size: which huge page are we interested in
|
|
* @page_avail: total number of huge pages in the pool
|
|
* @page_free: the number of free huge pages in the pool
|
|
*
|
|
* For given NUMA node and huge page size fetch information on
|
|
* total number of huge pages in the pool (both free and taken)
|
|
* and count for free huge pages in the pool.
|
|
*
|
|
* If you're interested in just one bit, pass NULL to the other one.
|
|
*
|
|
* As a special case, if @node == -1, overall info is fetched
|
|
* from the system.
|
|
*
|
|
* Returns 0 on success, -1 otherwise (with error reported).
|
|
*/
|
|
static int
|
|
virNumaGetHugePageInfo(int node,
|
|
unsigned int page_size,
|
|
unsigned long long *page_avail,
|
|
unsigned long long *page_free)
|
|
{
|
|
char *end;
|
|
VIR_AUTOFREE(char *) path = NULL;
|
|
VIR_AUTOFREE(char *) buf = NULL;
|
|
|
|
if (page_avail) {
|
|
if (virNumaGetHugePageInfoPath(&path, node,
|
|
page_size, "nr_hugepages") < 0)
|
|
return -1;
|
|
|
|
if (virFileReadAll(path, 1024, &buf) < 0)
|
|
return -1;
|
|
|
|
if (virStrToLong_ull(buf, &end, 10, page_avail) < 0 ||
|
|
*end != '\n') {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR,
|
|
_("unable to parse: %s"),
|
|
buf);
|
|
return -1;
|
|
}
|
|
VIR_FREE(buf);
|
|
VIR_FREE(path);
|
|
}
|
|
|
|
if (page_free) {
|
|
if (virNumaGetHugePageInfoPath(&path, node,
|
|
page_size, "free_hugepages") < 0)
|
|
return -1;
|
|
|
|
if (virFileReadAll(path, 1024, &buf) < 0)
|
|
return -1;
|
|
|
|
if (virStrToLong_ull(buf, &end, 10, page_free) < 0 ||
|
|
*end != '\n') {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR,
|
|
_("unable to parse: %s"),
|
|
buf);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* virNumaGetPageInfo:
|
|
* @node: NUMA node id
|
|
* @page_size: which huge page are we interested in (in KiB)
|
|
* @huge_page_sum: the sum of memory taken by huge pages (in
|
|
* bytes)
|
|
* @page_avail: total number of huge pages in the pool
|
|
* @page_free: the number of free huge pages in the pool
|
|
*
|
|
* For given NUMA node and page size fetch information on
|
|
* total number of pages in the pool (both free and taken)
|
|
* and count for free pages in the pool.
|
|
*
|
|
* The @huge_page_sum parameter exists due to the Linux kernel
|
|
* limitation. The problem is, if there are some huge pages
|
|
* allocated, they are accounted under the 'MemUsed' field in the
|
|
* meminfo file instead of being subtracted from the 'MemTotal'.
|
|
* We must do the subtraction ourselves.
|
|
* If unsure, pass 0.
|
|
*
|
|
* If you're interested in just one bit, pass NULL to the other one.
|
|
*
|
|
* As a special case, if @node == -1, overall info is fetched
|
|
* from the system.
|
|
*
|
|
* Returns 0 on success, -1 otherwise (with error reported).
|
|
*/
|
|
int
|
|
virNumaGetPageInfo(int node,
|
|
unsigned int page_size,
|
|
unsigned long long huge_page_sum,
|
|
unsigned long long *page_avail,
|
|
unsigned long long *page_free)
|
|
{
|
|
int ret = -1;
|
|
long system_page_size = virGetSystemPageSize();
|
|
|
|
/* sysconf() returns page size in bytes,
|
|
* the @page_size is however in kibibytes */
|
|
if (page_size == system_page_size / 1024) {
|
|
unsigned long long memsize, memfree;
|
|
|
|
/* TODO: come up with better algorithm that takes huge pages into
|
|
* account. The problem is huge pages cut off regular memory. */
|
|
if (node == -1) {
|
|
if (virHostMemGetInfo(&memsize, &memfree) < 0)
|
|
goto cleanup;
|
|
} else {
|
|
if (virNumaGetNodeMemory(node, &memsize, &memfree) < 0)
|
|
goto cleanup;
|
|
}
|
|
|
|
/* see description above */
|
|
memsize -= huge_page_sum;
|
|
|
|
if (page_avail)
|
|
*page_avail = memsize / system_page_size;
|
|
|
|
if (page_free)
|
|
*page_free = memfree / system_page_size;
|
|
} else {
|
|
if (virNumaGetHugePageInfo(node, page_size, page_avail, page_free) < 0)
|
|
goto cleanup;
|
|
}
|
|
|
|
ret = 0;
|
|
cleanup:
|
|
return ret;
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetPages:
|
|
* @node: NUMA node id
|
|
* @pages_size: list of pages supported on @node
|
|
* @pages_avail: list of the pool sizes on @node
|
|
* @pages_free: list of free pages on @node
|
|
* @npages: the lists size
|
|
*
|
|
* For given NUMA node fetch info on pages. The size of pages
|
|
* (e.g. 4K, 2M, 1G) is stored into @pages_size, the size of the
|
|
* pool is then stored into @pages_avail and the number of free
|
|
* pages in the pool is stored into @pages_free.
|
|
*
|
|
* If you're interested only in some lists, pass NULL to the
|
|
* other ones.
|
|
*
|
|
* As a special case, if @node == -1, overall info is fetched
|
|
* from the system.
|
|
*
|
|
* Returns 0 on success, -1 otherwise.
|
|
*/
|
|
int
|
|
virNumaGetPages(int node,
|
|
unsigned int **pages_size,
|
|
unsigned long long **pages_avail,
|
|
unsigned long long **pages_free,
|
|
size_t *npages)
|
|
{
|
|
int ret = -1;
|
|
DIR *dir = NULL;
|
|
int direrr = 0;
|
|
struct dirent *entry;
|
|
unsigned int ntmp = 0;
|
|
size_t i;
|
|
bool exchange;
|
|
long system_page_size;
|
|
unsigned long long huge_page_sum = 0;
|
|
VIR_AUTOFREE(char *) path = NULL;
|
|
VIR_AUTOFREE(unsigned int *) tmp_size = NULL;
|
|
VIR_AUTOFREE(unsigned long long *) tmp_avail = NULL;
|
|
VIR_AUTOFREE(unsigned long long *) tmp_free = NULL;
|
|
|
|
/* sysconf() returns page size in bytes,
|
|
* but we are storing the page size in kibibytes. */
|
|
system_page_size = virGetSystemPageSizeKB();
|
|
|
|
/* Query huge pages at first.
|
|
* On Linux systems, the huge pages pool cuts off the available memory and
|
|
* is always shown as used memory. Here, however, we want to report
|
|
* slightly different information. So we take the total memory on a node
|
|
* and subtract memory taken by the huge pages. */
|
|
if (virNumaGetHugePageInfoDir(&path, node) < 0)
|
|
goto cleanup;
|
|
|
|
/* It's okay if the @path doesn't exist. Maybe we are running on
|
|
* system without huge pages support where the path may not exist. */
|
|
if (virDirOpenIfExists(&dir, path) < 0)
|
|
goto cleanup;
|
|
|
|
while (dir && (direrr = virDirRead(dir, &entry, path)) > 0) {
|
|
const char *page_name = entry->d_name;
|
|
unsigned int page_size;
|
|
unsigned long long page_avail = 0;
|
|
unsigned long long page_free = 0;
|
|
char *end;
|
|
|
|
/* Just to give you a hint, we're dealing with this:
|
|
* hugepages-2048kB/ or hugepages-1048576kB/ */
|
|
if (!STRPREFIX(entry->d_name, HUGEPAGES_PREFIX))
|
|
continue;
|
|
|
|
page_name += strlen(HUGEPAGES_PREFIX);
|
|
|
|
if (virStrToLong_ui(page_name, &end, 10, &page_size) < 0 ||
|
|
STRCASENEQ(end, "kB")) {
|
|
virReportError(VIR_ERR_INTERNAL_ERROR,
|
|
_("unable to parse %s"),
|
|
entry->d_name);
|
|
goto cleanup;
|
|
}
|
|
|
|
if (virNumaGetHugePageInfo(node, page_size,
|
|
&page_avail, &page_free) < 0)
|
|
goto cleanup;
|
|
|
|
if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 ||
|
|
VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 ||
|
|
VIR_REALLOC_N(tmp_free, ntmp + 1) < 0)
|
|
goto cleanup;
|
|
|
|
tmp_size[ntmp] = page_size;
|
|
tmp_avail[ntmp] = page_avail;
|
|
tmp_free[ntmp] = page_free;
|
|
ntmp++;
|
|
|
|
/* page_size is in kibibytes while we want huge_page_sum
|
|
* in just bytes. */
|
|
huge_page_sum += 1024 * page_size * page_avail;
|
|
}
|
|
|
|
if (direrr < 0)
|
|
goto cleanup;
|
|
|
|
/* Now append the ordinary system pages */
|
|
if (VIR_REALLOC_N(tmp_size, ntmp + 1) < 0 ||
|
|
VIR_REALLOC_N(tmp_avail, ntmp + 1) < 0 ||
|
|
VIR_REALLOC_N(tmp_free, ntmp + 1) < 0)
|
|
goto cleanup;
|
|
|
|
if (virNumaGetPageInfo(node, system_page_size, huge_page_sum,
|
|
&tmp_avail[ntmp], &tmp_free[ntmp]) < 0)
|
|
goto cleanup;
|
|
tmp_size[ntmp] = system_page_size;
|
|
ntmp++;
|
|
|
|
/* Just to produce nice output, sort the arrays by increasing page size */
|
|
do {
|
|
exchange = false;
|
|
for (i = 0; i < ntmp -1; i++) {
|
|
if (tmp_size[i] > tmp_size[i + 1]) {
|
|
exchange = true;
|
|
SWAP(tmp_size[i], tmp_size[i + 1]);
|
|
SWAP(tmp_avail[i], tmp_avail[i + 1]);
|
|
SWAP(tmp_free[i], tmp_free[i + 1]);
|
|
}
|
|
}
|
|
} while (exchange);
|
|
|
|
if (pages_size) {
|
|
*pages_size = tmp_size;
|
|
tmp_size = NULL;
|
|
}
|
|
if (pages_avail) {
|
|
*pages_avail = tmp_avail;
|
|
tmp_avail = NULL;
|
|
}
|
|
if (pages_free) {
|
|
*pages_free = tmp_free;
|
|
tmp_free = NULL;
|
|
}
|
|
*npages = ntmp;
|
|
ret = 0;
|
|
cleanup:
|
|
VIR_DIR_CLOSE(dir);
|
|
return ret;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaSetPagePoolSize(int node,
|
|
unsigned int page_size,
|
|
unsigned long long page_count,
|
|
bool add)
|
|
{
|
|
char *end;
|
|
unsigned long long nr_count;
|
|
VIR_AUTOFREE(char *) nr_path = NULL;
|
|
VIR_AUTOFREE(char *) nr_buf = NULL;
|
|
|
|
if (page_size == virGetSystemPageSizeKB()) {
|
|
/* Special case as kernel handles system pages
|
|
* differently to huge pages. */
|
|
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
|
|
_("system pages pool can't be modified"));
|
|
return -1;
|
|
}
|
|
|
|
if (virNumaGetHugePageInfoPath(&nr_path, node, page_size, "nr_hugepages") < 0)
|
|
return -1;
|
|
|
|
/* Firstly check, if there's anything for us to do */
|
|
if (virFileReadAll(nr_path, 1024, &nr_buf) < 0)
|
|
return -1;
|
|
|
|
if (virStrToLong_ull(nr_buf, &end, 10, &nr_count) < 0 ||
|
|
*end != '\n') {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("invalid number '%s' in '%s'"),
|
|
nr_buf, nr_path);
|
|
return -1;
|
|
}
|
|
|
|
if (add) {
|
|
if (!page_count) {
|
|
VIR_DEBUG("Nothing left to do: add = true page_count = 0");
|
|
return 0;
|
|
}
|
|
page_count += nr_count;
|
|
} else {
|
|
if (nr_count == page_count) {
|
|
VIR_DEBUG("Nothing left to do: nr_count = page_count = %llu",
|
|
page_count);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* Okay, page pool adjustment must be done in two steps. In
|
|
* first we write the desired number into nr_hugepages file.
|
|
* Kernel then starts to allocate the pages (return from
|
|
* write should be postponed until the kernel is finished).
|
|
* However, kernel may have not been successful and reserved
|
|
* all the pages we wanted. So do the second read to check.
|
|
*/
|
|
VIR_FREE(nr_buf);
|
|
if (virAsprintf(&nr_buf, "%llu", page_count) < 0)
|
|
return -1;
|
|
|
|
if (virFileWriteStr(nr_path, nr_buf, 0) < 0) {
|
|
virReportSystemError(errno,
|
|
_("Unable to write to: %s"), nr_path);
|
|
return -1;
|
|
}
|
|
|
|
/* And now do the check. */
|
|
|
|
VIR_FREE(nr_buf);
|
|
if (virFileReadAll(nr_path, 1024, &nr_buf) < 0)
|
|
return -1;
|
|
|
|
if (virStrToLong_ull(nr_buf, &end, 10, &nr_count) < 0 ||
|
|
*end != '\n') {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("invalid number '%s' in '%s'"),
|
|
nr_buf, nr_path);
|
|
return -1;
|
|
}
|
|
|
|
if (nr_count != page_count) {
|
|
virReportError(VIR_ERR_OPERATION_FAILED,
|
|
_("Unable to allocate %llu pages. Allocated only %llu"),
|
|
page_count, nr_count);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
#else /* #ifdef __linux__ */
|
|
int
|
|
virNumaGetPageInfo(int node ATTRIBUTE_UNUSED,
|
|
unsigned int page_size ATTRIBUTE_UNUSED,
|
|
unsigned long long huge_page_sum ATTRIBUTE_UNUSED,
|
|
unsigned long long *page_avail ATTRIBUTE_UNUSED,
|
|
unsigned long long *page_free ATTRIBUTE_UNUSED)
|
|
{
|
|
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
|
|
_("page info is not supported on this platform"));
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaGetPages(int node ATTRIBUTE_UNUSED,
|
|
unsigned int **pages_size ATTRIBUTE_UNUSED,
|
|
unsigned long long **pages_avail ATTRIBUTE_UNUSED,
|
|
unsigned long long **pages_free ATTRIBUTE_UNUSED,
|
|
size_t *npages ATTRIBUTE_UNUSED)
|
|
{
|
|
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
|
|
_("page info is not supported on this platform"));
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
virNumaSetPagePoolSize(int node ATTRIBUTE_UNUSED,
|
|
unsigned int page_size ATTRIBUTE_UNUSED,
|
|
unsigned long long page_count ATTRIBUTE_UNUSED,
|
|
bool add ATTRIBUTE_UNUSED)
|
|
{
|
|
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
|
|
_("page pool allocation is not supported on this platform"));
|
|
return -1;
|
|
}
|
|
#endif /* #ifdef __linux__ */
|
|
|
|
bool
|
|
virNumaNodesetIsAvailable(virBitmapPtr nodeset)
|
|
{
|
|
ssize_t bit = -1;
|
|
|
|
if (!nodeset)
|
|
return true;
|
|
|
|
while ((bit = virBitmapNextSetBit(nodeset, bit)) >= 0) {
|
|
if (virNumaNodeIsAvailable(bit))
|
|
continue;
|
|
|
|
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
|
|
_("NUMA node %zd is unavailable"), bit);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
/**
|
|
* virNumaGetHostMemoryNodeset:
|
|
*
|
|
* Returns a bitmap of guest numa node ids that contain memory.
|
|
*/
|
|
virBitmapPtr
|
|
virNumaGetHostMemoryNodeset(void)
|
|
{
|
|
int maxnode = virNumaGetMaxNode();
|
|
unsigned long long nodesize;
|
|
size_t i = 0;
|
|
virBitmapPtr nodeset = NULL;
|
|
|
|
if (maxnode < 0)
|
|
return NULL;
|
|
|
|
if (!(nodeset = virBitmapNew(maxnode + 1)))
|
|
return NULL;
|
|
|
|
for (i = 0; i <= maxnode; i++) {
|
|
if (!virNumaNodeIsAvailable(i))
|
|
continue;
|
|
|
|
/* if we can't detect NUMA node size assume that it's present */
|
|
if (virNumaGetNodeMemory(i, &nodesize, NULL) < 0 || nodesize > 0)
|
|
ignore_value(virBitmapSetBit(nodeset, i));
|
|
}
|
|
|
|
return nodeset;
|
|
}
|