/*
* Copyright (C) 2009-2015 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* .
*/
#include
#include
#include "virnetdevbandwidth.h"
#include "vircommand.h"
#include "viralloc.h"
#include "virerror.h"
#include "virlog.h"
#include "virutil.h"
#define VIR_FROM_THIS VIR_FROM_NONE
VIR_LOG_INIT("util.netdevbandwidth");
void
virNetDevBandwidthFree(virNetDevBandwidth *def)
{
if (!def)
return;
g_free(def->in);
g_free(def->out);
g_free(def);
}
static void
virNetDevBandwidthCmdAddOptimalQuantum(virCommand *cmd,
const virNetDevBandwidthRate *rate)
{
const unsigned long long mtu = 1500;
unsigned long long r2q;
/* When two or more classes compete for unused bandwidth they are each
* given some number of bytes before serving other competing class. This
* number is called quantum. It's advised in HTB docs that the number
* should be equal to MTU. The class quantum is computed from its rate
* divided by global r2q parameter. However, if rate is too small the
* default value will not suffice and thus we must provide our own value.
* */
r2q = rate->average * 1024 / 8 / mtu;
if (!r2q)
r2q = 1;
virCommandAddArg(cmd, "quantum");
virCommandAddArgFormat(cmd, "%llu", r2q);
}
/**
* virNetDevBandwidthManipulateFilter:
* @ifname: interface to operate on
* @ifmac_ptr: MAC of the interface to create filter over
* @id: filter ID
* @class_id: where to place traffic
* @remove_old: whether to remove the filter
* @create_new: whether to create the filter
*
* TC filters are as crucial for traffic shaping as QDiscs. While
* QDiscs act like black boxes deciding which packets should be
* held up and which should be sent immediately, it's the filter
* that places a packet into the box. So, we may end up
* constructing a set of filters on a single device (e.g. a
* bridge) and filter the traffic into QDiscs based on the
* originating vNET device.
*
* Long story short, @ifname is the interface where the filter
* should be created. The @ifmac_ptr is the MAC address for which
* the filter should be created (usually different to the MAC
* address of @ifname). Then, like everything - even filters have
* an @id which should be unique (per @ifname). And @class_id
* tells into which QDisc should filter place the traffic.
*
* This function can be used for both, removing stale filter
* (@remove_old set to true) and creating new one (@create_new
* set to true). Both at once for the same price!
*
* Returns: 0 on success,
* -1 otherwise (with error reported).
*/
static int ATTRIBUTE_NONNULL(1)
virNetDevBandwidthManipulateFilter(const char *ifname,
const virMacAddr *ifmac_ptr,
unsigned int id,
const char *class_id,
bool remove_old,
bool create_new)
{
int ret = -1;
g_autofree char *filter_id = NULL;
unsigned char ifmac[VIR_MAC_BUFLEN];
char *mac[2] = {NULL, NULL};
if (!(remove_old || create_new)) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("filter creation API error"));
goto cleanup;
}
/* u32 filters must have 800:: prefix. Don't ask. Furthermore, handles
* start at 800. Therefore, we want the filter ID to look like this:
* 800::(800 + id) */
filter_id = g_strdup_printf("800::%u", 800 + id);
if (remove_old) {
g_autoptr(virCommand) cmd = virCommandNew(TC);
int cmd_ret = 0;
virCommandAddArgList(cmd, "filter", "del", "dev", ifname,
"prio", "2", "handle", filter_id, "u32", NULL);
if (virCommandRun(cmd, &cmd_ret) < 0)
goto cleanup;
}
if (create_new) {
g_autoptr(virCommand) cmd = virCommandNew(TC);
virMacAddrGetRaw(ifmac_ptr, ifmac);
mac[0] = g_strdup_printf("0x%02x%02x%02x%02x", ifmac[2],
ifmac[3], ifmac[4], ifmac[5]);
mac[1] = g_strdup_printf("0x%02x%02x", ifmac[0], ifmac[1]);
/* Okay, this not nice. But since libvirt does not necessarily track
* interface IP address(es), and tc fw filter simply refuse to use
* ebtables marks, we need to use u32 selector to match MAC address.
* If libvirt will ever know something, remove this FIXME
*/
virCommandAddArgList(cmd, "filter", "add", "dev", ifname, "protocol", "ip",
"prio", "2", "handle", filter_id, "u32",
"match", "u16", "0x0800", "0xffff", "at", "-2",
"match", "u32", mac[0], "0xffffffff", "at", "-12",
"match", "u16", mac[1], "0xffff", "at", "-14",
"flowid", class_id, NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
}
ret = 0;
cleanup:
VIR_FREE(mac[1]);
VIR_FREE(mac[0]);
return ret;
}
/**
* virNetDevBandwidthSet:
* @ifname: on which interface
* @bandwidth: rates to set (may be NULL)
* @hierarchical_class: whether to create hierarchical class
* @swapped: true if IN/OUT should be set contrariwise
*
* This function enables QoS on specified interface
* and set given traffic limits for both, incoming
* and outgoing traffic. Any previous setting get
* overwritten. If @hierarchical_class is TRUE, create
* hierarchical class. It is used to guarantee minimal
* throughput ('floor' attribute in NIC).
*
* If @swapped is set, the IN part of @bandwidth is set on
* @ifname's TX, and vice versa. If it is not set, IN is set on
* RX and OUT on TX. This is because for some types of interfaces
* domain and the host live on the same side of the interface (so
* domain's RX/TX is host's RX/TX), and for some it's swapped
* (domain's RX/TX is hosts's TX/RX).
*
* Return 0 on success, -1 otherwise.
*/
int
virNetDevBandwidthSet(const char *ifname,
const virNetDevBandwidth *bandwidth,
bool hierarchical_class,
bool swapped)
{
int ret = -1;
virNetDevBandwidthRate *rx = NULL; /* From domain POV */
virNetDevBandwidthRate *tx = NULL; /* From domain POV */
virCommand *cmd = NULL;
char *average = NULL;
char *peak = NULL;
char *burst = NULL;
if (!bandwidth) {
/* nothing to be enabled */
ret = 0;
goto cleanup;
}
if (geteuid() != 0) {
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
_("Network bandwidth tuning is not available"
" in session mode"));
return -1;
}
if (!ifname) {
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
_("Unable to set bandwidth for interface because "
"device name is unknown"));
return -1;
}
if (swapped) {
rx = bandwidth->out;
tx = bandwidth->in;
} else {
rx = bandwidth->in;
tx = bandwidth->out;
}
virNetDevBandwidthClear(ifname);
if (tx && tx->average) {
average = g_strdup_printf("%llukbps", tx->average);
if (tx->peak)
peak = g_strdup_printf("%llukbps", tx->peak);
if (tx->burst)
burst = g_strdup_printf("%llukb", tx->burst);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root",
"handle", "1:", "htb", "default",
hierarchical_class ? "2" : "1", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
/* If we are creating a hierarchical class, all non guaranteed traffic
* goes to the 1:2 class which will adjust 'rate' dynamically as NICs
* with guaranteed throughput are plugged and unplugged. Class 1:1
* exists so we don't exceed the maximum limit for the network. For each
* NIC with guaranteed throughput a separate classid will be created.
* NB '1:' is just a shorter notation of '1:0'.
*
* To get a picture how this works:
*
* +-----+ +---------+ +-----------+ +-----------+ +-----+
* | | | qdisc | | class 1:1 | | class 1:2 | | |
* | NIC | | def 1:2 | | rate | | rate | | sfq |
* | | --> | | --> | peak | -+-> | peak | --> | |
* +-----+ +---------+ +-----------+ | +-----------+ +-----+
* |
* | +-----------+ +-----+
* | | class 1:3 | | |
* | | rate | | sfq |
* +-> | peak | --> | |
* | +-----------+ +-----+
* ...
* | +-----------+ +-----+
* | | class 1:n | | |
* | | rate | | sfq |
* +-> | peak | --> | |
* +-----------+ +-----+
*
* After the routing decision, when is it clear a packet is to be sent
* via a particular NIC, it is sent to the root qdisc (queuing
* discipline). In this case HTB (Hierarchical Token Bucket). It has
* only one direct child class (with id 1:1) which shapes the overall
* rate that is sent through the NIC. This class has at least one child
* (1:2) which is meant for all non-privileged (non guaranteed) traffic
* from all domains. Then, for each interface with guaranteed
* throughput, a separate class (1:n) is created. Imagine a class is a
* box. Whenever a packet ends up in a class it is stored in this box
* until the kernel sends it, then it is removed from box. Packets are
* placed into boxes based on rules (filters) - e.g. depending on
* destination IP/MAC address. If there is no rule to be applied, the
* root qdisc has a default where such packets go (1:2 in this case).
* Packets come in over and over again and boxes get filled more and
* more. Imagine that kernel sends packets just once a second. So it
* starts to traverse through this tree. It starts with the root qdisc
* and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'.
* Then it moves to 1:3 and again sends packets up to 1:3's 'rate'. The
* whole process is repeated until 1:n is processed. So now we have
* ensured each class its guaranteed bandwidth. If the sum of sent data
* doesn't exceed the 'rate' in 1:1 class, we can go further and send
* more packets. The rest of available bandwidth is distributed to the
* 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root
* 'rate' limit is reached or there are no more packets to send, we stop
* sending and wait another second. Each class has an SFQ qdisc which
* shuffles packets in boxes stochastically, so one sender cannot
* starve others.
*
* Therefore, whenever we want to plug in a new guaranteed interface, we
* need to create a new class and adjust the 'rate' of the 1:2 class.
* When unplugging we do the exact opposite - remove the associated
* class, and adjust the 'rate'.
*
* This description is rather long, but it is still a good idea to read
* it before you dig into the code.
*/
if (hierarchical_class) {
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent",
"1:", "classid", "1:1", "htb", "rate", average,
"ceil", peak ? peak : average, NULL);
virNetDevBandwidthCmdAddOptimalQuantum(cmd, tx);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
}
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent",
hierarchical_class ? "1:1" : "1:", "classid",
hierarchical_class ? "1:2" : "1:1", "htb",
"rate", average, NULL);
if (peak)
virCommandAddArgList(cmd, "ceil", peak, NULL);
if (burst)
virCommandAddArgList(cmd, "burst", burst, NULL);
virNetDevBandwidthCmdAddOptimalQuantum(cmd, tx);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent",
hierarchical_class ? "1:2" : "1:1",
"handle", "2:", "sfq", "perturb",
"10", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "filter", "add", "dev", ifname, "parent",
"1:0", "protocol", "all", "prio", "1", "handle",
"1", "fw", "flowid", "1", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
VIR_FREE(average);
VIR_FREE(peak);
VIR_FREE(burst);
}
if (rx) {
average = g_strdup_printf("%llukbps", rx->average);
if (rx->burst) {
burst = g_strdup_printf("%llukb", rx->burst);
} else {
/* Internally, tc uses uint to store burst size (in bytes).
* Therefore, the largest value we can set is UINT_MAX bytes.
* We're outputting the vale in KiB though. */
unsigned long long avg = MIN(rx->average, UINT_MAX / 1024);
burst = g_strdup_printf("%llukb", avg);
}
virCommandFree(cmd);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname,
"ingress", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
virCommandFree(cmd);
cmd = virCommandNew(TC);
/* Set filter to match all ingress traffic */
virCommandAddArgList(cmd, "filter", "add", "dev", ifname, "parent",
"ffff:", "protocol", "all", "u32", "match", "u32",
"0", "0", "police", "rate", average,
"burst", burst, "mtu", "64kb", "drop", "flowid",
":1", NULL);
if (virCommandRun(cmd, NULL) < 0)
goto cleanup;
}
ret = 0;
cleanup:
virCommandFree(cmd);
VIR_FREE(average);
VIR_FREE(peak);
VIR_FREE(burst);
return ret;
}
/**
* virNetDevBandwidthClear:
* @ifname: on which interface
*
* This function tries to disable QoS on specified interface
* by deleting root and ingress qdisc. However, this may fail
* if we try to remove the default one.
*
* Return 0 on success, -1 otherwise.
*/
int
virNetDevBandwidthClear(const char *ifname)
{
int ret = 0;
int dummy; /* for ignoring the exit status */
g_autoptr(virCommand) rootcmd = NULL;
g_autoptr(virCommand) ingresscmd = NULL;
if (!ifname)
return 0;
rootcmd = virCommandNew(TC);
virCommandAddArgList(rootcmd, "qdisc", "del", "dev", ifname, "root", NULL);
if (virCommandRun(rootcmd, &dummy) < 0)
ret = -1;
ingresscmd = virCommandNew(TC);
virCommandAddArgList(ingresscmd, "qdisc", "del", "dev", ifname, "ingress", NULL);
if (virCommandRun(ingresscmd, &dummy) < 0)
ret = -1;
return ret;
}
/*
* virNetDevBandwidthCopy:
* @dest: destination
* @src: source (may be NULL)
*
* Returns -1 on OOM error (which gets reported),
* 0 otherwise.
*/
int
virNetDevBandwidthCopy(virNetDevBandwidth **dest,
const virNetDevBandwidth *src)
{
*dest = NULL;
if (!src) {
/* nothing to be copied */
return 0;
}
*dest = g_new0(virNetDevBandwidth, 1);
if (src->in) {
(*dest)->in = g_new0(virNetDevBandwidthRate, 1);
memcpy((*dest)->in, src->in, sizeof(*src->in));
}
if (src->out) {
(*dest)->out = g_new0(virNetDevBandwidthRate, 1);
memcpy((*dest)->out, src->out, sizeof(*src->out));
}
return 0;
}
bool
virNetDevBandwidthEqual(const virNetDevBandwidth *a,
const virNetDevBandwidth *b)
{
if (!a && !b)
return true;
if (!a || !b)
return false;
/* in */
if (a->in) {
if (!b->in)
return false;
if (a->in->average != b->in->average ||
a->in->peak != b->in->peak ||
a->in->floor != b->in->floor ||
a->in->burst != b->in->burst)
return false;
} else if (b->in) {
return false;
}
/* out */
if (a->out) {
if (!b->out)
return false;
if (a->out->average != b->out->average ||
a->out->peak != b->out->peak ||
a->out->floor != b->out->floor ||
a->out->burst != b->out->burst)
return false;
} else if (b->out) {
return false;
}
return true;
}
/*
* virNetDevBandwidthPlug:
* @brname: name of the bridge
* @net_bandwidth: QoS settings on @brname
* @ifmac_ptr: MAC of interface
* @bandwidth: QoS settings for interface
* @id: unique ID (MUST be greater than 2)
*
* Set bridge part of interface QoS settings, e.g. guaranteed
* bandwidth. @id is an unique ID (among @brname) from which
* other identifiers for class, qdisc and filter are derived.
* However, two classes were already set up (by
* virNetDevBandwidthSet). That's why this @id MUST be greater
* than 2. You may want to keep passed @id, as it is used later
* by virNetDevBandwidthUnplug.
*
* Returns:
* 0 if QoS set successfully
* -1 otherwise.
*/
int
virNetDevBandwidthPlug(const char *brname,
virNetDevBandwidth *net_bandwidth,
const virMacAddr *ifmac_ptr,
virNetDevBandwidth *bandwidth,
unsigned int id)
{
g_autoptr(virCommand) cmd1 = NULL;
g_autoptr(virCommand) cmd2 = NULL;
g_autofree char *class_id = NULL;
g_autofree char *qdisc_id = NULL;
g_autofree char *floor = NULL;
g_autofree char *ceil = NULL;
char ifmacStr[VIR_MAC_STRING_BUFLEN];
if (id <= 2) {
virReportError(VIR_ERR_INTERNAL_ERROR, _("Invalid class ID %d"), id);
return -1;
}
virMacAddrFormat(ifmac_ptr, ifmacStr);
if (!net_bandwidth || !net_bandwidth->in) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
_("Bridge '%s' has no QoS set, therefore "
"unable to set 'floor' on '%s'"),
brname, ifmacStr);
return -1;
}
class_id = g_strdup_printf("1:%x", id);
qdisc_id = g_strdup_printf("%x:", id);
floor = g_strdup_printf("%llukbps", bandwidth->in->floor);
ceil = g_strdup_printf("%llukbps", net_bandwidth->in->peak ?
net_bandwidth->in->peak :
net_bandwidth->in->average);
cmd1 = virCommandNew(TC);
virCommandAddArgList(cmd1, "class", "add", "dev", brname, "parent", "1:1",
"classid", class_id, "htb", "rate", floor,
"ceil", ceil, NULL);
virNetDevBandwidthCmdAddOptimalQuantum(cmd1, bandwidth->in);
if (virCommandRun(cmd1, NULL) < 0)
return -1;
cmd2 = virCommandNew(TC);
virCommandAddArgList(cmd2, "qdisc", "add", "dev", brname, "parent",
class_id, "handle", qdisc_id, "sfq", "perturb",
"10", NULL);
if (virCommandRun(cmd2, NULL) < 0)
return -1;
if (virNetDevBandwidthManipulateFilter(brname, ifmac_ptr, id,
class_id, false, true) < 0)
return -1;
return 0;
}
/*
* virNetDevBandwidthUnplug:
* @brname: from which bridge are we unplugging
* @id: unique identifier (MUST be greater than 2)
*
* Remove QoS settings from bridge.
*
* Returns 0 on success, -1 otherwise.
*/
int
virNetDevBandwidthUnplug(const char *brname,
unsigned int id)
{
int cmd_ret = 0;
g_autoptr(virCommand) cmd1 = NULL;
g_autoptr(virCommand) cmd2 = NULL;
g_autofree char *class_id = NULL;
g_autofree char *qdisc_id = NULL;
if (id <= 2) {
virReportError(VIR_ERR_INTERNAL_ERROR, _("Invalid class ID %d"), id);
return -1;
}
class_id = g_strdup_printf("1:%x", id);
qdisc_id = g_strdup_printf("%x:", id);
cmd1 = virCommandNew(TC);
virCommandAddArgList(cmd1, "qdisc", "del", "dev", brname,
"handle", qdisc_id, NULL);
/* Don't threat tc errors as fatal, but
* try to remove as much as possible */
if (virCommandRun(cmd1, &cmd_ret) < 0)
return -1;
if (virNetDevBandwidthManipulateFilter(brname, NULL, id,
NULL, true, false) < 0)
return -1;
cmd2 = virCommandNew(TC);
virCommandAddArgList(cmd2, "class", "del", "dev", brname,
"classid", class_id, NULL);
if (virCommandRun(cmd2, &cmd_ret) < 0)
return -1;
return 0;
}
/**
* virNetDevBandwidthUpdateRate:
* @ifname: interface name
* @id: unique identifier
* @bandwidth: used to derive 'ceil' of class with @id
* @new_rate: new rate
*
* This function updates the 'rate' attribute of HTB class.
* It can be used whenever a new interface is plugged to a
* bridge to adjust average throughput of non guaranteed
* NICs.
*
* Returns 0 on success, -1 otherwise.
*/
int
virNetDevBandwidthUpdateRate(const char *ifname,
unsigned int id,
virNetDevBandwidth *bandwidth,
unsigned long long new_rate)
{
g_autoptr(virCommand) cmd = NULL;
g_autofree char *class_id = NULL;
g_autofree char *rate = NULL;
g_autofree char *ceil = NULL;
class_id = g_strdup_printf("1:%x", id);
rate = g_strdup_printf("%llukbps", new_rate);
ceil = g_strdup_printf("%llukbps", bandwidth->in->peak ?
bandwidth->in->peak :
bandwidth->in->average);
cmd = virCommandNew(TC);
virCommandAddArgList(cmd, "class", "change", "dev", ifname,
"classid", class_id, "htb", "rate", rate,
"ceil", ceil, NULL);
virNetDevBandwidthCmdAddOptimalQuantum(cmd, bandwidth->in);
return virCommandRun(cmd, NULL);
}
/**
* virNetDevBandwidthUpdateFilter:
* @ifname: interface to operate on
* @ifmac_ptr: new MAC to update the filter with
* @id: filter ID
*
* Sometimes the host environment is so dynamic, that even a
* guest's MAC addresses change on the fly. When that happens we
* must update our QoS hierarchy so that the guest's traffic is
* placed into the correct QDiscs. This function updates the
* filter for the interface @ifname with the unique identifier
* @id so that it uses the new MAC address of the guest interface
* @ifmac_ptr.
*
* Returns: 0 on success,
* -1 on failure (with error reported).
*/
int
virNetDevBandwidthUpdateFilter(const char *ifname,
const virMacAddr *ifmac_ptr,
unsigned int id)
{
int ret = -1;
char *class_id = NULL;
class_id = g_strdup_printf("1:%x", id);
if (virNetDevBandwidthManipulateFilter(ifname, ifmac_ptr, id,
class_id, true, true) < 0)
goto cleanup;
ret = 0;
cleanup:
VIR_FREE(class_id);
return ret;
}
/**
* virNetDevBandwidthSetRootQDisc:
* @ifname: the interface name
* @qdisc: queueing discipline to set
*
* For given interface @ifname set its root queueing discipline
* to @qdisc. This can be used to replace the default qdisc
* (usually pfifo_fast or whatever is set in
* /proc/sys/net/core/default_qdisc) with different qdisc.
*
* Returns: 0 on success,
* -1 if failed to exec tc (with error reported)
* -2 if tc failed (with no error reported)
*/
int
virNetDevBandwidthSetRootQDisc(const char *ifname,
const char *qdisc)
{
g_autoptr(virCommand) cmd = NULL;
g_autofree char *outbuf = NULL;
g_autofree char *errbuf = NULL;
int status;
/* Ideally, we would have a netlink implementation and just
* call it here. But honestly, I tried and failed miserably.
* Fallback to spawning tc. */
cmd = virCommandNewArgList(TC, "qdisc", "add", "dev", ifname,
"root", "handle", "0:", qdisc,
NULL);
virCommandAddEnvString(cmd, "LC_ALL=C");
virCommandSetOutputBuffer(cmd, &outbuf);
virCommandSetErrorBuffer(cmd, &errbuf);
if (virCommandRun(cmd, &status) < 0)
return -1;
if (status != 0) {
VIR_DEBUG("Setting qdisc failed: output='%s' err='%s'", outbuf, errbuf);
return -2;
}
return 0;
}