libvirt/src/util/virnetdevtap.c

1021 lines
31 KiB
C
Raw Normal View History

/*
util: use virDirRead API In making the conversion to the new API, I fixed a couple bugs: virSCSIDeviceGetSgName would leak memory if a directory unexpectedly contained multiple entries; virNetDevTapGetRealDeviceName could report a spurious error from a stale errno inherited before starting the readdir search. The decision on whether to store the result of virDirRead into a variable is based on whether the end of the loop falls through to cleanup code automatically. In some cases, we have loops that are documented to return NULL on failure, and which raise an error on most failure paths but not in the case where the directory was unexpectedly empty; it may be worth a followup patch to explicitly report an error if readdir was successful but the directory was empty, so that a NULL return always has an error set. * src/util/vircgroup.c (virCgroupRemoveRecursively): Use new interface. (virCgroupKillRecursiveInternal, virCgroupSetOwner): Report readdir failures. * src/util/virfile.c (virFileLoopDeviceOpenSearch) (virFileNBDDeviceFindUnused, virFileDeleteTree): Use new interface. * src/util/virnetdevtap.c (virNetDevTapGetRealDeviceName): Properly check readdir errors. * src/util/virpci.c (virPCIDeviceIterDevices) (virPCIDeviceFileIterate, virPCIGetNetName): Report readdir failures. (virPCIDeviceAddressIOMMUGroupIterate): Use new interface. * src/util/virscsi.c (virSCSIDeviceGetSgName): Report readdir failures, and avoid memory leak. (virSCSIDeviceGetDevName): Report readdir failures. * src/util/virusb.c (virUSBDeviceSearch): Report readdir failures. * src/util/virutil.c (virGetFCHostNameByWWN) (virFindFCHostCapableVport): Report readdir failures. Signed-off-by: Eric Blake <eblake@redhat.com>
2014-04-25 20:45:49 +00:00
* Copyright (C) 2007-2014 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include <config.h>
util: centralize tap device MAC address 1st byte "0xFE" modification When a tap device for a domain is created and attached to a bridge, the first byte of the tap device MAC address is set to 0xFE, while the rest is set to match the MAC address that will be presented to the guest as its network device MAC address. Setting this high value in the tap's MAC address discourages the bridge from using the tap device's MAC address as the bridge's own MAC address (Linux bridges always take on the lowest numbered MAC address of all attached devices as their own). In one case within libvirt, a tap device is created and attached to the bridge with the intent that its MAC address be taken on by the bridge as its own (this is used to assure that the bridge has a fixed MAC address to prevent network outages created by the bridge MAC address "flapping" as guests are started and stopped). In this case, the first byte of the mac address is *not* altered to 0xFE. In the current code, callers to virNetDevTapCreateInBridgePort each make the MAC address modification themselves before calling, which leads to code duplication, and also prevents lower level functions from knowing the real MAC address being used by the guest. The problem here is that openvswitch bridges must be informed about this MAC address, or they will be unable to pass traffic to/from the guest. This patch centralizes the location of the MAC address "0xFE fixup" into virNetDevTapCreateInBridgePort(), meaning 1) callers of this function no longer need the extra strange bit of code, and 2) bitNetDevTapCreateBridgeInPort itself now is called with the guest's unaltered MAC address, and can pass it on, unmodified, to virNetDevOpenvswitchAddPort. There is no other behavioral change created by this patch.
2012-02-16 23:49:57 +00:00
#include "virmacaddr.h"
#include "virnetdevtap.h"
#include "virnetdev.h"
#include "virnetdevbridge.h"
#include "virnetdevmidonet.h"
#include "virnetdevopenvswitch.h"
#include "virerror.h"
#include "virfile.h"
2012-12-12 18:06:53 +00:00
#include "viralloc.h"
2012-12-12 17:59:27 +00:00
#include "virlog.h"
#include "virstring.h"
#include "datatypes.h"
#include <unistd.h>
#include <sys/types.h>
#ifndef WIN32
# include <sys/ioctl.h>
#endif
#ifdef HAVE_NET_IF_H
# include <net/if.h>
#endif
#include <fcntl.h>
#ifdef __linux__
# include <linux/if_tun.h> /* IFF_TUN, IFF_NO_PI */
#elif defined(__FreeBSD__)
# include <net/if_mib.h>
# include <sys/sysctl.h>
#endif
#if defined(HAVE_GETIFADDRS) && defined(AF_LINK)
# include <ifaddrs.h>
#endif
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
#include <math.h>
#define VIR_FROM_THIS VIR_FROM_NONE
VIR_LOG_INIT("util.netdevtap");
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
virMutex virNetDevTapCreateMutex = VIR_MUTEX_INITIALIZER;
static int virNetDevTapLastID = -1; /* not "unsigned" because callers use %d */
/**
* virNetDevTapReserveName:
* @name: name of an existing tap device
*
* Set the value of virNetDevTapLastID to assure that any new tap
* device created with an autogenerated name will use a number higher
* than the number in the given tap device name.
*
* Returns nothing.
*/
void
virNetDevTapReserveName(const char *name)
{
unsigned int id;
const char *idstr = NULL;
if (STRPREFIX(name, VIR_NET_GENERATED_TAP_PREFIX)) {
VIR_INFO("marking device in use: '%s'", name);
idstr = name + strlen(VIR_NET_GENERATED_TAP_PREFIX);
if (virStrToLong_ui(idstr, NULL, 10, &id) >= 0) {
virMutexLock(&virNetDevTapCreateMutex);
if (virNetDevTapLastID < (int)id)
virNetDevTapLastID = id;
virMutexUnlock(&virNetDevTapCreateMutex);
}
}
}
/**
* virNetDevTapGetName:
* @tapfd: a tun/tap file descriptor
* @ifname: a pointer that will receive the interface name
*
* Retrieve the interface name given a file descriptor for a tun/tap
* interface.
*
* Returns 0 if the interface name is successfully queried, -1 otherwise
*/
int
virNetDevTapGetName(int tapfd G_GNUC_UNUSED, char **ifname G_GNUC_UNUSED)
{
#ifdef TUNGETIFF
struct ifreq ifr;
if (ioctl(tapfd, TUNGETIFF, &ifr) < 0) {
virReportSystemError(errno, "%s",
_("Unable to query tap interface name"));
return -1;
}
*ifname = g_strdup(ifr.ifr_name);
return 0;
#else
return -1;
#endif
}
/**
* virNetDevTapGetRealDeviceName:
* @ifname: the interface name
*
* Lookup real interface name (i.e. name of the device entry in /dev),
* because e.g. on FreeBSD if we rename tap device to vnetN its device
* entry still remains unchanged (/dev/tapX), but bhyve needs a name
* that matches /dev entry.
*
* Returns the proper interface name or NULL if no corresponding interface
* found.
*/
char*
virNetDevTapGetRealDeviceName(char *ifname G_GNUC_UNUSED)
{
#ifdef IFDATA_DRIVERNAME
int ifindex = 0;
int name[6];
size_t len = 0;
char *ret = NULL;
if ((ifindex = if_nametoindex(ifname)) == 0) {
virReportSystemError(errno,
_("Unable to get interface index for '%s'"),
ifname);
return NULL;
}
name[0] = CTL_NET;
name[1] = PF_LINK;
name[2] = NETLINK_GENERIC;
name[3] = IFMIB_IFDATA;
name[4] = ifindex;
name[5] = IFDATA_DRIVERNAME;
if (sysctl(name, 6, NULL, &len, 0, 0) < 0) {
virReportSystemError(errno,
_("Unable to get driver name for '%s'"),
ifname);
return NULL;
}
if (VIR_ALLOC_N(ret, len) < 0)
return NULL;
if (sysctl(name, 6, ret, &len, 0, 0) < 0) {
virReportSystemError(errno,
_("Unable to get driver name for '%s'"),
ifname);
VIR_FREE(ret);
return NULL;
}
return ret;
#else
return NULL;
#endif
}
/**
* virNetDevProbeVnetHdr:
* @tapfd: a tun/tap file descriptor
*
* Check whether it is safe to enable the IFF_VNET_HDR flag on the
* tap interface.
*
* Setting IFF_VNET_HDR enables QEMU's virtio_net driver to allow
* guests to pass larger (GSO) packets, with partial checksums, to
* the host. This greatly increases the achievable throughput.
*
* It is only useful to enable this when we're setting up a virtio
* interface. And it is only *safe* to enable it when we know for
* sure that a) qemu has support for IFF_VNET_HDR and b) the running
* kernel implements the TUNGETIFF ioctl(), which qemu needs to query
* the supplied tapfd.
*
* Returns 1 if VnetHdr is supported, 0 if not supported
*/
#ifdef IFF_VNET_HDR
static int
virNetDevProbeVnetHdr(int tapfd)
{
# if defined(IFF_VNET_HDR) && defined(TUNGETFEATURES) && defined(TUNGETIFF)
unsigned int features;
struct ifreq dummy;
if (ioctl(tapfd, TUNGETFEATURES, &features) != 0) {
VIR_INFO("Not enabling IFF_VNET_HDR; "
"TUNGETFEATURES ioctl() not implemented");
return 0;
}
if (!(features & IFF_VNET_HDR)) {
VIR_INFO("Not enabling IFF_VNET_HDR; "
"TUNGETFEATURES ioctl() reports no IFF_VNET_HDR");
return 0;
}
/* The kernel will always return -1 at this point.
* If TUNGETIFF is not implemented then errno == EBADFD.
*/
if (ioctl(tapfd, TUNGETIFF, &dummy) != -1 || errno != EBADFD) {
VIR_INFO("Not enabling IFF_VNET_HDR; "
"TUNGETIFF ioctl() not implemented");
return 0;
}
VIR_INFO("Enabling IFF_VNET_HDR");
return 1;
# else
(void) tapfd;
VIR_INFO("Not enabling IFF_VNET_HDR; disabled at build time");
return 0;
# endif
}
#endif
#ifdef TUNSETIFF
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
/**
* virNetDevTapGenerateName:
* @ifname: pointer to pointer to string containing template
*
* generate a new (currently unused) name for a new tap device based
* on the templace string in @ifname - replace %d with
* ++virNetDevTapLastID, and keep trying new values until one is found
* that doesn't already exist, or we've tried 10000 different
* names. Once a usable name is found, replace the template with the
* actual name.
*
* Returns 0 on success, -1 on failure.
*/
static int
virNetDevTapGenerateName(char **ifname)
{
int id;
double maxIDd = pow(10, IFNAMSIZ - 1 - strlen(VIR_NET_GENERATED_TAP_PREFIX));
int maxID = INT_MAX;
int attempts = 0;
if (maxIDd <= (double)INT_MAX)
maxID = (int)maxIDd;
do {
g_autofree char *try = NULL;
id = ++virNetDevTapLastID;
/* reset before overflow */
if (virNetDevTapLastID >= maxID)
virNetDevTapLastID = -1;
try = g_strdup_printf(*ifname, id);
if (!virNetDevExists(try)) {
g_free(*ifname);
*ifname = g_steal_pointer(&try);
return 0;
}
} while (++attempts < 10000);
virReportError(VIR_ERR_INTERNAL_ERROR,
_("no unused %s names available"),
VIR_NET_GENERATED_TAP_PREFIX);
return -1;
}
/**
* virNetDevTapCreate:
* @ifname: the interface name
* @tunpath: path to the tun device (if NULL, /dev/net/tun is used)
* @tapfds: array of file descriptors return value for the new tap device
* @tapfdSize: number of file descriptors in @tapfd
* @flags: OR of virNetDevTapCreateFlags. Only one flag is recognized:
*
* VIR_NETDEV_TAP_CREATE_VNET_HDR
* - Enable IFF_VNET_HDR on the tap device
network: fix dnsmasq/radvd binding to IPv6 on recent kernels I hit this problem recently when trying to create a bridge with an IPv6 address on a 3.2 kernel: dnsmasq (and, further, radvd) would not bind to the given address, waiting 20s and then giving up with -EADDRNOTAVAIL (resp. exiting immediately with "error parsing or activating the config file", without libvirt noticing it, BTW). This can be reproduced with (I think) any kernel >= 2.6.39 and the following XML (to be used with "virsh net-create"): <network> <name>test-bridge</name> <bridge name='testbr0' /> <ip family='ipv6' address='fd00::1' prefix='64'> </ip> </network> (it happens even when you have an IPv4, too) The problem is that since commit [1] (which, ironically, was made to “help IPv6 autoconfiguration”) the linux bridge code makes bridges behave like “real” devices regarding carrier detection. This makes the bridges created by libvirt, which are started without any up devices, stay with the NO-CARRIER flag set, and thus prevents DAD (Duplicate address detection) from happening, thus letting the IPv6 address flagged as “tentative”. Such addresses cannot be bound to (see RFC 2462), so dnsmasq fails binding to it (for radvd, it detects that "interface XXX is not RUNNING", thus that "interface XXX does not exist, ignoring the interface" (sic)). It seems that this behavior was enhanced somehow with commit [2] by avoiding setting NO-CARRIER on empty bridges, but I couldn't reproduce this behavior on my kernel. Anyway, with the “dummy tap to set MAC address” trick, this wouldn't work. To fix this, the idea is to get the bridge's attached device to be up so that DAD can happen (deactivating DAD altogether is not a good idea, I think). Currently, libvirt creates a dummy TAP device to set the MAC address of the bridge, keeping it down. But even if we set this device up, it is not RUNNING as soon as the tap file descriptor attached to it is closed, thus still preventing DAD. So, we must modify the API a bit, so that we can get the fd, keep the tap device persistent, run the daemons, and close it after DAD has taken place. After that, the bridge will be flagged NO-CARRIER again, but the daemons will be running, even if not happy about the device's state (but we don't really care about the bridge's daemons doing anything when no up interface is connected to it). Other solutions that I envisioned were: * Keeping the *-nic interface up: this would waste an fd for each bridge during all its life. May be acceptable, I don't really know. * Stop using the dummy tap trick, and set the MAC address directly on the bridge: it is possible since quite some time it seems, even if then there is the problem of the bridge not being RUNNING when empty, contrary to what [2] says, so this will need fixing (and this fix only happened in 3.1, so it wouldn't work for 2.6.39) * Using the --interface option of dnsmasq, but I saw somewhere that it's not used by libvirt for backward compatibility. I am not sure this would solve this problem, though, as I don't know how dnsmasq binds itself to it with this option. This is why this patch does what's described earlier. This patch also makes radvd start even if the interface is “missing” (i.e. it is not RUNNING), as it daemonizes before binding to it, and thus sometimes does it after the interface has been brought down by us (by closing the tap fd), and then originally stops. This also makes it stop yelling about it in the logs when the interface is down at a later time. [1] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=1faa4356a3bd89ea11fb92752d897cff3a20ec0e [2] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=b64b73d7d0c480f75684519c6134e79d50c1b341
2012-09-26 19:02:20 +00:00
* VIR_NETDEV_TAP_CREATE_PERSIST
* - The device will persist after the file descriptor is closed
*
* Creates a tap interface. The caller must use virNetDevTapDelete to
* remove a persistent TAP device when it is no longer needed. In case
* @tapfdSize is greater than one, multiqueue extension is requested
* from kernel.
*
* Returns 0 in case of success or -1 on failure.
*/
int virNetDevTapCreate(char **ifname,
const char *tunpath,
int *tapfd,
size_t tapfdSize,
network: fix dnsmasq/radvd binding to IPv6 on recent kernels I hit this problem recently when trying to create a bridge with an IPv6 address on a 3.2 kernel: dnsmasq (and, further, radvd) would not bind to the given address, waiting 20s and then giving up with -EADDRNOTAVAIL (resp. exiting immediately with "error parsing or activating the config file", without libvirt noticing it, BTW). This can be reproduced with (I think) any kernel >= 2.6.39 and the following XML (to be used with "virsh net-create"): <network> <name>test-bridge</name> <bridge name='testbr0' /> <ip family='ipv6' address='fd00::1' prefix='64'> </ip> </network> (it happens even when you have an IPv4, too) The problem is that since commit [1] (which, ironically, was made to “help IPv6 autoconfiguration”) the linux bridge code makes bridges behave like “real” devices regarding carrier detection. This makes the bridges created by libvirt, which are started without any up devices, stay with the NO-CARRIER flag set, and thus prevents DAD (Duplicate address detection) from happening, thus letting the IPv6 address flagged as “tentative”. Such addresses cannot be bound to (see RFC 2462), so dnsmasq fails binding to it (for radvd, it detects that "interface XXX is not RUNNING", thus that "interface XXX does not exist, ignoring the interface" (sic)). It seems that this behavior was enhanced somehow with commit [2] by avoiding setting NO-CARRIER on empty bridges, but I couldn't reproduce this behavior on my kernel. Anyway, with the “dummy tap to set MAC address” trick, this wouldn't work. To fix this, the idea is to get the bridge's attached device to be up so that DAD can happen (deactivating DAD altogether is not a good idea, I think). Currently, libvirt creates a dummy TAP device to set the MAC address of the bridge, keeping it down. But even if we set this device up, it is not RUNNING as soon as the tap file descriptor attached to it is closed, thus still preventing DAD. So, we must modify the API a bit, so that we can get the fd, keep the tap device persistent, run the daemons, and close it after DAD has taken place. After that, the bridge will be flagged NO-CARRIER again, but the daemons will be running, even if not happy about the device's state (but we don't really care about the bridge's daemons doing anything when no up interface is connected to it). Other solutions that I envisioned were: * Keeping the *-nic interface up: this would waste an fd for each bridge during all its life. May be acceptable, I don't really know. * Stop using the dummy tap trick, and set the MAC address directly on the bridge: it is possible since quite some time it seems, even if then there is the problem of the bridge not being RUNNING when empty, contrary to what [2] says, so this will need fixing (and this fix only happened in 3.1, so it wouldn't work for 2.6.39) * Using the --interface option of dnsmasq, but I saw somewhere that it's not used by libvirt for backward compatibility. I am not sure this would solve this problem, though, as I don't know how dnsmasq binds itself to it with this option. This is why this patch does what's described earlier. This patch also makes radvd start even if the interface is “missing” (i.e. it is not RUNNING), as it daemonizes before binding to it, and thus sometimes does it after the interface has been brought down by us (by closing the tap fd), and then originally stops. This also makes it stop yelling about it in the logs when the interface is down at a later time. [1] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=1faa4356a3bd89ea11fb92752d897cff3a20ec0e [2] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=b64b73d7d0c480f75684519c6134e79d50c1b341
2012-09-26 19:02:20 +00:00
unsigned int flags)
{
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
size_t i = 0;
struct ifreq ifr;
int ret = -1;
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
int fd = 0;
virMutexLock(&virNetDevTapCreateMutex);
/* if ifname is "vnet%d", then auto-generate a name for the new
* device (the kernel could do this for us, but has a bad habit of
* immediately re-using names that have just been released, which
* can lead to race conditions).
*/
if (STREQ(*ifname, VIR_NET_GENERATED_TAP_PREFIX "%d") &&
virNetDevTapGenerateName(ifname) < 0) {
goto cleanup;
}
if (!tunpath)
tunpath = "/dev/net/tun";
memset(&ifr, 0, sizeof(ifr));
for (i = 0; i < tapfdSize; i++) {
if ((fd = open(tunpath, O_RDWR)) < 0) {
virReportSystemError(errno,
_("Unable to open %s, is tun module loaded?"),
tunpath);
goto cleanup;
}
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
/* If tapfdSize is greater than one, request multiqueue */
if (tapfdSize > 1) {
# ifdef IFF_MULTI_QUEUE
ifr.ifr_flags |= IFF_MULTI_QUEUE;
# else
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Multiqueue devices are not supported on this system"));
goto cleanup;
# endif
}
# ifdef IFF_VNET_HDR
if ((flags & VIR_NETDEV_TAP_CREATE_VNET_HDR) &&
virNetDevProbeVnetHdr(fd))
ifr.ifr_flags |= IFF_VNET_HDR;
# endif
if (virStrcpyStatic(ifr.ifr_name, *ifname) < 0) {
virReportSystemError(ERANGE,
_("Network interface name '%s' is too long"),
*ifname);
goto cleanup;
}
if (ioctl(fd, TUNSETIFF, &ifr) < 0) {
virReportSystemError(errno,
_("Unable to create tap device %s"),
NULLSTR(*ifname));
goto cleanup;
}
if (i == 0) {
/* In case we are looping more than once, set other
* TAPs to have the same name */
VIR_FREE(*ifname);
*ifname = g_strdup(ifr.ifr_name);
}
if ((flags & VIR_NETDEV_TAP_CREATE_PERSIST) &&
ioctl(fd, TUNSETPERSIST, 1) < 0) {
virReportSystemError(errno,
_("Unable to set tap device %s to persistent"),
NULLSTR(*ifname));
goto cleanup;
}
tapfd[i] = fd;
}
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
VIR_INFO("created device: '%s'", *ifname);
ret = 0;
cleanup:
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
virMutexUnlock(&virNetDevTapCreateMutex);
if (ret < 0) {
VIR_FORCE_CLOSE(fd);
while (i--)
VIR_FORCE_CLOSE(tapfd[i]);
}
return ret;
}
int virNetDevTapDelete(const char *ifname,
const char *tunpath)
{
struct ifreq try;
int fd;
int ret = -1;
if (!tunpath)
tunpath = "/dev/net/tun";
if ((fd = open(tunpath, O_RDWR)) < 0) {
virReportSystemError(errno,
_("Unable to open %s, is tun module loaded?"),
tunpath);
return -1;
}
memset(&try, 0, sizeof(struct ifreq));
try.ifr_flags = IFF_TAP|IFF_NO_PI;
if (virStrcpyStatic(try.ifr_name, ifname) < 0) {
virReportSystemError(ERANGE,
_("Network interface name '%s' is too long"),
ifname);
goto cleanup;
}
if (ioctl(fd, TUNSETIFF, &try) < 0) {
virReportSystemError(errno, "%s",
_("Unable to associate TAP device"));
goto cleanup;
}
if (ioctl(fd, TUNSETPERSIST, 0) < 0) {
virReportSystemError(errno, "%s",
_("Unable to make TAP device non-persistent"));
goto cleanup;
}
util: assign tap device names using a monotonically increasing integer When creating a standard tap device, if provided with an ifname that contains "%d", rather than taking that literally as the name to use for the new device, the kernel will instead use that string as a template, and search for the lowest number that could be put in place of %d and produce an otherwise unused and unique name for the new device. For example, if there is no tap device name given in the XML, libvirt will always send "vnet%d" as the device name, and the kernel will create new devices named "vnet0", "vnet1", etc. If one of those devices is deleted, creating a "hole" in the name list, the kernel will always attempt to reuse the name in the hole first before using a name with a higher number (i.e. it finds the lowest possible unused number). The problem with this, as described in the previous patch dealing with macvtap device naming, is that it makes "immediate reuse" of a newly freed tap device name *much* more common, and in the aftermath of deleting a tap device, there is some other necessary cleanup of things which are named based on the device name (nwfilter rules, bandwidth rules, OVS switch ports, to name a few) that could end up stomping over the top of the setup of a new device of the same name for a different guest. Since the kernel "create a name based on a template" functionality for tap devices doesn't exist for macvtap, this patch for standard tap devices is a bit different from the previous patch for macvtap - in particular there was no previous "bitmap ID reservation system" or overly-complex retry loop that needed to be removed. We simply find and unused name, and pass that name on to the kernel instead of "vnet%d". This counter is also wrapped when either it gets to INT_MAX or if the full name would overflow IFNAMSIZ-1 characters. In the case of "vnet%d" and a 32 bit int, we would reach INT_MAX first, but possibly someday someone will change the name from vnet to something else. (NB: It is still possible for a user to provide their own parameterized template name (e.g. "mytap%d") in the XML, and libvirt will just pass that through to the kernel as it always has.) Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
2020-08-24 01:20:13 +00:00
VIR_INFO("delete device: '%s'", ifname);
ret = 0;
cleanup:
VIR_FORCE_CLOSE(fd);
return ret;
}
#elif defined(SIOCIFCREATE2) && defined(SIOCIFDESTROY) && defined(IF_MAXUNIT)
int virNetDevTapCreate(char **ifname,
const char *tunpath G_GNUC_UNUSED,
int *tapfd,
size_t tapfdSize,
unsigned int flags G_GNUC_UNUSED)
{
int s;
struct ifreq ifr;
int ret = -1;
char *newifname = NULL;
if (tapfdSize > 1) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("Multiqueue devices are not supported on this system"));
goto cleanup;
}
/* As FreeBSD determines interface type by name,
* we have to create 'tap' interface first and
* then rename it to 'vnet'
*/
if ((s = virNetDevSetupControl("tap", &ifr)) < 0)
return -1;
if (ioctl(s, SIOCIFCREATE2, &ifr) < 0) {
virReportSystemError(errno, "%s",
_("Unable to create tap device"));
goto cleanup;
}
/* In case we were given exact interface name (e.g. 'vnetN'),
* we just rename to it. If we have format string like
* 'vnet%d', we need to find the first available name that
* matches this pattern
*/
if (strstr(*ifname, "%d") != NULL) {
size_t i;
for (i = 0; i <= IF_MAXUNIT; i++) {
g_autofree char *newname = NULL;
newname = g_strdup_printf(*ifname, i);
if (virNetDevExists(newname) == 0) {
newifname = g_steal_pointer(&newname);
break;
}
}
if (newifname) {
VIR_FREE(*ifname);
*ifname = newifname;
} else {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Failed to generate new name for interface %s"),
ifr.ifr_name);
goto cleanup;
}
}
if (tapfd) {
g_autofree char *dev_path = NULL;
dev_path = g_strdup_printf("/dev/%s", ifr.ifr_name);
if ((*tapfd = open(dev_path, O_RDWR)) < 0) {
virReportSystemError(errno,
_("Unable to open %s"),
dev_path);
goto cleanup;
}
}
if (virNetDevSetName(ifr.ifr_name, *ifname) == -1)
goto cleanup;
ret = 0;
cleanup:
VIR_FORCE_CLOSE(s);
return ret;
}
int virNetDevTapDelete(const char *ifname,
const char *tunpath G_GNUC_UNUSED)
{
int s;
struct ifreq ifr;
int ret = -1;
if ((s = virNetDevSetupControl(ifname, &ifr)) < 0)
return -1;
if (ioctl(s, SIOCIFDESTROY, &ifr) < 0) {
virReportSystemError(errno,
_("Unable to remove tap device %s"),
ifname);
goto cleanup;
}
ret = 0;
cleanup:
VIR_FORCE_CLOSE(s);
return ret;
}
#else
int virNetDevTapCreate(char **ifname G_GNUC_UNUSED,
const char *tunpath G_GNUC_UNUSED,
int *tapfd G_GNUC_UNUSED,
size_t tapfdSize G_GNUC_UNUSED,
unsigned int flags G_GNUC_UNUSED)
{
virReportSystemError(ENOSYS, "%s",
_("Unable to create TAP devices on this platform"));
return -1;
}
int virNetDevTapDelete(const char *ifname G_GNUC_UNUSED,
const char *tunpath G_GNUC_UNUSED)
{
virReportSystemError(ENOSYS, "%s",
_("Unable to delete TAP devices on this platform"));
return -1;
}
#endif
/**
* virNetDevTapAttachBridge:
* @tapname: the tap interface name (or name template)
* @brname: the bridge name
* @macaddr: desired MAC address
* @virtPortProfile: bridge/port specific configuration
* @virtVlan: vlan tag info
* @mtu: requested MTU for port (or 0 for "default")
* @actualMTU: MTU actually set for port (after accounting for bridge's MTU)
*
* This attaches an existing tap device (@tapname) to a bridge
* (@brname).
*
* Returns 0 in case of success or -1 on failure
*/
int
virNetDevTapAttachBridge(const char *tapname,
const char *brname,
const virMacAddr *macaddr,
const unsigned char *vmuuid,
const virNetDevVPortProfile *virtPortProfile,
const virNetDevVlan *virtVlan,
virTristateBool isolatedPort,
unsigned int mtu,
unsigned int *actualMTU)
{
/* If an MTU is specified for the new device, set it before
* attaching the device to the bridge, as it may affect the MTU of
* the bridge (in particular if it is the first device attached to
* the bridge, or if it is smaller than the current MTU of the
* bridge). If MTU isn't specified for the new device (i.e. 0),
* we need to set the interface MTU to the current MTU of the
* bridge (to avoid inadvertently changing the bridge's MTU).
*/
if (mtu > 0) {
if (virNetDevSetMTU(tapname, mtu) < 0)
return -1;
} else {
if (virNetDevSetMTUFromDevice(tapname, brname) < 0)
return -1;
}
if (actualMTU) {
int retMTU = virNetDevGetMTU(tapname);
if (retMTU < 0)
return -1;
*actualMTU = retMTU;
}
if (virtPortProfile) {
if (virtPortProfile->virtPortType == VIR_NETDEV_VPORT_PROFILE_MIDONET) {
if (virNetDevMidonetBindPort(tapname, virtPortProfile) < 0)
return -1;
} else if (virtPortProfile->virtPortType == VIR_NETDEV_VPORT_PROFILE_OPENVSWITCH) {
if (virNetDevOpenvswitchAddPort(brname, tapname, macaddr, vmuuid,
virtPortProfile, virtVlan) < 0)
return -1;
}
} else {
if (virNetDevBridgeAddPort(brname, tapname) < 0)
return -1;
if (isolatedPort == VIR_TRISTATE_BOOL_YES &&
virNetDevBridgePortSetIsolated(brname, tapname, true) < 0) {
virErrorPtr err;
virErrorPreserveLast(&err);
ignore_value(virNetDevBridgeRemovePort(brname, tapname));
virErrorRestore(&err);
return -1;
}
}
return 0;
}
/**
* virNetDevTapReattachBridge:
* @tapname: the tap interface name (or name template)
* @brname: the bridge name
* @macaddr: desired MAC address
* @virtPortProfile: bridge/port specific configuration
* @virtVlan: vlan tag info
* @mtu: requested MTU for port (or 0 for "default")
* @actualMTU: MTU actually set for port (after accounting for bridge's MTU)
*
* Ensures that the tap device (@tapname) is connected to the bridge
* (@brname), potentially removing it from any existing bridge that
* does not match.
*
* Returns 0 in case of success or -1 on failure
*/
int
virNetDevTapReattachBridge(const char *tapname,
const char *brname,
const virMacAddr *macaddr,
const unsigned char *vmuuid,
const virNetDevVPortProfile *virtPortProfile,
const virNetDevVlan *virtVlan,
virTristateBool isolatedPort,
unsigned int mtu,
unsigned int *actualMTU)
{
bool useOVS = false;
g_autofree char *master = NULL;
if (virNetDevGetMaster(tapname, &master) < 0)
return -1;
/* IFLA_MASTER for a tap on an OVS switch is always "ovs-system" */
if (STREQ_NULLABLE(master, "ovs-system")) {
useOVS = true;
if (virNetDevOpenvswitchInterfaceGetMaster(tapname, &master) < 0)
return -1;
}
/* Nothing more todo if we're on the right bridge already */
if (STREQ_NULLABLE(brname, master))
return 0;
/* disconnect from current (incorrect) bridge, if any */
if (master) {
int ret;
VIR_INFO("Removing %s from %s", tapname, master);
if (useOVS)
ret = virNetDevOpenvswitchRemovePort(master, tapname);
else
ret = virNetDevBridgeRemovePort(master, tapname);
if (ret < 0)
return -1;
}
VIR_INFO("Attaching %s to %s", tapname, brname);
if (virNetDevTapAttachBridge(tapname, brname,
macaddr, vmuuid,
virtPortProfile,
virtVlan,
isolatedPort,
mtu, actualMTU) < 0)
return -1;
return 0;
}
/**
* virNetDevTapCreateInBridgePort:
* @brname: the bridge name
* @ifname: the interface name (or name template)
* @macaddr: desired MAC address
* @tunpath: path to the tun device (if NULL, /dev/net/tun is used)
* @tapfd: array of file descriptor return value for the new tap device
* @tapfdSize: number of file descriptors in @tapfd
* @virtPortProfile: bridge/port specific configuration
* @coalesce: optional coalesce parameters
* @mtu: requested MTU for port (or 0 for "default")
* @actualMTU: MTU actually set for port (after accounting for bridge's MTU)
* @flags: OR of virNetDevTapCreateFlags:
* VIR_NETDEV_TAP_CREATE_IFUP
* - Bring the interface up
* VIR_NETDEV_TAP_CREATE_VNET_HDR
* - Enable IFF_VNET_HDR on the tap device
* VIR_NETDEV_TAP_CREATE_USE_MAC_FOR_BRIDGE
* - Set this interface's MAC as the bridge's MAC address
network: fix dnsmasq/radvd binding to IPv6 on recent kernels I hit this problem recently when trying to create a bridge with an IPv6 address on a 3.2 kernel: dnsmasq (and, further, radvd) would not bind to the given address, waiting 20s and then giving up with -EADDRNOTAVAIL (resp. exiting immediately with "error parsing or activating the config file", without libvirt noticing it, BTW). This can be reproduced with (I think) any kernel >= 2.6.39 and the following XML (to be used with "virsh net-create"): <network> <name>test-bridge</name> <bridge name='testbr0' /> <ip family='ipv6' address='fd00::1' prefix='64'> </ip> </network> (it happens even when you have an IPv4, too) The problem is that since commit [1] (which, ironically, was made to “help IPv6 autoconfiguration”) the linux bridge code makes bridges behave like “real” devices regarding carrier detection. This makes the bridges created by libvirt, which are started without any up devices, stay with the NO-CARRIER flag set, and thus prevents DAD (Duplicate address detection) from happening, thus letting the IPv6 address flagged as “tentative”. Such addresses cannot be bound to (see RFC 2462), so dnsmasq fails binding to it (for radvd, it detects that "interface XXX is not RUNNING", thus that "interface XXX does not exist, ignoring the interface" (sic)). It seems that this behavior was enhanced somehow with commit [2] by avoiding setting NO-CARRIER on empty bridges, but I couldn't reproduce this behavior on my kernel. Anyway, with the “dummy tap to set MAC address” trick, this wouldn't work. To fix this, the idea is to get the bridge's attached device to be up so that DAD can happen (deactivating DAD altogether is not a good idea, I think). Currently, libvirt creates a dummy TAP device to set the MAC address of the bridge, keeping it down. But even if we set this device up, it is not RUNNING as soon as the tap file descriptor attached to it is closed, thus still preventing DAD. So, we must modify the API a bit, so that we can get the fd, keep the tap device persistent, run the daemons, and close it after DAD has taken place. After that, the bridge will be flagged NO-CARRIER again, but the daemons will be running, even if not happy about the device's state (but we don't really care about the bridge's daemons doing anything when no up interface is connected to it). Other solutions that I envisioned were: * Keeping the *-nic interface up: this would waste an fd for each bridge during all its life. May be acceptable, I don't really know. * Stop using the dummy tap trick, and set the MAC address directly on the bridge: it is possible since quite some time it seems, even if then there is the problem of the bridge not being RUNNING when empty, contrary to what [2] says, so this will need fixing (and this fix only happened in 3.1, so it wouldn't work for 2.6.39) * Using the --interface option of dnsmasq, but I saw somewhere that it's not used by libvirt for backward compatibility. I am not sure this would solve this problem, though, as I don't know how dnsmasq binds itself to it with this option. This is why this patch does what's described earlier. This patch also makes radvd start even if the interface is “missing” (i.e. it is not RUNNING), as it daemonizes before binding to it, and thus sometimes does it after the interface has been brought down by us (by closing the tap fd), and then originally stops. This also makes it stop yelling about it in the logs when the interface is down at a later time. [1] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=1faa4356a3bd89ea11fb92752d897cff3a20ec0e [2] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=b64b73d7d0c480f75684519c6134e79d50c1b341
2012-09-26 19:02:20 +00:00
* VIR_NETDEV_TAP_CREATE_PERSIST
* - The device will persist after the file descriptor is closed
*
* This function creates a new tap device on a bridge. @ifname can be either
* a fixed name or a name template with '%d' for dynamic name allocation.
* in either case the final name for the bridge will be stored in @ifname.
network: fix dnsmasq/radvd binding to IPv6 on recent kernels I hit this problem recently when trying to create a bridge with an IPv6 address on a 3.2 kernel: dnsmasq (and, further, radvd) would not bind to the given address, waiting 20s and then giving up with -EADDRNOTAVAIL (resp. exiting immediately with "error parsing or activating the config file", without libvirt noticing it, BTW). This can be reproduced with (I think) any kernel >= 2.6.39 and the following XML (to be used with "virsh net-create"): <network> <name>test-bridge</name> <bridge name='testbr0' /> <ip family='ipv6' address='fd00::1' prefix='64'> </ip> </network> (it happens even when you have an IPv4, too) The problem is that since commit [1] (which, ironically, was made to “help IPv6 autoconfiguration”) the linux bridge code makes bridges behave like “real” devices regarding carrier detection. This makes the bridges created by libvirt, which are started without any up devices, stay with the NO-CARRIER flag set, and thus prevents DAD (Duplicate address detection) from happening, thus letting the IPv6 address flagged as “tentative”. Such addresses cannot be bound to (see RFC 2462), so dnsmasq fails binding to it (for radvd, it detects that "interface XXX is not RUNNING", thus that "interface XXX does not exist, ignoring the interface" (sic)). It seems that this behavior was enhanced somehow with commit [2] by avoiding setting NO-CARRIER on empty bridges, but I couldn't reproduce this behavior on my kernel. Anyway, with the “dummy tap to set MAC address” trick, this wouldn't work. To fix this, the idea is to get the bridge's attached device to be up so that DAD can happen (deactivating DAD altogether is not a good idea, I think). Currently, libvirt creates a dummy TAP device to set the MAC address of the bridge, keeping it down. But even if we set this device up, it is not RUNNING as soon as the tap file descriptor attached to it is closed, thus still preventing DAD. So, we must modify the API a bit, so that we can get the fd, keep the tap device persistent, run the daemons, and close it after DAD has taken place. After that, the bridge will be flagged NO-CARRIER again, but the daemons will be running, even if not happy about the device's state (but we don't really care about the bridge's daemons doing anything when no up interface is connected to it). Other solutions that I envisioned were: * Keeping the *-nic interface up: this would waste an fd for each bridge during all its life. May be acceptable, I don't really know. * Stop using the dummy tap trick, and set the MAC address directly on the bridge: it is possible since quite some time it seems, even if then there is the problem of the bridge not being RUNNING when empty, contrary to what [2] says, so this will need fixing (and this fix only happened in 3.1, so it wouldn't work for 2.6.39) * Using the --interface option of dnsmasq, but I saw somewhere that it's not used by libvirt for backward compatibility. I am not sure this would solve this problem, though, as I don't know how dnsmasq binds itself to it with this option. This is why this patch does what's described earlier. This patch also makes radvd start even if the interface is “missing” (i.e. it is not RUNNING), as it daemonizes before binding to it, and thus sometimes does it after the interface has been brought down by us (by closing the tap fd), and then originally stops. This also makes it stop yelling about it in the logs when the interface is down at a later time. [1] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=1faa4356a3bd89ea11fb92752d897cff3a20ec0e [2] http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=b64b73d7d0c480f75684519c6134e79d50c1b341
2012-09-26 19:02:20 +00:00
* If the @tapfd parameter is supplied, the open tap device file descriptor
* will be returned, otherwise the TAP device will be closed. The caller must
* use virNetDevTapDelete to remove a persistent TAP device when it is no
* longer needed.
*
* Returns 0 in case of success or -1 on failure
*/
int virNetDevTapCreateInBridgePort(const char *brname,
char **ifname,
maint: avoid 'const fooPtr' in virnet files 'const fooPtr' is the same as 'foo * const' (the pointer won't change, but it's contents can). But in general, if an interface is trying to be const-correct, it should be using 'const foo *' (the pointer is to data that can't be changed). Fix up remaining offenders in src/util. * src/util/virnetdev.h (virNetDevSetMAC) (virNetDevReplaceMacAddress, virNetDevValidateConfig) (virNetDevReplaceNetConfig): Use intended type. * src/util/virnetdevbandwidth.h (virNetDevBandwidthCopy) (virNetDevBandwidthPlug): Likewise. * src/util/virnetdevmacvlan.h (virNetDevMacVLanCreate) (virNetDevMacVLanCreateWithVPortProfile) (virNetDevMacVLanDeleteWithVPortProfile) (virNetDevMacVLanRestartWithVPortProfile) (virNetDevMacVLanVPortProfileRegisterCallback): Likewise. * src/util/virnetdevopenvswitch.h (virNetDevOpenvswitchAddPort): Likewise. * src/util/virnetdevtap.h (virNetDevTapCreateInBridgePort): Likewise. * src/util/virnetdevvlan.h (virNetDevVlanEqual) (virNetDevVlanCopy): Likewise. * src/util/virnetdevvportprofile.h (virNetDevVPortProfileAssociate) (virNetDevVPortProfileDisassociate): Likewise. * src/util/virnetlink.h (virNetlinkEventRemoveCallback) (virNetlinkEventAddClient, virNetlinkEventRemoveClient): Likewise. * src/util/virnetdev.c (virNetDevSetMAC) (virNetDevReplaceMacAddress, virNetDevValidateConfig) (virNetDevReplaceNetConfig): Fix fallout. * src/util/virnetdevbandwidth.c (virNetDevBandwidthCopy) (virNetDevBandwidthPlug): Likewise. * src/util/virnetdevmacvlan.c (virNetDevMacVLanCreate) (virNetDevMacVLanCreateWithVPortProfile) (virNetDevMacVLanDeleteWithVPortProfile) (virNetDevMacVLanRestartWithVPortProfile) (virNetDevMacVLanVPortProfileRegisterCallback): Likewise. * src/util/virnetdevopenvswitch.c (virNetDevOpenvswitchAddPort): Likewise. * src/util/virnetdevtap.c (virNetDevTapCreateInBridgePort): Likewise. * src/util/virnetdevvlan.c (virNetDevVlanEqual) (virNetDevVlanCopy): Likewise. * src/util/virnetdevvportprofile.c (virNetDevVPortProfileAssociate) (virNetDevVPortProfileDisassociate) (virNetDevVPortProfileOpSetLink, virNetDevVPortProfileOpCommon) (virNetDevVPortProfileOp8021Qbg, virNetDevVPortProfileOp8021Qbh): Likewise. * src/util/virnetlink.c (virNetlinkEventRemoveCallback) (virNetlinkEventAddClient, virNetlinkEventRemoveClient): Likewise. Signed-off-by: Eric Blake <eblake@redhat.com>
2013-10-05 19:41:44 +00:00
const virMacAddr *macaddr,
const unsigned char *vmuuid,
const char *tunpath,
int *tapfd,
size_t tapfdSize,
const virNetDevVPortProfile *virtPortProfile,
const virNetDevVlan *virtVlan,
virTristateBool isolatedPort,
virNetDevCoalescePtr coalesce,
unsigned int mtu,
unsigned int *actualMTU,
unsigned int flags)
{
virMacAddr tapmac;
size_t i;
util: centralize tap device MAC address 1st byte "0xFE" modification When a tap device for a domain is created and attached to a bridge, the first byte of the tap device MAC address is set to 0xFE, while the rest is set to match the MAC address that will be presented to the guest as its network device MAC address. Setting this high value in the tap's MAC address discourages the bridge from using the tap device's MAC address as the bridge's own MAC address (Linux bridges always take on the lowest numbered MAC address of all attached devices as their own). In one case within libvirt, a tap device is created and attached to the bridge with the intent that its MAC address be taken on by the bridge as its own (this is used to assure that the bridge has a fixed MAC address to prevent network outages created by the bridge MAC address "flapping" as guests are started and stopped). In this case, the first byte of the mac address is *not* altered to 0xFE. In the current code, callers to virNetDevTapCreateInBridgePort each make the MAC address modification themselves before calling, which leads to code duplication, and also prevents lower level functions from knowing the real MAC address being used by the guest. The problem here is that openvswitch bridges must be informed about this MAC address, or they will be unable to pass traffic to/from the guest. This patch centralizes the location of the MAC address "0xFE fixup" into virNetDevTapCreateInBridgePort(), meaning 1) callers of this function no longer need the extra strange bit of code, and 2) bitNetDevTapCreateBridgeInPort itself now is called with the guest's unaltered MAC address, and can pass it on, unmodified, to virNetDevOpenvswitchAddPort. There is no other behavioral change created by this patch.
2012-02-16 23:49:57 +00:00
if (virNetDevTapCreate(ifname, tunpath, tapfd, tapfdSize, flags) < 0)
return -1;
/* We need to set the interface MAC before adding it
* to the bridge, because the bridge assumes the lowest
* MAC of all enslaved interfaces & we don't want it
* seeing the kernel allocate random MAC for the TAP
* device before we set our static MAC.
*/
virMacAddrSet(&tapmac, macaddr);
if (!(flags & VIR_NETDEV_TAP_CREATE_USE_MAC_FOR_BRIDGE)) {
util: allow tap-based guest interfaces to have MAC address prefix 0xFE Back in July 2010, commit 6ea90b84 (meant to resolve https://bugzilla.redhat.com/571991 ) added code to set the MAC address of any tap device to the associated guest interface's MAC, but with the first byte replaced with 0xFE. This was done in order to assure that 1) the tap MAC and guest interface MAC were different (otherwise L2 forwarding through the tap would not work, and the kernel would repeatedly issue a warning stating as much). 2) any bridge device that had one of these taps attached would *not* take on the MAC of the tap (leading to network instability as guests started and stopped) A couple years later, https://bugzilla.redhat.com/798467 was filed, complaining that a user could configure a tap-based guest interface to have a MAC address that itself had a first byte of 0xFE, silently (other than the kernel warning messages) resulting in a non-working configuration. This was fixed by commit 5d571045, which logged an error and failed the guest start / interface attach if the MAC's first byte was 0xFE. Although this restriction only reduces the potential pool of MAC addresses from 2^46 (last two bits of byte 1 must be set to 10) by 2^32 (still 4 orders of magnitude larger than the entire IPv4 address space), it also means that management software that autogenerates MAC addresses must have special code to avoid an 0xFE prefix. Now after 7 years, someone has noticed this restriction and requested that we remove it. So instead of failing when 0xFE is found as the first byte, this patch removes the restriction by just replacing the first byte in the tap device MAC with 0xFA if the first byte in the guest interface is 0xFE. 0xFA is the next-highest value that still has 10 as the lowest two bits, and still 2) meets the requirement of "tap MAC must be different from guest interface MAC", and 3) is high enough that there should never be an issue of the attached bridge device taking on the MAC of the tap. The result is that *any* MAC can be chosen by management software (although it would still not work correctly if a multicast MAC (lowest bit of first byte set to 1) was chosen), but that's a different issue). Signed-off-by: Laine Stump <laine@redhat.com> Reviewed-by: Michal Privoznik <mprivozn@redhat.com
2019-08-11 20:21:42 +00:00
/* The tap device's MAC address cannot match the MAC address
* used by the guest. This results in "received packet on
* vnetX with own address as source address" error logs from
* the kernel. Making the tap address as high as possible
* discourages the bridge from using this tap's MAC as its own
* (a Linux host bridge will take on the lowest numbered MAC
* of all devices attached to it).
*/
if (tapmac.addr[0] == 0xFE)
tapmac.addr[0] = 0xFA;
else
tapmac.addr[0] = 0xFE;
}
util: centralize tap device MAC address 1st byte "0xFE" modification When a tap device for a domain is created and attached to a bridge, the first byte of the tap device MAC address is set to 0xFE, while the rest is set to match the MAC address that will be presented to the guest as its network device MAC address. Setting this high value in the tap's MAC address discourages the bridge from using the tap device's MAC address as the bridge's own MAC address (Linux bridges always take on the lowest numbered MAC address of all attached devices as their own). In one case within libvirt, a tap device is created and attached to the bridge with the intent that its MAC address be taken on by the bridge as its own (this is used to assure that the bridge has a fixed MAC address to prevent network outages created by the bridge MAC address "flapping" as guests are started and stopped). In this case, the first byte of the mac address is *not* altered to 0xFE. In the current code, callers to virNetDevTapCreateInBridgePort each make the MAC address modification themselves before calling, which leads to code duplication, and also prevents lower level functions from knowing the real MAC address being used by the guest. The problem here is that openvswitch bridges must be informed about this MAC address, or they will be unable to pass traffic to/from the guest. This patch centralizes the location of the MAC address "0xFE fixup" into virNetDevTapCreateInBridgePort(), meaning 1) callers of this function no longer need the extra strange bit of code, and 2) bitNetDevTapCreateBridgeInPort itself now is called with the guest's unaltered MAC address, and can pass it on, unmodified, to virNetDevOpenvswitchAddPort. There is no other behavioral change created by this patch.
2012-02-16 23:49:57 +00:00
if (virNetDevSetMAC(*ifname, &tapmac) < 0)
goto error;
if (virNetDevTapAttachBridge(*ifname, brname, macaddr, vmuuid,
virtPortProfile, virtVlan,
isolatedPort, mtu, actualMTU) < 0) {
goto error;
}
if (virNetDevSetOnline(*ifname, !!(flags & VIR_NETDEV_TAP_CREATE_IFUP)) < 0)
goto error;
if (virNetDevSetCoalesce(*ifname, coalesce, false) < 0)
goto error;
return 0;
error:
for (i = 0; i < tapfdSize && tapfd[i] >= 0; i++)
VIR_FORCE_CLOSE(tapfd[i]);
return -1;
}
/*-------------------- interface stats --------------------*/
/**
* virNetDevTapInterfaceStats:
* @ifname: interface
* @stats: where to store statistics
* @swapped: whether to swap RX/TX fields
*
* Fetch RX/TX statistics for given named interface (@ifname) and
* store them at @stats. The returned statistics are always from
* domain POV. Because in some cases this means swapping RX/TX in
* the stats and in others this means no swapping (consider TAP
* vs macvtap) caller might choose if the returned stats should
* be @swapped or not.
*
* Returns 0 on success, -1 otherwise (with error reported).
*/
#ifdef __linux__
int
virNetDevTapInterfaceStats(const char *ifname,
virDomainInterfaceStatsPtr stats,
bool swapped)
{
int ifname_len;
FILE *fp;
char line[256], *colon;
if (!ifname) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Interface name not provided"));
return -1;
}
fp = fopen("/proc/net/dev", "r");
if (!fp) {
virReportSystemError(errno, "%s",
_("Could not open /proc/net/dev"));
return -1;
}
ifname_len = strlen(ifname);
while (fgets(line, sizeof(line), fp)) {
long long dummy;
long long rx_bytes;
long long rx_packets;
long long rx_errs;
long long rx_drop;
long long tx_bytes;
long long tx_packets;
long long tx_errs;
long long tx_drop;
/* The line looks like:
* " eth0:..."
* Split it at the colon.
*/
colon = strchr(line, ':');
if (!colon) continue;
*colon = '\0';
if (colon-ifname_len >= line &&
STREQ(colon-ifname_len, ifname)) {
if (sscanf(colon+1,
"%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld",
&rx_bytes, &rx_packets, &rx_errs, &rx_drop,
&dummy, &dummy, &dummy, &dummy,
&tx_bytes, &tx_packets, &tx_errs, &tx_drop,
&dummy, &dummy, &dummy, &dummy) != 16)
continue;
if (swapped) {
stats->rx_bytes = tx_bytes;
stats->rx_packets = tx_packets;
stats->rx_errs = tx_errs;
stats->rx_drop = tx_drop;
stats->tx_bytes = rx_bytes;
stats->tx_packets = rx_packets;
stats->tx_errs = rx_errs;
stats->tx_drop = rx_drop;
} else {
stats->rx_bytes = rx_bytes;
stats->rx_packets = rx_packets;
stats->rx_errs = rx_errs;
stats->rx_drop = rx_drop;
stats->tx_bytes = tx_bytes;
stats->tx_packets = tx_packets;
stats->tx_errs = tx_errs;
stats->tx_drop = tx_drop;
}
VIR_FORCE_FCLOSE(fp);
return 0;
}
}
VIR_FORCE_FCLOSE(fp);
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("/proc/net/dev: Interface not found"));
return -1;
}
#elif defined(HAVE_GETIFADDRS) && defined(AF_LINK)
int
virNetDevTapInterfaceStats(const char *ifname,
virDomainInterfaceStatsPtr stats,
bool swapped)
{
struct ifaddrs *ifap, *ifa;
struct if_data *ifd;
int ret = -1;
if (!ifname) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Interface name not provided"));
return -1;
}
if (getifaddrs(&ifap) < 0) {
virReportSystemError(errno, "%s",
_("Could not get interface list"));
return -1;
}
for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
util: check ifa_addr pointer before accessing its elements Reported by Rafał Wojciechowski <it@rafalwojciechowski.pl>. Thread 1 (Thread 0x7f194b99d700 (LWP 5631)): 0 virNetDevGetifaddrsAddress (addr=0x7f194b99c7c0, ifname=0x7f193400e2b0 "ovirtmgmt") at util/virnetdevip.c:738 1 virNetDevIPAddrGet (ifname=0x7f193400e2b0 "ovirtmgmt", addr=addr@entry=0x7f194b99c7c0) at util/virnetdevip.c:795 2 0x00007f19467800d6 in networkGetNetworkAddress (netname=<optimized out>, netaddr=netaddr@entry=0x7f1924013f18) at network/bridge_driver.c:4780 3 0x00007f193e43a33c in qemuProcessGraphicsSetupNetworkAddress (listenAddr=0x7f19340f7650 "127.0.0.1", glisten=0x7f1924013f10) at qemu/qemu_process.c:4062 4 qemuProcessGraphicsSetupListen (vm=<optimized out>, graphics=0x7f1924014f10, cfg=0x7f1934119f00) at qemu/qemu_process.c:4133 5 qemuProcessSetupGraphics (flags=17, vm=0x7f19240155d0, driver=0x7f193411f1d0) at qemu/qemu_process.c:4196 6 qemuProcessPrepareDomain (conn=conn@entry=0x7f192c00ab50, driver=driver@entry=0x7f193411f1d0, vm=vm@entry=0x7f19240155d0, flags=flags@entry=17) at qemu/qemu_process.c:4969 7 0x00007f193e4417c0 in qemuProcessStart (conn=conn@entry=0x7f192c00ab50, driver=driver@entry=0x7f193411f1d0, vm=0x7f19240155d0,asyncJob=asyncJob@entry=QEMU_ASYNC_JOB_START, migrateFrom=migrateFrom@entry=0x0, migrateFd=migrateFd@entry=-1, migratePath=migratePath@entry=0x0,snapshot=snapshot@entry=0x0, vmop=vmop@entry=VIR_NETDEV_VPORT_PROFILE_OP_CREATE, flags=17, flags@entry=1) at qemu/qemu_process.c:5553 Man page for getifaddrs also states that the "ifa_addr" may contain a null pointer which happens if there is an existing network interface on the host without IP address. Signed-off-by: Pavel Hrdina <phrdina@redhat.com>
2017-04-21 08:50:12 +00:00
if (!ifa->ifa_addr)
continue;
if (ifa->ifa_addr->sa_family != AF_LINK)
continue;
if (STREQ(ifa->ifa_name, ifname)) {
ifd = (struct if_data *)ifa->ifa_data;
if (swapped) {
stats->tx_bytes = ifd->ifi_ibytes;
stats->tx_packets = ifd->ifi_ipackets;
stats->tx_errs = ifd->ifi_ierrors;
stats->tx_drop = ifd->ifi_iqdrops;
stats->rx_bytes = ifd->ifi_obytes;
stats->rx_packets = ifd->ifi_opackets;
stats->rx_errs = ifd->ifi_oerrors;
# ifdef HAVE_STRUCT_IF_DATA_IFI_OQDROPS
stats->rx_drop = ifd->ifi_oqdrops;
# else
stats->rx_drop = 0;
# endif
} else {
stats->tx_bytes = ifd->ifi_obytes;
stats->tx_packets = ifd->ifi_opackets;
stats->tx_errs = ifd->ifi_oerrors;
# ifdef HAVE_STRUCT_IF_DATA_IFI_OQDROPS
stats->tx_drop = ifd->ifi_oqdrops;
# else
stats->tx_drop = 0;
# endif
stats->rx_bytes = ifd->ifi_ibytes;
stats->rx_packets = ifd->ifi_ipackets;
stats->rx_errs = ifd->ifi_ierrors;
stats->rx_drop = ifd->ifi_iqdrops;
}
ret = 0;
break;
}
}
if (ret < 0)
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Interface not found"));
freeifaddrs(ifap);
return ret;
}
#else
int
virNetDevTapInterfaceStats(const char *ifname G_GNUC_UNUSED,
virDomainInterfaceStatsPtr stats G_GNUC_UNUSED,
bool swapped G_GNUC_UNUSED)
{
virReportError(VIR_ERR_OPERATION_INVALID, "%s",
_("interface stats not implemented on this platform"));
return -1;
}
#endif /* __linux__ */