1
0
mirror of https://gitlab.com/libvirt/libvirt.git synced 2025-01-12 07:42:56 +00:00

network: add tc filter rule to nftables backend to fix checksum of DHCP responses

Please see the commit log for commit v10.9.0-rc1-1-g42ab0148dd for the
history and explanation of the problem that this patch is fixing.

A shorter explanation is that when a guest is connected to a libvirt
virtual network using a virtio-net adapter with in-kernel "vhost-net"
packet processing enabled, it will fail to acquire an IP address from
a DHCP seever running on the host.

In commit v10.9.0-rc1-1-g42ab0148dd we tried fixing this by *zeroing
out* the checksums of these packets with an nftables rule (nftables
can't recompute the checksum, but it can set it to 0) . This
*appeared* to work initially, but it turned out that zeroing the
checksum ends up breaking dhcp packets on *non* virtio/vhost-net guest
interfaces. That attempt was reverted in commit v10.9.0-rc2.

Fortunately, there is an existing way to recompute the checksum of a
packet as it leaves an interface - the "tc" (traffic control) utility
that libvirt already uses for bandwidth management. This patch uses a
tc filter rule to match dhcp response packets on the bridge and
recompute their checksum.

The filter rule must be attached to a tc qdisc, which may also have a
filter attached for bandwidth management (in the <bandwidth> element
of the network config). Not only must we add the qdisc only once
(which was already handled by the patch two prior to this one), but
also the filter rule for checksum fixing and the filter rule for
bandwidth management must be different priorities so they don't clash;
this is solved by adding the checksum-fix filter with "priority 2",
while the bandwidth management filter remains "priority 1" (both will
always be evaluated anyway, it's just a matter of which is evaluated
first).

So far this method has worked with every different guest we could
throw at it, including several that failed with the previous method.

Fixes: b89c4991daa0ee9371f10937fab3b03c5ffdabc6
Reported-by: Rich Jones <rjones@redhat.com>
Reported-by: Andrea Bolognani <abologna@redhat.com>
Fix-Suggested-by: Eric Garver <egarver@redhat.com>
Fix-Suggested-by: Phil Sutter <psutter@redhat.com>
Signed-off-by: Laine Stump <laine@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
This commit is contained in:
Laine Stump 2024-11-25 22:24:49 -05:00 committed by Michal Privoznik
parent 6412c2cb51
commit e9e5ebe6a6
12 changed files with 508 additions and 0 deletions

@ -29,6 +29,7 @@
#include "internal.h" #include "internal.h"
#include "virfirewalld.h" #include "virfirewalld.h"
#include "vircommand.h"
#include "virerror.h" #include "virerror.h"
#include "virlog.h" #include "virlog.h"
#include "virhash.h" #include "virhash.h"
@ -924,6 +925,67 @@ nftablesAddIPSpecificFirewallRules(virFirewall *fw,
} }
/**
* nftablesAddUdpChecksumFixWithTC:
*
* Add a tc filter rule to @ifname (the bridge device of this network)
* that will recompute the checksum of udp packets output from @iface with
* destination port @port.
*
* Normally the checksum should be filled by some part of the basic
* network stack, but there are cases (e.g. DHCP response packets sent
* from virtualization host to a QEMU guest when the guest NIC uses
* vhost-net packet processing) when the host (sender) thinks that
* packet checksums will be computed elsewhere (and so leaves a
* partially computed checksum in the packet header) while the guest
* (receiver) thinks that the checksum has already been fully
* computed; in the meantime none of the code in between has actually
* finished computing the checksum.
*
* An example of this is DHCP response packets from host to guest. If
* the checksum of each of these packets isn't properly computed, then
* many guests (e.g. FreeBSD) will drop them with reason BAD CHECKSUM;
* this tc filter rule will fix the ip and udp checksums, and the
* FreeBSD dhcp client will happily accept the packet.
*
* (NB: if you're wondering how the tc qdisc and filter are removed
* when the network is destroyed, the answer is that the kernel
* automatically (and properly) removes them for us, so we don't need
* to worry about keeping track/deleting as we do with nftables rules)
*/
static int
nftablesAddUdpChecksumFixWithTC(virFirewall *fw,
const char *iface,
int port)
{
g_autofree char *portstr = g_strdup_printf("%d", port);
/* this will add the qdisc (that the filter below is attached to)
* unless it already exists
*/
if (virNetDevBandWidthAddTxFilterParentQdisc(iface, true) < 0)
return -1;
/* add a filter to catch all udp packets with dst "port" and
* recompute their checksum
*/
virFirewallAddCmd(fw, VIR_FIREWALL_LAYER_TC,
"filter", "add", "dev", iface,
"prio", "2", "protocol", "ip", "parent", "1:",
"u32", "match", "ip", "dport", portstr, "ffff",
"action", "csum", "ip", "and", "udp",
NULL);
virFirewallAddRollbackCmd(fw, VIR_FIREWALL_LAYER_TC,
"filter", "del", "dev", iface,
"prio", "2", "protocol", "ip", "parent", "1:",
"u32", "match", "ip", "dport", portstr, "ffff",
"action", "csum", "ip", "and", "udp",
NULL);
return 0;
}
/* nftablesAddFirewallrules: /* nftablesAddFirewallrules:
* *
* @def - the network that needs an nftables firewall added * @def - the network that needs an nftables firewall added
@ -944,6 +1006,12 @@ nftablesAddFirewallRules(virNetworkDef *def, virFirewall **fwRemoval)
virFirewallStartTransaction(fw, VIR_FIREWALL_TRANSACTION_AUTO_ROLLBACK); virFirewallStartTransaction(fw, VIR_FIREWALL_TRANSACTION_AUTO_ROLLBACK);
/* add the tc filter rule needed to fixup the checksum of dhcp
* response packets going from host to guest.
*/
if (nftablesAddUdpChecksumFixWithTC(fw, def->bridge, 68) < 0)
return -1;
nftablesAddGeneralFirewallRules(fw, def); nftablesAddGeneralFirewallRules(fw, def);
for (i = 0; for (i = 0;

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \

@ -1,3 +1,43 @@
tc \
qdisc \
show \
dev \
virbr0 \
handle \
1:
tc \
qdisc \
add \
dev \
virbr0 \
root \
handle \
1: \
htb \
default \
2
tc \
filter \
add \
dev \
virbr0 \
prio \
2 \
protocol \
ip \
parent \
1: \
u32 \
match \
ip \
dport \
68 \
ffff \
action \
csum \
ip \
and \
udp
nft \ nft \
-ae insert \ -ae insert \
rule \ rule \