2024-08-02 18:10:36 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
* Copyright Red Hat
|
|
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
|
|
*
|
|
|
|
* UDP flow tracking functions
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <sys/uio.h>
|
2024-08-20 00:22:55 +02:00
|
|
|
#include <unistd.h>
|
2024-08-02 18:10:36 +02:00
|
|
|
|
|
|
|
#include "util.h"
|
|
|
|
#include "passt.h"
|
|
|
|
#include "flow_table.h"
|
|
|
|
|
|
|
|
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* udp_at_sidx() - Get UDP specific flow at given sidx
|
|
|
|
* @sidx: Flow and side to retrieve
|
|
|
|
*
|
|
|
|
* Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if
|
|
|
|
* the flow at @sidx is not FLOW_UDP.
|
|
|
|
*/
|
|
|
|
struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
|
|
|
|
{
|
|
|
|
union flow *flow = flow_at_sidx(sidx);
|
|
|
|
|
|
|
|
if (!flow)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ASSERT(flow->f.type == FLOW_UDP);
|
|
|
|
return &flow->udp;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* udp_flow_close() - Close and clean up UDP flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @uflow: UDP flow
|
|
|
|
*/
|
2024-09-06 15:17:06 +10:00
|
|
|
void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
|
2024-08-02 18:10:36 +02:00
|
|
|
{
|
2024-09-06 15:17:06 +10:00
|
|
|
if (uflow->closed)
|
|
|
|
return; /* Nothing to do */
|
|
|
|
|
2024-08-02 18:10:36 +02:00
|
|
|
if (uflow->s[INISIDE] >= 0) {
|
|
|
|
/* The listening socket needs to stay in epoll */
|
|
|
|
close(uflow->s[INISIDE]);
|
|
|
|
uflow->s[INISIDE] = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (uflow->s[TGTSIDE] >= 0) {
|
|
|
|
/* But the flow specific one needs to be removed */
|
|
|
|
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL);
|
|
|
|
close(uflow->s[TGTSIDE]);
|
|
|
|
uflow->s[TGTSIDE] = -1;
|
|
|
|
}
|
|
|
|
flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
|
|
|
|
if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
|
|
|
|
flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
|
2024-09-06 15:17:06 +10:00
|
|
|
|
|
|
|
uflow->closed = true;
|
2024-08-02 18:10:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* udp_flow_new() - Common setup for a new UDP flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @flow: Initiated flow
|
|
|
|
* @s_ini: Initiating socket (or -1)
|
|
|
|
* @now: Timestamp
|
|
|
|
*
|
|
|
|
* Return: UDP specific flow, if successful, NULL on failure
|
|
|
|
*/
|
|
|
|
static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
|
|
|
int s_ini, const struct timespec *now)
|
|
|
|
{
|
|
|
|
struct udp_flow *uflow = NULL;
|
|
|
|
const struct flowside *tgt;
|
|
|
|
uint8_t tgtpif;
|
|
|
|
|
|
|
|
if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
|
|
|
|
goto cancel;
|
|
|
|
tgtpif = flow->f.pif[TGTSIDE];
|
|
|
|
|
|
|
|
uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
|
|
|
|
uflow->ts = now->tv_sec;
|
|
|
|
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
|
|
|
|
|
|
|
|
if (s_ini >= 0) {
|
|
|
|
/* When using auto port-scanning the listening port could go
|
|
|
|
* away, so we need to duplicate the socket
|
|
|
|
*/
|
|
|
|
uflow->s[INISIDE] = fcntl(s_ini, F_DUPFD_CLOEXEC, 0);
|
|
|
|
if (uflow->s[INISIDE] < 0) {
|
|
|
|
flow_err(uflow,
|
|
|
|
"Couldn't duplicate listening socket: %s",
|
treewide: Dodge dynamic memory allocation in strerror() from glibc > 2.40
With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot
use buffer after dlmopen (bug 32026)"), strerror() now needs, at least
on x86, the getrandom() and brk() system calls, in order to fill in
the locale-translated error message. But getrandom() and brk() are not
allowed by our seccomp profiles.
This became visible on Fedora Rawhide with the "podman login and
logout" Podman tests, defined at test/e2e/login_logout_test.go in the
Podman source tree, where pasta would terminate upon printing error
descriptions (at least the ones related to the SO_ERROR queue for
spliced connections).
Avoid dynamic memory allocation by calling strerrordesc_np() instead,
which is a GNU function returning a static, untranslated version of
the error description. If it's not available, keep calling strerror(),
which at that point should be simple enough as to be usable (at least,
that's currently the case for musl).
Reported-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24804
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
2024-12-11 00:13:39 +01:00
|
|
|
strerror_(errno));
|
2024-08-02 18:10:36 +02:00
|
|
|
goto cancel;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pif_is_socket(tgtpif)) {
|
|
|
|
struct mmsghdr discard[UIO_MAXIOV] = { 0 };
|
|
|
|
union {
|
|
|
|
flow_sidx_t sidx;
|
|
|
|
uint32_t data;
|
|
|
|
} fref = {
|
|
|
|
.sidx = FLOW_SIDX(flow, TGTSIDE),
|
|
|
|
};
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
uflow->s[TGTSIDE] = flowside_sock_l4(c, EPOLL_TYPE_UDP_REPLY,
|
|
|
|
tgtpif, tgt, fref.data);
|
|
|
|
if (uflow->s[TGTSIDE] < 0) {
|
|
|
|
flow_dbg(uflow,
|
|
|
|
"Couldn't open socket for spliced flow: %s",
|
treewide: Dodge dynamic memory allocation in strerror() from glibc > 2.40
With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot
use buffer after dlmopen (bug 32026)"), strerror() now needs, at least
on x86, the getrandom() and brk() system calls, in order to fill in
the locale-translated error message. But getrandom() and brk() are not
allowed by our seccomp profiles.
This became visible on Fedora Rawhide with the "podman login and
logout" Podman tests, defined at test/e2e/login_logout_test.go in the
Podman source tree, where pasta would terminate upon printing error
descriptions (at least the ones related to the SO_ERROR queue for
spliced connections).
Avoid dynamic memory allocation by calling strerrordesc_np() instead,
which is a GNU function returning a static, untranslated version of
the error description. If it's not available, keep calling strerror(),
which at that point should be simple enough as to be usable (at least,
that's currently the case for musl).
Reported-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24804
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
2024-12-11 00:13:39 +01:00
|
|
|
strerror_(errno));
|
2024-08-02 18:10:36 +02:00
|
|
|
goto cancel;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flowside_connect(c, uflow->s[TGTSIDE], tgtpif, tgt) < 0) {
|
|
|
|
flow_dbg(uflow,
|
|
|
|
"Couldn't connect flow socket: %s",
|
treewide: Dodge dynamic memory allocation in strerror() from glibc > 2.40
With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot
use buffer after dlmopen (bug 32026)"), strerror() now needs, at least
on x86, the getrandom() and brk() system calls, in order to fill in
the locale-translated error message. But getrandom() and brk() are not
allowed by our seccomp profiles.
This became visible on Fedora Rawhide with the "podman login and
logout" Podman tests, defined at test/e2e/login_logout_test.go in the
Podman source tree, where pasta would terminate upon printing error
descriptions (at least the ones related to the SO_ERROR queue for
spliced connections).
Avoid dynamic memory allocation by calling strerrordesc_np() instead,
which is a GNU function returning a static, untranslated version of
the error description. If it's not available, keep calling strerror(),
which at that point should be simple enough as to be usable (at least,
that's currently the case for musl).
Reported-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24804
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
2024-12-11 00:13:39 +01:00
|
|
|
strerror_(errno));
|
2024-08-02 18:10:36 +02:00
|
|
|
goto cancel;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* It's possible, if unlikely, that we could receive some
|
|
|
|
* unrelated packets in between the bind() and connect() of this
|
|
|
|
* socket. For now we just discard these. We could consider
|
|
|
|
* trying to redirect these to an appropriate handler, if we
|
|
|
|
* need to.
|
|
|
|
*/
|
|
|
|
rc = recvmmsg(uflow->s[TGTSIDE], discard, ARRAY_SIZE(discard),
|
|
|
|
MSG_DONTWAIT, NULL);
|
|
|
|
if (rc >= ARRAY_SIZE(discard)) {
|
|
|
|
flow_dbg(uflow,
|
|
|
|
"Too many (%d) spurious reply datagrams", rc);
|
|
|
|
goto cancel;
|
|
|
|
} else if (rc > 0) {
|
|
|
|
flow_trace(uflow,
|
|
|
|
"Discarded %d spurious reply datagrams", rc);
|
|
|
|
} else if (errno != EAGAIN) {
|
|
|
|
flow_err(uflow,
|
|
|
|
"Unexpected error discarding datagrams: %s",
|
treewide: Dodge dynamic memory allocation in strerror() from glibc > 2.40
With glibc commit 25a5eb4010df ("string: strerror, strsignal cannot
use buffer after dlmopen (bug 32026)"), strerror() now needs, at least
on x86, the getrandom() and brk() system calls, in order to fill in
the locale-translated error message. But getrandom() and brk() are not
allowed by our seccomp profiles.
This became visible on Fedora Rawhide with the "podman login and
logout" Podman tests, defined at test/e2e/login_logout_test.go in the
Podman source tree, where pasta would terminate upon printing error
descriptions (at least the ones related to the SO_ERROR queue for
spliced connections).
Avoid dynamic memory allocation by calling strerrordesc_np() instead,
which is a GNU function returning a static, untranslated version of
the error description. If it's not available, keep calling strerror(),
which at that point should be simple enough as to be usable (at least,
that's currently the case for musl).
Reported-by: Paul Holzinger <pholzing@redhat.com>
Link: https://github.com/containers/podman/issues/24804
Analysed-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
2024-12-11 00:13:39 +01:00
|
|
|
strerror_(errno));
|
2024-08-02 18:10:36 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
|
|
|
|
|
|
|
|
/* If the target side is a socket, it will be a reply socket that knows
|
|
|
|
* its own flowside. But if it's tap, then we need to look it up by
|
|
|
|
* hash.
|
|
|
|
*/
|
|
|
|
if (!pif_is_socket(tgtpif))
|
|
|
|
flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
|
|
|
|
FLOW_ACTIVATE(uflow);
|
|
|
|
|
|
|
|
return FLOW_SIDX(uflow, TGTSIDE);
|
|
|
|
|
|
|
|
cancel:
|
|
|
|
if (uflow)
|
|
|
|
udp_flow_close(c, uflow);
|
|
|
|
flow_alloc_cancel(flow);
|
|
|
|
return FLOW_SIDX_NONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* udp_flow_from_sock() - Find or create UDP flow for "listening" socket
|
|
|
|
* @c: Execution context
|
|
|
|
* @ref: epoll reference of the receiving socket
|
|
|
|
* @s_in: Source socket address, filled in by recvmmsg()
|
|
|
|
* @now: Timestamp
|
|
|
|
*
|
treewide: Allow additional system calls for i386/i686
I haven't tested i386 for a long time (after playing with some
openSUSE i586 image a couple of years ago). It turns out that a number
of system calls we actually need were denied by the seccomp filter,
and not even basic functionality works.
Add some system calls that glibc started using with the 64-bit time
("t64") transition, see also:
https://wiki.debian.org/ReleaseGoals/64bit-time
that is: clock_gettime64, timerfd_gettime64, fcntl64, and
recvmmsg_time64.
Add further system calls that are needed regardless of time_t width,
that is, mmap2 (valgrind profile only), _llseek and sigreturn (common
outside x86_64), and socketcall (same as s390x).
I validated this against an almost full run of the test suite, with
just a few selected tests skipped. Fixes needed to run most tests on
i386/i686, and other assorted fixes for tests, are included in
upcoming patches.
Reported-by: Uroš Knupleš <uros@knuples.net>
Analysed-by: Faidon Liambotis <paravoid@debian.org>
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1078981
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
2024-08-19 23:42:30 +02:00
|
|
|
* #syscalls fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
|
2024-08-02 18:10:36 +02:00
|
|
|
*
|
|
|
|
* Return: sidx for the destination side of the flow for this packet, or
|
|
|
|
* FLOW_SIDX_NONE if we couldn't find or create a flow.
|
|
|
|
*/
|
|
|
|
flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
|
|
|
|
const union sockaddr_inany *s_in,
|
|
|
|
const struct timespec *now)
|
|
|
|
{
|
2024-12-05 15:26:01 +11:00
|
|
|
const struct flowside *ini;
|
2024-08-02 18:10:36 +02:00
|
|
|
struct udp_flow *uflow;
|
|
|
|
union flow *flow;
|
|
|
|
flow_sidx_t sidx;
|
|
|
|
|
|
|
|
ASSERT(ref.type == EPOLL_TYPE_UDP_LISTEN);
|
|
|
|
|
|
|
|
sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, s_in, ref.udp.port);
|
|
|
|
if ((uflow = udp_at_sidx(sidx))) {
|
|
|
|
uflow->ts = now->tv_sec;
|
|
|
|
return flow_sidx_opposite(sidx);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(flow = flow_alloc())) {
|
|
|
|
char sastr[SOCKADDR_STRLEN];
|
|
|
|
|
|
|
|
debug("Couldn't allocate flow for UDP datagram from %s %s",
|
|
|
|
pif_name(ref.udp.pif),
|
|
|
|
sockaddr_ntop(s_in, sastr, sizeof(sastr)));
|
|
|
|
return FLOW_SIDX_NONE;
|
|
|
|
}
|
|
|
|
|
2024-12-05 15:26:01 +11:00
|
|
|
ini = flow_initiate_sa(flow, ref.udp.pif, s_in, ref.udp.port);
|
|
|
|
|
|
|
|
if (!inany_is_unicast(&ini->eaddr) ||
|
|
|
|
ini->eport == 0 || ini->oport == 0) {
|
2024-12-20 12:40:29 +01:00
|
|
|
/* In principle ini->oddr also must be specified, but when we've
|
2024-12-05 15:26:01 +11:00
|
|
|
* been initiated from a socket bound to 0.0.0.0 or ::, we don't
|
|
|
|
* know our address, so we have to leave it unpopulated.
|
|
|
|
*/
|
|
|
|
flow_err(flow, "Invalid endpoint on UDP recvfrom()");
|
|
|
|
flow_alloc_cancel(flow);
|
|
|
|
return FLOW_SIDX_NONE;
|
|
|
|
}
|
|
|
|
|
2024-08-02 18:10:36 +02:00
|
|
|
return udp_flow_new(c, flow, ref.fd, now);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* udp_flow_from_tap() - Find or create UDP flow for tap packets
|
|
|
|
* @c: Execution context
|
|
|
|
* @pif: pif on which the packet is arriving
|
|
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
|
|
* @saddr: Source address on guest side
|
|
|
|
* @daddr: Destination address guest side
|
|
|
|
* @srcport: Source port on guest side
|
|
|
|
* @dstport: Destination port on guest side
|
|
|
|
*
|
|
|
|
* Return: sidx for the destination side of the flow for this packet, or
|
|
|
|
* FLOW_SIDX_NONE if we couldn't find or create a flow.
|
|
|
|
*/
|
|
|
|
flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
|
|
|
uint8_t pif, sa_family_t af,
|
|
|
|
const void *saddr, const void *daddr,
|
|
|
|
in_port_t srcport, in_port_t dstport,
|
|
|
|
const struct timespec *now)
|
|
|
|
{
|
2024-12-05 15:26:01 +11:00
|
|
|
const struct flowside *ini;
|
2024-08-02 18:10:36 +02:00
|
|
|
struct udp_flow *uflow;
|
|
|
|
union flow *flow;
|
|
|
|
flow_sidx_t sidx;
|
|
|
|
|
|
|
|
ASSERT(pif == PIF_TAP);
|
|
|
|
|
|
|
|
sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr,
|
|
|
|
srcport, dstport);
|
|
|
|
if ((uflow = udp_at_sidx(sidx))) {
|
|
|
|
uflow->ts = now->tv_sec;
|
|
|
|
return flow_sidx_opposite(sidx);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(flow = flow_alloc())) {
|
|
|
|
char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
|
|
|
|
|
|
|
|
debug("Couldn't allocate flow for UDP datagram from %s %s:%hu -> %s:%hu",
|
|
|
|
pif_name(pif),
|
|
|
|
inet_ntop(af, saddr, sstr, sizeof(sstr)), srcport,
|
|
|
|
inet_ntop(af, daddr, dstr, sizeof(dstr)), dstport);
|
|
|
|
return FLOW_SIDX_NONE;
|
|
|
|
}
|
|
|
|
|
2024-12-05 15:26:01 +11:00
|
|
|
ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport,
|
|
|
|
daddr, dstport);
|
|
|
|
|
2024-12-20 12:40:29 +01:00
|
|
|
if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 ||
|
|
|
|
inany_is_unspecified(&ini->oaddr) || ini->oport == 0) {
|
2024-12-05 15:26:01 +11:00
|
|
|
flow_dbg(flow, "Invalid endpoint on UDP packet");
|
|
|
|
flow_alloc_cancel(flow);
|
|
|
|
return FLOW_SIDX_NONE;
|
|
|
|
}
|
2024-08-02 18:10:36 +02:00
|
|
|
|
|
|
|
return udp_flow_new(c, flow, -1, now);
|
|
|
|
}
|
|
|
|
|
2024-09-06 15:17:06 +10:00
|
|
|
/**
|
|
|
|
* udp_flow_defer() - Deferred per-flow handling (clean up aborted flows)
|
|
|
|
* @uflow: Flow to handle
|
|
|
|
*
|
|
|
|
* Return: true if the connection is ready to free, false otherwise
|
|
|
|
*/
|
|
|
|
bool udp_flow_defer(const struct udp_flow *uflow)
|
|
|
|
{
|
|
|
|
return uflow->closed;
|
|
|
|
}
|
|
|
|
|
2024-08-02 18:10:36 +02:00
|
|
|
/**
|
|
|
|
* udp_flow_timer() - Handler for timed events related to a given flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @uflow: UDP flow
|
|
|
|
* @now: Current timestamp
|
|
|
|
*
|
|
|
|
* Return: true if the flow is ready to free, false otherwise
|
|
|
|
*/
|
|
|
|
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
|
|
|
|
const struct timespec *now)
|
|
|
|
{
|
|
|
|
if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
udp_flow_close(c, uflow);
|
|
|
|
return true;
|
|
|
|
}
|