passt: Relicense to GPL 2.0, or any later version
In practical terms, passt doesn't benefit from the additional
protection offered by the AGPL over the GPL, because it's not
suitable to be executed over a computer network.
Further, restricting the distribution under the version 3 of the GPL
wouldn't provide any practical advantage either, as long as the passt
codebase is concerned, and might cause unnecessary compatibility
dilemmas.
Change licensing terms to the GNU General Public License Version 2,
or any later version, with written permission from all current and
past contributors, namely: myself, David Gibson, Laine Stump, Andrea
Bolognani, Paul Holzinger, Richard W.M. Jones, Chris Kuhn, Florian
Weimer, Giuseppe Scrivano, Stefan Hajnoczi, and Vasiliy Ulyanov.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2023-04-05 20:11:44 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
2022-11-17 16:58:43 +11:00
|
|
|
* Copyright Red Hat
|
|
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
|
|
*
|
|
|
|
* TCP connection tracking data structures, used by tcp.c and
|
|
|
|
* tcp_splice.c. Shouldn't be included in non-TCP code.
|
|
|
|
*/
|
|
|
|
#ifndef TCP_CONN_H
|
|
|
|
#define TCP_CONN_H
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
|
2023-11-30 13:02:08 +11:00
|
|
|
* @f: Generic flow information
|
2023-08-22 15:29:58 +10:00
|
|
|
* @in_epoll: Is the connection in the epoll set?
|
2024-07-18 15:26:29 +10:00
|
|
|
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
|
|
|
|
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
|
|
|
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
2022-11-17 16:58:43 +11:00
|
|
|
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
|
|
|
* @sock: Socket descriptor number
|
|
|
|
* @events: Connection events, implying connection states
|
|
|
|
* @timer: timerfd descriptor for timeout events
|
|
|
|
* @flags: Connection flags representing internal attributes
|
|
|
|
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
|
|
|
|
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
|
|
|
|
* @wnd_from_tap: Last window size from tap, unscaled (as received)
|
|
|
|
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
|
|
|
|
* @seq_to_tap: Next sequence for packets to tap
|
|
|
|
* @seq_ack_from_tap: Last ACK number received from tap
|
|
|
|
* @seq_from_tap: Next sequence for packets from tap (not actually sent)
|
|
|
|
* @seq_ack_to_tap: Last ACK number sent to tap
|
|
|
|
* @seq_init_from_tap: Initial sequence number from tap
|
|
|
|
*/
|
|
|
|
struct tcp_tap_conn {
|
2023-11-30 13:02:08 +11:00
|
|
|
/* Must be first element */
|
|
|
|
struct flow_common f;
|
2022-11-17 16:58:44 +11:00
|
|
|
|
2023-08-22 15:29:58 +10:00
|
|
|
bool in_epoll :1;
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
#define TCP_RETRANS_BITS 3
|
|
|
|
unsigned int retrans :TCP_RETRANS_BITS;
|
2023-02-27 03:30:01 +01:00
|
|
|
#define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
#define TCP_WS_BITS 4 /* RFC 7323 */
|
|
|
|
#define TCP_WS_MAX 14
|
|
|
|
unsigned int ws_from_tap :TCP_WS_BITS;
|
|
|
|
unsigned int ws_to_tap :TCP_WS_BITS;
|
|
|
|
|
2024-07-18 15:26:29 +10:00
|
|
|
#define TCP_MSS_BITS 14
|
|
|
|
unsigned int tap_mss :TCP_MSS_BITS;
|
|
|
|
#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
|
|
|
|
#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS))
|
2022-11-17 16:58:43 +11:00
|
|
|
|
2023-08-11 15:12:21 +10:00
|
|
|
int sock :FD_REF_BITS;
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
uint8_t events;
|
|
|
|
#define CLOSED 0
|
|
|
|
#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */
|
|
|
|
#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */
|
|
|
|
#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */
|
|
|
|
#define ESTABLISHED BIT(2)
|
|
|
|
#define SOCK_FIN_RCVD BIT( 3)
|
|
|
|
#define SOCK_FIN_SENT BIT( 4)
|
|
|
|
#define TAP_FIN_RCVD BIT( 5)
|
|
|
|
#define TAP_FIN_SENT BIT( 6)
|
|
|
|
#define TAP_FIN_ACKED BIT( 7)
|
|
|
|
|
|
|
|
#define CONN_STATE_BITS /* Setting these clears other flags */ \
|
|
|
|
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
|
|
|
|
|
|
|
|
|
2023-08-11 15:12:21 +10:00
|
|
|
int timer :FD_REF_BITS;
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
uint8_t flags;
|
|
|
|
#define STALLED BIT(0)
|
|
|
|
#define LOCAL BIT(1)
|
tcp: Don't use TCP_WINDOW_CLAMP
On the L2 tap side, we see TCP headers and know the TCP window that the
ultimate receiver is advertising. In order to avoid unnecessary buffering
within passt/pasta (or by the kernel on passt/pasta's behalf) we attempt
to advertise that window back to the original sock-side sender using
TCP_WINDOW_CLAMP.
However, TCP_WINDOW_CLAMP just doesn't work like this. Prior to kernel
commit 3aa7857fe1d7 ("tcp: enable mid stream window clamp"), it simply
had no effect on established sockets. After that commit, it does affect
established sockets but doesn't behave the way we need:
* It appears to be designed only to shrink the window, not to allow it to
re-expand.
* More importantly, that commit has a serious bug where if the
setsockopt() is made when the existing kernel advertised window for the
socket happens to be zero, it will now become locked at zero, stopping
any further data from being received on the socket.
Since this has never worked as intended, simply remove it. It might be
possible to re-implement the intended behaviour by manipulating SO_RCVBUF,
so we leave a comment to that effect.
This kernel bug is the underlying cause of both the linked passt bug and
the linked podman bug. We attempted to fix this before with passt commit
d3192f67 ("tcp: Force TCP_WINDOW_CLAMP before resetting STALLED flag").
However while that commit masked the bug for some cases, it didn't really
address the problem.
Fixes: d3192f67c492 ("tcp: Force TCP_WINDOW_CLAMP before resetting STALLED flag")
Link: https://github.com/containers/podman/issues/20170
Link: https://bugs.passt.top/show_bug.cgi?id=74
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2023-11-09 20:54:00 +11:00
|
|
|
#define ACTIVE_CLOSE BIT(2)
|
|
|
|
#define ACK_TO_TAP_DUE BIT(3)
|
|
|
|
#define ACK_FROM_TAP_DUE BIT(4)
|
tcp: Mask EPOLLIN altogether if we're blocked waiting on an ACK from the guest
There are pretty much two cases of the (misnomer) STALLED: in one
case, we could send more data to the guest if it becomes available,
and in another case, we can't, because we filled the window.
If, in this second case, we keep EPOLLIN enabled, but never read from
the socket, we get short but CPU-annoying storms of EPOLLIN events,
upon which we reschedule the ACK timeout handler, never read from the
socket, go back to epoll_wait(), and so on:
timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0
epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1
also known as:
29.1517: Flow 2 (TCP connection): timer expires in 2.000s
29.1517: Flow 2 (TCP connection): timer expires in 2.000s
29.1517: Flow 2 (TCP connection): timer expires in 2.000s
which, for some reason, becomes very visible with muvm and aria2c
downloading from a server nearby in parallel chunks.
That's because EPOLLIN isn't cleared if we don't read from the socket,
and even with EPOLLET, epoll_wait() will repeatedly wake us up until
we actually read something.
In this case, we don't want to subscribe to EPOLLIN at all: all we're
waiting for is an ACK segment from the guest. Differentiate this case
with a new connection flag, ACK_FROM_TAP_BLOCKS, which doesn't just
indicate that we're waiting for an ACK from the guest
(ACK_FROM_TAP_DUE), but also that we're blocked waiting for it.
If this flag is set before we set STALLED, EPOLLIN will be masked
while we set EPOLLET because of STALLED. Whenever we clear STALLED,
we also clear this flag.
This is definitely not elegant, but it's a minimal fix.
We can probably simplify this at a later point by having a category
of connection flags directly corresponding to epoll flags, and
dropping STALLED altogether, or, perhaps, always using EPOLLET (but
we need a mechanism to re-check sockets for pending data if we can't
temporarily write to the guest).
I suspect that this might also be implied in
https://github.com/containers/podman/issues/23686, hence the Link:
tag. It doesn't necessarily mean I'm fixing it (I can't reproduce
that).
Link: https://github.com/containers/podman/issues/23686
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-01-16 20:47:00 +01:00
|
|
|
#define ACK_FROM_TAP_BLOCKS BIT(5)
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
#define SNDBUF_BITS 24
|
|
|
|
unsigned int sndbuf :SNDBUF_BITS;
|
|
|
|
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
|
|
|
|
#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS))
|
|
|
|
|
|
|
|
uint8_t seq_dup_ack_approx;
|
|
|
|
|
|
|
|
uint16_t wnd_from_tap;
|
|
|
|
uint16_t wnd_to_tap;
|
|
|
|
|
|
|
|
uint32_t seq_to_tap;
|
|
|
|
uint32_t seq_ack_from_tap;
|
|
|
|
uint32_t seq_from_tap;
|
|
|
|
uint32_t seq_ack_to_tap;
|
|
|
|
uint32_t seq_init_from_tap;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
|
2023-11-30 13:02:08 +11:00
|
|
|
* @f: Generic flow information
|
2023-11-07 13:42:46 +11:00
|
|
|
* @s: File descriptor for sockets
|
|
|
|
* @pipe: File descriptors for pipes
|
|
|
|
* @read: Bytes read (not fully written to other side in one shot)
|
|
|
|
* @written: Bytes written (not fully written from one other side read)
|
2024-07-18 15:26:29 +10:00
|
|
|
* @events: Events observed/actions performed on connection
|
|
|
|
* @flags: Connection flags (attributes, not events)
|
|
|
|
* @in_epoll: Is the connection in the epoll set?
|
|
|
|
*/
|
2022-11-17 16:58:43 +11:00
|
|
|
struct tcp_splice_conn {
|
2023-11-30 13:02:08 +11:00
|
|
|
/* Must be first element */
|
|
|
|
struct flow_common f;
|
2022-11-17 16:58:44 +11:00
|
|
|
|
2023-11-07 13:42:46 +11:00
|
|
|
int s[SIDES];
|
|
|
|
int pipe[SIDES][2];
|
2022-11-17 16:58:43 +11:00
|
|
|
|
2024-07-18 15:26:29 +10:00
|
|
|
uint32_t read[SIDES];
|
|
|
|
uint32_t written[SIDES];
|
|
|
|
|
2022-11-17 16:58:43 +11:00
|
|
|
uint8_t events;
|
|
|
|
#define SPLICE_CLOSED 0
|
|
|
|
#define SPLICE_CONNECT BIT(0)
|
|
|
|
#define SPLICE_ESTABLISHED BIT(1)
|
2024-07-17 14:52:21 +10:00
|
|
|
#define OUT_WAIT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
|
|
|
#define FIN_RCVD(sidei_) ((sidei_) ? BIT(5) : BIT(4))
|
|
|
|
#define FIN_SENT(sidei_) ((sidei_) ? BIT(7) : BIT(6))
|
2022-11-17 16:58:43 +11:00
|
|
|
|
|
|
|
uint8_t flags;
|
2024-07-18 15:26:33 +10:00
|
|
|
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
|
|
|
|
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
|
|
|
|
#define CLOSING BIT(4)
|
2022-11-17 16:58:43 +11:00
|
|
|
|
2024-07-18 15:26:29 +10:00
|
|
|
bool in_epoll :1;
|
2022-11-17 16:58:43 +11:00
|
|
|
};
|
|
|
|
|
2023-02-14 10:48:21 +11:00
|
|
|
/* Socket pools */
|
|
|
|
#define TCP_SOCK_POOL_SIZE 32
|
|
|
|
|
|
|
|
extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
|
|
|
|
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
|
|
|
|
|
2024-05-21 15:57:03 +10:00
|
|
|
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
|
|
|
|
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
|
|
|
|
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
|
2023-02-14 10:48:22 +11:00
|
|
|
int tcp_conn_pool_sock(int pool[]);
|
2024-02-19 18:56:50 +11:00
|
|
|
int tcp_conn_sock(const struct ctx *c, sa_family_t af);
|
2024-02-19 18:56:49 +11:00
|
|
|
int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af);
|
2023-02-14 10:48:21 +11:00
|
|
|
void tcp_splice_refill(const struct ctx *c);
|
2022-11-17 16:58:46 +11:00
|
|
|
|
2022-11-17 16:58:43 +11:00
|
|
|
#endif /* TCP_CONN_H */
|