diff --git a/Makefile b/Makefile index cc4f014..a9b4f79 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h \ - pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_splice.h udp.h util.h + pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h \ + util.h HEADERS = $(PASST_HEADERS) seccomp.h # On gcc 11 and 12, with -O2 and -flto, tcp_hash() and siphash_20b(), if diff --git a/tcp.c b/tcp.c index 34d7d45..189041c 100644 --- a/tcp.c +++ b/tcp.c @@ -98,7 +98,7 @@ * Connection tracking and storage * ------------------------------- * - * Connections are tracked by the @tc array of struct tcp_conn, containing + * Connections are tracked by the @tc array of struct tcp_tap_conn, containing * addresses, ports, TCP states and parameters. This is statically allocated and * indexed by an arbitrary connection number. The array is compacted whenever a * connection is closed, by remapping the highest connection index in use to the @@ -301,6 +301,8 @@ #include "tcp_splice.h" #include "log.h" +#include "tcp_conn.h" + #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) @@ -308,7 +310,6 @@ #define TCP_FILE_PRESSURE 30 /* % of c->nofile */ #define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */ -#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ TCP_HASH_TABLE_LOAD) @@ -402,117 +403,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define OPT_SACK 5 #define OPT_TS 8 -/** - * struct tcp_conn - Descriptor for a TCP connection (not spliced) - * @next_index: Connection index of next item in hash chain, -1 for none - * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS - * @sock: Socket descriptor number - * @events: Connection events, implying connection states - * @timer: timerfd descriptor for timeout events - * @flags: Connection flags representing internal attributes - * @hash_bucket: Bucket index in connection lookup hash table - * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT - * @ws_from_tap: Window scaling factor advertised from tap/guest - * @ws_to_tap: Window scaling factor advertised to tap/guest - * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS - * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @a.a6: IPv6 remote address, can be IPv4-mapped - * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 - * @a.a4.one: Ones prefix for IPv4-mapped - * @a.a4.a: IPv4 address - * @tap_port: Guest-facing tap port - * @sock_port: Remote, socket-facing port - * @wnd_from_tap: Last window size from tap, unscaled (as received) - * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) - * @seq_to_tap: Next sequence for packets to tap - * @seq_ack_from_tap: Last ACK number received from tap - * @seq_from_tap: Next sequence for packets from tap (not actually sent) - * @seq_ack_to_tap: Last ACK number sent to tap - * @seq_init_from_tap: Initial sequence number from tap - */ -struct tcp_conn { - int next_index :TCP_CONN_INDEX_BITS + 2; - -#define TCP_RETRANS_BITS 3 - unsigned int retrans :TCP_RETRANS_BITS; -#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) - -#define TCP_WS_BITS 4 /* RFC 7323 */ -#define TCP_WS_MAX 14 - unsigned int ws_from_tap :TCP_WS_BITS; - unsigned int ws_to_tap :TCP_WS_BITS; - - - int sock :SOCKET_REF_BITS; - - uint8_t events; -#define CLOSED 0 -#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */ -#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */ -#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */ -#define ESTABLISHED BIT(2) -#define SOCK_FIN_RCVD BIT( 3) -#define SOCK_FIN_SENT BIT( 4) -#define TAP_FIN_RCVD BIT( 5) -#define TAP_FIN_SENT BIT( 6) -#define TAP_FIN_ACKED BIT( 7) - -#define CONN_STATE_BITS /* Setting these clears other flags */ \ - (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) - - - int timer :SOCKET_REF_BITS; - - uint8_t flags; -#define STALLED BIT(0) -#define LOCAL BIT(1) -#define WND_CLAMPED BIT(2) -#define IN_EPOLL BIT(3) -#define ACTIVE_CLOSE BIT(4) -#define ACK_TO_TAP_DUE BIT(5) -#define ACK_FROM_TAP_DUE BIT(6) - - - unsigned int hash_bucket :TCP_HASH_BUCKET_BITS; - -#define TCP_MSS_BITS 14 - unsigned int tap_mss :TCP_MSS_BITS; -#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) -#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) - - -#define SNDBUF_BITS 24 - unsigned int sndbuf :SNDBUF_BITS; -#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS))) -#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS)) - - uint8_t seq_dup_ack_approx; - - - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; #define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) #define CONN_V6(conn) (!CONN_V4(conn)) - - in_port_t tap_port; - in_port_t sock_port; - - uint16_t wnd_from_tap; - uint16_t wnd_to_tap; - - uint32_t seq_to_tap; - uint32_t seq_ack_from_tap; - uint32_t seq_from_tap; - uint32_t seq_ack_to_tap; - uint32_t seq_init_from_tap; -}; - #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -695,7 +587,7 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes; /* TCP connections */ -static struct tcp_conn tc[TCP_MAX_CONNS]; +static struct tcp_tap_conn tc[TCP_MAX_CONNS]; #define CONN(index) (tc + (index)) #define CONN_IDX(conn) ((conn) - tc) @@ -705,7 +597,7 @@ static struct tcp_conn tc[TCP_MAX_CONNS]; * * Return: pointer to connection, or NULL if @index is out of bounds */ -static inline struct tcp_conn *conn_at_idx(int index) +static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; @@ -713,7 +605,7 @@ static inline struct tcp_conn *conn_at_idx(int index) } /* Table for lookup from remote address, local port, remote port */ -static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; /* Pools for pre-opened sockets */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; @@ -749,7 +641,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn, +static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, unsigned long flag); #define conn_flag(c, conn, flag) \ do { \ @@ -764,7 +656,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn, * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn) +static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, @@ -809,7 +701,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn) * * #syscalls timerfd_create timerfd_settime */ -static void tcp_timer_ctl(const struct ctx *c, struct tcp_conn *conn) +static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { struct itimerspec it = { { 0 }, { 0 } }; @@ -865,7 +757,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn, +static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, unsigned long flag) { if (flag & (flag - 1)) { @@ -903,7 +795,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_conn *conn, * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_conn *conn, +static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, unsigned long event) { int prev, new, num = fls(event); @@ -963,7 +855,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_conn *conn, * * Return: 1 if destination is in low RTT table, 0 otherwise */ -static int tcp_rtt_dst_low(const struct tcp_conn *conn) +static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) { int i; @@ -979,7 +871,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn) * @conn: Connection pointer * @tinfo: Pointer to struct tcp_info for socket */ -static void tcp_rtt_dst_check(const struct tcp_conn *conn, +static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, const struct tcp_info *tinfo) { #ifdef HAS_MIN_RTT @@ -1016,7 +908,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage) * @conn: Connection pointer */ -static void tcp_get_sndbuf(struct tcp_conn *conn) +static void tcp_get_sndbuf(struct tcp_tap_conn *conn) { int s = conn->sock, sndbuf; socklen_t sl; @@ -1290,7 +1182,8 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, * * Return: 1 on match, 0 otherwise */ -static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, +static int tcp_hash_match(const struct tcp_tap_conn *conn, + int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { if (af == AF_INET && CONN_V4(conn) && @@ -1356,7 +1249,7 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr, * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to in_addr or in6_addr */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn, +static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn, int af, const void *addr) { int b; @@ -1374,9 +1267,9 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn, * tcp_hash_remove() - Drop connection from hash table, chain unlink * @conn: Connection pointer */ -static void tcp_hash_remove(const struct tcp_conn *conn) +static void tcp_hash_remove(const struct tcp_tap_conn *conn) { - struct tcp_conn *entry, *prev = NULL; + struct tcp_tap_conn *entry, *prev = NULL; int b = conn->hash_bucket; for (entry = tc_hash[b]; entry; @@ -1400,9 +1293,9 @@ static void tcp_hash_remove(const struct tcp_conn *conn) * @old: Old connection pointer * @new: New connection pointer */ -static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new) +static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new) { - struct tcp_conn *entry, *prev = NULL; + struct tcp_tap_conn *entry, *prev = NULL; int b = old->hash_bucket; for (entry = tc_hash[b]; entry; @@ -1431,12 +1324,13 @@ static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new) * * Return: connection pointer, if found, -ENOENT otherwise */ -static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af, - const void *addr, - in_port_t tap_port, in_port_t sock_port) +static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, + int af, const void *addr, + in_port_t tap_port, + in_port_t sock_port) { int b = tcp_hash(c, af, addr, tap_port, sock_port); - struct tcp_conn *conn; + struct tcp_tap_conn *conn; for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) { if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) @@ -1451,9 +1345,9 @@ static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af, * @c: Execution context * @hole: Pointer to recently closed connection */ -static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) +static void tcp_table_compact(struct ctx *c, struct tcp_tap_conn *hole) { - struct tcp_conn *from, *to; + struct tcp_tap_conn *from, *to; if (CONN_IDX(hole) == --c->tcp.conn_count) { debug("TCP: hash table compaction: maximum index was %li (%p)", @@ -1482,7 +1376,7 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) * @c: Execution context * @conn: Connection pointer */ -static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) +static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) { close(conn->sock); if (conn->timer != -1) @@ -1492,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) tcp_table_compact(c, conn); } -static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); +static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); #define tcp_rst(c, conn) \ do { \ debug("TCP: index %li, reset at %s:%i", CONN_IDX(conn), \ @@ -1627,7 +1521,7 @@ void tcp_defer_handler(struct ctx *c) { int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; - struct tcp_conn *conn; + struct tcp_tap_conn *conn; tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); @@ -1656,7 +1550,7 @@ void tcp_defer_handler(struct ctx *c) * Return: 802.3 length, host order */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, - const struct tcp_conn *conn, + const struct tcp_tap_conn *conn, void *p, size_t plen, const uint16_t *check, uint32_t seq) { @@ -1738,7 +1632,7 @@ do { \ * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_conn *conn, +static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; @@ -1824,7 +1718,7 @@ out: * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; @@ -1971,7 +1865,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) +static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -1986,7 +1880,7 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) * @opts: Pointer to start of TCP options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_get_tap_ws(struct tcp_conn *conn, +static void tcp_get_tap_ws(struct tcp_tap_conn *conn, const char *opts, size_t optlen) { int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL); @@ -2003,7 +1897,7 @@ static void tcp_get_tap_ws(struct tcp_conn *conn, * @conn: Connection pointer * @window: Window value, host order, unscaled */ -static void tcp_clamp_window(const struct ctx *c, struct tcp_conn *conn, +static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn, unsigned wnd) { uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; @@ -2125,7 +2019,7 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) * Return: clamped MSS value */ static uint16_t tcp_conn_tap_mss(const struct ctx *c, - const struct tcp_conn *conn, + const struct tcp_tap_conn *conn, const char *opts, size_t optlen) { unsigned int mss; @@ -2172,7 +2066,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, .sin6_addr = *(struct in6_addr *)addr, }; const struct sockaddr *sa; - struct tcp_conn *conn; + struct tcp_tap_conn *conn; socklen_t sl; int s, mss; @@ -2280,7 +2174,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, * * Return: 0 on success, negative error code from recv() on failure */ -static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq) +static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) { /* Simply ignore out-of-order ACKs: we already consumed the data we * needed from the buffer, and we won't rewind back to a lower ACK @@ -2307,7 +2201,7 @@ static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq) * @seq: Sequence number to be sent * @now: Current timestamp */ -static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, +static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, ssize_t plen, int no_csum, uint32_t seq) { struct iovec *iov; @@ -2344,7 +2238,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; @@ -2475,7 +2369,7 @@ zero_len: * * #syscalls sendmsg */ -static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, +static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, const struct pool *p) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; @@ -2675,7 +2569,7 @@ out: * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, +static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, const struct tcphdr *th, const char *opts, size_t optlen) { @@ -2714,7 +2608,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, int tcp_tap_handler(struct ctx *c, int af, const void *addr, const struct pool *p, const struct timespec *now) { - struct tcp_conn *conn; + struct tcp_tap_conn *conn; size_t optlen, len; struct tcphdr *th; int ack_due = 0; @@ -2829,7 +2723,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr, * @c: Execution context * @conn: Connection pointer */ -static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn) +static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) { socklen_t sl; int so; @@ -2857,7 +2751,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, const struct timespec *now) { struct sockaddr_storage sa; - struct tcp_conn *conn; + struct tcp_tap_conn *conn; socklen_t sl; int s; @@ -2949,7 +2843,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, */ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { - struct tcp_conn *conn = conn_at_idx(ref.r.p.tcp.tcp.index); + struct tcp_tap_conn *conn = conn_at_idx(ref.r.p.tcp.tcp.index); struct itimerspec check_armed = { { 0 }, { 0 } }; if (!conn) @@ -3012,7 +2906,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - struct tcp_conn *conn; + struct tcp_tap_conn *conn; if (ref.r.p.tcp.tcp.timer) { tcp_timer_handler(c, ref); @@ -3510,7 +3404,7 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, const struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - struct tcp_conn *conn; + struct tcp_tap_conn *conn; (void)ts; diff --git a/tcp_conn.h b/tcp_conn.h new file mode 100644 index 0000000..db4c2d9 --- /dev/null +++ b/tcp_conn.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: AGPL-3.0-or-later + * Copyright Red Hat + * Author: Stefano Brivio + * Author: David Gibson + * + * TCP connection tracking data structures, used by tcp.c and + * tcp_splice.c. Shouldn't be included in non-TCP code. + */ +#ifndef TCP_CONN_H +#define TCP_CONN_H + +#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) + +/** + * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) + * @next_index: Connection index of next item in hash chain, -1 for none + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS + * @sock: Socket descriptor number + * @events: Connection events, implying connection states + * @timer: timerfd descriptor for timeout events + * @flags: Connection flags representing internal attributes + * @hash_bucket: Bucket index in connection lookup hash table + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap + * @a.a6: IPv6 remote address, can be IPv4-mapped + * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 + * @a.a4.one: Ones prefix for IPv4-mapped + * @a.a4.a: IPv4 address + * @tap_port: Guest-facing tap port + * @sock_port: Remote, socket-facing port + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) + * @seq_to_tap: Next sequence for packets to tap + * @seq_ack_from_tap: Last ACK number received from tap + * @seq_from_tap: Next sequence for packets from tap (not actually sent) + * @seq_ack_to_tap: Last ACK number sent to tap + * @seq_init_from_tap: Initial sequence number from tap + */ +struct tcp_tap_conn { + int next_index :TCP_CONN_INDEX_BITS + 2; + +#define TCP_RETRANS_BITS 3 + unsigned int retrans :TCP_RETRANS_BITS; +#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) + +#define TCP_WS_BITS 4 /* RFC 7323 */ +#define TCP_WS_MAX 14 + unsigned int ws_from_tap :TCP_WS_BITS; + unsigned int ws_to_tap :TCP_WS_BITS; + + + int sock :SOCKET_REF_BITS; + + uint8_t events; +#define CLOSED 0 +#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */ +#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */ +#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */ +#define ESTABLISHED BIT(2) +#define SOCK_FIN_RCVD BIT( 3) +#define SOCK_FIN_SENT BIT( 4) +#define TAP_FIN_RCVD BIT( 5) +#define TAP_FIN_SENT BIT( 6) +#define TAP_FIN_ACKED BIT( 7) + +#define CONN_STATE_BITS /* Setting these clears other flags */ \ + (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) + + + int timer :SOCKET_REF_BITS; + + uint8_t flags; +#define STALLED BIT(0) +#define LOCAL BIT(1) +#define WND_CLAMPED BIT(2) +#define IN_EPOLL BIT(3) +#define ACTIVE_CLOSE BIT(4) +#define ACK_TO_TAP_DUE BIT(5) +#define ACK_FROM_TAP_DUE BIT(6) + + + unsigned int hash_bucket :TCP_HASH_BUCKET_BITS; + +#define TCP_MSS_BITS 14 + unsigned int tap_mss :TCP_MSS_BITS; +#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) +#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) + + +#define SNDBUF_BITS 24 + unsigned int sndbuf :SNDBUF_BITS; +#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS))) +#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS)) + + uint8_t seq_dup_ack_approx; + + + union { + struct in6_addr a6; + struct { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a; + } a4; + } a; + + in_port_t tap_port; + in_port_t sock_port; + + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; +}; + +/** + * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @a: File descriptor number of socket for accepted connection + * @pipe_a_b: Pipe ends for splice() from @a to @b + * @b: File descriptor number of peer connected socket + * @pipe_b_a: Pipe ends for splice() from @b to @a + * @events: Events observed/actions performed on connection + * @flags: Connection flags (attributes, not events) + * @a_read: Bytes read from @a (not fully written to @b in one shot) + * @a_written: Bytes written to @a (not fully written from one @b read) + * @b_read: Bytes read from @b (not fully written to @a in one shot) + * @b_written: Bytes written to @b (not fully written from one @a read) +*/ +struct tcp_splice_conn { + int a; + int pipe_a_b[2]; + int b; + int pipe_b_a[2]; + + uint8_t events; +#define SPLICE_CLOSED 0 +#define SPLICE_CONNECT BIT(0) +#define SPLICE_ESTABLISHED BIT(1) +#define A_OUT_WAIT BIT(2) +#define B_OUT_WAIT BIT(3) +#define A_FIN_RCVD BIT(4) +#define B_FIN_RCVD BIT(5) +#define A_FIN_SENT BIT(6) +#define B_FIN_SENT BIT(7) + + uint8_t flags; +#define SPLICE_V6 BIT(0) +#define SPLICE_IN_EPOLL BIT(1) +#define RCVLOWAT_SET_A BIT(2) +#define RCVLOWAT_SET_B BIT(3) +#define RCVLOWAT_ACT_A BIT(4) +#define RCVLOWAT_ACT_B BIT(5) +#define CLOSING BIT(6) + + uint32_t a_read; + uint32_t a_written; + uint32_t b_read; + uint32_t b_written; +}; + +#endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index 4cc4ad2..cbfab01 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -21,12 +21,12 @@ * * - SPLICE_CONNECT: connection accepted, connecting to target * - SPLICE_ESTABLISHED: connection to target established - * - SPLICE_A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT - * - SPLICE_B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT - * - SPLICE_A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket - * - SPLICE_B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket - * - SPLICE_A_FIN_RCVD: FIN (write shutdown) sent to accepted socket - * - SPLICE_B_FIN_RCVD: FIN (write shutdown) sent to target socket + * - A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT + * - B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT + * - A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket + * - B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket + * - A_FIN_RCVD: FIN (write shutdown) sent to accepted socket + * - B_FIN_RCVD: FIN (write shutdown) sent to target socket * * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 */ @@ -52,6 +52,8 @@ #include "log.h" #include "tcp_splice.h" +#include "tcp_conn.h" + #define MAX_PIPE_SIZE (8UL * 1024 * 1024) #define TCP_SPLICE_PIPE_POOL_SIZE 16 #define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */ @@ -66,52 +68,7 @@ extern int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; /* Pool of pre-opened pipes */ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; -/** - * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @a: File descriptor number of socket for accepted connection - * @pipe_a_b: Pipe ends for splice() from @a to @b - * @b: File descriptor number of peer connected socket - * @pipe_b_a: Pipe ends for splice() from @b to @a - * @events: Events observed/actions performed on connection - * @flags: Connection flags (attributes, not events) - * @a_read: Bytes read from @a (not fully written to @b in one shot) - * @a_written: Bytes written to @a (not fully written from one @b read) - * @b_read: Bytes read from @b (not fully written to @a in one shot) - * @b_written: Bytes written to @b (not fully written from one @a read) -*/ -struct tcp_splice_conn { - int a; - int pipe_a_b[2]; - int b; - int pipe_b_a[2]; - - uint8_t events; -#define CLOSED 0 -#define CONNECT BIT(0) -#define ESTABLISHED BIT(1) -#define A_OUT_WAIT BIT(2) -#define B_OUT_WAIT BIT(3) -#define A_FIN_RCVD BIT(4) -#define B_FIN_RCVD BIT(5) -#define A_FIN_SENT BIT(6) -#define B_FIN_SENT BIT(7) - - uint8_t flags; -#define SOCK_V6 BIT(0) -#define IN_EPOLL BIT(1) -#define RCVLOWAT_SET_A BIT(2) -#define RCVLOWAT_SET_B BIT(3) -#define RCVLOWAT_ACT_A BIT(4) -#define RCVLOWAT_ACT_B BIT(5) -#define CLOSING BIT(6) - - uint32_t a_read; - uint32_t a_written; - uint32_t b_read; - uint32_t b_written; -}; - -#define CONN_V6(x) (x->flags & SOCK_V6) +#define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (tc_splice + (index)) @@ -122,13 +79,13 @@ static struct tcp_splice_conn tc_splice[TCP_SPLICE_MAX_CONNS]; /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { - "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT", + "SPLICE_CONNECT", "SPLICE_ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT", "A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT", }; /* Display strings for connection flags */ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { - "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", + "SPLICE_V6", "SPLICE_IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING", }; @@ -143,12 +100,12 @@ static void tcp_splice_conn_epoll_events(uint16_t events, { *a = *b = 0; - if (events & ESTABLISHED) { + if (events & SPLICE_ESTABLISHED) { if (!(events & B_FIN_SENT)) *a = EPOLLIN | EPOLLRDHUP; if (!(events & A_FIN_SENT)) *b = EPOLLIN | EPOLLRDHUP; - } else if (events & CONNECT) { + } else if (events & SPLICE_CONNECT) { *b = EPOLLOUT; } @@ -210,7 +167,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, .r.p.tcp.tcp.index = CONN_IDX(conn), @@ -234,7 +191,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, epoll_ctl(c->epollfd, m, conn->b, &ev_b)) goto delete; - conn->flags |= IN_EPOLL; /* No need to log this */ + conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */ return 0; @@ -323,7 +280,7 @@ static void tcp_table_splice_compact(struct ctx *c, */ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) { - if (conn->events & ESTABLISHED) { + if (conn->events & SPLICE_ESTABLISHED) { /* Flushing might need to block: don't recycle them. */ if (conn->pipe_a_b[0] != -1) { close(conn->pipe_a_b[0]); @@ -337,7 +294,7 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) } } - if (conn->events & CONNECT) { + if (conn->events & SPLICE_CONNECT) { close(conn->b); conn->b = -1; } @@ -346,7 +303,7 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) conn->a = -1; conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0; - conn->events = CLOSED; + conn->events = SPLICE_CLOSED; conn->flags = 0; debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); @@ -397,8 +354,8 @@ static int tcp_splice_connect_finish(const struct ctx *c, } } - if (!(conn->events & ESTABLISHED)) - conn_event(c, conn, ESTABLISHED); + if (!(conn->events & SPLICE_ESTABLISHED)) + conn_event(c, conn, SPLICE_ESTABLISHED); return 0; } @@ -466,9 +423,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, close(sock_conn); return ret; } - conn_event(c, conn, CONNECT); + conn_event(c, conn, SPLICE_CONNECT); } else { - conn_event(c, conn, ESTABLISHED); + conn_event(c, conn, SPLICE_ESTABLISHED); return tcp_splice_connect_finish(c, conn); } @@ -598,7 +555,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; + conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, ref.r.p.tcp.tcp.outbound)) @@ -609,13 +566,13 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(ref.r.p.tcp.tcp.index); - if (conn->events == CLOSED) + if (conn->events == SPLICE_CLOSED) return; if (events & EPOLLERR) goto close; - if (conn->events == CONNECT) { + if (conn->events == SPLICE_CONNECT) { if (!(events & EPOLLOUT)) goto close; if (tcp_splice_connect_finish(c, conn))