1
0
mirror of https://passt.top/passt synced 2025-01-03 11:25:24 +00:00

tcp: Replace TCP buffer structure by an iovec array

To be able to provide pointers to TCP headers and IP headers without
worrying about alignment in the structure, split the structure into
several arrays and point to each part of the frame using an iovec array.

Using iovec also allows us to simply ignore the first entry when the
vnet length header is not needed.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Laurent Vivier 2024-04-15 19:01:37 +02:00 committed by Stefano Brivio
parent 27f1c762b1
commit 95601237ef
2 changed files with 266 additions and 270 deletions

533
tcp.c
View File

@ -318,39 +318,14 @@
/* MSS rounding: see SET_MSS() */ /* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536 #define MSS_DEFAULT 536
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ sizeof(struct tcphdr) - \
#ifdef __AVX2__ sizeof(struct iphdr), \
uint8_t pad[26]; sizeof(uint32_t))
#else #define MSS6 ROUND_DOWN(IP_MAX_MTU - \
uint8_t pad[2]; sizeof(struct tcphdr) - \
#endif sizeof(struct ipv6hdr), \
struct tap_hdr taph; sizeof(uint32_t))
struct iphdr iph;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
uint8_t pad[14];
#else
uint8_t pad[2];
#endif
struct tap_hdr taph;
struct ipv6hdr ip6h;
struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND #ifdef HAS_SND_WND
@ -445,133 +420,107 @@ struct tcp_buf_seq_update {
}; };
/* Static buffers */ /* Static buffers */
/**
* struct tcp_payload_t - TCP header and data to send segments with payload
* @th: TCP header
* @data: TCP data
*/
struct tcp_payload_t {
struct tcphdr th;
uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif
/** /**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * struct tcp_flags_t - TCP header and data to send zero-length
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * segments (flags)
* @taph: Tap-level headers (partially pre-filled) * @th: TCP header
* @iph: Pre-filled IP header (except for tot_len and saddr) * @opts TCP options
* @uh: Headroom for TCP header
* @data: Storage for TCP payload
*/ */
static struct tcp4_l2_buf_t { struct tcp_flags_t {
struct tcphdr th;
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__ #ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */ } __attribute__ ((packed, aligned(32)));
#else #else
uint8_t pad[2]; /* align iph to 4 bytes 0 */ } __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif #endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
uint8_t data[MSS4]; /* 84 60 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; /* Ethernet header for IPv4 frames */
static struct ethhdr tcp4_eth_src;
static unsigned int tcp4_l2_buf_used; static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM];
/* IPv4 headers */
static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
/* TCP segments with payload for IPv4 frames */
static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
/** static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
* tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @data: Storage for TCP payload
*/
struct tcp6_l2_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th; /* 72 60 */
uint8_t data[MSS6]; /* 92 80 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];
static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp4_payload_used;
static unsigned int tcp6_l2_buf_used; static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM];
/* IPv4 headers for TCP segment without payload */
static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
/* TCP segments without payload for IPv4 frames */
static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
static unsigned int tcp4_flags_used;
/* Ethernet header for IPv6 frames */
static struct ethhdr tcp6_eth_src;
static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM];
/* IPv6 headers */
static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
/* TCP headers and data for IPv6 frames */
static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
static unsigned int tcp6_payload_used;
static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM];
/* IPv6 headers for TCP segment without payload */
static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
/* TCP segment without payload for IPv6 frames */
static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
static unsigned int tcp6_flags_used;
/* recvmsg()/sendmsg() data for tap */ /* recvmsg()/sendmsg() data for tap */
static char tcp_buf_discard [MAX_WINDOW]; static char tcp_buf_discard [MAX_WINDOW];
static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; /*
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; * enum tcp_iov_parts - I/O vector parts for one TCP frame
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; * @TCP_IOV_VLEN virtio net header
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; * @TCP_IOV_ETH Ethernet header
* @TCP_IOV_IP IP (v4/v6) header
* @TCP_IOV_PAYLOAD IP payload (TCP header + data)
* @TCP_NUM_IOVS the number of entries in the iovec array
*/
enum tcp_iov_parts {
TCP_IOV_VLEN = 0,
TCP_IOV_ETH = 1,
TCP_IOV_IP = 2,
TCP_IOV_PAYLOAD = 3,
TCP_NUM_IOVS
};
static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
/* sendmsg() to socket */ /* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV]; static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp4_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp4_l2_flags_buf_used;
/**
* tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
* @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
* @taph: Tap-level headers (partially pre-filled)
* @ip6h: Pre-filled IP header (except for payload_len and addresses)
* @th: Headroom for TCP header
* @opts: Headroom for TCP options
*/
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
#else
uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
#endif
struct tap_hdr taph; /* 14 2 */
struct ipv6hdr ip6h; /* 32 20 */
struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];
static unsigned int tcp6_l2_flags_buf_used;
#define CONN(idx) (&(FLOW(idx)->tcp)) #define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */ /* Table for lookup from remote address, local port, remote port */
@ -967,25 +916,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
} }
/** /**
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged * @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged
*/ */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{ {
int i; eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
for (i = 0; i < TCP_FRAMES_MEM; i++) {
struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
eth_update_mac(&b4->taph.eh, eth_d, eth_s);
eth_update_mac(&b6->taph.eh, eth_d, eth_s);
eth_update_mac(&b4f->taph.eh, eth_d, eth_s);
eth_update_mac(&b6f->taph.eh, eth_d, eth_s);
}
} }
/** /**
@ -998,26 +936,45 @@ static void tcp_sock4_iov_init(const struct ctx *c)
struct iovec *iov; struct iovec *iov;
int i; int i;
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP), for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
.iph = iph, tcp4_payload_ip[i] = iph;
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
}; tcp4_payload[i].th.ack = 1;
} }
for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { tcp4_flags_ip[i] = iph;
.taph = TAP_HDR_INIT(ETH_P_IP), tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
.iph = L2_BUF_IP4_INIT(IPPROTO_TCP) tcp4_flags[i].th.ack = 1;
};
} }
for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); iov = tcp4_l2_iov[i];
for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov[TCP_IOV_VLEN].iov_base = &tcp4_payload_vnet_len[i];
iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
sizeof(tcp4_payload_vnet_len[i]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp4_l2_flags_iov[i];
iov[TCP_IOV_VLEN].iov_base = &tcp4_flags_vnet_len[i];
iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
sizeof(tcp4_flags_vnet_len[i]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src);
iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
}
} }
/** /**
@ -1026,29 +983,49 @@ static void tcp_sock4_iov_init(const struct ctx *c)
*/ */
static void tcp_sock6_iov_init(const struct ctx *c) static void tcp_sock6_iov_init(const struct ctx *c)
{ {
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
struct iovec *iov; struct iovec *iov;
int i; int i;
for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IPV6), for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), tcp6_payload_ip[i] = ip6;
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
}; tcp6_payload[i].th.ack = 1;
} }
for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { tcp6_flags_ip[i] = ip6;
.taph = TAP_HDR_INIT(ETH_P_IPV6), tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
.ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) tcp6_flags[i].th .ack = 1;
};
} }
for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); iov = tcp6_l2_iov[i];
for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov[TCP_IOV_VLEN].iov_base = &tcp6_payload_vnet_len[i];
iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
sizeof(tcp6_payload_vnet_len[i]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
}
for (i = 0; i < TCP_FRAMES_MEM; i++) {
iov = tcp6_l2_flags_iov[i];
iov[TCP_IOV_VLEN].iov_base = &tcp6_flags_vnet_len[i];
iov[TCP_IOV_VLEN].iov_len = c->mode == MODE_PASTA ? 0 :
sizeof(tcp6_flags_vnet_len[i]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src);
iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i];
iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]);
iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
}
} }
/** /**
@ -1284,36 +1261,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
} while (0) } while (0)
/** /**
* tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) * tcp_flags_flush() - Send out buffers for segments with no data (flags)
* @c: Execution context * @c: Execution context
*/ */
static void tcp_l2_flags_buf_flush(const struct ctx *c) static void tcp_flags_flush(const struct ctx *c)
{ {
tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp6_l2_flags_buf_used = 0; tcp6_flags_used);
tcp6_flags_used = 0;
tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
tcp4_l2_flags_buf_used = 0; tcp4_flags_used);
tcp4_flags_used = 0;
} }
/** /**
* tcp_l2_data_buf_flush() - Send out buffers for segments with data * tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context * @c: Execution context
*/ */
static void tcp_l2_data_buf_flush(const struct ctx *c) static void tcp_payload_flush(const struct ctx *c)
{ {
unsigned i; unsigned i;
size_t m; size_t m;
m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
tcp6_payload_used);
for (i = 0; i < m; i++) for (i = 0; i < m; i++)
*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; *tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
tcp6_l2_buf_used = 0; tcp6_payload_used = 0;
m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
tcp4_payload_used);
for (i = 0; i < m; i++) for (i = 0; i < m; i++)
*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; *tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
tcp4_l2_buf_used = 0; tcp4_payload_used = 0;
} }
/** /**
@ -1323,8 +1304,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c)
/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */
void tcp_defer_handler(struct ctx *c) void tcp_defer_handler(struct ctx *c)
{ {
tcp_l2_flags_buf_flush(c); tcp_flags_flush(c);
tcp_l2_data_buf_flush(c); tcp_payload_flush(c);
} }
/** /**
@ -1433,35 +1414,31 @@ static size_t tcp_fill_headers6(const struct ctx *c,
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
* @c: Execution context * @c: Execution context
* @conn: Connection pointer * @conn: Connection pointer
* @p: Pointer to any type of TCP pre-cooked buffer * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @plen: Payload length (including TCP header options) * @plen: Payload length (including TCP header options)
* @check: Checksum, if already known * @check: Checksum, if already known
* @seq: Sequence number for this segment * @seq: Sequence number for this segment
* *
* Return: frame length including L2 headers, host order * Return: IP payload length, host order
*/ */
static size_t tcp_l2_buf_fill_headers(const struct ctx *c, static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
const struct tcp_tap_conn *conn, const struct tcp_tap_conn *conn,
void *p, size_t plen, struct iovec *iov, size_t plen,
const uint16_t *check, uint32_t seq) const uint16_t *check, uint32_t seq)
{ {
const struct in_addr *a4 = inany_v4(&conn->faddr); const struct in_addr *a4 = inany_v4(&conn->faddr);
size_t ip_len, tlen; size_t ip_len, tlen;
if (a4) { if (a4) {
struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, plen,
ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen,
check, seq); check, seq);
tlen = ip_len - sizeof(struct iphdr);
tlen = tap_frame_len(c, &b->taph, ip_len);
} else { } else {
struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, plen,
ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen,
seq); seq);
tlen = ip_len - sizeof(struct ipv6hdr);
tlen = tap_frame_len(c, &b->taph, ip_len);
} }
return tlen; return tlen;
@ -1593,16 +1570,16 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
*/ */
static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
{ {
struct tcp4_l2_flags_buf_t *b4 = NULL; struct tcp_flags_t *payload;
struct tcp6_l2_flags_buf_t *b6 = NULL;
struct tcp_info tinfo = { 0 }; struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo); socklen_t sl = sizeof(tinfo);
int s = conn->sock; int s = conn->sock;
uint32_t vnet_len;
size_t optlen = 0; size_t optlen = 0;
struct iovec *iov;
struct tcphdr *th; struct tcphdr *th;
struct iovec *iov;
size_t ip_len;
char *data; char *data;
void *p;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap) !flags && conn->wnd_to_tap)
@ -1625,19 +1602,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
return 0; return 0;
if (CONN_V4(conn)) { if (CONN_V4(conn)) {
iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; iov = tcp4_l2_flags_iov[tcp4_flags_used++];
p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr);
th = &b4->th;
/* gcc 11.2 would complain on data = (char *)(th + 1); */
data = b4->opts;
} else { } else {
iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; iov = tcp6_l2_flags_iov[tcp6_flags_used++];
p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
th = &b6->th;
data = b6->opts;
} }
payload = iov[TCP_IOV_PAYLOAD].iov_base;
th = &payload->th;
data = payload->opts;
if (flags & SYN) { if (flags & SYN) {
int mss; int mss;
@ -1683,8 +1658,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
th->syn = !!(flags & SYN); th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN); th->fin = !!(flags & FIN);
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
NULL, conn->seq_to_tap); conn->seq_to_tap);
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
*(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len + ip_len);
if (th->ack) { if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
@ -1700,24 +1678,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn) if (th->fin || th->syn)
conn->seq_to_tap++; conn->seq_to_tap++;
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
if (CONN_V4(conn))
dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
else
dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
for (i = 0; i < TCP_NUM_IOVS; i++)
memcpy(dup_iov[i].iov_base, iov[i].iov_base,
iov[i].iov_len);
dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
}
if (CONN_V4(conn)) { if (CONN_V4(conn)) {
if (flags & DUP_ACK) { if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
memcpy(b4 + 1, b4, sizeof(*b4)); tcp_flags_flush(c);
(iov + 1)->iov_len = iov->iov_len;
tcp4_l2_flags_buf_used++;
}
if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
tcp_l2_flags_buf_flush(c);
} else { } else {
if (flags & DUP_ACK) { if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
memcpy(b6 + 1, b6, sizeof(*b6)); tcp_flags_flush(c);
(iov + 1)->iov_len = iov->iov_len;
tcp6_l2_flags_buf_used++;
}
if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
tcp_l2_flags_buf_flush(c);
} }
return 0; return 0;
@ -2164,30 +2145,42 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
{ {
uint32_t *seq_update = &conn->seq_to_tap; uint32_t *seq_update = &conn->seq_to_tap;
struct iovec *iov; struct iovec *iov;
size_t ip_len;
uint32_t vnet_len;
if (CONN_V4(conn)) { if (CONN_V4(conn)) {
struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; const uint16_t *check = NULL;
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; if (no_csum) {
tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
iov = tcp4_l2_iov + tcp4_l2_buf_used++; tcp4_seq_update[tcp4_payload_used].seq = seq_update;
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, tcp4_seq_update[tcp4_payload_used].len = plen;
check, seq);
if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) iov = tcp4_l2_iov[tcp4_payload_used++];
tcp_l2_data_buf_flush(c); ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check,
seq);
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr) +
ip_len;
*(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len);
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
} else if (CONN_V6(conn)) { } else if (CONN_V6(conn)) {
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; tcp6_seq_update[tcp6_payload_used].seq = seq_update;
tcp6_seq_update[tcp6_payload_used].len = plen;
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; iov = tcp6_l2_iov[tcp6_payload_used++];
tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq);
iov[TCP_IOV_PAYLOAD].iov_len = ip_len;
iov = tcp6_l2_iov + tcp6_l2_buf_used++; vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr) +
iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, ip_len;
NULL, seq); *(uint32_t *)iov[TCP_IOV_VLEN].iov_base = htonl(vnet_len);
if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_l2_data_buf_flush(c); tcp_payload_flush(c);
} }
} }
@ -2242,19 +2235,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_base = tcp_buf_discard;
iov_sock[0].iov_len = already_sent; iov_sock[0].iov_len = already_sent;
if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
(!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
tcp_l2_data_buf_flush(c); tcp_payload_flush(c);
/* Silence Coverity CWE-125 false positive */ /* Silence Coverity CWE-125 false positive */
tcp4_l2_buf_used = tcp6_l2_buf_used = 0; tcp4_payload_used = tcp6_payload_used = 0;
} }
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
if (v4) if (v4)
iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
else else
iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
iov->iov_len = mss; iov->iov_len = mss;
} }
if (iov_rem) if (iov_rem)
@ -2299,7 +2292,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
plen = mss; plen = mss;
seq = conn->seq_to_tap; seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) { for (i = 0; i < send_bufs; i++) {
int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
if (i == send_bufs - 1) if (i == send_bufs - 1)
plen = last_len; plen = last_len;

3
util.h
View File

@ -32,6 +32,9 @@
#ifndef ETH_MIN_MTU #ifndef ETH_MIN_MTU
#define ETH_MIN_MTU 68 #define ETH_MIN_MTU 68
#endif #endif
#ifndef IP_MAX_MTU
#define IP_MAX_MTU USHRT_MAX
#endif
#ifndef MIN #ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y))