mirror of
https://passt.top/passt
synced 2025-01-02 19:05:48 +00:00
a6348cad51
We have different versions of this function for IPv4 and IPv6, but the caller already requires some IP version specific code to get the right header pointers. Instead, have a common function that fills either an IPv4 or an IPv6 header based on which header pointer it is passed. This allows us to remove a small amount of code duplication and make a few slightly ugly conditionals. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
412 lines
11 KiB
C
412 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
/* PASST - Plug A Simple Socket Transport
|
|
* for qemu/UNIX domain socket mode
|
|
*
|
|
* PASTA - Pack A Subtle Tap Abstraction
|
|
* for network namespace/tap device mode
|
|
*
|
|
* tcp_buf.c - TCP L2 buffer management functions
|
|
*
|
|
* Copyright Red Hat
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
*/
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <limits.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/tcp.h>
|
|
|
|
#include "util.h"
|
|
#include "ip.h"
|
|
#include "iov.h"
|
|
#include "passt.h"
|
|
#include "tap.h"
|
|
#include "siphash.h"
|
|
#include "inany.h"
|
|
#include "tcp_conn.h"
|
|
#include "tcp_internal.h"
|
|
#include "tcp_buf.h"
|
|
|
|
#define TCP_FRAMES_MEM 128
|
|
#define TCP_FRAMES \
|
|
(c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
|
|
|
|
/* Static buffers */
|
|
|
|
/* Ethernet header for IPv4 and IPv6 frames */
|
|
static struct ethhdr tcp4_eth_src;
|
|
static struct ethhdr tcp6_eth_src;
|
|
|
|
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
|
|
|
|
/* IP headers for IPv4 and IPv6 */
|
|
struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
|
|
struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
|
|
|
|
/* TCP segments with payload for IPv4 and IPv6 frames */
|
|
static struct tcp_payload_t tcp_payload[TCP_FRAMES_MEM];
|
|
|
|
static_assert(MSS4 <= sizeof(tcp_payload[0].data), "MSS4 is greater than 65516");
|
|
static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516");
|
|
|
|
/* References tracking the owner connection of frames in the tap outqueue */
|
|
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
|
|
static unsigned int tcp_payload_used;
|
|
|
|
/* recvmsg()/sendmsg() data for tap */
|
|
static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
|
|
|
|
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
|
|
|
|
/**
|
|
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
|
|
* @eth_d: Ethernet destination address, NULL if unchanged
|
|
* @eth_s: Ethernet source address, NULL if unchanged
|
|
*/
|
|
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
|
|
{
|
|
eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
|
|
eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
|
|
}
|
|
|
|
/**
|
|
* tcp_sock_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
|
|
* @c: Execution context
|
|
*/
|
|
void tcp_sock_iov_init(const struct ctx *c)
|
|
{
|
|
struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
|
|
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
|
|
int i;
|
|
|
|
tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
|
|
tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
|
|
|
|
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
|
|
tcp6_payload_ip[i] = ip6;
|
|
tcp4_payload_ip[i] = iph;
|
|
}
|
|
|
|
for (i = 0; i < TCP_FRAMES_MEM; i++) {
|
|
struct iovec *iov = tcp_l2_iov[i];
|
|
|
|
iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp_payload_tap_hdr[i]);
|
|
iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr);
|
|
iov[TCP_IOV_PAYLOAD].iov_base = &tcp_payload[i];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
|
|
* @ctx: Execution context
|
|
* @conns: Array of connection pointers corresponding to queued frames
|
|
* @frames: Two-dimensional array containing queued frames with sub-iovs
|
|
* @num_frames: Number of entries in the two arrays to be compared
|
|
*/
|
|
static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
|
|
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < num_frames; i++) {
|
|
const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
|
|
struct tcp_tap_conn *conn = conns[i];
|
|
uint32_t seq = ntohl(th->seq);
|
|
uint32_t peek_offset;
|
|
|
|
if (SEQ_LE(conn->seq_to_tap, seq))
|
|
continue;
|
|
|
|
conn->seq_to_tap = seq;
|
|
peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
|
if (tcp_set_peek_offset(conn->sock, peek_offset))
|
|
tcp_rst(c, conn);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tcp_payload_flush() - Send out buffers for segments with data or flags
|
|
* @c: Execution context
|
|
*/
|
|
void tcp_payload_flush(const struct ctx *c)
|
|
{
|
|
size_t m;
|
|
|
|
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
|
|
tcp_payload_used);
|
|
if (m != tcp_payload_used) {
|
|
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
|
|
tcp_payload_used - m);
|
|
}
|
|
tcp_payload_used = 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
|
|
* @conn: Connection pointer
|
|
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
|
|
* @check: Checksum, if already known
|
|
* @seq: Sequence number for this segment
|
|
* @no_tcp_csum: Do not set TCP checksum
|
|
*/
|
|
static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
|
|
struct iovec *iov, const uint16_t *check,
|
|
uint32_t seq, bool no_tcp_csum)
|
|
{
|
|
struct iov_tail tail = IOV_TAIL(&iov[TCP_IOV_PAYLOAD], 1, 0);
|
|
struct tcphdr *th = IOV_REMOVE_HEADER(&tail, struct tcphdr);
|
|
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
|
|
const struct flowside *tapside = TAPFLOW(conn);
|
|
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
|
|
struct ipv6hdr *ip6h = NULL;
|
|
struct iphdr *ip4h = NULL;
|
|
|
|
if (a4)
|
|
ip4h = iov[TCP_IOV_IP].iov_base;
|
|
else
|
|
ip6h = iov[TCP_IOV_IP].iov_base;
|
|
|
|
tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
|
|
check, seq, no_tcp_csum);
|
|
}
|
|
|
|
/**
|
|
* tcp_buf_send_flag() - Send segment with flags to tap (no payload)
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @flags: TCP flags: if not set, send segment only if ACK is due
|
|
*
|
|
* Return: negative error code on connection reset, 0 otherwise
|
|
*/
|
|
int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
|
{
|
|
struct tcp_payload_t *payload;
|
|
struct iovec *iov;
|
|
size_t optlen;
|
|
size_t l4len;
|
|
uint32_t seq;
|
|
int ret;
|
|
|
|
iov = tcp_l2_iov[tcp_payload_used];
|
|
if (CONN_V4(conn)) {
|
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
|
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
|
} else {
|
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
|
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
|
}
|
|
|
|
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
|
seq = conn->seq_to_tap;
|
|
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
|
|
(struct tcp_syn_opts *)&payload->data, &optlen);
|
|
if (ret <= 0)
|
|
return ret;
|
|
|
|
tcp_payload_used++;
|
|
l4len = optlen + sizeof(struct tcphdr);
|
|
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
|
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
|
|
|
|
if (flags & DUP_ACK) {
|
|
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
|
|
|
|
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
|
|
iov[TCP_IOV_TAP].iov_len);
|
|
dup_iov[TCP_IOV_ETH].iov_base = iov[TCP_IOV_ETH].iov_base;
|
|
dup_iov[TCP_IOV_IP] = iov[TCP_IOV_IP];
|
|
memcpy(dup_iov[TCP_IOV_PAYLOAD].iov_base,
|
|
iov[TCP_IOV_PAYLOAD].iov_base, l4len);
|
|
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
|
}
|
|
|
|
if (tcp_payload_used > TCP_FRAMES_MEM - 2)
|
|
tcp_payload_flush(c);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
* @dlen: TCP payload length
|
|
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
|
|
* @seq: Sequence number to be sent
|
|
*/
|
|
static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
|
ssize_t dlen, int no_csum, uint32_t seq)
|
|
{
|
|
struct tcp_payload_t *payload;
|
|
const uint16_t *check = NULL;
|
|
struct iovec *iov;
|
|
|
|
conn->seq_to_tap = seq + dlen;
|
|
tcp_frame_conns[tcp_payload_used] = conn;
|
|
iov = tcp_l2_iov[tcp_payload_used];
|
|
if (CONN_V4(conn)) {
|
|
if (no_csum) {
|
|
struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
|
|
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
|
|
|
|
check = &iph->check;
|
|
}
|
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
|
|
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
|
|
} else if (CONN_V6(conn)) {
|
|
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
|
|
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
|
|
}
|
|
payload = iov[TCP_IOV_PAYLOAD].iov_base;
|
|
payload->th.th_off = sizeof(struct tcphdr) / 4;
|
|
payload->th.th_x2 = 0;
|
|
payload->th.th_flags = 0;
|
|
payload->th.ack = 1;
|
|
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
|
|
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
|
|
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
|
|
tcp_payload_flush(c);
|
|
}
|
|
|
|
/**
|
|
* tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
|
|
* @c: Execution context
|
|
* @conn: Connection pointer
|
|
*
|
|
* Return: negative on connection reset, 0 otherwise
|
|
*
|
|
* #syscalls recvmsg
|
|
*/
|
|
int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|
{
|
|
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
|
|
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
|
|
int len, dlen, i, s = conn->sock;
|
|
struct msghdr mh_sock = { 0 };
|
|
uint16_t mss = MSS_GET(conn);
|
|
uint32_t already_sent, seq;
|
|
struct iovec *iov;
|
|
|
|
/* How much have we read/sent since last received ack ? */
|
|
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
|
|
|
|
if (SEQ_LT(already_sent, 0)) {
|
|
/* RFC 761, section 2.1. */
|
|
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
|
|
conn->seq_ack_from_tap, conn->seq_to_tap);
|
|
conn->seq_to_tap = conn->seq_ack_from_tap;
|
|
already_sent = 0;
|
|
if (tcp_set_peek_offset(s, 0)) {
|
|
tcp_rst(c, conn);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
|
conn_flag(c, conn, STALLED);
|
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
|
return 0;
|
|
}
|
|
|
|
/* Set up buffer descriptors we'll fill completely and partially. */
|
|
fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
|
|
if (fill_bufs > TCP_FRAMES) {
|
|
fill_bufs = TCP_FRAMES;
|
|
iov_rem = 0;
|
|
} else {
|
|
iov_rem = (wnd_scaled - already_sent) % mss;
|
|
}
|
|
|
|
/* Prepare iov according to kernel capability */
|
|
if (!peek_offset_cap) {
|
|
mh_sock.msg_iov = iov_sock;
|
|
iov_sock[0].iov_base = tcp_buf_discard;
|
|
iov_sock[0].iov_len = already_sent;
|
|
mh_sock.msg_iovlen = fill_bufs + 1;
|
|
} else {
|
|
mh_sock.msg_iov = &iov_sock[1];
|
|
mh_sock.msg_iovlen = fill_bufs;
|
|
}
|
|
|
|
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
|
|
tcp_payload_flush(c);
|
|
|
|
/* Silence Coverity CWE-125 false positive */
|
|
tcp_payload_used = 0;
|
|
}
|
|
|
|
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
|
|
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
|
|
iov->iov_len = mss;
|
|
}
|
|
if (iov_rem)
|
|
iov_sock[fill_bufs].iov_len = iov_rem;
|
|
|
|
/* Receive into buffers, don't dequeue until acknowledged by guest. */
|
|
do
|
|
len = recvmsg(s, &mh_sock, MSG_PEEK);
|
|
while (len < 0 && errno == EINTR);
|
|
|
|
if (len < 0) {
|
|
if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
|
tcp_rst(c, conn);
|
|
return -errno;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
if (!len) {
|
|
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
|
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
|
if (ret) {
|
|
tcp_rst(c, conn);
|
|
return ret;
|
|
}
|
|
|
|
conn_event(c, conn, TAP_FIN_SENT);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
if (!peek_offset_cap)
|
|
len -= already_sent;
|
|
|
|
if (len <= 0) {
|
|
conn_flag(c, conn, STALLED);
|
|
return 0;
|
|
}
|
|
|
|
conn_flag(c, conn, ~STALLED);
|
|
|
|
send_bufs = DIV_ROUND_UP(len, mss);
|
|
last_len = len - (send_bufs - 1) * mss;
|
|
|
|
/* Likely, some new data was acked too. */
|
|
tcp_update_seqack_wnd(c, conn, false, NULL);
|
|
|
|
/* Finally, queue to tap */
|
|
dlen = mss;
|
|
seq = conn->seq_to_tap;
|
|
for (i = 0; i < send_bufs; i++) {
|
|
int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
|
|
|
|
if (i == send_bufs - 1)
|
|
dlen = last_len;
|
|
|
|
tcp_data_to_tap(c, conn, dlen, no_csum, seq);
|
|
seq += dlen;
|
|
}
|
|
|
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
|
|
|
return 0;
|
|
}
|