diff --git a/Makefile b/Makefile index fb4494a..2f48c35 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,20 @@ CFLAGS += -Wall -Wextra -pedantic +CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s) -all: passt qrap +all: passt pasta passt4netns qrap passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h $(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt +pasta: passt + ln -s passt pasta + +passt4netns: passt + ln -s passt passt4netns + qrap: qrap.c passt.h $(CC) $(CFLAGS) -DARCH=\"$(shell uname -m)\" qrap.c -o qrap .PHONY: clean clean: - -${RM} passt *.o qrap + -${RM} passt *.o qrap pasta passt4netns diff --git a/arp.c b/arp.c index 20f08b2..547057c 100644 --- a/arp.c +++ b/arp.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * arp.c - ARP implementation * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ #include @@ -22,9 +25,9 @@ #include #include +#include "util.h" #include "passt.h" #include "dhcp.h" -#include "util.h" #include "tap.h" #include "arp.h" @@ -66,7 +69,7 @@ int arp(struct ctx *c, struct ethhdr *eh, size_t len) memcpy(eh->h_dest, eh->h_source, ETH_ALEN); memcpy(eh->h_source, c->mac, ETH_ALEN); - if (tap_send(c->fd_unix, eh, len, 0) < 0) + if (tap_send(c, eh, len, 0) < 0) perror("ARP: send"); return 1; diff --git a/dhcp.c b/dhcp.c index 6448e51..337463a 100644 --- a/dhcp.c +++ b/dhcp.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * dhcp.c - Minimalistic DHCP server for PASST * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ #include @@ -21,9 +24,9 @@ #include #include +#include "util.h" #include "passt.h" #include "dhcp.h" -#include "util.h" #include "tap.h" /** @@ -322,7 +325,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) memcpy(eh->h_dest, eh->h_source, ETH_ALEN); memcpy(eh->h_source, c->mac, ETH_ALEN); - if (tap_send(c->fd_unix, eh, len, 0) < 0) + if (tap_send(c, eh, len, 0) < 0) perror("DHCP: send"); return 1; diff --git a/dhcpv6.c b/dhcpv6.c index 4ce7a87..be6d9b1 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * dhcpv6.c - Minimalistic DHCPv6 server for PASST * * Copyright (c) 2021 Red Hat GmbH * Author: Stefano Brivio - * */ #include @@ -23,9 +26,9 @@ #include #include +#include "util.h" #include "passt.h" #include "tap.h" -#include "util.h" /** * struct opt_hdr - DHCPv6 option header diff --git a/icmp.c b/icmp.c index 378e787..8f2fdb2 100644 --- a/icmp.c +++ b/icmp.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * icmp.c - ICMP/ICMPv6 echo proxy * * Copyright (c) 2021 Red Hat GmbH * Author: Stefano Brivio - * */ #include @@ -28,57 +31,91 @@ #include #include +#include "util.h" #include "passt.h" #include "tap.h" -#include "util.h" #include "icmp.h" +#define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ + +/** + * struct icmp_id - Tracking information for single ICMP echo identifier + * @sock: Bound socket for identifier + * @ts: Last associated activity from tap, seconds + * @seq: Last sequence number sent to tap, host order + */ +struct icmp_id { + int sock; + time_t ts; + uint16_t seq; +}; + /* Indexed by ICMP echo identifier */ -static int icmp_s_v4[USHRT_MAX]; -static int icmp_s_v6[USHRT_MAX]; +static struct icmp_id icmp_id_map [IP_VERSIONS][USHRT_MAX]; + +/* Bitmaps, activity monitoring needed for identifier */ +static uint8_t icmp_act [IP_VERSIONS][USHRT_MAX / 8]; /** * icmp_sock_handler() - Handle new data from socket * @c: Execution context - * @s: File descriptor number for socket + * @ref: epoll reference * @events: epoll events bitmap - * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp, unused */ -void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0, 0, 0, 0 } }; - struct sockaddr_storage sr, sl; - socklen_t slen = sizeof(sr); + struct sockaddr_storage sr; + socklen_t sl = sizeof(sr); char buf[USHRT_MAX]; + uint16_t seq, id; ssize_t n; (void)events; - (void)pkt_buf; (void)now; - n = recvfrom(s, buf, sizeof(buf), MSG_DONTWAIT, - (struct sockaddr *)&sr, &slen); + n = recvfrom(ref.s, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl); if (n < 0) return; - if (getsockname(s, (struct sockaddr *)&sl, &slen)) - return; + if (ref.icmp.v6) { + struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; + struct icmp6hdr *ih = (struct icmp6hdr *)buf; - if (sl.ss_family == AF_INET) { + /* In PASTA mode, we'll get any reply we send, discard them. */ + if (c->mode == MODE_PASTA) { + seq = ntohs(ih->icmp6_sequence); + id = ntohs(ih->icmp6_identifier); + + if (icmp_id_map[V6][id].seq == seq) + return; + + icmp_id_map[V6][id].seq = seq; + } + + tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n); + } else { struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; + struct icmphdr *ih = (struct icmphdr *)buf; + + if (c->mode == MODE_PASTA) { + seq = ntohs(ih->un.echo.sequence); + id = ntohs(ih->un.echo.id); + + if (icmp_id_map[V4][id].seq == seq) + return; + + icmp_id_map[V4][id].seq = seq; + } memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); tap_ip_send(c, &a6, IPPROTO_ICMP, buf, n); - } else if (sl.ss_family == AF_INET6) { - struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; - - tap_ip_send(c, &sr6->sin6_addr, IPPROTO_ICMPV6, buf, n); } } @@ -86,101 +123,131 @@ void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, * icmp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 + * @ * @msg: Input message * @count: Message count (always 1 for ICMP) - * @now: Current timestamp, unused + * @now: Current timestamp * * Return: count of consumed packets (always 1, even if malformed) */ int icmp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now) { - int s; - (void)count; - (void)now; - (void)c; if (af == AF_INET) { struct icmphdr *ih = (struct icmphdr *)msg[0].l4h; + union icmp_epoll_ref iref = { .v6 = 0 }; struct sockaddr_in sa = { .sin_family = AF_INET, .sin_addr = { .s_addr = INADDR_ANY }, .sin_port = ih->un.echo.id, }; + int id, s; if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO) return 1; - if ((s = icmp_s_v4[ntohs(ih->un.echo.id)]) < 0) - return 1; + id = ntohs(ih->un.echo.id); - bind(s, (struct sockaddr *)&sa, sizeof(sa)); + if ((s = icmp_id_map[V4][id].sock) <= 0) { + s = sock_l4(c, AF_INET, IPPROTO_ICMP, id, 0, iref.u32); + if (s < 0) + goto fail_sock; + + icmp_id_map[V4][id].sock = s; + } + icmp_id_map[V4][id].ts = now->tv_sec; + bitmap_set(icmp_act[V4], id); sa.sin_addr = *(struct in_addr *)addr; - sendto(s, msg[0].l4h, msg[0].l4_len, - MSG_DONTWAIT | MSG_NOSIGNAL, + sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } else if (af == AF_INET6) { struct icmp6hdr *ih = (struct icmp6hdr *)msg[0].l4h; + union icmp_epoll_ref iref = { .v6 = 1 }; struct sockaddr_in6 sa = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = ih->icmp6_identifier, }; + int id, s; if (msg[0].l4_len < sizeof(*ih) || (ih->icmp6_type != 128 && ih->icmp6_type != 129)) return 1; - if ((s = icmp_s_v6[ntohs(ih->icmp6_identifier)]) < 0) - return 1; + id = ntohs(ih->icmp6_identifier); + if ((s = icmp_id_map[V6][id].sock) <= 0) { + s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, id, 0, + iref.u32); + if (s < 0) + goto fail_sock; - bind(s, (struct sockaddr *)&sa, sizeof(sa)); + icmp_id_map[V6][id].sock = s; + } + icmp_id_map[V6][id].ts = now->tv_sec; + bitmap_set(icmp_act[V6], id); sa.sin6_addr = *(struct in6_addr *)addr; - sendto(s, msg[0].l4h, msg[0].l4_len, - MSG_DONTWAIT | MSG_NOSIGNAL, + sendto(s, msg[0].l4h, msg[0].l4_len, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } return 1; + +fail_sock: + warn("Cannot open \"ping\" socket. You might need to:"); + warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); + warn("...echo requests/replies will fail."); + return 1; } /** - * icmp_sock_init() - Create ICMP, ICMPv6 sockets for echo requests and replies + * icmp_timer_one() - Handler for timed events related to a given identifier * @c: Execution context - * - * Return: 0 on success, -1 on failure + * @v6: Set for IPv6 echo identifier bindings + * @id: Echo identifier, host order + * @ts: Timestamp from caller */ -int icmp_sock_init(struct ctx *c) +static void icmp_timer_one(struct ctx *c, int v6, uint16_t id, + struct timespec *ts) { - int i, fail = 0; + struct icmp_id *id_map = &icmp_id_map[v6 ? V6 : V4][id]; - c->icmp.fd_min = INT_MAX; - c->icmp.fd_max = 0; + if (ts->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) + return; - if (c->v4) { - for (i = 0; i < USHRT_MAX; i++) { - icmp_s_v4[i] = sock_l4(c, AF_INET, IPPROTO_ICMP, i); - if (icmp_s_v4[i] < 0) - fail = 1; - } - } + bitmap_clear(icmp_act[v6 ? V6 : V4], id); - if (c->v6) { - for (i = 0; i < USHRT_MAX; i++) { - icmp_s_v6[i] = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, i); - if (icmp_s_v6[i] < 0) - fail = 1; - } - } - - if (fail) { - warn("Cannot open some \"ping\" sockets. You might need to:"); - warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); - warn("...echo requests/replies might fail."); - } - - return 0; + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_map->sock, NULL); + close(id_map->sock); + id_map->sock = 0; +} + +/** + * icmp_timer() - Scan activity bitmap for identifiers with timed events + * @c: Execution context + * @ts: Timestamp from caller + */ +void icmp_timer(struct ctx *c, struct timespec *ts) +{ + long *word, tmp; + unsigned int i; + int n, v6 = 0; + +v6: + word = (long *)icmp_act[v6 ? V6 : V4]; + for (i = 0; i < sizeof(icmp_act[0]) / sizeof(long); i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + icmp_timer_one(c, v6, i * sizeof(long) * 8 + n - 1, ts); + } + } + + if (!v6) { + v6 = 1; + goto v6; + } } diff --git a/icmp.h b/icmp.h index d04eb8c..12547b7 100644 --- a/icmp.h +++ b/icmp.h @@ -1,22 +1,34 @@ #ifndef ICMP_H #define ICMP_H +#define ICMP_TIMER_INTERVAL 1000 /* ms */ + struct ctx; -void icmp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); int icmp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); -int icmp_sock_init(struct ctx *c); +void icmp_timer(struct ctx *c, struct timespec *ts); + +/** + * union icmp_epoll_ref - epoll reference portion for ICMP tracking + * @v6: Set for IPv6 sockets or connections + * @u32: Opaque u32 value of reference + */ +union icmp_epoll_ref { + struct { + uint32_t v6:1; + }; + uint32_t u32; +}; /** * struct icmp_ctx - Execution context for ICMP routines - * @fd_min: Lowest file descriptor number for ICMP/ICMPv6 ever used - * @fd_max: Highest file descriptor number for ICMP/ICMPv6 ever used + * @timer_run: Timestamp of most recent timer run */ struct icmp_ctx { - int fd_min; - int fd_max; + struct timespec timer_run; }; #endif /* ICMP_H */ diff --git a/ndp.c b/ndp.c index 1d2a2d3..a7360aa 100644 --- a/ndp.c +++ b/ndp.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * ndp.c - NDP support for PASST * @@ -23,8 +27,8 @@ #include #include -#include "passt.h" #include "util.h" +#include "passt.h" #include "tap.h" #define RS 133 @@ -175,7 +179,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) memcpy(ehr->h_source, c->mac, ETH_ALEN); ehr->h_proto = htons(ETH_P_IPV6); - if (tap_send(c->fd_unix, ehr, len, 0) < 0) + if (tap_send(c, ehr, len, 0) < 0) perror("NDP: send"); return 1; diff --git a/passt.c b/passt.c index 46eb5f6..ee721df 100644 --- a/passt.c +++ b/passt.c @@ -1,18 +1,26 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * passt.c - Daemon implementation * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio * - * Grab Ethernet frames via AF_UNIX socket, build SOCK_DGRAM/SOCK_STREAM sockets - * for each 5-tuple from TCP, UDP packets, perform connection tracking and - * forward them. Forward packets received on sockets back to the UNIX domain - * socket (typically, a socket virtio_net file descriptor from qemu). + * Grab Ethernet frames from AF_UNIX socket (in "passt" mode) or tap device (in + * "pasta" mode), build SOCK_DGRAM/SOCK_STREAM sockets for each 5-tuple from + * TCP, UDP packets, perform connection tracking and forward them. Forward + * packets received on sockets back to the UNIX domain socket (typically, a + * socket virtio_net file descriptor from qemu) or to the tap device (typically, + * created in a separate network namespace). */ +#define _GNU_SOURCE +#include #include #include #include @@ -44,91 +52,32 @@ #include #include -#include "passt.h" -#include "arp.h" -#include "dhcp.h" -#include "ndp.h" -#include "dhcpv6.h" #include "util.h" +#include "passt.h" +#include "dhcpv6.h" #include "icmp.h" #include "tcp.h" #include "udp.h" #include "pcap.h" +#include "tap.h" #define EPOLL_EVENTS 10 -#define TAP_BUF_BYTES (ETH_MAX_MTU * 8) -#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) -#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1) +#define __TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) +#define TIMER_INTERVAL MIN(__TIMER_INTERVAL, ICMP_TIMER_INTERVAL) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, SOCK_BUF_BYTES) -static char pkt_buf [PKT_BUF_BYTES]; - -#define TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) +char pkt_buf [PKT_BUF_BYTES]; #ifdef DEBUG -static char *ip_proto_str[IPPROTO_SCTP + 1] = { +char *ip_proto_str[IPPROTO_SCTP + 1] = { [IPPROTO_ICMP] = "ICMP", [IPPROTO_TCP] = "TCP", [IPPROTO_UDP] = "UDP", [IPPROTO_ICMPV6] = "ICMPV6", [IPPROTO_SCTP] = "SCTP", }; - -#define IP_PROTO_STR(n) \ - (((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?") - #endif -/** - * sock_unix() - Create and bind AF_UNIX socket, add to epoll list - * @index: Index used in socket path, filled on success - * - * Return: newly created socket, doesn't return on error - */ -static int sock_unix(int *index) -{ - int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex; - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - int i, ret; - - if (fd < 0) { - perror("UNIX socket"); - exit(EXIT_FAILURE); - } - - for (i = 1; i < UNIX_SOCK_MAX; i++) { - snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i); - - ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); - ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); - if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) { - close(ex); - continue; - } - close(ex); - - unlink(addr.sun_path); - if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr))) - break; - } - - if (i == UNIX_SOCK_MAX) { - perror("UNIX socket bind"); - exit(EXIT_FAILURE); - } - - info("UNIX domain socket bound at %s\n", addr.sun_path); - chmod(addr.sun_path, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); - - *index = i; - - return fd; -} - /** * struct nl_request - Netlink request filled and sent by get_routes() * @nlh: Netlink message header @@ -365,362 +314,76 @@ static void get_dns(struct ctx *c) } /** - * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor - * @c: Execution context - * @msg: Array of messages with the same L3 protocol - * @count: Count of messages with the same L3 protocol - * @now: Current timestamp + * get_bound_ports_ns() - Get TCP and UDP ports bound in namespace + * @arg: Execution context * - * Return: count of packets consumed by handlers + * Return: 0 */ -static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static int get_bound_ports_ns(void *arg) { - char buf_s[INET_ADDRSTRLEN] __attribute((__unused__)); - char buf_d[INET_ADDRSTRLEN] __attribute((__unused__)); - struct ethhdr *eh = (struct ethhdr *)msg[0].start; - struct iphdr *iph, *prev_iph = NULL; - struct udphdr *uh, *prev_uh = NULL; - size_t len = msg[0].len; - unsigned int i; - char *l4h; + struct ctx *c = (struct ctx *)arg; - if (!c->v4) - return count; + ns_enter(c->pasta_pid); - if (len < sizeof(*eh) + sizeof(*iph)) - return 1; - - if (arp(c, eh, len) || dhcp(c, eh, len)) - return 1; - - for (i = 0; i < count; i++) { - len = msg[i].len; - if (len < sizeof(*eh) + sizeof(*iph)) - return 1; - - eh = (struct ethhdr *)msg[i].start; - iph = (struct iphdr *)(eh + 1); - l4h = (char *)iph + iph->ihl * 4; - - c->addr4_seen = iph->saddr; - - msg[i].l4h = l4h; - msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); - - if (iph->protocol != IPPROTO_TCP && - iph->protocol != IPPROTO_UDP) - break; - - if (len < sizeof(*uh)) - break; - - uh = (struct udphdr *)l4h; - - if (!i) { - prev_iph = iph; - prev_uh = uh; - continue; - } - - if (iph->tos != prev_iph->tos || - iph->frag_off != prev_iph->frag_off || - iph->protocol != prev_iph->protocol || - iph->saddr != prev_iph->saddr || - iph->daddr != prev_iph->daddr || - uh->source != prev_uh->source || - uh->dest != prev_uh->dest) - break; - - prev_iph = iph; - prev_uh = uh; + if (c->v4) { + procfs_scan_listen("tcp", c->tcp.port_to_ns); + procfs_scan_listen("udp", c->udp.port_to_ns); } - eh = (struct ethhdr *)msg[0].start; - iph = (struct iphdr *)(eh + 1); - - if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_SCTP) { - uh = (struct udphdr *)msg[0].l4h; - - if (msg[0].len < sizeof(*uh)) - return 1; - - debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)", - IP_PROTO_STR(iph->protocol), iph->protocol, - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - ntohs(uh->source), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), - ntohs(uh->dest), - i, i > 1 ? "s" : ""); - } else if (iph->protocol == IPPROTO_ICMP) { - debug("icmp from tap: %s -> %s", - inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); + if (c->v6) { + procfs_scan_listen("tcp6", c->tcp.port_to_ns); + procfs_scan_listen("udp6", c->udp.port_to_ns); } - if (iph->protocol == IPPROTO_TCP) - return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now); - - if (iph->protocol == IPPROTO_UDP) - return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now); - - if (iph->protocol == IPPROTO_ICMP) - icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now); - - return 1; + return 0; } /** - * tap6_handler() - IPv6 packet handler for tap file descriptor + * get_bound_ports() - Get maps of ports that should have bound sockets * @c: Execution context - * @msg: Array of messages with the same L3 protocol - * @count: Count of messages with the same L3 protocol - * @now: Current timestamp */ -static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static void get_bound_ports(struct ctx *c) { - char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__)); - char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__)); - struct ethhdr *eh = (struct ethhdr *)msg[0].start; - struct udphdr *uh, *prev_uh = NULL; - uint8_t proto = 0, prev_proto = 0; - size_t len = msg[0].len; - struct ipv6hdr *ip6h; - unsigned int i; - char *l4h; + char ns_fn_stack[NS_FN_STACK_SIZE]; - if (!c->v6) - return count; - - if (len < sizeof(*eh) + sizeof(*ip6h)) - return 1; - - if (ndp(c, eh, len) || dhcpv6(c, eh, len)) - return 1; - - for (i = 0; i < count; i++) { - struct ipv6hdr *p_ip6h; - - len = msg[i].len; - if (len < sizeof(*eh) + sizeof(*ip6h)) - return 1; - - eh = (struct ethhdr *)msg[i].start; - ip6h = (struct ipv6hdr *)(eh + 1); - l4h = ipv6_l4hdr(ip6h, &proto); - - msg[i].l4h = l4h; - msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); - - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) - c->addr6_ll_seen = ip6h->saddr; - else - c->addr6_seen = ip6h->saddr; - - ip6h->saddr = c->addr6; - - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) - break; - - if (len < sizeof(*uh)) - break; - - uh = (struct udphdr *)l4h; - - if (!i) { - p_ip6h = ip6h; - prev_proto = proto; - prev_uh = uh; - continue; - } - - if (proto != prev_proto || - memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) || - memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) || - uh->source != prev_uh->source || - uh->dest != prev_uh->dest) - break; - - p_ip6h = ip6h; - prev_proto = proto; - prev_uh = uh; + if (c->mode == MODE_PASST) { + memset(c->tcp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8); + memset(c->udp.port_to_tap, 0xff, PORT_EPHEMERAL_MIN / 8); + return; } - if (prev_proto) - proto = prev_proto; + clone(get_bound_ports_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, (void *)c); - eh = (struct ethhdr *)msg[0].start; - ip6h = (struct ipv6hdr *)(eh + 1); - - if (proto == IPPROTO_ICMPV6) { - debug("icmpv6 from tap: %s ->\n\t%s", - inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), - inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d))); - } else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || - proto == IPPROTO_SCTP) { - uh = (struct udphdr *)msg[0].l4h; - - if (msg[0].len < sizeof(*uh)) - return 1; - - debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)", - IP_PROTO_STR(proto), proto, - inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), - ntohs(uh->source), - inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), - ntohs(uh->dest), - i, i > 1 ? "s" : ""); + if (c->v4) { + procfs_scan_listen("tcp", c->tcp.port_to_init); + procfs_scan_listen("udp", c->udp.port_to_init); } - if (proto == IPPROTO_TCP) - return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now); - - if (proto == IPPROTO_UDP) - return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now); - - if (proto == IPPROTO_ICMPV6) - icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now); - - return 1; -} - -/** - * tap_handler() - Packet handler for tap file descriptor - * @c: Execution context - * @now: Current timestamp - * - * Return: -ECONNRESET if tap connection was lost, 0 otherwise - */ -static int tap_handler(struct ctx *c, struct timespec *now) -{ - struct tap_msg msg[TAP_MSGS]; - int msg_count, same, i; - struct ethhdr *eh; - char *p = pkt_buf; - ssize_t n, rem; - - while ((n = recv(c->fd_unix, p, TAP_BUF_FILL, MSG_DONTWAIT)) > 0) { - msg_count = 0; - - while (n > (ssize_t)sizeof(uint32_t)) { - ssize_t len = ntohl(*(uint32_t *)p); - - p += sizeof(uint32_t); - n -= sizeof(uint32_t); - - if (len < (ssize_t)sizeof(*eh)) - return 0; - - /* At most one packet might not fit in a single read */ - if (len > n) { - rem = recv(c->fd_unix, p + n, len - n, - MSG_DONTWAIT); - if ((n += rem) != len) - return 0; - } - - pcap(p, len); - - msg[msg_count].start = p; - msg[msg_count++].len = len; - - n -= len; - p += len; - } - - i = 0; - while (i < msg_count) { - eh = (struct ethhdr *)msg[i].start; - - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - tap4_handler(c, msg + i, 1, now); - i++; - break; - case ETH_P_IP: - for (same = 1; i + same < msg_count && - same < UIO_MAXIOV; same++) { - struct tap_msg *next = &msg[i + same]; - - eh = (struct ethhdr *)next->start; - if (ntohs(eh->h_proto) != ETH_P_IP) - break; - } - - i += tap4_handler(c, msg + i, same, now); - break; - case ETH_P_IPV6: - for (same = 1; i + same < msg_count && - same < UIO_MAXIOV; same++) { - struct tap_msg *next = &msg[i + same]; - - eh = (struct ethhdr *)next->start; - if (ntohs(eh->h_proto) != ETH_P_IPV6) - break; - } - - i += tap6_handler(c, msg + i, same, now); - break; - default: - i++; - break; - } - } - - p = pkt_buf; + if (c->v6) { + procfs_scan_listen("tcp6", c->tcp.port_to_init); + procfs_scan_listen("udp6", c->udp.port_to_init); } - - if (n >= 0 || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) - return 0; - - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_unix, NULL); - close(c->fd_unix); - - return -ECONNRESET; } /** * sock_handler() - Event handler for L4 sockets * @c: Execution context - * @s: Socket associated to event + * @ref: epoll reference * @events: epoll events * @now: Current timestamp */ -static void sock_handler(struct ctx *c, int s, uint32_t events, +static void sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - socklen_t sl; - int proto; + debug("%s packet from socket %i", IP_PROTO_STR(ref.proto), ref.s); - sl = sizeof(proto); - - if ( FD_PROTO(s, udp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, tcp)) - proto = IPPROTO_UDP; - else if (FD_PROTO(s, tcp) && !FD_PROTO(s, icmp) && !FD_PROTO(s, udp)) - proto = IPPROTO_TCP; - else if (FD_PROTO(s, icmp) && !FD_PROTO(s, udp) && !FD_PROTO(s, tcp)) - proto = IPPROTO_ICMP; /* Fits ICMPv6 below, too */ - else if (getsockopt(s, SOL_SOCKET, SO_PROTOCOL, &proto, &sl)) - proto = -1; - - if (proto == -1) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); - close(s); - return; - } - - debug("%s (%i): packet from socket %i", IP_PROTO_STR(proto), proto, s); - - if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) - icmp_sock_handler(c, s, events, pkt_buf, now); - else if (proto == IPPROTO_TCP) - tcp_sock_handler( c, s, events, pkt_buf, now); - else if (proto == IPPROTO_UDP) - udp_sock_handler( c, s, events, pkt_buf, now); + if (ref.proto == IPPROTO_TCP) + tcp_sock_handler( c, ref, events, now); + else if (ref.proto == IPPROTO_UDP) + udp_sock_handler( c, ref, events, now); + else if (ref.proto == IPPROTO_ICMP || ref.proto == IPPROTO_ICMPV6) + icmp_sock_handler(c, ref, events, now); } /** @@ -739,39 +402,70 @@ static void timer_handler(struct ctx *c, struct timespec *now) udp_timer(c, now); c->udp.timer_run = *now; } + + if (timespec_diff_ms(now, &c->icmp.timer_run) >= ICMP_TIMER_INTERVAL) { + icmp_timer(c, now); + c->icmp.timer_run = *now; + } } /** - * usage() - Print usage and exit + * usage_passt() - Print usage for "passt" mode and exit * @name: Executable name */ -void usage(const char *name) +void usage_passt(const char *name) { fprintf(stderr, "Usage: %s\n", name); exit(EXIT_FAILURE); } +/** + * usage_pasta() - Print usage for "pasta" mode and exit + * @name: Executable name + */ +void usage_pasta(const char *name) +{ + fprintf(stderr, "Usage: %s TARGET_PID\n", name); + + exit(EXIT_FAILURE); +} + /** * main() - Entry point and main loop * @argc: Argument count - * @argv: Interface names + * @argv: Target PID for pasta mode * * Return: 0 once interrupted, non-zero on failure */ int main(int argc, char **argv) { + char buf6[INET6_ADDRSTRLEN], buf4[INET_ADDRSTRLEN], *log_name; struct epoll_event events[EPOLL_EVENTS]; - int nfds, i, fd_unix, sock_index; - char buf6[INET6_ADDRSTRLEN]; - char buf4[INET_ADDRSTRLEN]; - struct epoll_event ev = { 0 }; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; + int nfds, i; - if (argc != 1) - usage(argv[0]); + if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) { + if (argc != 2) + usage_pasta(argv[0]); + + errno = 0; + c.pasta_pid = strtol(argv[1], NULL, 0); + if (c.pasta_pid < 0 || errno) + usage_pasta(argv[0]); + + c.mode = MODE_PASTA; + log_name = "pasta"; + } else { + if (argc != 1) + usage_passt(argv[0]); + + c.mode = MODE_PASST; + log_name = "passt"; + memset(&c.mac_guest, 0xff, sizeof(c.mac_guest)); + } if (clock_gettime(CLOCK_MONOTONIC, &now)) { perror("clock_gettime"); @@ -795,27 +489,22 @@ int main(int argc, char **argv) } #if DEBUG - openlog("passt", 0, LOG_DAEMON); + openlog(log_name, 0, LOG_DAEMON); #else - openlog("passt", isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON); + openlog(log_name, isatty(fileno(stdout)) ? 0 : LOG_PERROR, LOG_DAEMON); #endif get_routes(&c); get_addrs(&c); get_dns(&c); + get_bound_ports(&c); - fd_unix = sock_unix(&sock_index); - - if (icmp_sock_init(&c) || udp_sock_init(&c) || tcp_sock_init(&c)) + if (udp_sock_init(&c) || tcp_sock_init(&c)) exit(EXIT_FAILURE); if (c.v6) dhcpv6_init(&c); - memset(&c.mac_guest, 0xff, sizeof(c.mac_guest)); - - pcap_init(sock_index); - if (c.v4) { info("ARP:"); info(" address: %02x:%02x:%02x:%02x:%02x:%02x from %s", @@ -859,15 +548,7 @@ int main(int argc, char **argv) } } -listen: - listen(fd_unix, 0); - info("You can now start qrap:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); - info("or directly qemu, patched with:"); - info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch"); - info("as follows:"); - info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH - " -net nic,model=virtio", sock_index); + tap_sock_init(&c); #ifndef DEBUG if (isatty(fileno(stdout)) && daemon(0, 0)) { @@ -876,12 +557,6 @@ listen: } #endif - c.fd_unix = accept(fd_unix, NULL, NULL); - - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLHUP; - ev.data.fd = c.fd_unix; - epoll_ctl(c.epollfd, EPOLL_CTL_ADD, c.fd_unix, &ev); - loop: nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); if (nfds == -1 && errno != EINTR) { @@ -892,18 +567,12 @@ loop: clock_gettime(CLOCK_MONOTONIC, &now); for (i = 0; i < nfds; i++) { - if (events[i].data.fd == c.fd_unix) { - if (events[i].events & EPOLLRDHUP || - events[i].events & EPOLLHUP || - events[i].events & EPOLLERR || - tap_handler(&c, &now)) { - close(c.fd_unix); - goto listen; - } - } else { - sock_handler(&c, events[i].data.fd, events[i].events, - &now); - } + union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64); + + if (events[i].data.fd == c.fd_tap) + tap_handler(&c, events[i].events, &now); + else + sock_handler(&c, ref, events[i].events, &now); } timer_handler(&c, &now); diff --git a/passt.h b/passt.h index 28840fc..1a708fd 100644 --- a/passt.h +++ b/passt.h @@ -15,27 +15,76 @@ struct tap_msg { size_t l4_len; }; -#define SOCK_BUF_BYTES (ETH_MAX_MTU * 4) +union epoll_ref; #include "icmp.h" #include "tcp.h" #include "udp.h" +/** + * union epoll_ref - Breakdown of reference for epoll socket bookkeeping + * @proto: IP protocol number + * @s: Socket number (implies 2^24 limit on number of descriptors) + * @tcp: TCP-specific reference part + * @udp: UDP-specific reference part + * @icmp: ICMP-specific reference part + * @data: Data handled by protocol handlers + * @u64: Opaque reference for epoll_ctl() and epoll_wait() + */ +union epoll_ref { + struct { + uint32_t proto:8, + s:24; + union { + union tcp_epoll_ref tcp; + union udp_epoll_ref udp; + union icmp_epoll_ref icmp; + uint32_t data; + }; + }; + uint64_t u64; +}; + +#define TAP_BUF_BYTES (ETH_MAX_MTU * 3) +#define TAP_BUF_FILL (TAP_BUF_BYTES - ETH_MAX_MTU - sizeof(uint32_t)) +#define TAP_MSGS (TAP_BUF_BYTES / sizeof(struct ethhdr) + 1) + +#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) +extern char pkt_buf [PKT_BUF_BYTES]; + +#ifdef DEBUG +extern char *ip_proto_str[]; +#define IP_PROTO_STR(n) \ + (((n) <= IPPROTO_SCTP && ip_proto_str[(n)]) ? ip_proto_str[(n)] : "?") +#endif + #include /* For MAXNS below */ +/** + * struct fqdn - Representation of fully-qualified domain name + * @n: Domain name string + */ struct fqdn { char n[NS_MAXDNAME]; }; #include +enum passt_modes { + MODE_PASST, + MODE_PASTA, +}; + /** * struct ctx - Execution context - * @epollfd: file descriptor for epoll instance - * @fd_unix: AF_UNIX socket for tap file descriptor - * @v4: Enable IPv4 transport + * @mode: Operation mode, qemu/UNIX domain socket or namespace/tap + * @pasta_pid: Target PID of namespace for pasta mode + * @epollfd: File descriptor for epoll instance + * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any + * @fd_tap: File descriptor for AF_UNIX socket or tuntap device * @mac: Host MAC address * @mac_guest: Guest MAC address + * @v4: Enable IPv4 transport * @addr4: IPv4 address for external, routable interface * @addr4_seen: Latest IPv4 address seen as source from tap * @mask4: IPv4 netmask, network order @@ -49,10 +98,17 @@ struct fqdn { * @gw6: Default IPv6 gateway * @dns4: IPv4 DNS addresses, zero-terminated * @ifn: Name of routable interface + * @tcp: Context for TCP protocol handler + * @udp: Context for UDP protocol handler + * @icmp: Context for ICMP protocol handler */ struct ctx { + enum passt_modes mode; + int pasta_pid; + int epollfd; - int fd_unix; + int fd_tap_listen; + int fd_tap; unsigned char mac[ETH_ALEN]; unsigned char mac_guest[ETH_ALEN]; @@ -74,7 +130,7 @@ struct ctx { char ifn[IF_NAMESIZE]; - struct icmp_ctx icmp; struct tcp_ctx tcp; struct udp_ctx udp; + struct icmp_ctx icmp; }; diff --git a/pcap.c b/pcap.c index 8dd647a..c728b8a 100644 --- a/pcap.c +++ b/pcap.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode * - * pcap.c - Packet capture for PASST + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * pcap.c - Packet capture for PASST/PASTA * * Copyright (c) 2021 Red Hat GmbH * Author: Stefano Brivio - * */ #include @@ -22,18 +25,19 @@ #include #include -#include "passt.h" #include "util.h" +#include "passt.h" #ifdef DEBUG #define PCAP_PREFIX "/tmp/passt_" +#define PCAP_PREFIX_PASTA "/tmp/pasta_" #define PCAP_ISO8601_FORMAT "%FT%H:%M:%SZ" #define PCAP_ISO8601_STR "YYYY-MM-ddTHH:mm:ssZ" #define PCAP_VERSION_MINOR 4 -static int pcap_fd = 1; +static int pcap_fd = -1; /* See pcap.h from libpcap, or pcap-savefile(5) */ static struct { @@ -64,6 +68,11 @@ struct pcap_pkthdr { uint32_t len; }; +/** + * pcap() - Capture a single frame to pcap file + * @pkt: Pointer to data buffer, including L2 headers + * @len: L2 packet length + */ void pcap(char *pkt, size_t len) { struct pcap_pkthdr h; @@ -81,12 +90,23 @@ void pcap(char *pkt, size_t len) write(pcap_fd, pkt, len); } -void pcap_init(int sock_index) +/** + * pcap_init() - Initialise pcap file + * @c: Execution context + * @index: pcap name index: passt instance number or pasta target pid + */ +void pcap_init(struct ctx *c, int index) { - char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UNIX_SOCK_MAX) ".pcap"; + char name[] = PCAP_PREFIX PCAP_ISO8601_STR STR(UINT_MAX) ".pcap"; struct timeval tv; struct tm *tm; + if (pcap_fd != -1) + close(pcap_fd); + + if (c->mode == MODE_PASTA) + memcpy(name, PCAP_PREFIX_PASTA, sizeof(PCAP_PREFIX_PASTA)); + gettimeofday(&tv, NULL); tm = localtime(&tv.tv_sec); strftime(name + strlen(PCAP_PREFIX), sizeof(PCAP_ISO8601_STR) - 1, @@ -94,7 +114,7 @@ void pcap_init(int sock_index) snprintf(name + strlen(PCAP_PREFIX) + strlen(PCAP_ISO8601_STR), sizeof(name) - strlen(PCAP_PREFIX) - strlen(PCAP_ISO8601_STR), - "_%i.pcap", sock_index); + "_%i.pcap", index); pcap_fd = open(name, O_WRONLY | O_CREAT | O_APPEND | O_DSYNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); diff --git a/pcap.h b/pcap.h index abca097..c69c3b0 100644 --- a/pcap.h +++ b/pcap.h @@ -1,2 +1,2 @@ void pcap(char *pkt, size_t len); -void pcap_init(int sock_index); +void pcap_init(struct ctx *c, int sock_index); diff --git a/siphash.c b/siphash.c index 910c718..b18f640 100644 --- a/siphash.c +++ b/siphash.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * siphash.c - SipHash routines * diff --git a/tap.c b/tap.c index 70e4774..583344d 100644 --- a/tap.c +++ b/tap.c @@ -1,21 +1,39 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode * - * tap.c - Functions to communicate with guest-facing tap interface + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tap.c - Functions to communicate with guest- or namespace-facing interface * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ +#define _GNU_SOURCE +#include #include +#include #include #include #include #include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -23,26 +41,46 @@ #include #include -#include "passt.h" #include "util.h" +#include "passt.h" +#include "arp.h" +#include "dhcp.h" +#include "ndp.h" +#include "dhcpv6.h" #include "pcap.h" /** - * tap_send() - Send frame and qemu socket header with indication of length - * @fd: tap file descriptor + * tap_send() - Send frame, with qemu socket header if needed + * @c: Execution context + * @data: Packet buffer * @len: Total L2 packet length - * @flags: Flags for send(), if any + * @vnet_pre: Buffer has four-byte headroom * - * Return: return code from send() + * Return: return code from send() or write() */ -int tap_send(int fd, void *data, size_t len, int flags) +int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre) { - uint32_t vnet_len = htonl(len); - send(fd, &vnet_len, 4, MSG_DONTWAIT | MSG_NOSIGNAL); + if (vnet_pre) + pcap((char *)data + 4, len); + else + pcap(data, len); - pcap(data, len); + if (c->mode == MODE_PASST) { + int flags = MSG_NOSIGNAL | MSG_DONTWAIT; - return send(fd, data, len, flags | MSG_DONTWAIT | MSG_NOSIGNAL); + if (vnet_pre) { + *((uint32_t *)data) = htonl(len); + len += 4; + } else { + uint32_t vnet_len = htonl(len); + + send(c->fd_tap, &vnet_len, 4, flags); + } + + return send(c->fd_tap, data, len, flags); + } + + return write(c->fd_tap, (char *)data + (vnet_pre ? 4 : 0), len); } /** @@ -56,7 +94,8 @@ int tap_send(int fd, void *data, size_t len, int flags) void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, char *in, size_t len) { - char pkt[USHRT_MAX]; + char buf[USHRT_MAX]; + char *pkt = buf + 4; struct ethhdr *eh; eh = (struct ethhdr *)pkt; @@ -95,7 +134,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, uh->check = 0; } - tap_send(c->fd_unix, pkt, len + sizeof(*iph) + sizeof(*eh), 0); + tap_send(c, buf, len + sizeof(*iph) + sizeof(*eh), 1); } else { struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); char *data = (char *)(ip6h + 1); @@ -137,6 +176,527 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, ip6h->nexthdr = proto; ip6h->hop_limit = 255; - tap_send(c->fd_unix, pkt, len + sizeof(*ip6h) + sizeof(*eh), 0); + tap_send(c, buf, len + sizeof(*ip6h) + sizeof(*eh), 1); } } + +/** + * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor + * @c: Execution context + * @msg: Array of messages with the same L3 protocol + * @count: Count of messages with the same L3 protocol + * @now: Current timestamp + * + * Return: count of packets consumed by handlers + */ +static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, + struct timespec *now) +{ + char buf_s[INET_ADDRSTRLEN] __attribute((__unused__)); + char buf_d[INET_ADDRSTRLEN] __attribute((__unused__)); + struct ethhdr *eh = (struct ethhdr *)msg[0].start; + struct iphdr *iph, *prev_iph = NULL; + struct udphdr *uh, *prev_uh = NULL; + size_t len = msg[0].len; + unsigned int i; + char *l4h; + + if (!c->v4) + return count; + + if (len < sizeof(*eh) + sizeof(*iph)) + return 1; + + if (arp(c, eh, len) || dhcp(c, eh, len)) + return 1; + + for (i = 0; i < count; i++) { + len = msg[i].len; + if (len < sizeof(*eh) + sizeof(*iph)) + return 1; + + eh = (struct ethhdr *)msg[i].start; + iph = (struct iphdr *)(eh + 1); + l4h = (char *)iph + iph->ihl * 4; + + c->addr4_seen = iph->saddr; + + msg[i].l4h = l4h; + msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + break; + + if (len < sizeof(*uh)) + break; + + uh = (struct udphdr *)l4h; + + if (!i) { + prev_iph = iph; + prev_uh = uh; + continue; + } + + if (iph->tos != prev_iph->tos || + iph->frag_off != prev_iph->frag_off || + iph->protocol != prev_iph->protocol || + iph->saddr != prev_iph->saddr || + iph->daddr != prev_iph->daddr || + uh->source != prev_uh->source || + uh->dest != prev_uh->dest) + break; + + prev_iph = iph; + prev_uh = uh; + } + + eh = (struct ethhdr *)msg[0].start; + iph = (struct iphdr *)(eh + 1); + + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_SCTP) { + uh = (struct udphdr *)msg[0].l4h; + + if (msg[0].len < sizeof(*uh)) + return 1; + + debug("%s (%i) from tap: %s:%i -> %s:%i (%i packet%s)", + IP_PROTO_STR(iph->protocol), iph->protocol, + inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), + ntohs(uh->source), + inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d)), + ntohs(uh->dest), + i, i > 1 ? "s" : ""); + } else if (iph->protocol == IPPROTO_ICMP) { + debug("icmp from tap: %s -> %s", + inet_ntop(AF_INET, &iph->saddr, buf_s, sizeof(buf_s)), + inet_ntop(AF_INET, &iph->daddr, buf_d, sizeof(buf_d))); + } + + if (iph->protocol == IPPROTO_TCP) + return tcp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now); + + if (iph->protocol == IPPROTO_UDP) + return udp_tap_handler(c, AF_INET, &iph->daddr, msg, i, now); + + if (iph->protocol == IPPROTO_ICMP) + icmp_tap_handler(c, AF_INET, &iph->daddr, msg, 1, now); + + return 1; +} + +/** + * tap6_handler() - IPv6 packet handler for tap file descriptor + * @c: Execution context + * @msg: Array of messages with the same L3 protocol + * @count: Count of messages with the same L3 protocol + * @now: Current timestamp + * + * Return: count of packets consumed by handlers + */ +static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, + struct timespec *now) +{ + char buf_s[INET6_ADDRSTRLEN] __attribute((__unused__)); + char buf_d[INET6_ADDRSTRLEN] __attribute((__unused__)); + struct ethhdr *eh = (struct ethhdr *)msg[0].start; + struct udphdr *uh, *prev_uh = NULL; + uint8_t proto = 0, prev_proto = 0; + size_t len = msg[0].len; + struct ipv6hdr *ip6h; + unsigned int i; + char *l4h; + + if (!c->v6) + return count; + + if (len < sizeof(*eh) + sizeof(*ip6h)) + return 1; + + if (ndp(c, eh, len) || dhcpv6(c, eh, len)) + return 1; + + for (i = 0; i < count; i++) { + struct ipv6hdr *p_ip6h; + + len = msg[i].len; + if (len < sizeof(*eh) + sizeof(*ip6h)) + return 1; + + eh = (struct ethhdr *)msg[i].start; + ip6h = (struct ipv6hdr *)(eh + 1); + l4h = ipv6_l4hdr(ip6h, &proto); + + msg[i].l4h = l4h; + msg[i].l4_len = len - ((intptr_t)l4h - (intptr_t)eh); + + if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) + c->addr6_ll_seen = ip6h->saddr; + else + c->addr6_seen = ip6h->saddr; + + ip6h->saddr = c->addr6; + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + break; + + if (len < sizeof(*uh)) + break; + + uh = (struct udphdr *)l4h; + + if (!i) { + p_ip6h = ip6h; + prev_proto = proto; + prev_uh = uh; + continue; + } + + if (proto != prev_proto || + memcmp(&ip6h->saddr, &p_ip6h->saddr, sizeof(ip6h->saddr)) || + memcmp(&ip6h->daddr, &p_ip6h->daddr, sizeof(ip6h->daddr)) || + uh->source != prev_uh->source || + uh->dest != prev_uh->dest) + break; + + p_ip6h = ip6h; + prev_proto = proto; + prev_uh = uh; + } + + if (prev_proto) + proto = prev_proto; + + eh = (struct ethhdr *)msg[0].start; + ip6h = (struct ipv6hdr *)(eh + 1); + + if (proto == IPPROTO_ICMPV6) { + debug("icmpv6 from tap: %s ->\n\t%s", + inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), + inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d))); + } else if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || + proto == IPPROTO_SCTP) { + uh = (struct udphdr *)msg[0].l4h; + + if (msg[0].len < sizeof(*uh)) + return 1; + + debug("%s (%i) from tap: [%s]:%i\n\t-> [%s]:%i (%i packet%s)", + IP_PROTO_STR(proto), proto, + inet_ntop(AF_INET6, &ip6h->saddr, buf_s, sizeof(buf_s)), + ntohs(uh->source), + inet_ntop(AF_INET6, &ip6h->daddr, buf_d, sizeof(buf_d)), + ntohs(uh->dest), + i, i > 1 ? "s" : ""); + } + + if (proto == IPPROTO_TCP) + return tcp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now); + + if (proto == IPPROTO_UDP) + return udp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, i, now); + + if (proto == IPPROTO_ICMPV6) + icmp_tap_handler(c, AF_INET6, &ip6h->daddr, msg, 1, now); + + return 1; +} + +/** + * tap_handler_passt() - Packet handler for AF_UNIX file descriptor + * @c: Execution context + * @now: Current timestamp + * + * Return: -ECONNRESET on receive error, 0 otherwise + */ +static int tap_handler_passt(struct ctx *c, struct timespec *now) +{ + int msg_count = 0, same, i = 0; + struct tap_msg msg[TAP_MSGS]; + struct ethhdr *eh; + char *p = pkt_buf; + ssize_t n, rem; + + n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); + if (n < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); + close(c->fd_tap); + + return -ECONNRESET; + } + + while (n > (ssize_t)sizeof(uint32_t)) { + ssize_t len = ntohl(*(uint32_t *)p); + + p += sizeof(uint32_t); + n -= sizeof(uint32_t); + + if (len < (ssize_t)sizeof(*eh)) + return 0; + + /* At most one packet might not fit in a single read */ + if (len > n) { + rem = recv(c->fd_tap, p + n, len - n, MSG_DONTWAIT); + if ((n += rem) != len) + return 0; + } + + pcap(p, len); + + msg[msg_count].start = p; + msg[msg_count++].len = len; + + n -= len; + p += len; + } + + while (i < msg_count) { + eh = (struct ethhdr *)msg[i].start; + + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + tap4_handler(c, msg + i, 1, now); + i++; + break; + case ETH_P_IP: + for (same = 1; i + same < msg_count && + same < UIO_MAXIOV; same++) { + struct tap_msg *next = &msg[i + same]; + + eh = (struct ethhdr *)next->start; + if (ntohs(eh->h_proto) != ETH_P_IP) + break; + } + + i += tap4_handler(c, msg + i, same, now); + break; + case ETH_P_IPV6: + for (same = 1; i + same < msg_count && + same < UIO_MAXIOV; same++) { + struct tap_msg *next = &msg[i + same]; + + eh = (struct ethhdr *)next->start; + if (ntohs(eh->h_proto) != ETH_P_IPV6) + break; + } + + i += tap6_handler(c, msg + i, same, now); + break; + default: + i++; + break; + } + } + + return 0; +} + +/** + * tap_handler_passt() - Packet handler for tuntap file descriptor + * @c: Execution context + * @now: Current timestamp + * + * Return: -ECONNRESET on receive error, 0 otherwise + */ +static int tap_handler_pasta(struct ctx *c, struct timespec *now) +{ + struct tap_msg msg = { .start = pkt_buf }; + ssize_t n; + + while ((n = read(c->fd_tap, pkt_buf, TAP_BUF_BYTES)) > 0) { + struct ethhdr *eh = (struct ethhdr *)pkt_buf; + msg.len = n; + + pcap(msg.start, msg.len); + + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + tap4_handler(c, &msg, 1, now); + break; + case ETH_P_IP: + tap4_handler(c, &msg, 1, now); + break; + case ETH_P_IPV6: + tap6_handler(c, &msg, 1, now); + break; + } + } + + if (!n || errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); + close(c->fd_tap); + + return -ECONNRESET; +} + +/** + * tap_sock_init_unix() - Create and bind AF_UNIX socket, wait for connection + * @c: Execution context + */ +static void tap_sock_init_unix(struct ctx *c) +{ + int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex; + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + int i, ret; + + if (c->fd_tap_listen) + close(c->fd_tap_listen); + + if (fd < 0) { + perror("UNIX socket"); + exit(EXIT_FAILURE); + } + c->fd_tap_listen = fd; + + for (i = 1; i < UNIX_SOCK_MAX; i++) { + snprintf(addr.sun_path, UNIX_PATH_MAX, UNIX_SOCK_PATH, i); + + ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr)); + if (!ret || (errno != ENOENT && errno != ECONNREFUSED)) { + close(ex); + continue; + } + close(ex); + + unlink(addr.sun_path); + if (!bind(fd, (const struct sockaddr *)&addr, sizeof(addr))) + break; + } + + if (i == UNIX_SOCK_MAX) { + perror("UNIX socket bind"); + exit(EXIT_FAILURE); + } + + info("UNIX domain socket bound at %s\n", addr.sun_path); + chmod(addr.sun_path, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); + + pcap_init(c, i); + + listen(fd, 0); + + info("You can now start qrap:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + info("or directly qemu, patched with:"); + info(" qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch"); + info("as follows:"); + info(" kvm ... -net socket,connect=" UNIX_SOCK_PATH + " -net nic,model=virtio", i); + + c->fd_tap = accept(fd, NULL, NULL); +} + +static int tun_ns_fd = -1; + +/** + * tap_sock_init_tun_ns() - Create tuntap file descriptor in namespace + * @c: Execution context + */ +static int tap_sock_init_tun_ns(void *target_pid) +{ + int fd; + + if (ns_enter(*(int *)target_pid)) + goto fail; + + if ((fd = open("/dev/net/tun", O_RDWR)) < 0) + goto fail; + + fcntl(fd, F_SETFL, O_NONBLOCK); + + tun_ns_fd = fd; + + return 0; + +fail: + tun_ns_fd = -1; + return 0; +} + +/** + * tap_sock_init_tun() - Set up tuntap file descriptor + * @c: Execution context + */ +static void tap_sock_init_tun(struct ctx *c) +{ + struct ifreq ifr = { .ifr_name = "pasta0", + .ifr_flags = IFF_TAP | IFF_NO_PI, + }; + char ns_fn_stack[NS_FN_STACK_SIZE]; + + clone(tap_sock_init_tun_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&c->pasta_pid); + + if (tun_ns_fd == -1) { + err("Failed to open tun socket in namespace"); + exit(EXIT_FAILURE); + } + + if (ioctl(tun_ns_fd, TUNSETIFF, &ifr)) { + perror("TUNSETIFF ioctl"); + exit(EXIT_FAILURE); + } + + pcap_init(c, c->pasta_pid); + + c->fd_tap = tun_ns_fd; +} + +/** + * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor + * @c: Execution context + */ +void tap_sock_init(struct ctx *c) +{ + struct epoll_event ev = { 0 }; + + if (c->fd_tap) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); + close(c->fd_tap); + } + + if (c->mode == MODE_PASST) + tap_sock_init_unix(c); + else + tap_sock_init_tun(c); + + ev.events = EPOLLIN | EPOLLRDHUP; + ev.data.fd = c->fd_tap; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); +} + +/** + * tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor + * @c: Execution context + * @events: epoll events + * @now: Current timestamp + */ +void tap_handler(struct ctx *c, uint32_t events, struct timespec *now) +{ + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) + goto fail; + + if ((c->mode == MODE_PASST && tap_handler_passt(c, now)) || + (c->mode == MODE_PASTA && tap_handler_pasta(c, now))) + goto fail; + + return; +fail: + tap_sock_init(c); +} diff --git a/tap.h b/tap.h index ecea936..385fab0 100644 --- a/tap.h +++ b/tap.h @@ -1,3 +1,5 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, char *in, size_t len); -int tap_send(int fd, void *data, size_t len, int flags); +int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre); +void tap_handler(struct ctx *c, uint32_t events, struct timespec *now); +void tap_sock_init(struct ctx *c); diff --git a/tcp.c b/tcp.c index d650166..dec2df3 100644 --- a/tcp.c +++ b/tcp.c @@ -1,20 +1,23 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * tcp.c - TCP L2-L4 translation state machine * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ /** * DOC: Theory of Operation * * - * Overview - * -------- + * PASST mode + * ========== * * This implementation maps TCP traffic between a single L2 interface (tap) and * native TCP (L4) sockets, mimicking and reproducing as closely as possible the @@ -22,7 +25,7 @@ * interface. Four connection flows are supported: * - from the local host to the guest behind the tap interface: * - this is the main use case for proxies in service meshes - * - we bind to all unbound local ports, and relay traffic between L4 sockets + * - we bind to configured local ports, and relay traffic between L4 sockets * with local endpoints and the L2 interface * - from remote hosts to the guest behind the tap interface: * - this might be needed for services that need to be addressed directly, @@ -64,7 +67,7 @@ * ------ * * To avoid the need for dynamic memory allocation, a maximum, reasonable amount - * of connections is defined by TCP_MAX_CONNS below (currently 256k, close to + * of connections is defined by MAX_TAP_CONNS below (currently 1M, close to * the maximum amount of file descriptors typically available to a process on * Linux). * @@ -72,8 +75,8 @@ * segments and retransmissions needs to be, thus data needs to linger on * sockets as long as it's not acknowledged by the guest, and read using * MSG_PEEK into a single, preallocated static buffer sized to the maximum - * supported window, 64MiB. This imposes a practical limitation on window - * scaling, that is, the maximum factor is 1024. If a bigger window scaling + * supported window, 16MiB. This imposes a practical limitation on window + * scaling, that is, the maximum factor is 512. If a bigger window scaling * factor is observed during connection establishment, connection is reset and * reestablished by omitting the scaling factor in the SYN segment. This * limitation only applies to the window scaling advertised by the guest, but @@ -84,9 +87,10 @@ * ----- * * To avoid the need for ad-hoc configuration of port forwarding or allowed - * ports, listening sockets are opened and bound to all unbound ports on the + * ports, listening sockets can be opened and bound to all unbound ports on the * host, as far as process capabilities allow. This service needs to be started - * after any application proxy that needs to bind to local ports. + * after any application proxy that needs to bind to local ports. Mapped ports + * can also be configured explicitly. * * No port translation is needed for connections initiated remotely or by the * local host: source port from socket is reused while establishing connections @@ -100,10 +104,14 @@ * Connection tracking and storage * ------------------------------- * - * Connection are tracked by the @tc array of struct tcp_conn, containing + * Connections are tracked by the @tt array of struct tcp_tap_conn, containing * addresses, ports, TCP states and parameters. This is statically allocated and - * indices are the file descriptor numbers associated to inbound or outbound - * sockets. + * indexed by an arbitrary connection number. The array is compacted whenever a + * connection is closed, by remapping the highest connection index in use to the + * one freed up. + * + * References used for the epoll interface report the connection index used for + * the @tt array. * * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for * separate data structures depending on the protocol version. @@ -120,8 +128,8 @@ * -------------- * * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for - * IPv4 and IPv6) are opened and bound to wildcard addresses. Some will fail to - * bind (for low ports, or ports already bound, e.g. by a proxy). These are + * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail + * to bind (for low ports, or ports already bound, e.g. by a proxy). These are * added to the epoll list, with no separate storage. * * @@ -291,9 +299,31 @@ * set @tcpi_acked_last to tcpi_bytes_acked, set @seq_ack_to_tap * to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and * send ACK to tap + * + * + * PASTA mode + * ========== + * + * For traffic directed to TCP ports configured for mapping to the tuntap device + * in the namespace, and for non-local traffic coming from the tuntap device, + * the implementation is identical as the PASST mode described in the previous + * section. + * + * For local traffic directed to TCP ports configured for direct mapping between + * namespaces, the implementation is substantially simpler: packets are directly + * translated between L4 sockets using a pair of splice() syscalls. These + * connections are tracked in the @ts array of struct tcp_splice_conn, using + * just four states: + * + * - CLOSED: no connection + * - SPLICE_ACCEPTED: accept() on the listening socket succeeded + * - SPLICE_CONNECT: connect() issued in the destination namespace + * - SPLICE_ESTABLISHED: connect() succeeded, packets are transferred */ #define _GNU_SOURCE +#include +#include #include #include #include @@ -313,24 +343,27 @@ #include #include +#include "util.h" #include "passt.h" #include "tap.h" -#include "util.h" #include "siphash.h" -/* Approximately maximum number of open descriptors per process */ -#define MAX_CONNS (1024 * 1024) +#define MAX_TAP_CONNS (128 * 1024) +#define MAX_SPLICE_CONNS (128 * 1024) + +#define PIPE_SIZE (1024 * 1024) #define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (MAX_CONNS * 100 / TCP_HASH_TABLE_LOAD) +#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \ + TCP_HASH_TABLE_LOAD) -#define MAX_WS 10 +#define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #define SYN_TIMEOUT 240000 /* ms */ -#define ACK_TIMEOUT 3000 +#define ACK_TIMEOUT 10000 #define ACK_INTERVAL 50 #define ACT_TIMEOUT 7200000 #define FIN_TIMEOUT 240000 @@ -353,19 +386,25 @@ enum tcp_state { LAST_ACK, FIN_WAIT_1, FIN_WAIT_1_SOCK_FIN, + SPLICE_ACCEPTED, + SPLICE_CONNECT, + SPLICE_ESTABLISHED, }; -#define TCP_STATE_STR_SIZE (FIN_WAIT_1_SOCK_FIN + 1) +#define TCP_STATE_STR_SIZE (SPLICE_ESTABLISHED + 1) static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = { "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD", "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "CLOSE_WAIT", "LAST_ACK", "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN", + "SPLICE_ACCEPTED", "SPLICE_CONNECT", "SPLICE_ESTABLISHED", }; #define FIN (1 << 0) #define SYN (1 << 1) #define RST (1 << 2) #define ACK (1 << 4) +/* Flags for internal usage */ +#define ZERO_WINDOW (1 << 5) #define OPT_EOL 0 #define OPT_NOP 1 @@ -377,38 +416,39 @@ static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = { #define OPT_SACK 5 #define OPT_TS 8 -struct tcp_conn; +struct tcp_tap_conn; /** - * struct tcp_conn - Descriptor for a TCP connection + * struct tcp_tap_conn - Descriptor for a TCP connection via tap (not spliced) * @next: Pointer to next item in hash chain, if any * @sock: Socket descriptor number - * @hash_bucket: Bucket index in socket lookup hash table + * @hash_bucket: Bucket index in connection lookup hash table * @a.a6: IPv6 remote address, can be IPv4-mapped * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 * @a.a4.one: Ones prefix for IPv4-mapped * @a.a4.a: IPv4 address * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port - * @s: TCP connection state + * @state: TCP connection state * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap * @tcpi_acked_last: Most recent value of tcpi_bytes_acked (TCP_INFO query) - * @dup_acks: Count of currently duplicated ACKs from tap * @ws_allowed: Window scaling allowed * @ws: Window scaling factor * @tap_window: Last window size received from tap, scaled + * @window_clamped: Window was clamped on socket at least once * @no_snd_wnd: Kernel won't report window (without commit 8f7baad7f035) + * @tcpi_acked_last: Most recent value of tcpi_snd_wnd (TCP_INFO query) * @ts_sock: Last activity timestamp from socket for timeout purposes * @ts_tap: Last activity timestamp from tap for timeout purposes * @ts_ack_tap: Last ACK segment timestamp from tap for timeout purposes * @mss_guest: Maximum segment size advertised by guest */ -struct tcp_conn { - struct tcp_conn *next; +struct tcp_tap_conn { + struct tcp_tap_conn *next; int sock; int hash_bucket; @@ -422,7 +462,7 @@ struct tcp_conn { } a; in_port_t tap_port; in_port_t sock_port; - enum tcp_state s; + enum tcp_state state; uint32_t seq_to_tap; uint32_t seq_ack_from_tap; @@ -430,12 +470,13 @@ struct tcp_conn { uint32_t seq_ack_to_tap; uint32_t seq_init_from_tap; uint64_t tcpi_acked_last; - int dup_acks; int ws_allowed; int ws; - int tap_window; + uint32_t tap_window; + int window_clamped; int no_snd_wnd; + uint32_t tcpi_snd_wnd; struct timespec ts_sock; struct timespec ts_tap; @@ -444,48 +485,58 @@ struct tcp_conn { int mss_guest; }; +/** + * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @from: File descriptor number of socket for accepted connection + * @pipe_from_to: Pipe ends for splice() from @from to @to + * @to: File descriptor number of peer connected socket + * @pipe_to_from: Pipe ends for splice() from @to to @from + * @state: TCP connection state +*/ +struct tcp_splice_conn { + int from; + int pipe_from_to[2]; + int to; + int pipe_to_from[2]; + enum tcp_state state; + int v6; +}; + /* Socket receive buffer */ static char sock_buf[MAX_WINDOW]; -/* Bitmap, activity monitoring needed for connection, indexed by socket */ -static uint8_t tcp_act[MAX_CONNS / 8] = { 0 }; +/* Bitmap, activity monitoring needed for connection via tap */ +static uint8_t tcp_act[MAX_TAP_CONNS / 8] = { 0 }; -/* TCP connections, indexed by socket */ -static struct tcp_conn tc[MAX_CONNS]; +/* TCP connections */ +static struct tcp_tap_conn tt[MAX_TAP_CONNS]; +static struct tcp_splice_conn ts[MAX_SPLICE_CONNS]; -/* Hash table for socket lookup given remote address, local port, remote port */ -static int tc_hash[TCP_HASH_TABLE_SIZE]; - -static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len); +/* Table for lookup from remote address, local port, remote port */ +static struct tcp_tap_conn *tt_hash[TCP_HASH_TABLE_SIZE]; /** - * tcp_act_set() - Set socket in bitmap for timed events - * @s: Socket file descriptor number - */ -static void tcp_act_set(int s) -{ - tcp_act[s / 8] |= 1 << (s % 8); -} - -/** - * tcp_act_clear() - Clear socket from bitmap for timed events - * @s: Socket file descriptor number - */ -static void tcp_act_clear(int s) -{ - tcp_act[s / 8] &= ~(1 << (s % 8)); -} - -/** - * tcp_set_state() - Set given TCP state for socket, report change to stderr - * @s: Socket file descriptor number + * tcp_tap_state() - Set given TCP state for tap connection, report to stderr + * @conn: Connection pointer * @state: New TCP state to be set */ -static void tcp_set_state(int s, enum tcp_state state) +static void tcp_tap_state(struct tcp_tap_conn *conn, enum tcp_state state) { - debug("TCP: socket %i: %s -> %s", s, - tcp_state_str[tc[s].s], tcp_state_str[state]); - tc[s].s = state; + debug("TCP: socket %i: %s -> %s", + conn->sock, tcp_state_str[conn->state], tcp_state_str[state]); + conn->state = state; +} + +/** + * tcp_splice_state() - Set state for spliced connection, report to stderr + * @conn: Connection pointer + * @state: New TCP state to be set + */ +static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state) +{ + debug("TCP: index %i: %s -> %s", + conn - ts, tcp_state_str[conn->state], tcp_state_str[state]); + conn->state = state; } /** @@ -547,7 +598,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type, } /** - * tcp_sock_hash_match() - Check if a connection entry matches address and ports + * tcp_hash_match() - Check if a connection entry matches address and ports * @conn: Connection entry to match against * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr @@ -556,8 +607,8 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t __type, * * Return: 1 on match, 0 otherwise */ -static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr, - in_port_t tap_port, in_port_t sock_port) +static int tcp_hash_match(struct tcp_tap_conn *conn, int af, void *addr, + in_port_t tap_port, in_port_t sock_port) { if (af == AF_INET && IN6_IS_ADDR_V4MAPPED(&conn->a.a6) && !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) && @@ -573,7 +624,7 @@ static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr, } /** - * tcp_sock_hash() - Calculate hash value for connection given address and ports + * tcp_hash() - Calculate hash value for connection given address and ports * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr @@ -582,8 +633,8 @@ static int tcp_sock_hash_match(struct tcp_conn *conn, int af, void *addr, * * Return: hash value, already modulo size of the hash table */ -static unsigned int tcp_sock_hash(struct ctx *c, int af, void *addr, - in_port_t tap_port, in_port_t sock_port) +static unsigned int tcp_hash(struct ctx *c, int af, void *addr, + in_port_t tap_port, in_port_t sock_port) { uint64_t b = 0; @@ -617,114 +668,172 @@ static unsigned int tcp_sock_hash(struct ctx *c, int af, void *addr, } /** - * tcp_sock_hash_insert() - Insert socket into hash table, chain link if needed + * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr - * @tap_port: tap-facing port - * @sock_port: Socket-facing port */ -static void tcp_sock_hash_insert(struct ctx *c, int s, int af, void *addr, - in_port_t tap_port, in_port_t sock_port) +static void tcp_hash_insert(struct ctx *c, struct tcp_tap_conn *conn, + int af, void *addr) { int b; - b = tcp_sock_hash(c, af, addr, tap_port, sock_port); - tc[s].next = tc_hash[b] ? &tc[tc_hash[b]] : NULL; - tc_hash[b] = tc[s].sock = s; - tc[s].hash_bucket = b; + b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port); + conn->next = tt_hash[b]; + tt_hash[b] = conn; + conn->hash_bucket = b; + + debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p", + conn - tt, conn->sock, b, conn->next); } /** - * tcp_sock_hash_remove() - Drop socket from hash table, chain unlink if needed - * @b: Bucket index - * @s: File descriptor number for socket + * tcp_hash_remove() - Drop connection from hash table, chain unlink + * @conn: Connection pointer */ -static void tcp_sock_hash_remove(int b, int s) +static void tcp_hash_remove(struct tcp_tap_conn *conn) { - struct tcp_conn *conn, *prev = NULL; + struct tcp_tap_conn *entry, *prev = NULL; + int b = conn->hash_bucket; - for (conn = &tc[tc_hash[b]]; conn; prev = conn, conn = conn->next) { - if (conn->sock == s) { - conn->sock = 0; + for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) { + if (entry == conn) { if (prev) prev->next = conn->next; else - tc_hash[b] = conn->next ? conn->next->sock : 0; - return; + tt_hash[b] = conn->next; + break; } } + + debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p", + conn - tt, conn->sock, b, prev ? prev->next : tt_hash[b]); } /** - * tcp_sock_hash_lookup() - Look up socket given remote address and ports + * tcp_hash_update() - Update pointer for given connection + * @old: Old connection pointer + * @new: New connection pointer + */ +static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new) +{ + struct tcp_tap_conn *entry, *prev = NULL; + int b = old->hash_bucket; + + for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) { + if (entry == old) { + if (prev) + prev->next = new; + else + tt_hash[b] = new; + break; + } + } + + debug("TCP: hash table update: old index %i, new index %i, sock %i, " + "bucket: %i, old: %p, new: %p", + old - tt, new - tt, new->sock, b, old, new); +} + +/** + * tcp_hash_lookup() - Look up connection given remote address and ports * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr * @tap_port: tap-facing port * @sock_port: Socket-facing port * - * Return: file descriptor number for socket, if found, -ENOENT otherwise + * Return: connection pointer, if found, -ENOENT otherwise */ -static int tcp_sock_hash_lookup(struct ctx *c, int af, void *addr, - in_port_t tap_port, in_port_t sock_port) +static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr, + in_port_t tap_port, + in_port_t sock_port) { - struct tcp_conn *conn; - int b; + int b = tcp_hash(c, af, addr, tap_port, sock_port); + struct tcp_tap_conn *conn; - b = tcp_sock_hash(c, af, addr, tap_port, sock_port); - if (!tc_hash[b]) - return -ENOENT; - - for (conn = &tc[tc_hash[b]]; conn; conn = conn->next) { - if (tcp_sock_hash_match(conn, af, addr, tap_port, sock_port)) - return conn->sock; + for (conn = tt_hash[b]; conn; conn = conn->next) { + if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) + return conn; } - return -ENOENT; + return NULL; } /** - * tcp_close_and_epoll_del() - Close, remove socket from hash table and epoll fd + * tcp_table_tap_compact - Compaction tap connection table * @c: Execution context - * @s: File descriptor number for socket + * @hole: Pointer to recently closed connection */ -static void tcp_close_and_epoll_del(struct ctx *c, int s) +static void tcp_table_tap_compact(struct ctx *c, struct tcp_tap_conn *hole) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); - tcp_set_state(s, CLOSED); - close(s); - tcp_sock_hash_remove(tc[s].hash_bucket, tc[s].sock); - tcp_act_clear(s); + union epoll_ref ref = { .proto = IPPROTO_TCP, .tcp.index = hole - tt }; + struct tcp_tap_conn *from, *to; + struct epoll_event ev; + + if ((hole - tt) == --c->tcp.tap_conn_count) { + bitmap_clear(tcp_act, hole - tt); + debug("TCP: hash table compaction: index %i (%p) was max index", + hole - tt, hole); + return; + } + + from = &tt[c->tcp.tap_conn_count]; + memcpy(hole, from, sizeof(*hole)); + from->state = CLOSED; + + to = hole; + tcp_hash_update(from, to); + + if (to->state == SOCK_SYN_SENT) + ev.events = EPOLLRDHUP; + else if (to->state == TAP_SYN_SENT) + ev.events = EPOLLOUT | EPOLLRDHUP; + else + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + + ref.tcp.v6 = !IN6_IS_ADDR_V4MAPPED(&to->a.a6); + ref.s = from->sock; + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, from->sock, &ev); + + debug("TCP: hash table compaction: old index %i, new index %i, " + "sock %i, from: %p, to: %p", + from - tt, to - tt, from->sock, from, to); } /** - * tcp_rst() - Reset a connection: send RST segment to tap, close socket + * tcp_tap_destroy() - Close tap connection, drop from hash table and epoll * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer */ -static void tcp_rst(struct ctx *c, int s) +static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn) { - if (s < 0) + if (conn->state == CLOSED) return; - tcp_send_to_tap(c, s, RST, NULL, 0); - tcp_close_and_epoll_del(c, s); - tcp_set_state(s, CLOSED); + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL); + tcp_tap_state(conn, CLOSED); + close(conn->sock); + tcp_hash_remove(conn); + tcp_table_tap_compact(c, conn); } +static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn); + /** * tcp_send_to_tap() - Send segment to tap, with options and values from socket * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @flags: TCP flags to set * @in: Payload buffer * @len: Payload length * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) +static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, + int flags, char *in, int len) { char buf[USHRT_MAX] = { 0 }, *data; struct tcp_info info = { 0 }; @@ -732,10 +841,18 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) struct tcphdr *th; int ws = 0, err; - if ((err = getsockopt(s, SOL_TCP, TCP_INFO, &info, &sl)) && - !(flags & RST)) { - tcp_rst(c, s); - return err; + if (conn->seq_from_tap == conn->seq_ack_to_tap && !flags && len) { + err = 0; + info.tcpi_bytes_acked = conn->tcpi_acked_last; + info.tcpi_snd_wnd = conn->tcpi_snd_wnd; + } else { + err = getsockopt(conn->sock, SOL_TCP, TCP_INFO, &info, &sl); + if (err && !(flags & RST)) { + tcp_rst(c, conn); + return err; + } + + conn->tcpi_snd_wnd = info.tcpi_snd_wnd; } th = (struct tcphdr *)buf; @@ -753,10 +870,10 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) /* Check if kernel includes commit: * 8f7baad7f035 ("tcp: Add snd_wnd to TCP_INFO") */ - tc[s].no_snd_wnd = !info.tcpi_snd_wnd; + conn->no_snd_wnd = !info.tcpi_snd_wnd; - if (tc[s].ws_allowed && (ws = info.tcpi_snd_wscale) && - !tc[s].no_snd_wnd) { + if (conn->ws_allowed && (ws = info.tcpi_snd_wscale) && + !conn->no_snd_wnd) { *data++ = OPT_NOP; *data++ = OPT_WS; @@ -767,30 +884,27 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) } /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */ - th->seq = htonl(tc[s].seq_to_tap++); + th->seq = htonl(conn->seq_to_tap++); } else { - th->seq = htonl(tc[s].seq_to_tap); - tc[s].seq_to_tap += len; + th->seq = htonl(conn->seq_to_tap); + conn->seq_to_tap += len; } - if (!err && ((info.tcpi_bytes_acked > tc[s].tcpi_acked_last) || + if (!err && ((info.tcpi_bytes_acked > conn->tcpi_acked_last) || (flags & ACK) || len)) { - uint64_t ack_seq; - th->ack = 1; - ack_seq = info.tcpi_bytes_acked + tc[s].seq_init_from_tap; + conn->seq_ack_to_tap = info.tcpi_bytes_acked + + conn->seq_init_from_tap; - tc[s].seq_ack_to_tap = ack_seq & (uint32_t)~0U; - - if (tc[s].s == LAST_ACK) { - tc[s].seq_ack_to_tap = tc[s].seq_from_tap + 1; + if (conn->state == LAST_ACK) { + conn->seq_ack_to_tap = conn->seq_from_tap + 1; th->seq = htonl(ntohl(th->seq) + 1); } - th->ack_seq = htonl(tc[s].seq_ack_to_tap); + th->ack_seq = htonl(conn->seq_ack_to_tap); - tc[s].tcpi_acked_last = info.tcpi_bytes_acked; + conn->tcpi_acked_last = info.tcpi_bytes_acked; } else { if (!len && !flags) return 0; @@ -802,10 +916,12 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - th->source = tc[s].sock_port; - th->dest = tc[s].tap_port; + th->source = htons(conn->sock_port); + th->dest = htons(conn->tap_port); - if (!err && !tc[s].no_snd_wnd) { + if (flags & ZERO_WINDOW) { + th->window = 0; + } else if (!err && !conn->no_snd_wnd) { /* First value sent by receiver is not scaled */ th->window = htons(info.tcpi_snd_wnd >> (th->syn ? 0 : info.tcpi_snd_wscale)); @@ -818,34 +934,58 @@ static int tcp_send_to_tap(struct ctx *c, int s, int flags, char *in, int len) memcpy(data, in, len); - tap_ip_send(c, &tc[s].a.a6, IPPROTO_TCP, buf, th->doff * 4 + len); + tap_ip_send(c, &conn->a.a6, IPPROTO_TCP, buf, th->doff * 4 + len); return 0; } +/** + * tcp_rst() - Reset a tap connection: send RST segment to tap, close socket + * @c: Execution context + * @conn: Connection pointer + */ +static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn) +{ + if (conn->state == CLOSED) + return; + + tcp_send_to_tap(c, conn, RST, NULL, 0); + tcp_tap_destroy(c, conn); +} + /** * tcp_clamp_window() - Set window and scaling from option, clamp on socket - * @s: File descriptor number for socket + * @conn: Connection pointer * @th: TCP header, from tap * @len: Buffer length, at L4 * @init: Set if this is the very first segment from tap */ -static void tcp_clamp_window(int s, struct tcphdr *th, int len, int init) +static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th, + int len, int init) { if (init) { - tc[s].ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); - tc[s].ws_allowed = tc[s].ws >= 0 && tc[s].ws <= MAX_WS; - tc[s].ws *= tc[s].ws_allowed; + conn->ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + conn->ws_allowed = conn->ws >= 0 && conn->ws <= MAX_WS; + conn->ws *= conn->ws_allowed; /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp * yet, to avoid getting a zero scale just because we set a * small window now. */ - tc[s].tap_window = ntohs(th->window); + conn->tap_window = ntohs(th->window); + conn->window_clamped = 0; } else { - tc[s].tap_window = ntohs(th->window) << tc[s].ws; - setsockopt(s, SOL_TCP, TCP_WINDOW_CLAMP, - &tc[s].tap_window, sizeof(tc[s].tap_window)); + unsigned int window = ntohs(th->window) << conn->ws; + + if (conn->tap_window == window && conn->window_clamped) + return; + + conn->tap_window = window; + if (window < 256) + window = 256; + setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, + &window, sizeof(window)); + conn->window_clamped = 1; } } @@ -925,283 +1065,277 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, .sin6_port = th->dest, .sin6_addr = *(struct in6_addr *)addr, }; - struct epoll_event ev = { 0 }; + struct epoll_event ev = { .events = EPOLLIN | EPOLLET | EPOLLRDHUP }; + union epoll_ref ref = { .proto = IPPROTO_TCP }; const struct sockaddr *sa; + struct tcp_tap_conn *conn; socklen_t sl; int s; - s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (c->tcp.tap_conn_count >= MAX_TAP_CONNS) + return; + + ref.s = s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (s < 0) return; - if (s >= MAX_CONNS) { - close(s); - return; - } + conn = &tt[c->tcp.tap_conn_count++]; + conn->sock = s; - tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); - if (tc[s].mss_guest < 0) - tc[s].mss_guest = MSS_DEFAULT; - sl = sizeof(tc[s].mss_guest); - setsockopt(s, SOL_TCP, TCP_MAXSEG, &tc[s].mss_guest, sl); + conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (conn->mss_guest < 0) + conn->mss_guest = MSS_DEFAULT; + sl = sizeof(conn->mss_guest); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl); - tcp_clamp_window(s, th, len, 1); + tcp_clamp_window(conn, th, len, 1); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); - memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); - memcpy(&tc[s].a.a4.a, addr, sizeof(tc[s].a.a4.a)); + memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); + memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); + memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); - memcpy(&tc[s].a.a6, addr, sizeof(tc[s].a.a6)); + memcpy(&conn->a.a6, addr, sizeof(conn->a.a6)); } - tc[s].sock_port = th->dest; - tc[s].tap_port = th->source; + conn->sock_port = ntohs(th->dest); + conn->tap_port = ntohs(th->source); - tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now; + conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now; - tcp_act_set(s); + bitmap_set(tcp_act, conn - tt); - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP; - ev.data.fd = s; + conn->seq_init_from_tap = ntohl(th->seq); + conn->seq_from_tap = conn->seq_init_from_tap + 1; + conn->seq_ack_to_tap = conn->seq_from_tap; - tc[s].seq_init_from_tap = ntohl(th->seq); - tc[s].seq_from_tap = tc[s].seq_init_from_tap + 1; - tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); + conn->seq_ack_from_tap = conn->seq_to_tap + 1; - tc[s].seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); - tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1; - - tcp_sock_hash_insert(c, s, af, addr, th->source, th->dest); + tcp_hash_insert(c, conn, af, addr); if (connect(s, sa, sl)) { + tcp_tap_state(conn, TAP_SYN_SENT); + if (errno != EINPROGRESS) { - tcp_rst(c, s); + tcp_rst(c, conn); return; } - ev.events |= EPOLLOUT; - tcp_set_state(s, TAP_SYN_SENT); + ev.events = EPOLLOUT | EPOLLRDHUP; } else { - if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) - return; + tcp_tap_state(conn, TAP_SYN_RCVD); - tcp_set_state(s, TAP_SYN_RCVD); + if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0)) + return; } + ref.tcp.index = conn - tt; + ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); } /** - * tcp_conn_from_sock() - Handle new connection request from listening socket + * tcp_table_splice_compact - Compact spliced connection table * @c: Execution context - * @fd: File descriptor number for listening socket - * @now: Current timestamp + * @hole: Pointer to recently closed connection */ -static void tcp_conn_from_sock(struct ctx *c, int fd, struct timespec *now) +static void tcp_table_splice_compact(struct ctx *c, + struct tcp_splice_conn *hole) { - struct sockaddr_storage sa_r, sa_l; - socklen_t sa_len = sizeof(sa_l); - struct epoll_event ev = { 0 }; - int s; + union epoll_ref ref_from = { .proto = IPPROTO_TCP, + .tcp.index = hole - ts }; + union epoll_ref ref_to = { .proto = IPPROTO_TCP, + .tcp.index = hole - ts }; + struct tcp_splice_conn *move; + struct epoll_event ev_from; + struct epoll_event ev_to; - if (getsockname(fd, (struct sockaddr *)&sa_l, &sa_len)) + if ((hole - ts) == --c->tcp.splice_conn_count) return; - s = accept4(fd, (struct sockaddr *)&sa_r, &sa_len, SOCK_NONBLOCK); - if (s == -1) - return; + move = &ts[c->tcp.splice_conn_count]; + memcpy(hole, move, sizeof(*hole)); + move->state = CLOSED; + move = hole; - if (s >= MAX_CONNS) { - close(s); - return; + ref_from.s = move->from; + ref_from.tcp.v6 = move->v6; + ref_to.s = move->to; + ref_to.tcp.v6 = move->v6; + + if (move->state == SPLICE_ACCEPTED) { + ev_from.events = ev_to.events = 0; + } else if (move->state == SPLICE_CONNECT) { + ev_from.events = EPOLLET | EPOLLRDHUP; + ev_to.events = EPOLLET | EPOLLOUT | EPOLLRDHUP; + } else { + ev_from.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLRDHUP; + ev_to.events = EPOLLET | EPOLLIN | EPOLLOUT | EPOLLRDHUP; } - CHECK_SET_MIN_MAX(c->tcp.fd_, s); - CHECK_SET_MIN_MAX(c->tcp.fd_conn_, s); + ev_from.data.u64 = ref_from.u64; + ev_to.data.u64 = ref_to.u64; - if (sa_l.ss_family == AF_INET) { - struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa_r; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->from, &ev_from); + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->to, &ev_to); +} - memset(&tc[s].a.a4.zero, 0, sizeof(tc[s].a.a4.zero)); - memset(&tc[s].a.a4.one, 0xff, sizeof(tc[s].a.a4.one)); - - if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK || - ntohl(sa4->sin_addr.s_addr) == INADDR_ANY) - sa4->sin_addr.s_addr = c->gw4; - - memcpy(&tc[s].a.a4.a, &sa4->sin_addr, sizeof(tc[s].a.a4.a)); - - tc[s].sock_port = sa4->sin_port; - tc[s].tap_port = ((struct sockaddr_in *)&sa_l)->sin_port; - - tc[s].seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr, - tc[s].sock_port, - tc[s].tap_port, - now); - - tcp_sock_hash_insert(c, s, AF_INET, &sa4->sin_addr, - tc[s].tap_port, tc[s].sock_port); - } else if (sa_l.ss_family == AF_INET6) { - struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa_r; - - if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) - memcpy(&sa6->sin6_addr, &c->gw6, sizeof(c->gw6)); - - memcpy(&tc[s].a.a6, &sa6->sin6_addr, sizeof(tc[s].a.a6)); - - tc[s].sock_port = sa6->sin6_port; - tc[s].tap_port = ((struct sockaddr_in6 *)&sa_l)->sin6_port; - - tc[s].seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr, - tc[s].sock_port, - tc[s].tap_port, - now); - - tcp_sock_hash_insert(c, s, AF_INET6, &sa6->sin6_addr, - tc[s].tap_port, tc[s].sock_port); +/** + * tcp_tap_destroy() - Close spliced connection and pipes, drop from epoll + * @c: Execution context + * @conn: Connection pointer + */ +static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) +{ + switch (conn->state) { + case SPLICE_ESTABLISHED: + if (conn->pipe_from_to[0] != -1) { + close(conn->pipe_from_to[0]); + close(conn->pipe_from_to[1]); + } + if (conn->pipe_to_from[0] != -1) { + close(conn->pipe_to_from[0]); + close(conn->pipe_to_from[1]); + } + /* Falls through */ + case SPLICE_CONNECT: + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL); + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL); + close(conn->to); + /* Falls through */ + case SPLICE_ACCEPTED: + close(conn->from); + tcp_splice_state(conn, CLOSED); + tcp_table_splice_compact(c, conn); + return; + default: + return; } - - tc[s].seq_ack_from_tap = tc[s].seq_to_tap + 1; - - tc[s].tap_window = WINDOW_DEFAULT; - tc[s].ws_allowed = 1; - - tc[s].ts_sock = tc[s].ts_tap = tc[s].ts_ack_tap = *now; - - tcp_act_set(s); - - ev.events = EPOLLRDHUP | EPOLLHUP; - ev.data.fd = s; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); - - tcp_set_state(s, SOCK_SYN_SENT); - tcp_send_to_tap(c, s, SYN, NULL, 0); } /** * tcp_send_to_sock() - Send buffer to socket, update timestamp and sequence * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @data: Data buffer * @len: Length at L4 * @extra_flags: Additional flags for send(), if any * * Return: negative on socket error with connection reset, 0 otherwise */ -static int tcp_send_to_sock(struct ctx *c, int s, char *data, int len, - int extra_flags) +static int tcp_send_to_sock(struct ctx *c, struct tcp_tap_conn *conn, + char *data, int len, int extra_flags) { - int err = send(s, data, len, MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); + int err = send(conn->sock, data, len, + MSG_DONTWAIT | MSG_NOSIGNAL | extra_flags); if (err < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) { - /* If we can't queue right now, do nothing, sender has - * to retransmit. - */ - return 0; + tcp_send_to_tap(c, conn, ZERO_WINDOW, NULL, 0); + return err; } err = errno; - tcp_rst(c, s); + tcp_rst(c, conn); return -err; } - tc[s].seq_from_tap += len; - - return 0; -} - -/** - * tcp_is_dupack() - Check if given ACK number is duplicated, update counter - * @s: File descriptor number for socket - * @ack_seq: ACK sequence, host order - * - * Return: -EAGAIN on duplicated ACKs observed, with counter reset, 0 otherwise - */ -static int tcp_is_dupack(int s, uint32_t ack_seq) -{ - if (ack_seq == tc[s].seq_ack_from_tap && ++tc[s].dup_acks == 2) { - tc[s].dup_acks = 0; - return -EAGAIN; - } + conn->seq_from_tap += err; return 0; } /** * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence - * @s: File descriptor number for socket + * @conn: Connection pointer * @ack_seq: ACK sequence, host order */ -static void tcp_sock_consume(int s, uint32_t ack_seq) +static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) { - int to_ack; + uint32_t to_ack; /* Implicitly take care of wrap-arounds */ - to_ack = ack_seq - tc[s].seq_ack_from_tap; + to_ack = ack_seq - conn->seq_ack_from_tap; /* Simply ignore out-of-order ACKs: we already consumed the data we * needed from the buffer, and we won't rewind back to a lower ACK * sequence. */ - if (to_ack < 0) + if (to_ack > MAX_WINDOW) return; - recv(s, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); + if (to_ack) + recv(conn->sock, NULL, to_ack, MSG_DONTWAIT | MSG_TRUNC); - tc[s].seq_ack_from_tap = ack_seq; + conn->seq_ack_from_tap = ack_seq; } /** * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @now: Current timestamp * * Return: negative on connection reset, 1 on pending data, 0 otherwise */ -static int tcp_data_from_sock(struct ctx *c, int s, struct timespec *now) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, + struct timespec *now) { - int len, err, offset, left, send; + uint32_t offset = conn->seq_to_tap - conn->seq_ack_from_tap; + int len, err, left, send, s = conn->sock; + + if (!conn->tap_window || offset >= conn->tap_window) + return 1; + + len = recv(s, sock_buf, + /* TODO: Drop 64KiB limit (needed for responsiveness) once + * tap-side coalescing and zero-copy are fully implemented. + */ + MIN(64 * 1024, conn->tap_window), + /* Don't dequeue until acknowledged by guest */ + MSG_DONTWAIT | MSG_PEEK); - /* Don't dequeue until acknowledged by guest */ - len = recv(s, sock_buf, sizeof(sock_buf), MSG_DONTWAIT | MSG_PEEK); if (len < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK) { - tcp_rst(c, s); + tcp_rst(c, conn); return -errno; } return 0; } if (len == 0) { - if (tc[s].s >= ESTABLISHED_SOCK_FIN) + if (conn->state >= ESTABLISHED_SOCK_FIN) return 0; - tcp_set_state(s, ESTABLISHED_SOCK_FIN); - if ((err = tcp_send_to_tap(c, s, FIN | ACK, NULL, 0))) + tcp_tap_state(conn, ESTABLISHED_SOCK_FIN); + if ((err = tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0))) return err; left = 0; goto out; } - offset = tc[s].seq_to_tap - tc[s].seq_ack_from_tap; left = len - offset; - while (left && offset + tc[s].mss_guest <= tc[s].tap_window) { - if (left < tc[s].mss_guest) + while (left && (offset + conn->mss_guest <= conn->tap_window)) { + if (left < conn->mss_guest) send = left; else - send = tc[s].mss_guest; + send = conn->mss_guest; - if ((err = tcp_send_to_tap(c, s, 0, sock_buf + offset, send))) + if (offset + send > MAX_WINDOW) { + tcp_rst(c, conn); + return -EIO; + } + + err = tcp_send_to_tap(c, conn, 0, sock_buf + offset, send); + if (err) return err; offset += send; @@ -1209,7 +1343,7 @@ static int tcp_data_from_sock(struct ctx *c, int s, struct timespec *now) } out: - tc[s].ts_sock = *now; + conn->ts_sock = *now; return !!left; } @@ -1218,6 +1352,7 @@ out: * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context * @af: Address family, AF_INET or AF_INET6 + * @addr: Destination address * @msg: Input messages * @count: Message count * @now: Current timestamp @@ -1227,15 +1362,19 @@ out: int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now) { + union epoll_ref ref = { .proto = IPPROTO_TCP, + .tcp.v6 = ( af == AF_INET6 ) }; + /* TODO: Implement message batching for TCP */ struct tcphdr *th = (struct tcphdr *)msg[0].l4h; - struct epoll_event ev = { 0 }; size_t len = msg[0].l4_len; + struct tcp_tap_conn *conn; + struct epoll_event ev; size_t off, skip = 0; - int s, ws; + int ws, i; - (void)count; + uint32_t __seq_max; if (len < sizeof(*th)) return 1; @@ -1244,146 +1383,178 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (off < sizeof(*th) || off > len) return 1; - if ((s = tcp_sock_hash_lookup(c, af, addr, th->source, th->dest)) < 0) { + conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); + if (!conn) { if (th->syn) tcp_conn_from_tap(c, af, addr, th, len, now); return 1; } + /* TODO: Partial ACK coalescing, merge with message coalescing */ + for (i = 0; conn->state == ESTABLISHED && i < count; i++) { + struct tcphdr *__th = (struct tcphdr *)msg[i].l4h; + size_t __len = msg[i].l4_len; + uint32_t __this; + + if (__len < sizeof(*th)) + break; + + off = __th->doff * 4; + if (off < sizeof(*th) || off > __len) + break; + + __this = ntohl(th->ack_seq); + + if (!i || __this - __seq_max < MAX_WINDOW) + __seq_max = __this; + + if ((!th->ack || len != off) && i) { + tcp_sock_consume(conn, __seq_max); + conn->ts_tap = *now; + return i; + } + } + if (th->rst) { - tcp_close_and_epoll_del(c, s); + tcp_tap_destroy(c, conn); return 1; } - tcp_clamp_window(s, th, len, th->syn && th->ack); + tcp_clamp_window(conn, th, len, th->syn && th->ack); - tc[s].ts_tap = *now; + conn->ts_tap = *now; - if (ntohl(th->seq) < tc[s].seq_from_tap) - skip = tc[s].seq_from_tap - ntohl(th->seq); + if (ntohl(th->seq) < conn->seq_from_tap && + conn->seq_from_tap - ntohl(th->seq) < MAX_WINDOW) { + skip = conn->seq_from_tap - ntohl(th->seq); + } - switch (tc[s].s) { + switch (conn->state) { case SOCK_SYN_SENT: if (!th->syn || !th->ack) { - tcp_rst(c, s); + tcp_rst(c, conn); return 1; } - tc[s].mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); - if (tc[s].mss_guest < 0) - tc[s].mss_guest = MSS_DEFAULT; + conn->mss_guest = tcp_opt_get(th, len, OPT_MSS, NULL, NULL); + if (conn->mss_guest < 0) + conn->mss_guest = MSS_DEFAULT; ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); if (ws > MAX_WS) { - if (tcp_send_to_tap(c, s, RST, NULL, 0)) + if (tcp_send_to_tap(c, conn, RST, NULL, 0)) return 1; - tc[s].seq_to_tap = 0; - tc[s].ws_allowed = 0; - tcp_send_to_tap(c, s, SYN, NULL, 0); + conn->seq_to_tap = 0; + conn->ws_allowed = 0; + tcp_send_to_tap(c, conn, SYN, NULL, 0); return 1; } /* info.tcpi_bytes_acked already includes one byte for SYN, but * not for incoming connections. */ - tc[s].seq_init_from_tap = ntohl(th->seq) + 1; - tc[s].seq_from_tap = tc[s].seq_init_from_tap; - tc[s].seq_ack_to_tap = tc[s].seq_from_tap; + conn->seq_init_from_tap = ntohl(th->seq) + 1; + conn->seq_from_tap = conn->seq_init_from_tap; + conn->seq_ack_to_tap = conn->seq_from_tap; - tcp_set_state(s, ESTABLISHED); - tcp_send_to_tap(c, s, ACK, NULL, 0); + tcp_tap_state(conn, ESTABLISHED); + tcp_send_to_tap(c, conn, ACK, NULL, 0); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, s, now); - - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP; - ev.data.fd = s; - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); + tcp_data_from_sock(c, conn, now); + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + ref.s = conn->sock; + ref.tcp.index = conn - tt; + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev); break; case TAP_SYN_RCVD: if (th->fin) { - shutdown(s, SHUT_WR); - tcp_set_state(s, FIN_WAIT_1); + shutdown(conn->sock, SHUT_WR); + tcp_tap_state(conn, FIN_WAIT_1); break; } if (!th->ack) { - tcp_rst(c, s); + tcp_rst(c, conn); return 1; } - tcp_set_state(s, ESTABLISHED); + tcp_tap_state(conn, ESTABLISHED); break; case ESTABLISHED: case ESTABLISHED_SOCK_FIN: - tc[s].ts_ack_tap = *now; + conn->ts_ack_tap = *now; - if (ntohl(th->seq) > tc[s].seq_from_tap) { - tc[s].seq_from_tap = tc[s].seq_ack_to_tap; - tcp_send_to_tap(c, s, ACK, NULL, 0); - break; + if (ntohl(th->ack_seq) > conn->seq_to_tap && + (conn->seq_to_tap - ntohl(th->ack_seq)) > MAX_WINDOW) { + return count; } if (th->ack) { - int retrans = 0; + tcp_sock_consume(conn, ntohl(th->ack_seq)); - if (len == off) - retrans = tcp_is_dupack(s, ntohl(th->ack_seq)); - - tcp_sock_consume(s, ntohl(th->ack_seq)); - - if (retrans) - tc[s].seq_to_tap = tc[s].seq_ack_from_tap; - - if (tc[s].s == ESTABLISHED_SOCK_FIN) { - if (!tcp_data_from_sock(c, s, now)) - tcp_set_state(s, CLOSE_WAIT); + if (conn->state == ESTABLISHED_SOCK_FIN) { + if (!tcp_data_from_sock(c, conn, now)) + tcp_tap_state(conn, CLOSE_WAIT); + } else { + tcp_data_from_sock(c, conn, now); } } + if (ntohl(th->seq) > conn->seq_from_tap) { + tcp_send_to_tap(c, conn, ACK, NULL, 0); + tcp_send_to_tap(c, conn, ACK, NULL, 0); + return count; + } + if (skip < len - off && - tcp_send_to_sock(c, s, + tcp_send_to_sock(c, conn, msg[0].l4h + off + skip, len - off - skip, th->psh ? 0 : MSG_MORE)) - break; + return 1; - tcp_data_from_sock(c, s, now); + if (count == 1) + tcp_send_to_tap(c, conn, ACK, NULL, 0); if (th->fin) { - shutdown(s, SHUT_WR); - if (tc[s].s == ESTABLISHED) - tcp_set_state(s, FIN_WAIT_1); + shutdown(conn->sock, SHUT_WR); + if (conn->state == ESTABLISHED) + tcp_tap_state(conn, FIN_WAIT_1); else - tcp_set_state(s, LAST_ACK); + tcp_tap_state(conn, LAST_ACK); } break; case CLOSE_WAIT: - tcp_sock_consume(s, ntohl(th->ack_seq)); + tcp_sock_consume(conn, ntohl(th->ack_seq)); - if (skip < len - off && - tcp_send_to_sock(c, s, + if (skip < (len - off) && + tcp_send_to_sock(c, conn, msg[0].l4h + off + skip, len - off - skip, th->psh ? 0 : MSG_MORE)) break; if (th->fin) { - shutdown(s, SHUT_WR); - tcp_set_state(s, LAST_ACK); + shutdown(conn->sock, SHUT_WR); + tcp_tap_state(conn, LAST_ACK); } break; case FIN_WAIT_1_SOCK_FIN: if (th->ack) - tcp_close_and_epoll_del(c, s); + tcp_tap_destroy(c, conn); break; case FIN_WAIT_1: case TAP_SYN_SENT: case LAST_ACK: + case SPLICE_ACCEPTED: + case SPLICE_CONNECT: + case SPLICE_ESTABLISHED: case CLOSED: /* ;) */ break; } @@ -1395,105 +1566,537 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context * @s: File descriptor number for socket + * @ref: epoll reference */ -static void tcp_connect_finish(struct ctx *c, int s) +static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn, + union epoll_ref ref) { - struct epoll_event ev = { 0 }; + struct epoll_event ev; socklen_t sl; int so; sl = sizeof(so); - if (getsockopt(s, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { - tcp_rst(c, s); + if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { + tcp_rst(c, conn); return; } - if (tcp_send_to_tap(c, s, SYN | ACK, NULL, 0)) + if (tcp_send_to_tap(c, conn, SYN | ACK, NULL, 0)) return; /* Drop EPOLLOUT, only used to wait for connect() to complete */ - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP | EPOLLHUP; - ev.data.fd = s; - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, s, &ev); + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev); - tcp_set_state(s, TAP_SYN_RCVD); + tcp_tap_state(conn, TAP_SYN_RCVD); +} + +/** + * tcp_splice_connect_finish() - Completion of connect() or call on success + * @c: Execution context + * @conn: Connection pointer + * @v6: Set on IPv6 connection + */ +static void tcp_splice_connect_finish(struct ctx *c, + struct tcp_splice_conn *conn, int v6) +{ + union epoll_ref ref_from = { .proto = IPPROTO_TCP, .s = conn->from, + .tcp = { .splice = 1, .v6 = v6, + .index = conn - ts } }; + union epoll_ref ref_to = { .proto = IPPROTO_TCP, .s = conn->to, + .tcp = { .splice = 1, .v6 = v6, + .index = conn - ts } }; + struct epoll_event ev_from, ev_to; + + if (conn->state == SPLICE_CONNECT) { + socklen_t sl; + int so; + + sl = sizeof(so); + if (getsockopt(conn->to, SOL_SOCKET, SO_ERROR, &so, &sl) || + so) { + tcp_splice_destroy(c, conn); + return; + } + + tcp_splice_state(conn, SPLICE_ESTABLISHED); + + ev_from.events = ev_to.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + ev_from.data.u64 = ref_from.u64; + ev_to.data.u64 = ref_to.u64; + + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->from, &ev_from); + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->to, &ev_to); + } + + conn->pipe_from_to[0] = conn->pipe_to_from[0] = -1; + if (pipe2(conn->pipe_to_from, O_NONBLOCK) || + pipe2(conn->pipe_from_to, O_NONBLOCK)) { + tcp_splice_destroy(c, conn); + return; + } + + fcntl(conn->pipe_from_to[0], F_SETPIPE_SZ, PIPE_SIZE); + fcntl(conn->pipe_to_from[0], F_SETPIPE_SZ, PIPE_SIZE); +} + +/** + * tcp_splice_connect() - Create and connect socket for new spliced connection + * @c: Execution context + * @conn: Connection pointer + * @v6: Set on IPv6 connection + * @port: Destination port, host order + * + * Return: 0 for connect() succeeded or in progress, negative value on error + */ +static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, + int v6, in_port_t port) +{ + int sock_conn = socket(v6 ? AF_INET6 : AF_INET, + SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + union epoll_ref ref_accept = { .proto = IPPROTO_TCP, .s = conn->from, + .tcp = { .splice = 1, .v6 = v6, + .index = conn - ts } }; + union epoll_ref ref_conn = { .proto = IPPROTO_TCP, .s = sock_conn, + .tcp = { .splice = 1, .v6 = v6, + .index = conn - ts } }; + struct epoll_event ev_accept = { .events = EPOLLRDHUP | EPOLLET, + .data.u64 = ref_accept.u64 }; + struct epoll_event ev_conn = { .events = EPOLLRDHUP | EPOLLET, + .data.u64 = ref_conn.u64 }; + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(port), + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(port), + .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, + }; + const struct sockaddr *sa; + int ret, one = 1; + socklen_t sl; + + if (sock_conn < 0) + return -errno; + + conn->to = sock_conn; + + setsockopt(conn->from, SOL_TCP, TCP_CORK, &one, sizeof(one)); + setsockopt(conn->from, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); + setsockopt(conn->to, SOL_TCP, TCP_CORK, &one, sizeof(one)); + setsockopt(conn->to, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); + + if (v6) { + sa = (struct sockaddr *)&addr6; + sl = sizeof(addr6); + } else { + sa = (struct sockaddr *)&addr4; + sl = sizeof(addr4); + } + + if (connect(conn->to, sa, sl)) { + if (errno != EINPROGRESS) { + ret = -errno; + close(sock_conn); + return ret; + } + + tcp_splice_state(conn, SPLICE_CONNECT); + ev_conn.events |= EPOLLOUT; + } else { + tcp_splice_state(conn, SPLICE_ESTABLISHED); + tcp_splice_connect_finish(c, conn, v6); + + ev_conn.events |= EPOLLIN; + ev_accept.events |= EPOLLIN; + } + + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_accept); + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->to, &ev_conn); + + return 0; +} + +/** + * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns() + * @c: Execution context + * @conn: Accepted inbound connection + * @v6: Set for inbound IPv6 connection + * @port: Destination port, host order + * @ret: Return value of tcp_splice_connect_ns() + */ +struct tcp_splice_connect_ns_arg { + struct ctx *c; + struct tcp_splice_conn *conn; + int v6; + in_port_t port; + int ret; +}; + +/** + * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect() + * @arg: See struct tcp_splice_connect_ns_arg + * + * Return: 0 + */ +static int tcp_splice_connect_ns(void *arg) +{ + struct tcp_splice_connect_ns_arg *a; + + a = (struct tcp_splice_connect_ns_arg *)arg; + ns_enter(a->c->pasta_pid); + a->ret = tcp_splice_connect(a->c, a->conn, a->v6, a->port); + return 0; +} + +/** + * tcp_splice_new() - Handle new inbound, spliced connection + * @c: Execution context + * @conn: Connection pointer + * @v6: Set for IPv6 connection + * @port: Destination port, host order + * + * Return: return code from connect() + */ +static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn, + int v6, in_port_t port) +{ + struct tcp_splice_connect_ns_arg ns_arg = { c, conn, v6, port, 0 }; + char ns_fn_stack[NS_FN_STACK_SIZE]; + + if (bitmap_isset(c->tcp.port_to_ns, port)) { + clone(tcp_splice_connect_ns, + ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&ns_arg); + + return ns_arg.ret; + } + + return tcp_splice_connect(c, conn, v6, port); +} + +/** + * tcp_conn_from_sock() - Handle new connection request from listening socket + * @c: Execution context + * @ref: epoll reference of listening socket + * @now: Current timestamp + */ +static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, + struct timespec *now) +{ + union epoll_ref ref_conn = { .proto = IPPROTO_TCP, + .tcp.v6 = ref.tcp.v6 }; + struct sockaddr_storage sa; + struct tcp_tap_conn *conn; + struct epoll_event ev; + socklen_t sa_len; + int s; + + if (c->tcp.tap_conn_count >= MAX_TAP_CONNS) + return; + + sa_len = sizeof(sa); + s = accept4(ref.s, (struct sockaddr *)&sa, &sa_len, SOCK_NONBLOCK); + if (s < 0) + return; + + conn = &tt[c->tcp.tap_conn_count++]; + ref_conn.tcp.index = conn - tt; + ref_conn.s = conn->sock = s; + + if (ref.tcp.v6) { + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)&sa; + + if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) + memcpy(&sa6->sin6_addr, &c->gw6, sizeof(c->gw6)); + + memcpy(&conn->a.a6, &sa6->sin6_addr, sizeof(conn->a.a6)); + + conn->sock_port = ntohs(sa6->sin6_port); + conn->tap_port = ref.tcp.index; + + conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6->sin6_addr, + conn->sock_port, + conn->tap_port, + now); + + tcp_hash_insert(c, conn, AF_INET6, &sa6->sin6_addr); + } else { + struct sockaddr_in *sa4 = (struct sockaddr_in *)&sa; + + memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); + memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); + + if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK || + ntohl(sa4->sin_addr.s_addr) == INADDR_ANY) + sa4->sin_addr.s_addr = c->gw4; + + memcpy(&conn->a.a4.a, &sa4->sin_addr, sizeof(conn->a.a4.a)); + + conn->sock_port = ntohs(sa4->sin_port); + conn->tap_port = ref.tcp.index; + + conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4->sin_addr, + conn->sock_port, + conn->tap_port, + now); + + tcp_hash_insert(c, conn, AF_INET, &sa4->sin_addr); + } + + conn->seq_ack_from_tap = conn->seq_to_tap + 1; + + conn->tap_window = WINDOW_DEFAULT; + conn->ws_allowed = 1; + + conn->ts_sock = conn->ts_tap = conn->ts_ack_tap = *now; + + bitmap_set(tcp_act, conn - tt); + + ev.events = EPOLLRDHUP; + ev.data.u64 = ref_conn.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->sock, &ev); + + tcp_tap_state(conn, SOCK_SYN_SENT); + tcp_send_to_tap(c, conn, SYN, NULL, 0); +} + +/** + * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + */ +void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, + uint32_t events) +{ + int move_from, move_to, *pipes; + struct tcp_splice_conn *conn; + + if (ref.tcp.listen) { + int s; + + if (c->tcp.splice_conn_count >= MAX_SPLICE_CONNS) + return; + + if ((s = accept4(ref.s, NULL, NULL, SOCK_NONBLOCK)) < 0) + return; + + conn = &ts[c->tcp.splice_conn_count++]; + conn->from = s; + tcp_splice_state(conn, SPLICE_ACCEPTED); + + if (tcp_splice_new(c, conn, ref.tcp.v6, ref.tcp.index)) + tcp_splice_destroy(c, conn); + + return; + } + + conn = &ts[ref.tcp.index]; + + if (events & EPOLLRDHUP || events & EPOLLHUP || events & EPOLLERR) { + tcp_splice_destroy(c, conn); + return; + } + + if (events & EPOLLOUT) { + struct epoll_event ev = { + .events = EPOLLIN | EPOLLET | EPOLLRDHUP, + .data.u64 = ref.u64, + }; + + if (conn->state == SPLICE_CONNECT) { + tcp_splice_connect_finish(c, conn, ref.tcp.v6); + return; + } + + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, ref.s, &ev); + + move_to = ref.s; + if (ref.s == conn->to) { + move_from = conn->from; + pipes = conn->pipe_from_to; + } else { + move_from = conn->to; + pipes = conn->pipe_to_from; + } + } else { + move_from = ref.s; + if (ref.s == conn->from) { + move_to = conn->to; + pipes = conn->pipe_from_to; + } else { + move_to = conn->from; + pipes = conn->pipe_to_from; + } + } + +swap: + while (1) { + int retry_write = 1, no_read = 1; + ssize_t ret, nr = 0, nw; + +retry: + ret = splice(move_from, NULL, pipes[1], NULL, PIPE_SIZE, + SPLICE_F_MOVE); + if (ret < 0) { + if (errno == EAGAIN) { + nr = PIPE_SIZE; + } else { + tcp_splice_destroy(c, conn); + return; + } + } else if (!ret && no_read) { + break; + } else if (ret) { + no_read = 0; + nr += ret; + } + + nw = splice(pipes[0], NULL, move_to, NULL, nr, SPLICE_F_MOVE); + if (nw < 0) { + if (errno == EAGAIN) { + struct epoll_event ev = { + .events = EPOLLIN | EPOLLOUT | EPOLLET | + EPOLLRDHUP + }; + + if (no_read) + break; + + if (retry_write--) + goto retry; + + ref.s = move_to; + ev.data.u64 = ref.u64, + epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move_to, + &ev); + break; + } + tcp_splice_destroy(c, conn); + return; + } + } + + if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { + events = EPOLLIN; + + SWAP(move_from, move_to); + if (pipes == conn->pipe_from_to) + pipes = conn->pipe_to_from; + else + pipes = conn->pipe_from_to; + + goto swap; + } } /** * tcp_sock_handler() - Handle new data from socket * @c: Execution context - * @s: File descriptor number for socket + * @ref: epoll reference * @events: epoll events bitmap - * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp */ -void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - int accept = -1; - socklen_t sl; + struct tcp_tap_conn *conn; - (void)pkt_buf; - - sl = sizeof(accept); - - if (tc[s].s == LAST_ACK) { - tcp_send_to_tap(c, s, ACK, NULL, 0); - tcp_close_and_epoll_del(c, s); + if (ref.tcp.splice) { + tcp_sock_handler_splice(c, ref, events); return; } - if (tc[s].s == SOCK_SYN_SENT) { + if (ref.tcp.listen) { + tcp_conn_from_sock(c, ref, now); + return; + } + + conn = &tt[ref.tcp.index]; + + if (conn->state == LAST_ACK) { + tcp_send_to_tap(c, conn, ACK, NULL, 0); + tcp_tap_destroy(c, conn); + return; + } + + if (conn->state == SOCK_SYN_SENT) { /* This can only be a socket error or a shutdown from remote */ - tcp_rst(c, s); - return; - } - if (IN_INTERVAL(c->tcp.fd_listen_min, c->tcp.fd_listen_max, s) && - !IN_INTERVAL(c->tcp.fd_conn_min, c->tcp.fd_conn_max, s)) - accept = 1; - else if (IN_INTERVAL(c->tcp.fd_conn_min, c->tcp.fd_conn_max, s) && - !IN_INTERVAL(c->tcp.fd_listen_min, c->tcp.fd_listen_max, s)) - accept = 0; - else if (getsockopt(s, SOL_SOCKET, SO_ACCEPTCONN, &accept, &sl)) - accept = -1; - - if ((events & EPOLLERR) || accept == -1) { - if (tc[s].s != CLOSED) - tcp_rst(c, s); + tcp_rst(c, conn); return; } - if (accept) { - tcp_conn_from_sock(c, s, now); + if (events & EPOLLERR) { + if (conn->state != CLOSED) + tcp_rst(c, conn); return; } if (events & EPOLLOUT) { /* Implies TAP_SYN_SENT */ - tcp_connect_finish(c, s); + tcp_connect_finish(c, conn, ref); return; } - if (tc[s].s == ESTABLISHED) - tcp_data_from_sock(c, s, now); + if (conn->state == ESTABLISHED) + tcp_data_from_sock(c, conn, now); - if (events & EPOLLRDHUP || events & EPOLLHUP) { - if (tc[s].s == ESTABLISHED) { - tcp_set_state(s, ESTABLISHED_SOCK_FIN); - shutdown(s, SHUT_RD); - tcp_data_from_sock(c, s, now); - tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); - } else if (tc[s].s == FIN_WAIT_1) { - tcp_set_state(s, FIN_WAIT_1_SOCK_FIN); - shutdown(s, SHUT_RD); - tcp_data_from_sock(c, s, now); - tcp_send_to_tap(c, s, FIN | ACK, NULL, 0); - tcp_sock_consume(s, tc[s].seq_ack_from_tap); + if (events & (EPOLLRDHUP | EPOLLHUP)) { + if (conn->state == ESTABLISHED) { + tcp_tap_state(conn, ESTABLISHED_SOCK_FIN); + shutdown(conn->sock, SHUT_RD); + tcp_data_from_sock(c, conn, now); + tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0); + } else if (conn->state == FIN_WAIT_1) { + tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN); + shutdown(conn->sock, SHUT_RD); + tcp_data_from_sock(c, conn, now); + tcp_send_to_tap(c, conn, FIN | ACK, NULL, 0); + tcp_sock_consume(conn, conn->seq_ack_from_tap); } else { - tcp_close_and_epoll_del(c, s); + tcp_tap_destroy(c, conn); } } } +/** + * tcp_sock_init_ns() - Bind sockets in namespace for inbound connections + * @arg: Execution context + * + * Return: 0 on success, -1 on failure + */ +static int tcp_sock_init_ns(void *arg) +{ + union tcp_epoll_ref tref = { .listen = 1, .splice = 1 }; + struct ctx *c = (struct ctx *)arg; + in_port_t port; + + ns_enter(c->pasta_pid); + + for (port = 0; !PORT_IS_EPHEMERAL(port); port++) { + if (!bitmap_isset(c->tcp.port_to_init, port)) + continue; + + tref.index = port; + + if (c->v4) { + tref.v6 = 0; + sock_l4(c, AF_INET, IPPROTO_TCP, port, 1, tref.u32); + } + + if (c->v6) { + tref.v6 = 1; + sock_l4(c, AF_INET6, IPPROTO_TCP, port, 1, tref.u32); + } + } + + return 0; +} + /** * tcp_sock_init() - Bind sockets for inbound connections, get key for sequence * @c: Execution context @@ -1502,28 +2105,40 @@ void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, */ int tcp_sock_init(struct ctx *c) { + union tcp_epoll_ref tref = { .listen = 1 }; + char ns_fn_stack[NS_FN_STACK_SIZE]; in_port_t port; - int s = 0; - c->tcp.fd_min = c->tcp.fd_listen_min = c->tcp.fd_conn_min = INT_MAX; - c->tcp.fd_max = c->tcp.fd_listen_max = c->tcp.fd_conn_max = 0; - CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); + getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM); for (port = 0; !PORT_IS_EPHEMERAL(port); port++) { + if (bitmap_isset(c->tcp.port_to_ns, port)) + tref.splice = 1; + else if (bitmap_isset(c->tcp.port_to_tap, port)) + tref.splice = 0; + else + continue; + + tref.index = port; + if (c->v4) { - if ((s = sock_l4(c, AF_INET, IPPROTO_TCP, port)) < 0) - return -1; - CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); + tref.v6 = 0; + sock_l4(c, AF_INET, IPPROTO_TCP, port, tref.splice, + tref.u32); } if (c->v6) { - if ((s = sock_l4(c, AF_INET6, IPPROTO_TCP, port)) < 0) - return -1; - CHECK_SET_MIN_MAX(c->tcp.fd_listen_, s); + tref.v6 = 1; + sock_l4(c, AF_INET6, IPPROTO_TCP, port, tref.splice, + tref.u32); } } - getrandom(&c->tcp.hash_secret, sizeof(c->tcp.hash_secret), GRND_RANDOM); + if (c->mode == MODE_PASTA) { + clone(tcp_sock_init_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)c); + } return 0; } @@ -1531,69 +2146,79 @@ int tcp_sock_init(struct ctx *c) /** * tcp_timer_one() - Handler for timed events on one socket * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @ts: Timestamp from caller */ -static void tcp_timer_one(struct ctx *c, int s, struct timespec *ts) +static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn, + struct timespec *ts) { - int ack_tap_ms = timespec_diff_ms(ts, &tc[s].ts_ack_tap); - int sock_ms = timespec_diff_ms(ts, &tc[s].ts_tap); - int tap_ms = timespec_diff_ms(ts, &tc[s].ts_tap); + int ack_tap_ms = timespec_diff_ms(ts, &conn->ts_ack_tap); + int sock_ms = timespec_diff_ms(ts, &conn->ts_tap); + int tap_ms = timespec_diff_ms(ts, &conn->ts_tap); - switch (tc[s].s) { + switch (conn->state) { case SOCK_SYN_SENT: case TAP_SYN_RCVD: if (ack_tap_ms > SYN_TIMEOUT) - tcp_rst(c, s); + tcp_rst(c, conn); break; case ESTABLISHED_SOCK_FIN: if (ack_tap_ms > FIN_TIMEOUT) { - tcp_rst(c, s); + tcp_rst(c, conn); break; } /* Falls through */ case ESTABLISHED: - if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT) - tcp_rst(c, s); + if (tap_ms > ACT_TIMEOUT && sock_ms > ACT_TIMEOUT) { + tcp_rst(c, conn); + break; + } - if (tc[s].seq_to_tap == tc[s].seq_ack_from_tap && - tc[s].seq_from_tap == tc[s].seq_ack_to_tap) { - tc[s].ts_sock = *ts; + if (conn->seq_to_tap == conn->seq_ack_from_tap && + conn->seq_from_tap == conn->seq_ack_to_tap) { + conn->ts_sock = *ts; break; } if (sock_ms > ACK_INTERVAL) { - if (tc[s].seq_from_tap > tc[s].seq_ack_to_tap) - tcp_send_to_tap(c, s, 0, NULL, 0); + if (conn->seq_from_tap > conn->seq_ack_to_tap) + tcp_send_to_tap(c, conn, ACK, NULL, 0); } if (ack_tap_ms > ACK_TIMEOUT) { - if (tc[s].seq_ack_from_tap < tc[s].seq_to_tap) { - tc[s].seq_to_tap = tc[s].seq_ack_from_tap; - tc[s].ts_ack_tap = *ts; - tcp_data_from_sock(c, s, ts); + if (conn->seq_ack_from_tap < conn->seq_to_tap) { + if (ack_tap_ms > 10 * ACK_TIMEOUT) { + tcp_rst(c, conn); + break; + } + + conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_data_from_sock(c, conn, ts); } } - if (tc[s].seq_from_tap == tc[s].seq_ack_to_tap) - tc[s].ts_sock = *ts; + if (conn->seq_from_tap == conn->seq_ack_to_tap) + conn->ts_sock = *ts; break; case CLOSE_WAIT: case FIN_WAIT_1: if (sock_ms > FIN_TIMEOUT) - tcp_rst(c, s); + tcp_rst(c, conn); break; case FIN_WAIT_1_SOCK_FIN: if (ack_tap_ms > FIN_TIMEOUT) - tcp_rst(c, s); + tcp_rst(c, conn); break; case LAST_ACK: if (sock_ms > LAST_ACK_TIMEOUT) - tcp_rst(c, s); + tcp_rst(c, conn); break; case TAP_SYN_SENT: + case SPLICE_ACCEPTED: + case SPLICE_CONNECT: + case SPLICE_ESTABLISHED: case CLOSED: break; } @@ -1613,8 +2238,10 @@ void tcp_timer(struct ctx *c, struct timespec *ts) for (i = 0; i < sizeof(tcp_act) / sizeof(long); i++, word++) { tmp = *word; while ((n = ffsl(tmp))) { + int index = i * sizeof(long) * 8 + n - 1; + tmp &= ~(1UL << (n - 1)); - tcp_timer_one(c, i * sizeof(long) * 8 + n - 1, ts); + tcp_timer_one(c, &tt[index], ts); } } } diff --git a/tcp.h b/tcp.h index 7435c41..6a9aa4a 100644 --- a/tcp.h +++ b/tcp.h @@ -3,34 +3,53 @@ #define TCP_TIMER_INTERVAL 20 /* ms */ +#define TCP_MAX_CONNS (128 * 1024) +#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) + struct ctx; -void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); int tcp_sock_init(struct ctx *c); void tcp_timer(struct ctx *c, struct timespec *ts); +/** + * union tcp_epoll_ref - epoll reference portion for TCP connections + * @listen: Set if this file descriptor is a listening socket + * @splice: Set if descriptor is associated to a spliced connection + * @v6: Set for IPv6 sockets or connections + * @index: Index of connection in table, or port for bound sockets + * @u32: Opaque u32 value of reference + */ +union tcp_epoll_ref { + struct { + uint32_t listen:1, + splice:1, + v6:1, + index:20; + }; + uint32_t u32; +}; + /** * struct tcp_ctx - Execution context for TCP routines * @hash_secret: 128-bit secret for hash functions, ISN and hash table - * @fd_min: Lowest file descriptor number for TCP ever used - * @fd_max: Highest file descriptor number for TCP ever used - * @fd_listen_min: Lowest file descriptor number for listening sockets - * @fd_listen_max: Highest file descriptor number for listening sockets - * @fd_conn_min: Lowest file descriptor number for connected sockets - * @fd_conn_max: Highest file descriptor number for connected sockets + * @tap_conn_count: Count of tap connections in connection table + * @splice_conn_count: Count of spliced connections in connection table + * @port_to_tap: Ports bound host/init-side, packets to guest/tap + * @port_to_init: Ports bound namespace-side, spliced to init + * @port_to_ns: Ports bound init-side, spliced to namespace * @timer_run: Timestamp of most recent timer run */ struct tcp_ctx { uint64_t hash_secret[2]; - int fd_min; - int fd_max; - int fd_listen_min; - int fd_listen_max; - int fd_conn_min; - int fd_conn_max; + int tap_conn_count; + int splice_conn_count; + uint8_t port_to_tap [USHRT_MAX / 8]; + uint8_t port_to_init [USHRT_MAX / 8]; + uint8_t port_to_ns [USHRT_MAX / 8]; struct timespec timer_run; }; diff --git a/udp.c b/udp.c index 46a3302..d64d59b 100644 --- a/udp.c +++ b/udp.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * udp.c - UDP L2-L4 translation routines * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ /** @@ -17,23 +20,77 @@ * with two purposes: * - binding ephemeral ports when they're used as source port by the guest, so * that replies on those ports can be forwarded back to the guest, with a - * fixed 180s timeout for this binding + * fixed timeout for this binding * - packets received from the local host get their source changed to a local * address (gateway address) so that they can be forwarded to the guest, and * packets sent as replies by the guest need their destination address to * be changed back to the address of the local host. This is dynamic to allow * connections from the gateway as well, and uses the same fixed 180s timeout * - * Sockets for ephemeral and non-ephemeral ports are created and at - * initialisation time, one set for IPv4 and one for IPv6. Non-ephemeral ports - * are bound at initialisation time, ephemeral ports are bound dynamically. + * Sockets for bound ports are created at initialisation time, one set for IPv4 + * and one for IPv6. * * Packets are forwarded back and forth, by prepending and stripping UDP headers * in the obvious way, with no port translation. * + * In PASTA mode, the L2-L4 translation is skipped for connections to ports + * bound between namespaces using the loopback interface, messages are directly + * transferred between L4 sockets instead. These are called spliced connections + * for consistency with the TCP implementation, but the splice() syscall isn't + * actually used as it wouldn't make sense for datagram-based connections: a + * pair of recvmmsg() and sendmmsg() deals with this case. + * + * The connection tracking for PASTA mode is slightly complicated by the absence + * of actual connections, see struct udp_splice_port, and these examples: + * + * - from init to namespace: + * + * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from bound + * socket s, with epoll reference: index = 80, splice = UDP_TO_NS + * - if udp_splice_map[V4][5000].ns_conn_sock: + * - send packet to udp4_splice_map[5000].ns_conn_sock + * - otherwise: + * - create new socket udp_splice_map[V4][5000].ns_conn_sock + * - connect in namespace to 127.0.0.1:80 + * - get source port of new connected socket (10000) with getsockname() + * - add to epoll with reference: index = 10000, splice: UDP_BACK_TO_INIT + * - set udp_splice_map[V4][10000].init_bound_sock to s + * - set udp_splice_map[V4][10000].init_dst_port to 5000 + * - update udp_splice_map[V4][5000].ns_conn_ts with current time + * + * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:10000 in namespace from + * connected socket s, having epoll reference: index = 10000, + * splice = UDP_BACK_TO_INIT + * - if udp_splice_map[V4][10000].init_bound_sock: + * - send to udp_splice_map[V4][10000].init_bound_sock, with destination + * port udp_splice_map[V4][10000].init_dst_port (5000) + * - otherwise, discard + * + * - from namespace to init: + * + * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from bound + * socket s, with epoll reference: index = 22, splice = UDP_TO_INIT + * - if udp4_splice_map[V4][2000].init_conn_sock: + * - send packet to udp4_splice_map[2000].init_conn_sock + * - otherwise: + * - create new socket udp_splice_map[V4][2000].init_conn_sock + * - connect in init to 127.0.0.1:22, + * - get source port of new connected socket (4000) with getsockname() + * - add to epoll with reference: index = 4000, splice = UDP_BACK_TO_NS + * - set udp_splice_map[V4][4000].ns_bound_sock to s + * - set udp_splice_map[V4][4000].ns_dst_port to 2000 + * - update udp_splice_map[V4][4000].init_conn_ts with current time + * + * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:4000 in init from connected + * socket s, having epoll reference: index = 4000, splice = UDP_BACK_TO_NS + * - if udp_splice_map[V4][4000].ns_bound_sock: + * - send to udp_splice_map[V4][4000].ns_bound_sock, with destination port + * udp_splice_map[4000].ns_dst_port (2000) + * - otherwise, discard */ #define _GNU_SOURCE +#include #include #include #include @@ -53,252 +110,373 @@ #include #include +#include "util.h" #include "passt.h" #include "tap.h" -#include "util.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ +#define UDP_SPLICE_FRAMES 128 -struct udp_port { - int s; - time_t ts_ephemeral; +/** + * struct udp_tap_port - Port tracking based on tap-facing source port + * @sock: Socket bound to source port used as index + * @ts: Activity timestamp from tap, used for socket aging + * @ts_local: Timestamp of tap packet to gateway address, aging for local bind + */ +struct udp_tap_port { + int sock; + time_t ts; time_t ts_local; }; -static struct udp_port up4[USHRT_MAX]; -static struct udp_port up6[USHRT_MAX]; - -/* Bitmaps, activity monitoring needed for port */ -static uint8_t udp4_act[USHRT_MAX / 8]; -static uint8_t udp6_act[USHRT_MAX / 8]; - /** - * udp_act_set() - Set port in bitmap for timed events - * @af: Protocol family - * @s: Port number + * struct udp_splice_port - Source port tracking for traffic between namespaces + * @ns_conn_sock: Socket connected in namespace for init source port + * @init_conn_sock: Socket connected in init for namespace source port + * @ns_conn_ts: Timestamp of activity for socket connected in namespace + * @init_conn_ts: Timestamp of activity for socket connceted in init + * @ns_dst_port: Destination port in namespace for init source port + * @init_dst_port: Destination port in init for namespace source port + * @ns_bound_sock: Bound socket in namespace for this source port in init + * @init_bound_sock: Bound socket in init for this source port in namespace */ -static void udp_act_set(int af, int p) -{ - if (af == AF_INET) - udp4_act[p / 8] |= 1 << (p % 8); - else - udp6_act[p / 8] |= 1 << (p % 8); -} +struct udp_splice_port { + int ns_conn_sock; + int init_conn_sock; + + time_t ns_conn_ts; + time_t init_conn_ts; + + in_port_t ns_dst_port; + in_port_t init_dst_port; + + int ns_bound_sock; + int init_bound_sock; +}; + +/* Port tracking, arrays indexed by packet source port (host order) */ +static struct udp_tap_port udp_tap_map [IP_VERSIONS][USHRT_MAX]; +static struct udp_splice_port udp_splice_map [IP_VERSIONS][USHRT_MAX]; + +enum udp_act_type { + UDP_ACT_TAP, + UDP_ACT_NS_CONN, + UDP_ACT_INIT_CONN, + UDP_ACT_TYPE_MAX, +}; + +/* Activity-based aging for bindings */ +static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][USHRT_MAX / 8]; + +/* recvmmsg()/sendmmsg() data */ +static struct sockaddr_storage udp_splice_namebuf; +static uint8_t udp_splice_buf[UDP_SPLICE_FRAMES][USHRT_MAX]; + +static struct iovec udp_splice_iov_recv [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_splice_mmh_recv [UDP_SPLICE_FRAMES]; + +static struct iovec udp_splice_iov_send [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_splice_mmh_send [UDP_SPLICE_FRAMES]; + +static struct iovec udp_splice_iov_sendto [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_splice_mmh_sendto [UDP_SPLICE_FRAMES]; /** - * udp_act_clear() - Clear port from bitmap for timed events - * @af: Protocol family - * @s: Port number - */ -static void udp_act_clear(int af, int p) -{ - if (af == AF_INET) - udp4_act[p / 8] &= ~(1 << (p % 8)); - else - udp6_act[p / 8] &= ~(1 << (p % 8)); -} - -/** - * udp_sock_handler_local() - Replace address if local, update timestamp + * udp_splice_connect() - Create and connect socket for "spliced" binding * @c: Execution context - * @sa: Socket address as struct sockaddr_in or sockaddr_in6 + * @v6: Set for IPv6 connections + * @bound_sock: Originating bound socket + * @src: Source port of original connection, host order + * @dst: Destination port of original connection, host order + * @splice: UDP_BACK_TO_INIT from init, UDP_BACK_TO_NS from namespace + * + * Return: connected socket, negative error code on failure + */ +int udp_splice_connect(struct ctx *c, int v6, int bound_sock, + in_port_t src, in_port_t dst, int splice) +{ + struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; + union epoll_ref ref = { .proto = IPPROTO_UDP, + .udp = { .splice = splice, .v6 = v6 } + }; + struct sockaddr_storage sa; + struct udp_splice_port *sp; + socklen_t sl = sizeof(sa); + int s; + + s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, + IPPROTO_UDP); + if (s < 0) + return s; + ref.s = s; + + if (v6) { + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(dst), + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + if (connect(s, (struct sockaddr *)&addr6, sizeof(addr6))) + goto fail; + } else { + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(dst), + .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, + }; + if (connect(s, (struct sockaddr *)&addr4, sizeof(addr4))) + goto fail; + } + + if (getsockname(s, (struct sockaddr *)&sa, &sl)) + goto fail; + + if (v6) + ref.udp.port = ntohs(((struct sockaddr_in6 *)&sa)->sin6_port); + else + ref.udp.port = ntohs(((struct sockaddr_in *)&sa)->sin_port); + + sp = &udp_splice_map[v6 ? V6 : V4][ref.udp.port]; + if (splice == UDP_BACK_TO_INIT) { + sp->init_bound_sock = bound_sock; + sp->init_dst_port = src; + udp_splice_map[v6 ? V6 : V4][src].ns_conn_sock = s; + bitmap_set(udp_act[v6 ? V6 : V4][UDP_ACT_NS_CONN], src); + } else if (splice == UDP_BACK_TO_NS) { + sp->ns_bound_sock = bound_sock; + sp->ns_dst_port = src; + udp_splice_map[v6 ? V6 : V4][src].init_conn_sock = s; + bitmap_set(udp_act[v6 ? V6 : V4][UDP_ACT_INIT_CONN], src); + } + + ev.data.u64 = ref.u64; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + return s; + +fail: + close(s); + return -1; +} + +/** + * struct udp_splice_connect_ns_arg - Arguments for udp_splice_connect_ns() + * @c: Execution context + * @v6: Set for inbound IPv6 connection + * @bound_sock: Originating bound socket + * @src: Source port of original connection, host order + * @dst: Destination port of original connection, host order + * @s: Newly created socket or negative error code + */ +struct udp_splice_connect_ns_arg { + struct ctx *c; + int v6; + int bound_sock; + in_port_t src; + in_port_t dst; + int s; +}; + +/** + * udp_splice_connect_ns() - Enter namespace and call udp_splice_connect() + * @arg: See struct udp_splice_connect_ns_arg + * + * Return: 0 + */ +static int udp_splice_connect_ns(void *arg) +{ + struct udp_splice_connect_ns_arg *a; + + a = (struct udp_splice_connect_ns_arg *)arg; + + ns_enter(a->c->pasta_pid); + a->s = udp_splice_connect(a->c, a->v6, a->bound_sock, a->src, a->dst, + UDP_BACK_TO_INIT); + + return 0; +} + +/** + * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap * @now: Current timestamp */ -static void udp_sock_handler_local(struct ctx *c, int af, void *sa, - struct timespec *now) +static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, + uint32_t events, struct timespec *now) { - if (af == AF_INET) { - struct sockaddr_in *s_in = (struct sockaddr_in *)sa; + struct msghdr *mh = &udp_splice_mmh_recv[0].msg_hdr; + struct sockaddr_storage *sa_s = mh->msg_name; + in_port_t src, dst = ref.udp.port, send_dst; + char ns_fn_stack[NS_FN_STACK_SIZE]; + int s, v6 = ref.udp.v6, n, i; - s_in->sin_addr.s_addr = c->gw4; + if (!(events & EPOLLIN)) + return; - up4[ntohs(s_in->sin_port)].ts_local = now->tv_sec; - udp_act_set(AF_INET, ntohs(s_in->sin_port)); + n = recvmmsg(ref.s, udp_splice_mmh_recv, UDP_SPLICE_FRAMES, 0, NULL); + + if (n <= 0) + return; + + if (v6) { + struct sockaddr_in6 *sa = (struct sockaddr_in6 *)sa_s; + + src = htons(sa->sin6_port); } else { - struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa; + struct sockaddr_in *sa = (struct sockaddr_in *)sa_s; - memcpy(&s_in6->sin6_addr, &c->gw6, sizeof(c->gw6)); - - up6[ntohs(s_in6->sin6_port)].ts_local = now->tv_sec; - udp_act_set(AF_INET6, ntohs(s_in6->sin6_port)); + src = ntohs(sa->sin_port); } -} -/** - * udp_sock_name() - Get address family and port for bound UDP socket - * @c: Execution context - * @s: File descriptor number for socket - * @port: Local port, set on return, network order - * - * Return: address family, AF_INET or AF_INET6, negative error code on failure - */ -static int udp_sock_name(struct ctx *c, int s, in_port_t *port) -{ - if (!c->udp.fd_in_seq) { - struct sockaddr_storage sa; - socklen_t sl; + switch (ref.udp.splice) { + case UDP_TO_NS: + if (!(s = udp_splice_map[v6][src].ns_conn_sock)) { + struct udp_splice_connect_ns_arg arg = { + c, v6, ref.s, src, dst, -1, + }; - sl = sizeof(sa); - if (getsockname(s, (struct sockaddr *)&sa, &sl)) - return -errno; + clone(udp_splice_connect_ns, + ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&arg); - if (sa.ss_family == AF_INET) { - *port = ((struct sockaddr_in *)&sa)->sin_port; - return AF_INET; + if ((s = arg.s) < 0) + return; + } + udp_splice_map[v6][src].ns_conn_ts = now->tv_sec; + break; + case UDP_BACK_TO_INIT: + if (!(s = udp_splice_map[v6][dst].init_bound_sock)) + return; + + send_dst = udp_splice_map[v6][dst].init_dst_port; + break; + case UDP_TO_INIT: + if (!(s = udp_splice_map[v6][src].init_conn_sock)) { + s = udp_splice_connect(c, v6, ref.s, src, dst, + UDP_BACK_TO_NS); + if (s < 0) + return; + } + udp_splice_map[v6][src].init_conn_ts = now->tv_sec; + break; + case UDP_BACK_TO_NS: + if (!(s = udp_splice_map[v6][dst].ns_bound_sock)) + return; + + send_dst = udp_splice_map[v6][dst].ns_dst_port; + break; + default: + return; + } + + if (ref.udp.splice == UDP_TO_NS || ref.udp.splice == UDP_TO_INIT) { + for (i = 0; i < n; i++) { + struct msghdr *mh = &udp_splice_mmh_send[i].msg_hdr; + + mh->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len; } - if (sa.ss_family == AF_INET6) { - *port = ((struct sockaddr_in6 *)&sa)->sin6_port; - return AF_INET6; - } - - return -ENOTSUP; + sendmmsg(s, udp_splice_mmh_send, n, MSG_NOSIGNAL); + return; } - if (c->v4 && c->v6) { - *port = htons((s - c->udp.fd_min) / 2); - return ((s - c->udp.fd_min) % 2) ? AF_INET6 : AF_INET; + for (i = 0; i < n; i++) { + struct msghdr *mh = &udp_splice_mmh_sendto[i].msg_hdr; + + mh->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len; } - *port = htons(s - c->udp.fd_min); - return c->v4 ? AF_INET : AF_INET6; + if (v6) { + *((struct sockaddr_in6 *)&udp_splice_namebuf) = + ((struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + .sin6_port = htons(send_dst), + }); + } else { + *((struct sockaddr_in *)&udp_splice_namebuf) = + ((struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, + .sin_port = htons(send_dst), + }); + } + + sendmmsg(s, udp_splice_mmh_sendto, n, MSG_NOSIGNAL); } /** * udp_sock_handler() - Handle new data from socket * @c: Execution context - * @s: File descriptor number for socket + * @ref: epoll reference * @events: epoll events bitmap - * @pkt_buf: Buffer to receive packets, currently unused * @now: Current timestamp */ -void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0xff, 0xff, - 0, 0, 0, 0 } }; struct sockaddr_storage sr; - socklen_t slen = sizeof(sr); + socklen_t sl = sizeof(sr); char buf[USHRT_MAX]; struct udphdr *uh; ssize_t n; - int af; - - (void)pkt_buf; if (events == EPOLLERR) return; - n = recvfrom(s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), - MSG_DONTWAIT, (struct sockaddr *)&sr, &slen); + if (ref.udp.splice) { + udp_sock_handler_splice(c, ref, events, now); + return; + } + + uh = (struct udphdr *)buf; + + n = recvfrom(ref.s, buf + sizeof(*uh), sizeof(buf) - sizeof(*uh), 0, + (struct sockaddr *)&sr, &sl); if (n < 0) return; - uh = (struct udphdr *)buf; - af = udp_sock_name(c, s, &uh->dest); + uh->dest = htons(ref.udp.port); + uh->len = htons(n + sizeof(*uh)); - if (af == AF_INET) { - struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; - - if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK || - ntohl(sr4->sin_addr.s_addr) == INADDR_ANY) - udp_sock_handler_local(c, AF_INET, sr4, now); - - memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); - uh->source = sr4->sin_port; - uh->len = htons(n + sizeof(*uh)); - - tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh)); - } else if (af == AF_INET6) { + if (ref.udp.v6) { struct sockaddr_in6 *sr6 = (struct sockaddr_in6 *)&sr; - if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr)) - udp_sock_handler_local(c, AF_INET6, sr6, now); + if (IN6_IS_ADDR_LOOPBACK(&sr6->sin6_addr)) { + in_port_t src = htons(sr6->sin6_port); + + memcpy(&sr6->sin6_addr, &c->gw6, sizeof(c->gw6)); + udp_tap_map[V6][src].ts_local = now->tv_sec; + bitmap_set(udp_act[V6][UDP_ACT_TAP], src); + } uh->source = sr6->sin6_port; - uh->len = htons(n + sizeof(*uh)); tap_ip_send(c, &sr6->sin6_addr, IPPROTO_UDP, buf, n + sizeof(*uh)); - } -} - -/** - * udp_tap_handler_ephemeral() - Bind ephemeral source port, update timestamp - * @af: Address family, AF_INET or AF_INET6 - * @src: Source port, host order - * @now: Current timestamp - */ -static void udp_tap_handler_ephemeral(int af, in_port_t src, - struct timespec *now) -{ - struct sockaddr *addr = NULL; - struct sockaddr_in6 s_in6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(src), - .sin6_addr = IN6ADDR_ANY_INIT, - }; - struct sockaddr_in s_in = { - .sin_family = AF_INET, - .sin_port = htons(src), - .sin_addr = { .s_addr = INADDR_ANY }, - }; - socklen_t sl; - int s; - - if (af == AF_INET) { - if (!up4[src].ts_ephemeral) { - s = up4[src].s; - addr = (struct sockaddr *)&s_in; - sl = sizeof(s_in); - } } else { - if (!up6[src].ts_ephemeral) { - s = up6[src].s; - addr = (struct sockaddr *)&s_in6; - sl = sizeof(s_in6); + struct in6_addr a6 = { .s6_addr = { 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0xff, 0xff, + 0, 0, 0, 0 } }; + struct sockaddr_in *sr4 = (struct sockaddr_in *)&sr; + + if (ntohl(sr4->sin_addr.s_addr) == INADDR_LOOPBACK || + ntohl(sr4->sin_addr.s_addr) == INADDR_ANY) { + in_port_t src = htons(sr4->sin_port); + + sr4->sin_addr.s_addr = c->gw4; + udp_tap_map[V4][src].ts_local = now->tv_sec; + bitmap_set(udp_act[V4][UDP_ACT_TAP], src); } - } - if (addr) { - if (bind(s, addr, sl)) - return; + memcpy(&a6.s6_addr[12], &sr4->sin_addr, sizeof(sr4->sin_addr)); - udp_act_set(af, src); - } + uh->source = sr4->sin_port; - if (af == AF_INET) - up4[src].ts_ephemeral = now->tv_sec; - else - up6[src].ts_ephemeral = now->tv_sec; -} - -/** - * udp_tap_handler_local() - Set address to local if needed, update timestamp - * @af: Address family, AF_INET or AF_INET6 - * @dst: Destination port, host order - * @sa: Socket address as struct sockaddr_in or sockaddr_in6 to modify - * @now: Current timestamp - */ -static void udp_tap_handler_local(int af, in_port_t dst, void *sa, - struct timespec *now) -{ - if (af == AF_INET) { - if (up4[dst].ts_local) { - struct sockaddr_in *s_in = (struct sockaddr_in *)sa; - - s_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - up4[dst].ts_local = now->tv_sec; - } - } else { - if (up6[dst].ts_local) { - struct sockaddr_in6 *s_in6 = (struct sockaddr_in6 *)sa; - - s_in6->sin6_addr = in6addr_loopback; - up6[dst].ts_local = now->tv_sec; - } + tap_ip_send(c, &a6, IPPROTO_UDP, buf, n + sizeof(*uh)); } } @@ -306,6 +484,7 @@ static void udp_tap_handler_local(int af, in_port_t dst, void *sa, * udp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 + * @addr: Destination address * @msg: Input messages * @count: Message count * @now: Current timestamp @@ -345,7 +524,24 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, sa = (struct sockaddr *)&s_in; sl = sizeof(s_in); - } else if (af == AF_INET6) { + + if (!(s = udp_tap_map[V4][src].sock)) { + union udp_epoll_ref uref = { .bound = 1, .port = src }; + + s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32); + if (s <= 0) + return count; + + udp_tap_map[V4][src].sock = s; + bitmap_set(udp_act[V4][UDP_ACT_TAP], src); + } + + udp_tap_map[V4][src].ts = now->tv_sec; + + if (s_in.sin_addr.s_addr == c->gw4 && + udp_tap_map[V4][dst].ts_local) + s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + } else { s_in6 = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_port = uh->dest, @@ -354,8 +550,25 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, sa = (struct sockaddr *)&s_in6; sl = sizeof(s_in6); - } else { - return count; + + if (!(s = udp_tap_map[V6][src].sock)) { + union udp_epoll_ref uref = { .bound = 1, .v6 = 1, + .port = src + }; + + s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, 0, uref.u32); + if (s <= 0) + return count; + + udp_tap_map[V6][src].sock = s; + bitmap_set(udp_act[V6][UDP_ACT_TAP], src); + } + + udp_tap_map[V6][src].ts = now->tv_sec; + + if (!memcmp(addr, &c->gw6, sizeof(c->gw6)) && + udp_tap_map[V6][dst].ts_local) + s_in6.sin6_addr = in6addr_loopback; } for (i = 0; i < count; i++) { @@ -369,30 +582,100 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, mm[i].msg_hdr.msg_iovlen = 1; } - if (af == AF_INET) { - if (!(s = up4[src].s)) - return count; - - if (s_in.sin_addr.s_addr == c->gw4) - udp_tap_handler_local(AF_INET, dst, &s_in, now); - } else { - if (!(s = up6[src].s)) - return count; - - if (!memcmp(addr, &c->gw6, sizeof(c->gw6))) - udp_tap_handler_local(AF_INET6, dst, &s_in6, now); - } - - if (PORT_IS_EPHEMERAL(src)) - udp_tap_handler_ephemeral(af, src, now); - - count = sendmmsg(s, mm, count, MSG_DONTWAIT | MSG_NOSIGNAL); + count = sendmmsg(s, mm, count, MSG_NOSIGNAL); if (count < 0) return 1; return count; } +/** + * udp_sock_init_ns() - Bind sockets in namespace for inbound connections + * @arg: Execution context + * + * Return: 0 + */ +int udp_sock_init_ns(void *arg) +{ + union udp_epoll_ref uref = { .bound = 1, .splice = UDP_TO_INIT }; + struct ctx *c = (struct ctx *)arg; + in_port_t port; + + ns_enter(c->pasta_pid); + + for (port = 0; port < USHRT_MAX; port++) { + if (!bitmap_isset(c->udp.port_to_init, port)) + continue; + + uref.port = port; + + if (c->v4) { + uref.v6 = 0; + sock_l4(c, AF_INET, IPPROTO_UDP, port, 1, uref.u32); + } + + if (c->v6) { + uref.v6 = 1; + sock_l4(c, AF_INET6, IPPROTO_UDP, port, 1, uref.u32); + } + } + + return 0; +} + +/** + * udp_splice_iov_init() - Set up buffers and descriptors for recvmmsg/sendmmsg + */ +static void udp_splice_iov_init(void) +{ + struct mmsghdr *h; + struct iovec *iov; + int i; + + for (i = 0, h = udp_splice_mmh_recv; i < UDP_SPLICE_FRAMES; i++, h++) { + struct msghdr *mh = &h->msg_hdr; + + if (!i) { + mh->msg_name = &udp_splice_namebuf; + mh->msg_namelen = sizeof(udp_splice_namebuf); + } + + mh->msg_iov = &udp_splice_iov_recv[i]; + mh->msg_iovlen = 1; + } + for (i = 0, iov = udp_splice_iov_recv; i < UDP_SPLICE_FRAMES; + i++, iov++) { + iov->iov_base = udp_splice_buf[i]; + iov->iov_len = sizeof(udp_splice_buf[i]); + } + + for (i = 0, h = udp_splice_mmh_send; i < UDP_SPLICE_FRAMES; i++, h++) { + struct msghdr *mh = &h->msg_hdr; + + mh->msg_iov = &udp_splice_iov_send[i]; + mh->msg_iovlen = 1; + } + for (i = 0, iov = udp_splice_iov_send; i < UDP_SPLICE_FRAMES; + i++, iov++) { + iov->iov_base = udp_splice_buf[i]; + } + + for (i = 0, h = udp_splice_mmh_sendto; i < UDP_SPLICE_FRAMES; + i++, h++) { + struct msghdr *mh = &h->msg_hdr; + + mh->msg_name = &udp_splice_namebuf; + mh->msg_namelen = sizeof(udp_splice_namebuf); + + mh->msg_iov = &udp_splice_iov_sendto[i]; + mh->msg_iovlen = 1; + } + for (i = 0, iov = udp_splice_iov_sendto; i < UDP_SPLICE_FRAMES; + i++, iov++) { + iov->iov_base = udp_splice_buf[i]; + } +} + /** * udp_sock_init() - Create and bind listening sockets for inbound packets * @c: Execution context @@ -401,111 +684,128 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, */ int udp_sock_init(struct ctx *c) { - int s, prev = -1; + union udp_epoll_ref uref = { .bound = 1 }; + char ns_fn_stack[NS_FN_STACK_SIZE]; in_port_t port; - - c->udp.fd_min = INT_MAX; - c->udp.fd_max = 0; - c->udp.fd_in_seq = 1; + int s; for (port = 0; port < USHRT_MAX; port++) { + if (bitmap_isset(c->udp.port_to_ns, port)) + uref.splice = UDP_TO_NS; + else if (bitmap_isset(c->udp.port_to_tap, port)) + uref.splice = 0; + else + continue; + + uref.port = port; + if (c->v4) { - if ((s = sock_l4(c, AF_INET, IPPROTO_UDP, port)) < 0) - return -1; + uref.v6 = 0; + s = sock_l4(c, AF_INET, IPPROTO_UDP, port, + uref.splice == UDP_TO_NS, uref.u32); - if (c->udp.fd_in_seq && prev != -1 && s != prev + 1) - c->udp.fd_in_seq = 0; - else - prev = s; - - up4[port].s = s; + if (!uref.splice && s > 0) + udp_tap_map[V4][port].sock = s; } if (c->v6) { - if ((s = sock_l4(c, AF_INET6, IPPROTO_UDP, port)) < 0) - return -1; + uref.v6 = 1; + s = sock_l4(c, AF_INET6, IPPROTO_UDP, port, + uref.splice == UDP_TO_NS, uref.u32); - if (c->udp.fd_in_seq && prev != -1 && s != prev + 1) - c->udp.fd_in_seq = 0; - else - prev = s; - - up6[port].s = s; + if (!uref.splice && s > 0) + udp_tap_map[V6][port].sock = s; } } + if (c->mode == MODE_PASTA) { + udp_splice_iov_init(); + clone(udp_sock_init_ns, ns_fn_stack + sizeof(ns_fn_stack) / 2, + CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)c); + } + return 0; } /** * udp_timer_one() - Handler for timed events on one port - * @af: Address family, AF_INET or AF_INET6 - * @p: Port number, host order + * @c: Execution context + * @v6: Set for IPv6 connections + * @type: Socket type + * @port: Port number, host order * @ts: Timestamp from caller */ -static void udp_timer_one(struct ctx *c, int af, in_port_t p, - struct timespec *ts) +static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, + in_port_t port, struct timespec *ts) { + struct udp_splice_port *sp; + struct udp_tap_port *tp; int s = -1; - if (af == AF_INET) { - if (ts->tv_sec - up4[p].ts_ephemeral > UDP_CONN_TIMEOUT) - up4[p].ts_ephemeral = 0; - if (ts->tv_sec - up4[p].ts_local > UDP_CONN_TIMEOUT) - up4[p].ts_local = 0; + switch (type) { + case UDP_ACT_TAP: + tp = &udp_tap_map[v6 ? V6 : V4][port]; - if (!up4[p].ts_ephemeral && !up4[p].ts_local) { - udp_act_clear(AF_INET, p); - s = up4[p].s; - } - } else { - if (ts->tv_sec - up6[p].ts_ephemeral > UDP_CONN_TIMEOUT) - up6[p].ts_ephemeral = 0; - if (ts->tv_sec - up6[p].ts_local > UDP_CONN_TIMEOUT) - up6[p].ts_local = 0; + if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) + s = tp->sock; - if (!up6[p].ts_ephemeral && !up6[p].ts_local) { - udp_act_clear(AF_INET6, p); - s = up6[p].s; - } + if (ts->tv_sec - tp->ts_local > UDP_CONN_TIMEOUT) + tp->ts_local = 0; + + break; + case UDP_ACT_INIT_CONN: + sp = &udp_splice_map[v6 ? V6 : V4][port]; + + if (ts->tv_sec - sp->init_conn_ts > UDP_CONN_TIMEOUT) + s = sp->init_conn_sock; + + break; + case UDP_ACT_NS_CONN: + sp = &udp_splice_map[v6 ? V6 : V4][port]; + + if (ts->tv_sec - sp->ns_conn_ts > UDP_CONN_TIMEOUT) + s = sp->ns_conn_sock; + + break; + default: + return; } if (s != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); - if (sock_l4(c, af, IPPROTO_UDP, p) != s) - c->udp.fd_in_seq = 0; + bitmap_clear(udp_act[v6 ? V6 : V4][type], port); } } /** - * udp_timer() - Scan activity bitmap for ports with associated timed events + * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context * @ts: Timestamp from caller */ void udp_timer(struct ctx *c, struct timespec *ts) { - long *word, tmp; + int n, t, v6 = 0; unsigned int i; - int n; + long *word, tmp; - word = (long *)udp4_act; - for (i = 0; i < sizeof(udp4_act) / sizeof(long); i++, word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, AF_INET, - i * sizeof(long) * 8 + n - 1, ts); +v6: + for (t = 0; t < UDP_ACT_TYPE_MAX; t++) { + word = (long *)udp_act[v6 ? V6 : V4][t]; + for (i = 0; i < sizeof(udp_act[0][0]) / sizeof(long); + i++, word++) { + tmp = *word; + while ((n = ffsl(tmp))) { + tmp &= ~(1UL << (n - 1)); + udp_timer_one(c, v6, t, + i * sizeof(long) * 8 + n - 1, ts); + } } } - word = (long *)udp6_act; - for (i = 0; i < sizeof(udp6_act) / sizeof(long); i++, word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, AF_INET6, - i * sizeof(long) * 8 + n - 1, ts); - } + if (!v6) { + v6 = 1; + goto v6; } } diff --git a/udp.h b/udp.h index a126488..e3afa74 100644 --- a/udp.h +++ b/udp.h @@ -3,24 +3,48 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ -void udp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); int udp_tap_handler(struct ctx *c, int af, void *addr, struct tap_msg *msg, int count, struct timespec *now); int udp_sock_init(struct ctx *c); void udp_timer(struct ctx *c, struct timespec *ts); +/** + * union udp_epoll_ref - epoll reference portion for TCP connections + * @bound: Set if this file descriptor is a bound socket + * @splice: Set if descriptor is associated to "spliced" connection + * @v6: Set for IPv6 sockets or connections + * @port: Source port for connected sockets, bound port otherwise + * @u32: Opaque u32 value of reference + */ +union udp_epoll_ref { + struct { + uint32_t bound:1, + splice:3, +#define UDP_TO_NS 1 +#define UDP_TO_INIT 2 +#define UDP_BACK_TO_NS 3 +#define UDP_BACK_TO_INIT 4 + + v6:1, + port:16; + }; + uint32_t u32; +}; + + /** * struct udp_ctx - Execution context for UDP - * @fd_min: Lowest file descriptor number for UDP ever used - * @fd_max: Highest file descriptor number for UDP ever used - * @fd_in_seq: 1 if all socket numbers are in sequence, 0 otherwise + * @port_to_tap: Ports bound host/init-side, packets to guest/tap + * @port_to_init: Ports bound namespace-side, spliced to init + * @port_to_ns: Ports bound init-side, spliced to namespace * @timer_run: Timestamp of most recent timer run */ struct udp_ctx { - int fd_min; - int fd_max; - int fd_in_seq; + uint8_t port_to_tap [USHRT_MAX / 8]; + uint8_t port_to_init [USHRT_MAX / 8]; + uint8_t port_to_ns [USHRT_MAX / 8]; struct timespec timer_run; }; diff --git a/util.c b/util.c index 59a0cb2..1372eec 100644 --- a/util.c +++ b/util.c @@ -1,14 +1,19 @@ // SPDX-License-Identifier: AGPL-3.0-or-later /* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode * * util.c - Convenience helpers * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * */ +#define _GNU_SOURCE +#include #include #include #include @@ -20,13 +25,16 @@ #include #include #include +#include +#include +#include #include #include #include #include -#include "passt.h" #include "util.h" +#include "passt.h" #ifdef DEBUG #define logfn(name, level) \ @@ -183,73 +191,72 @@ char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) * sock_l4() - Create and bind socket for given L4, add to epoll list * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @proto: Protocol number, host order + * @proto: Protocol number * @port: Port, host order + * @lo: Bind to loopback address only, if set + * @data: epoll reference portion for protocol handlers * * Return: newly created socket, -1 on error */ -int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) +int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo, + uint32_t data) { + union epoll_ref ref = { .proto = proto, .data = data }; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = htons(port), - .sin_addr = { .s_addr = INADDR_ANY }, }; struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = htons(port), - .sin6_addr = IN6ADDR_ANY_INIT, }; - struct epoll_event ev = { 0 }; const struct sockaddr *sa; + struct epoll_event ev; int fd, sl, one = 1; if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) return -1; /* Not implemented. */ - fd = socket(af, proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM, proto); + if (proto == IPPROTO_TCP) + fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); + else + fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); if (fd < 0) { perror("L4 socket"); return -1; } + ref.s = fd; if (af == AF_INET) { + if (lo) + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + else + addr4.sin_addr.s_addr = htonl(INADDR_ANY); + sa = (const struct sockaddr *)&addr4; sl = sizeof(addr4); } else { + if (lo) + addr6.sin6_addr = in6addr_loopback; + else + addr6.sin6_addr = in6addr_any; + sa = (const struct sockaddr *)&addr6; sl = sizeof(addr6); setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); } - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMP, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_ICMPV6, icmp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_TCP, tcp, fd); - CHECK_SET_MIN_MAX_PROTO_FD(proto, IPPROTO_UDP, udp, fd); - - if (proto == IPPROTO_UDP && PORT_IS_EPHEMERAL(port)) - goto epoll_add; - - if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) - goto epoll_add; + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); if (bind(fd, sa, sl) < 0) { /* We'll fail to bind to low ports if we don't have enough * capabilities, and we'll fail to bind on already bound ports, - * this is fine. If this isn't the socket with the lowest number - * for a given protocol, leave it open, to avoid unnecessary - * holes in the numbering. + * this is fine. */ - if ((proto == IPPROTO_TCP && fd == c->tcp.fd_min) || - (proto == IPPROTO_UDP && fd == c->udp.fd_min) || - ((proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) && - fd == c->icmp.fd_min)) { - close(fd); - return 0; - } - return fd; + close(fd); + return 0; } if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { @@ -258,9 +265,8 @@ int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port) return -1; } -epoll_add: ev.events = EPOLLIN; - ev.data.fd = fd; + ev.data.u64 = ref.u64; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { perror("L4 epoll_ctl"); return -1; @@ -286,3 +292,97 @@ int timespec_diff_ms(struct timespec *a, struct timespec *b) return (a->tv_nsec - b->tv_nsec) / 1000000 + (a->tv_sec - b->tv_sec) * 1000; } + +/** + * bitmap_set() - Set single bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to set + */ +void bitmap_set(uint8_t *map, int bit) +{ + map[bit / 8] |= 1 << (bit % 8); +} + +/** + * bitmap_set() - Clear single bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to clear + */ +void bitmap_clear(uint8_t *map, int bit) +{ + map[bit / 8] &= ~(1 << (bit % 8)); +} + +/** + * bitmap_isset() - Check for set bit in bitmap + * @map: Pointer to bitmap + * @bit: Bit number to check + * + * Return: non-zero if given bit is set, zero if it's not + */ +int bitmap_isset(uint8_t *map, int bit) +{ + return map[bit / 8] & (1 << bit % 8); +} + +/** + * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs + * @name: Corresponding name of file under /proc/net/ + * @map: Bitmap where numbers of ports in listening state will be set + */ +void procfs_scan_listen(char *name, uint8_t *map) +{ + char line[200], path[PATH_MAX]; + unsigned long port; + unsigned int state; + FILE *fp; + + snprintf(path, PATH_MAX, "/proc/net/%s", name); + if (!(fp = fopen(path, "r"))) + return; + + fgets(line, sizeof(line), fp); + while (fgets(line, sizeof(line), fp)) { + if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2) + continue; + + /* See enum in kernel's include/net/tcp_states.h */ + if ((strstr(name, "tcp") && state != 0x0a) || + (strstr(name, "udp") && state != 0x07)) + continue; + + bitmap_set(map, port); + } + + fclose(fp); +} + +/** + * ns_enter() - Enter user and network namespaces of process with given PID + * @target_pid: Process PID + * + * Return: 0 on success, -1 on failure + */ +int ns_enter(int target_pid) +{ + char ns[PATH_MAX]; + int fd; + + snprintf(ns, PATH_MAX, "/proc/%i/ns/user", target_pid); + if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0)) + goto fail; + close(fd); + + snprintf(ns, PATH_MAX, "/proc/%i/ns/net", target_pid); + if ((fd = open(ns, O_RDONLY)) < 0 || setns(fd, 0)) + goto fail; + close(fd); + + return 0; + +fail: + if (fd != -1) + close(fd); + + return -1; +} diff --git a/util.h b/util.h index 3e24c9a..c4d947a 100644 --- a/util.h +++ b/util.h @@ -29,24 +29,45 @@ void debug(const char *format, ...); #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #endif +#define SWAP(a, b) \ + do { \ + typeof(a) __x = (a); (a) = (b); (b) = __x; \ + } while (0) \ + #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) +#define V4 0 +#define V6 1 +#define IP_VERSIONS 2 + #define ARRAY_SIZE(a) ((int)(sizeof(a) / sizeof((a)[0]))) #define IN_INTERVAL(a, b, x) ((x) >= (a) && (x) <= (b)) #define FD_PROTO(x, proto) \ (IN_INTERVAL(c->proto.fd_min, c->proto.fd_max, (x))) -#define PORT_IS_EPHEMERAL(port) ((port) >= (1 << 15) + (1 << 14)) /* RFC 6335 */ +#define PORT_EPHEMERAL_MIN ((1 << 15) + (1 << 14)) /* RFC 6335 */ +#define PORT_IS_EPHEMERAL(port) ((port) >= PORT_EPHEMERAL_MIN) + +#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 4) #include #include #include +#include + +struct ctx; uint16_t csum_fold(uint32_t sum); uint16_t csum_ip4(void *buf, size_t len); void csum_tcp4(struct iphdr *iph); char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); -int sock_l4(struct ctx *c, int af, uint16_t proto, uint16_t port); +int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo, + uint32_t data); int timespec_diff_ms(struct timespec *a, struct timespec *b); +void bitmap_set(uint8_t *map, int bit); +void bitmap_clear(uint8_t *map, int bit); +int bitmap_isset(uint8_t *map, int bit); +void procfs_scan_listen(char *name, uint8_t *map); +int ns_enter(int target_pid);