1
0
mirror of https://passt.top/passt synced 2024-12-22 21:55:22 +00:00

Compare commits

..

No commits in common. "69303cafbef86ef070d67582169d455eb8da288c" and "0af928eaa020c1062fdc91598dfdc533966e2afe" have entirely different histories.

14 changed files with 189 additions and 156 deletions

12
icmp.c
View File

@ -154,21 +154,17 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref)
* icmp_tap_handler() - Handle packets from tap * icmp_tap_handler() - Handle packets from tap
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @saddr: Source address * @addr: Destination address
* @daddr: Destination address
* @p: Packet pool, single packet with ICMP/ICMPv6 header * @p: Packet pool, single packet with ICMP/ICMPv6 header
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of consumed packets (always 1, even if malformed) * Return: count of consumed packets (always 1, even if malformed)
*/ */
int icmp_tap_handler(const struct ctx *c, int af, int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
const void *saddr, const void *daddr,
const struct pool *p, const struct timespec *now) const struct pool *p, const struct timespec *now)
{ {
size_t plen; size_t plen;
(void)saddr;
if (af == AF_INET) { if (af == AF_INET) {
struct sockaddr_in sa = { struct sockaddr_in sa = {
.sin_family = AF_INET, .sin_family = AF_INET,
@ -214,7 +210,7 @@ int icmp_tap_handler(const struct ctx *c, int af,
icmp_id_map[V4][id].ts = now->tv_sec; icmp_id_map[V4][id].ts = now->tv_sec;
bitmap_set(icmp_act[V4], id); bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)daddr; sa.sin_addr = *(struct in_addr *)addr;
if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)) < 0) { (struct sockaddr *)&sa, sizeof(sa)) < 0) {
debug("ICMP: failed to relay request to socket"); debug("ICMP: failed to relay request to socket");
@ -268,7 +264,7 @@ int icmp_tap_handler(const struct ctx *c, int af,
icmp_id_map[V6][id].ts = now->tv_sec; icmp_id_map[V6][id].ts = now->tv_sec;
bitmap_set(icmp_act[V6], id); bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)daddr; sa.sin6_addr = *(struct in6_addr *)addr;
if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)) < 1) { (struct sockaddr *)&sa, sizeof(sa)) < 1) {
debug("ICMPv6: failed to relay request to socket"); debug("ICMPv6: failed to relay request to socket");

3
icmp.h
View File

@ -12,8 +12,7 @@ struct ctx;
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref); void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref); void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref);
int icmp_tap_handler(const struct ctx *c, int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
int af, const void *saddr, const void *daddr,
const struct pool *p, const struct timespec *now); const struct pool *p, const struct timespec *now);
void icmp_timer(const struct ctx *c, const struct timespec *ts); void icmp_timer(const struct ctx *c, const struct timespec *ts);
void icmp_init(void); void icmp_init(void);

View File

@ -6,9 +6,6 @@
* IPv6 or IPv4 (encoded as IPv4-mapped IPv6 addresses) * IPv6 or IPv4 (encoded as IPv4-mapped IPv6 addresses)
*/ */
#ifndef INANY_H
#define INANY_H
/** union inany_addr - Represents either an IPv4 or IPv6 address /** union inany_addr - Represents either an IPv4 or IPv6 address
* @a6: Address as an IPv6 address, may be IPv4-mapped * @a6: Address as an IPv6 address, may be IPv4-mapped
* @v4mapped.zero: All zero-bits for an IPv4 address * @v4mapped.zero: All zero-bits for an IPv4 address
@ -93,5 +90,3 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
ASSERT(0); ASSERT(0);
} }
} }
#endif /* INANY_H */

10
passt.c
View File

@ -117,11 +117,13 @@ static void timer_init(struct ctx *c, const struct timespec *now)
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers * proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged * @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/ */
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{ {
tcp_update_l2_buf(eth_d, eth_s); tcp_update_l2_buf(eth_d, eth_s, ip_da);
udp_update_l2_buf(eth_d, eth_s); udp_update_l2_buf(eth_d, eth_s, ip_da);
} }
/** /**
@ -245,7 +247,7 @@ int main(int argc, char **argv)
if (!c.no_icmp) if (!c.no_icmp)
icmp_init(); icmp_init();
proto_update_l2_buf(c.mac_guest, c.mac); proto_update_l2_buf(c.mac_guest, c.mac, &c.ip4.addr);
if (c.ifi4 && !c.no_dhcp) if (c.ifi4 && !c.no_dhcp)
dhcp_init(); dhcp_init();

View File

@ -303,7 +303,7 @@ struct ctx {
int low_rmem; int low_rmem;
}; };
void proto_update_l2_buf(const unsigned char *eth_d, void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const unsigned char *eth_s); const struct in_addr *ip_da);
#endif /* PASST_H */ #endif /* PASST_H */

View File

@ -353,7 +353,7 @@ void pasta_ns_conf(struct ctx *c)
} }
} }
proto_update_l2_buf(c->mac_guest, NULL); proto_update_l2_buf(c->mac_guest, NULL, NULL);
} }
/** /**

29
tap.c
View File

@ -625,8 +625,10 @@ resume:
l4_len = l3_len - hlen; l4_len = l3_len - hlen;
if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) {
c->ip4.addr_seen.s_addr = iph->saddr; c->ip4.addr_seen.s_addr = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->ip4.addr_seen);
}
l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL); l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL);
if (!l4h) if (!l4h)
@ -641,8 +643,7 @@ resume:
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
packet_add(pkt, l4_len, l4h); packet_add(pkt, l4_len, l4h);
icmp_tap_handler(c, AF_INET, &iph->saddr, &iph->daddr, icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now);
pkt, now);
continue; continue;
} }
@ -707,6 +708,7 @@ append:
for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
struct pool *p = (struct pool *)&seq->p; struct pool *p = (struct pool *)&seq->p;
struct in_addr *da = &seq->daddr;
size_t n = p->count; size_t n = p->count;
tap_packet_debug(NULL, NULL, seq, 0, NULL, n); tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
@ -714,13 +716,11 @@ append:
if (seq->protocol == IPPROTO_TCP) { if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp) if (c->no_tcp)
continue; continue;
while ((n -= tcp_tap_handler(c, AF_INET, &seq->saddr, while ((n -= tcp_tap_handler(c, AF_INET, da, p, now)));
&seq->daddr, p, now)));
} else if (seq->protocol == IPPROTO_UDP) { } else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp) if (c->no_udp)
continue; continue;
while ((n -= udp_tap_handler(c, AF_INET, &seq->saddr, while ((n -= udp_tap_handler(c, AF_INET, da, p, now)));
&seq->daddr, p, now)));
} }
} }
@ -801,7 +801,7 @@ resume:
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
packet_add(pkt, l4_len, l4h); packet_add(pkt, l4_len, l4h);
icmp_tap_handler(c, AF_INET6, saddr, daddr, pkt, now); icmp_tap_handler(c, AF_INET6, daddr, pkt, now);
continue; continue;
} }
@ -818,6 +818,8 @@ resume:
continue; continue;
} }
*saddr = c->ip6.addr;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
continue; continue;
@ -868,6 +870,7 @@ append:
for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
struct pool *p = (struct pool *)&seq->p; struct pool *p = (struct pool *)&seq->p;
struct in6_addr *da = &seq->daddr;
size_t n = p->count; size_t n = p->count;
tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
@ -875,13 +878,11 @@ append:
if (seq->protocol == IPPROTO_TCP) { if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp) if (c->no_tcp)
continue; continue;
while ((n -= tcp_tap_handler(c, AF_INET6, &seq->saddr, while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now)));
&seq->daddr, p, now)));
} else if (seq->protocol == IPPROTO_UDP) { } else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp) if (c->no_udp)
continue; continue;
while ((n -= udp_tap_handler(c, AF_INET6, &seq->saddr, while ((n -= udp_tap_handler(c, AF_INET6, da, p, now)));
&seq->daddr, p, now)));
} }
} }
@ -967,7 +968,7 @@ redo:
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN); memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL); proto_update_l2_buf(c->mac_guest, NULL, NULL);
} }
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {
@ -1028,7 +1029,7 @@ restart:
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN); memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL); proto_update_l2_buf(c->mac_guest, NULL, NULL);
} }
switch (ntohs(eh->h_proto)) { switch (ntohs(eh->h_proto)) {

203
tcp.c
View File

@ -309,6 +309,9 @@
#define TCP_FRAMES \ #define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
#define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ #define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
TCP_HASH_TABLE_LOAD) TCP_HASH_TABLE_LOAD)
@ -320,8 +323,10 @@
#define MSS_DEFAULT 536 #define MSS_DEFAULT 536
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
uint32_t psum;
uint32_t tsum;
#ifdef __AVX2__ #ifdef __AVX2__
uint8_t pad[26]; uint8_t pad[18];
#else #else
uint8_t pad[2]; uint8_t pad[2];
#endif #endif
@ -396,7 +401,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#define OPT_SACK 5 #define OPT_SACK 5
#define OPT_TS 8 #define OPT_TS 8
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) #define CONN_V4(conn) (!!inany_v4(&(conn)->addr))
#define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \ #define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \ ((conn->events & ESTABLISHED) && \
@ -429,15 +434,15 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
/* Table of guest side forwarding addresses with very low RTT (assumed /* Table of destinations with very low RTT (assumed to be local), LRU */
* to be local to the host), LRU
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
/* Static buffers */ /* Static buffers */
/** /**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @tsum: Partial TCP header checksum (excluding length and saddr)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled) * @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr) * @iph: Pre-filled IP header (except for tot_len and saddr)
@ -445,15 +450,17 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
* @data: Storage for TCP payload * @data: Storage for TCP payload
*/ */
static struct tcp4_l2_buf_t { static struct tcp4_l2_buf_t {
uint32_t psum; /* 0 */
uint32_t tsum; /* 4 */
#ifdef __AVX2__ #ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */ uint8_t pad[18]; /* 8, align th to 32 bytes */
#else #else
uint8_t pad[2]; /* align iph to 4 bytes 0 */ uint8_t pad[2]; /* align iph to 4 bytes 8 */
#endif #endif
struct tap_hdr taph; /* 26 2 */ struct tap_hdr taph; /* 26 10 */
struct iphdr iph; /* 44 20 */ struct iphdr iph; /* 44 28 */
struct tcphdr th; /* 64 40 */ struct tcphdr th; /* 64 48 */
uint8_t data[MSS4]; /* 84 60 */ uint8_t data[MSS4]; /* 84 68 */
/* 65536 65532 */ /* 65536 65532 */
#ifdef __AVX2__ #ifdef __AVX2__
} __attribute__ ((packed, aligned(32))) } __attribute__ ((packed, aligned(32)))
@ -508,6 +515,8 @@ static struct iovec tcp_iov [UIO_MAXIOV];
/** /**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @tsum: Partial TCP header checksum (excluding length and saddr)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled) * @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr) * @iph: Pre-filled IP header (except for tot_len and saddr)
@ -515,14 +524,16 @@ static struct iovec tcp_iov [UIO_MAXIOV];
* @opts: Headroom for TCP options * @opts: Headroom for TCP options
*/ */
static struct tcp4_l2_flags_buf_t { static struct tcp4_l2_flags_buf_t {
uint32_t psum; /* 0 */
uint32_t tsum; /* 4 */
#ifdef __AVX2__ #ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */ uint8_t pad[18]; /* 8, align th to 32 bytes */
#else #else
uint8_t pad[2]; /* align iph to 4 bytes 0 */ uint8_t pad[2]; /* align iph to 4 bytes 8 */
#endif #endif
struct tap_hdr taph; /* 26 2 */ struct tap_hdr taph; /* 26 10 */
struct iphdr iph; /* 44 20 */ struct iphdr iph; /* 44 28 */
struct tcphdr th; /* 64 40 */ struct tcphdr th; /* 64 48 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__ #ifdef __AVX2__
} __attribute__ ((packed, aligned(32))) } __attribute__ ((packed, aligned(32)))
@ -631,13 +642,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
*/ */
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{ {
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.tcp.index = CONN_IDX(conn) }; .tcp.index = CONN_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 }; struct epoll_event ev = { .data.u64 = ref.u64 };
if (conn->events == CLOSED) { if (conn->events == CLOSED) {
if (conn->in_epoll) if (conn->c.in_epoll)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
if (conn->timer != -1) if (conn->timer != -1)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
@ -649,7 +660,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
return -errno; return -errno;
conn->in_epoll = true; conn->c.in_epoll = true;
if (conn->timer != -1) { if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@ -847,7 +858,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
int i; int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
if (inany_equals(&conn->faddr, low_rtt_dst + i)) if (inany_equals(&conn->addr, low_rtt_dst + i))
return 1; return 1;
return 0; return 0;
@ -869,7 +880,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
return; return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
if (inany_equals(&conn->faddr, low_rtt_dst + i)) if (inany_equals(&conn->addr, low_rtt_dst + i))
return; return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i; hole = i;
@ -881,7 +892,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1) if (hole == -1)
return; return;
low_rtt_dst[hole++] = conn->faddr; low_rtt_dst[hole++] = conn->addr;
if (hole == LOW_RTT_TABLE_SIZE) if (hole == LOW_RTT_TABLE_SIZE)
hole = 0; hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
@ -940,13 +951,11 @@ void tcp_sock_set_bufsize(const struct ctx *c, int s)
*/ */
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf) static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{ {
uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_TCP); uint32_t sum = buf->psum;
sum += buf->iph.tot_len; sum += buf->iph.tot_len;
sum += (buf->iph.saddr >> 16) & 0xffff; sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff; sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
buf->iph.check = (uint16_t)~csum_fold(sum); buf->iph.check = (uint16_t)~csum_fold(sum);
} }
@ -958,12 +967,10 @@ static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf) static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{ {
uint16_t tlen = ntohs(buf->iph.tot_len) - 20; uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
uint32_t sum = htons(IPPROTO_TCP); uint32_t sum = buf->tsum;
sum += (buf->iph.saddr >> 16) & 0xffff; sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff; sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
sum += htons(ntohs(buf->iph.tot_len) - 20); sum += htons(ntohs(buf->iph.tot_len) - 20);
buf->th.check = 0; buf->th.check = 0;
@ -994,8 +1001,10 @@ static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged * @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/ */
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{ {
int i; int i;
@ -1009,6 +1018,24 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
tap_update_mac(&b6->taph, eth_d, eth_s); tap_update_mac(&b6->taph, eth_d, eth_s);
tap_update_mac(&b4f->taph, eth_d, eth_s); tap_update_mac(&b4f->taph, eth_d, eth_s);
tap_update_mac(&b6f->taph, eth_d, eth_s); tap_update_mac(&b6f->taph, eth_d, eth_s);
if (ip_da) {
b4f->iph.daddr = b4->iph.daddr = ip_da->s_addr;
if (!i) {
b4f->iph.saddr = b4->iph.saddr = 0;
b4f->iph.tot_len = b4->iph.tot_len = 0;
b4f->iph.check = b4->iph.check = 0;
b4f->psum = b4->psum = sum_16b(&b4->iph, 20);
b4->tsum = ((ip_da->s_addr >> 16) & 0xffff) +
(ip_da->s_addr & 0xffff) +
htons(IPPROTO_TCP);
b4f->tsum = b4->tsum;
} else {
b4f->psum = b4->psum = tcp4_l2_buf[0].psum;
b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum;
}
}
} }
} }
@ -1016,16 +1043,15 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context * @c: Execution context
*/ */
static void tcp_sock4_iov_init(struct ctx *c) static void tcp_sock4_iov_init(const struct ctx *c)
{ {
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov; struct iovec *iov;
int i; int i;
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP), .taph = TAP_HDR_INIT(ETH_P_IP),
.iph = iph, .iph = L2_BUF_IP4_INIT(IPPROTO_TCP),
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
}; };
} }
@ -1136,18 +1162,18 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
/** /**
* tcp_hash_match() - Check if a connection entry matches address and ports * tcp_hash_match() - Check if a connection entry matches address and ports
* @conn: Connection entry to match against * @conn: Connection entry to match against
* @faddr: Guest side forwarding address * @addr: Remote address
* @eport: Guest side endpoint port * @tap_port: tap-facing port
* @fport: Guest side forwarding port * @sock_port: Socket-facing port
* *
* Return: 1 on match, 0 otherwise * Return: 1 on match, 0 otherwise
*/ */
static int tcp_hash_match(const struct tcp_tap_conn *conn, static int tcp_hash_match(const struct tcp_tap_conn *conn,
const union inany_addr *faddr, const union inany_addr *addr,
in_port_t eport, in_port_t fport) in_port_t tap_port, in_port_t sock_port)
{ {
if (inany_equals(&conn->faddr, faddr) && if (inany_equals(&conn->addr, addr) &&
conn->eport == eport && conn->fport == fport) conn->tap_port == tap_port && conn->sock_port == sock_port)
return 1; return 1;
return 0; return 0;
@ -1156,21 +1182,21 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn,
/** /**
* tcp_hash() - Calculate hash value for connection given address and ports * tcp_hash() - Calculate hash value for connection given address and ports
* @c: Execution context * @c: Execution context
* @faddr: Guest side forwarding address * @addr: Remote address
* @eport: Guest side endpoint port * @tap_port: tap-facing port
* @fport: Guest side forwarding port * @sock_port: Socket-facing port
* *
* Return: hash value, already modulo size of the hash table * Return: hash value, already modulo size of the hash table
*/ */
static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr,
in_port_t eport, in_port_t fport) in_port_t tap_port, in_port_t sock_port)
{ {
struct { struct {
union inany_addr faddr; union inany_addr addr;
in_port_t eport; in_port_t tap_port;
in_port_t fport; in_port_t sock_port;
} __attribute__((__packed__)) in = { } __attribute__((__packed__)) in = {
*faddr, eport, fport *addr, tap_port, sock_port
}; };
uint64_t b = 0; uint64_t b = 0;
@ -1189,7 +1215,7 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr,
static unsigned int tcp_conn_hash(const struct ctx *c, static unsigned int tcp_conn_hash(const struct ctx *c,
const struct tcp_tap_conn *conn) const struct tcp_tap_conn *conn)
{ {
return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); return tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
} }
/** /**
@ -1201,7 +1227,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
{ {
int b; int b;
b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1; conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1;
tc_hash[b] = conn; tc_hash[b] = conn;
@ -1270,24 +1296,25 @@ static void tcp_tap_conn_update(struct ctx *c, struct tcp_tap_conn *old,
* tcp_hash_lookup() - Look up connection given remote address and ports * tcp_hash_lookup() - Look up connection given remote address and ports
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @faddr: Guest side forwarding address (guest remote address) * @addr: Remote address, pointer to in_addr or in6_addr
* @eport: Guest side endpoint port (guest local port) * @tap_port: tap-facing port
* @fport: Guest side forwarding port (guest remote port) * @sock_port: Socket-facing port
* *
* Return: connection pointer, if found, -ENOENT otherwise * Return: connection pointer, if found, -ENOENT otherwise
*/ */
static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
int af, const void *faddr, int af, const void *addr,
in_port_t eport, in_port_t fport) in_port_t tap_port,
in_port_t sock_port)
{ {
union inany_addr aany; union inany_addr aany;
struct tcp_tap_conn *conn; struct tcp_tap_conn *conn;
int b; int b;
inany_from_af(&aany, af, faddr); inany_from_af(&aany, af, addr);
b = tcp_hash(c, &aany, eport, fport); b = tcp_hash(c, &aany, tap_port, sock_port);
for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) { for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) {
if (tcp_hash_match(conn, &aany, eport, fport)) if (tcp_hash_match(conn, &aany, tap_port, sock_port))
return conn; return conn;
} }
@ -1382,11 +1409,17 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/ */
void tcp_defer_handler(struct ctx *c) void tcp_defer_handler(struct ctx *c)
{ {
int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
union tcp_conn *conn; union tcp_conn *conn;
tcp_l2_flags_buf_flush(c); tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c); tcp_l2_data_buf_flush(c);
if ((c->tcp.conn_count < MIN(max_files, max_conns)) &&
(c->tcp.splice_conn_count < MIN(max_files / 6, max_conns)))
return;
for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) {
if (conn->c.spliced) { if (conn->c.spliced) {
if (conn->splice.flags & CLOSING) if (conn->splice.flags & CLOSING)
@ -1414,13 +1447,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
void *p, size_t plen, void *p, size_t plen,
const uint16_t *check, uint32_t seq) const uint16_t *check, uint32_t seq)
{ {
const struct in_addr *a4 = inany_v4(&conn->faddr); const struct in_addr *a4 = inany_v4(&conn->addr);
size_t ip_len, tlen; size_t ip_len, tlen;
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
do { \ do { \
b->th.source = htons(conn->fport); \ b->th.source = htons(conn->sock_port); \
b->th.dest = htons(conn->eport); \ b->th.dest = htons(conn->tap_port); \
b->th.seq = htonl(seq); \ b->th.seq = htonl(seq); \
b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
if (conn->events & ESTABLISHED) { \ if (conn->events & ESTABLISHED) { \
@ -1456,7 +1489,7 @@ do { \
ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
b->ip6h.saddr = conn->faddr.a6; b->ip6h.saddr = conn->addr.a6;
if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
b->ip6h.daddr = c->ip6.addr_ll_seen; b->ip6h.daddr = c->ip6.addr_ll_seen;
else else
@ -1809,7 +1842,7 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn,
/** /**
* tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
* @c: Execution context * @c: Execution context
* @conn: TCP connection, with faddr, fport and eport populated * @conn: TCP connection, with addr, sock_port and tap_port populated
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
@ -1822,9 +1855,9 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
union inany_addr dst; union inany_addr dst;
in_port_t dstport; in_port_t dstport;
} __attribute__((__packed__)) in = { } __attribute__((__packed__)) in = {
.src = conn->faddr, .src = conn->addr,
.srcport = conn->fport, .srcport = conn->tap_port,
.dstport = conn->eport, .dstport = conn->sock_port,
}; };
uint32_t ns, seq = 0; uint32_t ns, seq = 0;
@ -1972,15 +2005,13 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af)
* tcp_conn_from_tap() - Handle connection request (SYN segment) from tap * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @saddr: Source address, pointer to in_addr or in6_addr * @addr: Remote address, pointer to in_addr or in6_addr
* @daddr: Destination address, pointer to in_addr or in6_addr
* @th: TCP header from tap: caller MUST ensure it's there * @th: TCP header from tap: caller MUST ensure it's there
* @opts: Pointer to start of options * @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length * @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp * @now: Current timestamp
*/ */
static void tcp_conn_from_tap(struct ctx *c, static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
int af, const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts, const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now) size_t optlen, const struct timespec *now)
{ {
@ -1988,20 +2019,18 @@ static void tcp_conn_from_tap(struct ctx *c,
struct sockaddr_in addr4 = { struct sockaddr_in addr4 = {
.sin_family = AF_INET, .sin_family = AF_INET,
.sin_port = th->dest, .sin_port = th->dest,
.sin_addr = *(struct in_addr *)daddr, .sin_addr = *(struct in_addr *)addr,
}; };
struct sockaddr_in6 addr6 = { struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6, .sin6_family = AF_INET6,
.sin6_port = th->dest, .sin6_port = th->dest,
.sin6_addr = *(struct in6_addr *)daddr, .sin6_addr = *(struct in6_addr *)addr,
}; };
const struct sockaddr *sa; const struct sockaddr *sa;
struct tcp_tap_conn *conn; struct tcp_tap_conn *conn;
socklen_t sl; socklen_t sl;
int s, mss; int s, mss;
(void)saddr;
if (c->tcp.conn_count >= TCP_MAX_CONNS) if (c->tcp.conn_count >= TCP_MAX_CONNS)
return; return;
@ -2010,9 +2039,9 @@ static void tcp_conn_from_tap(struct ctx *c,
return; return;
if (!c->no_map_gw) { if (!c->no_map_gw) {
if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) if (af == AF_INET && IN4_ARE_ADDR_EQUAL(addr, &c->ip4.gw))
addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw))
addr6.sin6_addr = in6addr_loopback; addr6.sin6_addr = in6addr_loopback;
} }
@ -2049,7 +2078,7 @@ static void tcp_conn_from_tap(struct ctx *c,
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
conn->wnd_from_tap = 1; conn->wnd_from_tap = 1;
inany_from_af(&conn->faddr, af, daddr); inany_from_af(&conn->addr, af, addr);
if (af == AF_INET) { if (af == AF_INET) {
sa = (struct sockaddr *)&addr4; sa = (struct sockaddr *)&addr4;
@ -2059,8 +2088,8 @@ static void tcp_conn_from_tap(struct ctx *c,
sl = sizeof(addr6); sl = sizeof(addr6);
} }
conn->fport = ntohs(th->dest); conn->sock_port = ntohs(th->dest);
conn->eport = ntohs(th->source); conn->tap_port = ntohs(th->source);
conn->seq_init_from_tap = ntohl(th->seq); conn->seq_init_from_tap = ntohl(th->seq);
conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_from_tap = conn->seq_init_from_tap + 1;
@ -2527,14 +2556,13 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
* tcp_tap_handler() - Handle packets from tap and state transitions * tcp_tap_handler() - Handle packets from tap and state transitions
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @saddr: Source address * @addr: Destination address
* @daddr: Destination address
* @p: Pool of TCP packets, with TCP headers * @p: Pool of TCP packets, with TCP headers
* @now: Current timestamp * @now: Current timestamp
* *
* Return: count of consumed packets * Return: count of consumed packets
*/ */
int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now) const struct pool *p, const struct timespec *now)
{ {
struct tcp_tap_conn *conn; struct tcp_tap_conn *conn;
@ -2555,13 +2583,12 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
opts = packet_get(p, 0, sizeof(*th), optlen, NULL); opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, daddr, htons(th->source), htons(th->dest)); conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */ /* New connection from tap */
if (!conn) { if (!conn) {
if (opts && th->syn && !th->ack) if (opts && th->syn && !th->ack)
tcp_conn_from_tap(c, af, saddr, daddr, th, tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
opts, optlen, now);
return 1; return 1;
} }
@ -2720,10 +2747,10 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
conn->ws_to_tap = conn->ws_from_tap = 0; conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, SOCK_ACCEPTED); conn_event(c, conn, SOCK_ACCEPTED);
inany_from_sockaddr(&conn->faddr, &conn->fport, sa); inany_from_sockaddr(&conn->addr, &conn->sock_port, sa);
conn->eport = ref.port; conn->tap_port = ref.port;
tcp_snat_inbound(c, &conn->faddr); tcp_snat_inbound(c, &conn->addr);
tcp_seq_init(c, conn, now); tcp_seq_init(c, conn, now);
tcp_hash_insert(c, conn); tcp_hash_insert(c, conn);

7
tcp.h
View File

@ -17,7 +17,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(struct ctx *c, union epoll_ref ref, void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now); const struct timespec *now);
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now); const struct pool *p, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port); const char *ifname, in_port_t port);
@ -26,7 +26,8 @@ void tcp_timer(struct ctx *c, const struct timespec *ts);
void tcp_defer_handler(struct ctx *c); void tcp_defer_handler(struct ctx *c);
void tcp_sock_set_bufsize(const struct ctx *c, int s); void tcp_sock_set_bufsize(const struct ctx *c, int s);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da);
/** /**
* union tcp_epoll_ref - epoll reference portion for TCP connections * union tcp_epoll_ref - epoll reference portion for TCP connections
@ -56,6 +57,7 @@ union tcp_listen_epoll_ref {
* struct tcp_ctx - Execution context for TCP routines * struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table * @hash_secret: 128-bit secret for hash functions, ISN and hash table
* @conn_count: Count of total connections in connection table * @conn_count: Count of total connections in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host-side, packets to tap or spliced * @port_to_tap: Ports bound host-side, packets to tap or spliced
* @fwd_in: Port forwarding configuration for inbound packets * @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets * @fwd_out: Port forwarding configuration for outbound packets
@ -66,6 +68,7 @@ union tcp_listen_epoll_ref {
struct tcp_ctx { struct tcp_ctx {
uint64_t hash_secret[2]; uint64_t hash_secret[2];
int conn_count; int conn_count;
int splice_conn_count;
struct port_fwd fwd_in; struct port_fwd fwd_in;
struct port_fwd fwd_out; struct port_fwd fwd_out;
struct timespec timer_run; struct timespec timer_run;

View File

@ -12,9 +12,11 @@
/** /**
* struct tcp_conn_common - Common fields for spliced and non-spliced * struct tcp_conn_common - Common fields for spliced and non-spliced
* @spliced: Is this a spliced connection? * @spliced: Is this a spliced connection?
* @in_epoll: Is the connection in the epoll set?
*/ */
struct tcp_conn_common { struct tcp_conn_common {
bool spliced :1; bool spliced :1;
bool in_epoll :1;
}; };
extern const char *tcp_common_flag_str[]; extern const char *tcp_common_flag_str[];
@ -22,7 +24,6 @@ extern const char *tcp_common_flag_str[];
/** /**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
* @c: Fields common with tcp_splice_conn * @c: Fields common with tcp_splice_conn
* @in_epoll: Is the connection in the epoll set?
* @next_index: Connection index of next item in hash chain, -1 for none * @next_index: Connection index of next item in hash chain, -1 for none
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number * @sock: Socket descriptor number
@ -34,9 +35,9 @@ extern const char *tcp_common_flag_str[];
* @ws_to_tap: Window scaling factor advertised to tap/guest * @ws_to_tap: Window scaling factor advertised to tap/guest
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap * @seq_dup_ack_approx: Last duplicate ACK number sent to tap
* @faddr: Guest side forwarding address (guest's remote address) * @addr: Remote address (IPv4 or IPv6)
* @eport: Guest side endpoint port (guest's local port) * @tap_port: Guest-facing tap port
* @fport: Guest side forwarding port (guest's remote port) * @sock_port: Remote, socket-facing port
* @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_from_tap: Last window size from tap, unscaled (as received)
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
* @seq_to_tap: Next sequence for packets to tap * @seq_to_tap: Next sequence for packets to tap
@ -49,7 +50,6 @@ struct tcp_tap_conn {
/* Must be first element to match tcp_splice_conn */ /* Must be first element to match tcp_splice_conn */
struct tcp_conn_common c; struct tcp_conn_common c;
bool in_epoll :1;
int next_index :TCP_CONN_INDEX_BITS + 2; int next_index :TCP_CONN_INDEX_BITS + 2;
#define TCP_RETRANS_BITS 3 #define TCP_RETRANS_BITS 3
@ -105,9 +105,9 @@ struct tcp_tap_conn {
uint8_t seq_dup_ack_approx; uint8_t seq_dup_ack_approx;
union inany_addr faddr; union inany_addr addr;
in_port_t eport; in_port_t tap_port;
in_port_t fport; in_port_t sock_port;
uint16_t wnd_from_tap; uint16_t wnd_from_tap;
uint16_t wnd_to_tap; uint16_t wnd_to_tap;
@ -122,7 +122,6 @@ struct tcp_tap_conn {
/** /**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection * struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @c: Fields common with tcp_tap_conn * @c: Fields common with tcp_tap_conn
* @in_epoll: Is the connection in the epoll set?
* @a: File descriptor number of socket for accepted connection * @a: File descriptor number of socket for accepted connection
* @pipe_a_b: Pipe ends for splice() from @a to @b * @pipe_a_b: Pipe ends for splice() from @a to @b
* @b: File descriptor number of peer connected socket * @b: File descriptor number of peer connected socket
@ -138,7 +137,6 @@ struct tcp_splice_conn {
/* Must be first element to match tcp_tap_conn */ /* Must be first element to match tcp_tap_conn */
struct tcp_conn_common c; struct tcp_conn_common c;
bool in_epoll :1;
int a; int a;
int pipe_a_b[2]; int pipe_a_b[2];
int b; int b;

View File

@ -172,7 +172,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
static int tcp_splice_epoll_ctl(const struct ctx *c, static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn) struct tcp_splice_conn *conn)
{ {
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a, union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a,
.tcp.index = CONN_IDX(conn) }; .tcp.index = CONN_IDX(conn) };
union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b, union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b,
@ -192,7 +192,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
epoll_ctl(c->epollfd, m, conn->b, &ev_b)) epoll_ctl(c->epollfd, m, conn->b, &ev_b))
goto delete; goto delete;
conn->in_epoll = true; conn->c.in_epoll = true;
return 0; return 0;
@ -295,6 +295,7 @@ void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union)
conn->flags = 0; conn->flags = 0;
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
c->tcp.splice_conn_count--;
tcp_table_compact(c, conn_union); tcp_table_compact(c, conn_union);
} }
@ -512,6 +513,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);
conn->c.spliced = true; conn->c.spliced = true;
c->tcp.splice_conn_count++;
conn->a = s; conn->a = s;
if (tcp_splice_new(c, conn, ref.port, ref.ns)) if (tcp_splice_new(c, conn, ref.port, ref.ns))

37
udp.c
View File

@ -168,6 +168,7 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)
/** /**
* udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @s_in: Source socket address, filled in by recvmmsg() * @s_in: Source socket address, filled in by recvmmsg()
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @taph: Tap-level headers (partially pre-filled) * @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr) * @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for UDP header * @uh: Headroom for UDP header
@ -175,6 +176,7 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)
*/ */
static struct udp4_l2_buf_t { static struct udp4_l2_buf_t {
struct sockaddr_in s_in; struct sockaddr_in s_in;
uint32_t psum;
struct tap_hdr taph; struct tap_hdr taph;
struct iphdr iph; struct iphdr iph;
@ -261,13 +263,11 @@ static void udp_invert_portmap(struct udp_port_fwd *fwd)
*/ */
static void udp_update_check4(struct udp4_l2_buf_t *buf) static void udp_update_check4(struct udp4_l2_buf_t *buf)
{ {
uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_UDP); uint32_t sum = buf->psum;
sum += buf->iph.tot_len; sum += buf->iph.tot_len;
sum += (buf->iph.saddr >> 16) & 0xffff; sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff; sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
buf->iph.check = (uint16_t)~csum_fold(sum); buf->iph.check = (uint16_t)~csum_fold(sum);
} }
@ -276,8 +276,10 @@ static void udp_update_check4(struct udp4_l2_buf_t *buf)
* udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged * @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/ */
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{ {
int i; int i;
@ -287,6 +289,18 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
tap_update_mac(&b4->taph, eth_d, eth_s); tap_update_mac(&b4->taph, eth_d, eth_s);
tap_update_mac(&b6->taph, eth_d, eth_s); tap_update_mac(&b6->taph, eth_d, eth_s);
if (ip_da) {
b4->iph.daddr = ip_da->s_addr;
if (!i) {
b4->iph.saddr = 0;
b4->iph.tot_len = 0;
b4->iph.check = 0;
b4->psum = sum_16b(&b4->iph, 20);
} else {
b4->psum = udp4_l2_buf[0].psum;
}
}
} }
} }
@ -572,7 +586,6 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh); ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh);
b->iph.tot_len = htons(ip_len); b->iph.tot_len = htons(ip_len);
b->iph.daddr = c->ip4.addr_seen.s_addr;
src_port = ntohs(b->s_in.sin_port); src_port = ntohs(b->s_in.sin_port);
@ -786,8 +799,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* udp_tap_handler() - Handle packets from tap * udp_tap_handler() - Handle packets from tap
* @c: Execution context * @c: Execution context
* @af: Address family, AF_INET or AF_INET6 * @af: Address family, AF_INET or AF_INET6
* @saddr: Source address * @addr: Destination address
* @daddr: Destination address
* @p: Pool of UDP packets, with UDP headers * @p: Pool of UDP packets, with UDP headers
* @now: Current timestamp * @now: Current timestamp
* *
@ -795,7 +807,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* *
* #syscalls sendmmsg * #syscalls sendmmsg
*/ */
int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int udp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now) const struct pool *p, const struct timespec *now)
{ {
struct mmsghdr mm[UIO_MAXIOV]; struct mmsghdr mm[UIO_MAXIOV];
@ -809,7 +821,6 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
socklen_t sl; socklen_t sl;
(void)c; (void)c;
(void)saddr;
uh = packet_get(p, 0, 0, sizeof(*uh), NULL); uh = packet_get(p, 0, 0, sizeof(*uh), NULL);
if (!uh) if (!uh)
@ -825,7 +836,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
s_in = (struct sockaddr_in) { s_in = (struct sockaddr_in) {
.sin_family = AF_INET, .sin_family = AF_INET,
.sin_port = uh->dest, .sin_port = uh->dest,
.sin_addr = *(struct in_addr *)daddr, .sin_addr = *(struct in_addr *)addr,
}; };
sa = (struct sockaddr *)&s_in; sa = (struct sockaddr *)&s_in;
@ -870,17 +881,17 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
s_in6 = (struct sockaddr_in6) { s_in6 = (struct sockaddr_in6) {
.sin6_family = AF_INET6, .sin6_family = AF_INET6,
.sin6_port = uh->dest, .sin6_port = uh->dest,
.sin6_addr = *(struct in6_addr *)daddr, .sin6_addr = *(struct in6_addr *)addr,
}; };
const struct in6_addr *bind_addr = &in6addr_any; const struct in6_addr *bind_addr = &in6addr_any;
sa = (struct sockaddr *)&s_in6; sa = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6); sl = sizeof(s_in6);
if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) && if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.dns_match) &&
ntohs(s_in6.sin6_port) == 53) { ntohs(s_in6.sin6_port) == 53) {
s_in6.sin6_addr = c->ip6.dns_host; s_in6.sin6_addr = c->ip6.dns_host;
} else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) && } else if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw) &&
!c->no_map_gw) { !c->no_map_gw) {
if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) || if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) ||
(udp_tap_map[V6][dst].flags & PORT_LOOPBACK)) (udp_tap_map[V6][dst].flags & PORT_LOOPBACK))

5
udp.h
View File

@ -10,13 +10,14 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now); const struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int udp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now); const struct pool *p, const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port); const void *addr, const char *ifname, in_port_t port);
int udp_init(struct ctx *c); int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *ts); void udp_timer(struct ctx *c, const struct timespec *ts);
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da);
/** /**
* union udp_epoll_ref - epoll reference portion for TCP connections * union udp_epoll_ref - epoll reference portion for TCP connections

4
util.h
View File

@ -141,13 +141,11 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
.tot_len = 0, \ .tot_len = 0, \
.id = 0, \ .id = 0, \
.frag_off = 0, \ .frag_off = 0, \
.ttl = 0xff, \ .ttl = 255, \
.protocol = (proto), \ .protocol = (proto), \
.saddr = 0, \ .saddr = 0, \
.daddr = 0, \ .daddr = 0, \
} }
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
(uint32_t)htons_constant(0xff00 | (proto)))
#define L2_BUF_IP6_INIT(proto) \ #define L2_BUF_IP6_INIT(proto) \
{ \ { \