1
0
mirror of https://passt.top/passt synced 2024-12-22 13:45:32 +00:00

Compare commits

..

No commits in common. "69303cafbef86ef070d67582169d455eb8da288c" and "0af928eaa020c1062fdc91598dfdc533966e2afe" have entirely different histories.

14 changed files with 189 additions and 156 deletions

12
icmp.c
View File

@ -154,21 +154,17 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref)
* icmp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
* @addr: Destination address
* @p: Packet pool, single packet with ICMP/ICMPv6 header
* @now: Current timestamp
*
* Return: count of consumed packets (always 1, even if malformed)
*/
int icmp_tap_handler(const struct ctx *c, int af,
const void *saddr, const void *daddr,
int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now)
{
size_t plen;
(void)saddr;
if (af == AF_INET) {
struct sockaddr_in sa = {
.sin_family = AF_INET,
@ -214,7 +210,7 @@ int icmp_tap_handler(const struct ctx *c, int af,
icmp_id_map[V4][id].ts = now->tv_sec;
bitmap_set(icmp_act[V4], id);
sa.sin_addr = *(struct in_addr *)daddr;
sa.sin_addr = *(struct in_addr *)addr;
if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)) < 0) {
debug("ICMP: failed to relay request to socket");
@ -268,7 +264,7 @@ int icmp_tap_handler(const struct ctx *c, int af,
icmp_id_map[V6][id].ts = now->tv_sec;
bitmap_set(icmp_act[V6], id);
sa.sin6_addr = *(struct in6_addr *)daddr;
sa.sin6_addr = *(struct in6_addr *)addr;
if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL,
(struct sockaddr *)&sa, sizeof(sa)) < 1) {
debug("ICMPv6: failed to relay request to socket");

3
icmp.h
View File

@ -12,8 +12,7 @@ struct ctx;
void icmp_sock_handler(const struct ctx *c, union epoll_ref ref);
void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref);
int icmp_tap_handler(const struct ctx *c,
int af, const void *saddr, const void *daddr,
int icmp_tap_handler(const struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now);
void icmp_timer(const struct ctx *c, const struct timespec *ts);
void icmp_init(void);

View File

@ -6,9 +6,6 @@
* IPv6 or IPv4 (encoded as IPv4-mapped IPv6 addresses)
*/
#ifndef INANY_H
#define INANY_H
/** union inany_addr - Represents either an IPv4 or IPv6 address
* @a6: Address as an IPv6 address, may be IPv4-mapped
* @v4mapped.zero: All zero-bits for an IPv4 address
@ -93,5 +90,3 @@ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port,
ASSERT(0);
}
}
#endif /* INANY_H */

10
passt.c
View File

@ -117,11 +117,13 @@ static void timer_init(struct ctx *c, const struct timespec *now)
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{
tcp_update_l2_buf(eth_d, eth_s);
udp_update_l2_buf(eth_d, eth_s);
tcp_update_l2_buf(eth_d, eth_s, ip_da);
udp_update_l2_buf(eth_d, eth_s, ip_da);
}
/**
@ -245,7 +247,7 @@ int main(int argc, char **argv)
if (!c.no_icmp)
icmp_init();
proto_update_l2_buf(c.mac_guest, c.mac);
proto_update_l2_buf(c.mac_guest, c.mac, &c.ip4.addr);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();

View File

@ -303,7 +303,7 @@ struct ctx {
int low_rmem;
};
void proto_update_l2_buf(const unsigned char *eth_d,
const unsigned char *eth_s);
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da);
#endif /* PASST_H */

View File

@ -353,7 +353,7 @@ void pasta_ns_conf(struct ctx *c)
}
}
proto_update_l2_buf(c->mac_guest, NULL);
proto_update_l2_buf(c->mac_guest, NULL, NULL);
}
/**

29
tap.c
View File

@ -625,8 +625,10 @@ resume:
l4_len = l3_len - hlen;
if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr)
if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) {
c->ip4.addr_seen.s_addr = iph->saddr;
proto_update_l2_buf(NULL, NULL, &c->ip4.addr_seen);
}
l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL);
if (!l4h)
@ -641,8 +643,7 @@ resume:
tap_packet_debug(iph, NULL, NULL, 0, NULL, 1);
packet_add(pkt, l4_len, l4h);
icmp_tap_handler(c, AF_INET, &iph->saddr, &iph->daddr,
pkt, now);
icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now);
continue;
}
@ -707,6 +708,7 @@ append:
for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
struct pool *p = (struct pool *)&seq->p;
struct in_addr *da = &seq->daddr;
size_t n = p->count;
tap_packet_debug(NULL, NULL, seq, 0, NULL, n);
@ -714,13 +716,11 @@ append:
if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp)
continue;
while ((n -= tcp_tap_handler(c, AF_INET, &seq->saddr,
&seq->daddr, p, now)));
while ((n -= tcp_tap_handler(c, AF_INET, da, p, now)));
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
while ((n -= udp_tap_handler(c, AF_INET, &seq->saddr,
&seq->daddr, p, now)));
while ((n -= udp_tap_handler(c, AF_INET, da, p, now)));
}
}
@ -801,7 +801,7 @@ resume:
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
packet_add(pkt, l4_len, l4h);
icmp_tap_handler(c, AF_INET6, saddr, daddr, pkt, now);
icmp_tap_handler(c, AF_INET6, daddr, pkt, now);
continue;
}
@ -818,6 +818,8 @@ resume:
continue;
}
*saddr = c->ip6.addr;
if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1);
continue;
@ -868,6 +870,7 @@ append:
for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) {
struct pool *p = (struct pool *)&seq->p;
struct in6_addr *da = &seq->daddr;
size_t n = p->count;
tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n);
@ -875,13 +878,11 @@ append:
if (seq->protocol == IPPROTO_TCP) {
if (c->no_tcp)
continue;
while ((n -= tcp_tap_handler(c, AF_INET6, &seq->saddr,
&seq->daddr, p, now)));
while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now)));
} else if (seq->protocol == IPPROTO_UDP) {
if (c->no_udp)
continue;
while ((n -= udp_tap_handler(c, AF_INET6, &seq->saddr,
&seq->daddr, p, now)));
while ((n -= udp_tap_handler(c, AF_INET6, da, p, now)));
}
}
@ -967,7 +968,7 @@ redo:
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL);
proto_update_l2_buf(c->mac_guest, NULL, NULL);
}
switch (ntohs(eh->h_proto)) {
@ -1028,7 +1029,7 @@ restart:
if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
proto_update_l2_buf(c->mac_guest, NULL);
proto_update_l2_buf(c->mac_guest, NULL, NULL);
}
switch (ntohs(eh->h_proto)) {

203
tcp.c
View File

@ -309,6 +309,9 @@
#define TCP_FRAMES \
(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
#define TCP_FILE_PRESSURE 30 /* % of c->nofile */
#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \
TCP_HASH_TABLE_LOAD)
@ -320,8 +323,10 @@
#define MSS_DEFAULT 536
struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
uint32_t psum;
uint32_t tsum;
#ifdef __AVX2__
uint8_t pad[26];
uint8_t pad[18];
#else
uint8_t pad[2];
#endif
@ -396,7 +401,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#define OPT_SACK 5
#define OPT_TS 8
#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
#define CONN_V4(conn) (!!inany_v4(&(conn)->addr))
#define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
@ -429,15 +434,15 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = {
static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
/* Table of guest side forwarding addresses with very low RTT (assumed
* to be local to the host), LRU
*/
/* Table of destinations with very low RTT (assumed to be local), LRU */
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
/* Static buffers */
/**
* tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @tsum: Partial TCP header checksum (excluding length and saddr)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
@ -445,15 +450,17 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
* @data: Storage for TCP payload
*/
static struct tcp4_l2_buf_t {
uint32_t psum; /* 0 */
uint32_t tsum; /* 4 */
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
uint8_t pad[18]; /* 8, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
uint8_t pad[2]; /* align iph to 4 bytes 8 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
uint8_t data[MSS4]; /* 84 60 */
struct tap_hdr taph; /* 26 10 */
struct iphdr iph; /* 44 28 */
struct tcphdr th; /* 64 48 */
uint8_t data[MSS4]; /* 84 68 */
/* 65536 65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
@ -508,6 +515,8 @@ static struct iovec tcp_iov [UIO_MAXIOV];
/**
* tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @tsum: Partial TCP header checksum (excluding length and saddr)
* @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
@ -515,14 +524,16 @@ static struct iovec tcp_iov [UIO_MAXIOV];
* @opts: Headroom for TCP options
*/
static struct tcp4_l2_flags_buf_t {
uint32_t psum; /* 0 */
uint32_t tsum; /* 4 */
#ifdef __AVX2__
uint8_t pad[26]; /* 0, align th to 32 bytes */
uint8_t pad[18]; /* 8, align th to 32 bytes */
#else
uint8_t pad[2]; /* align iph to 4 bytes 0 */
uint8_t pad[2]; /* align iph to 4 bytes 8 */
#endif
struct tap_hdr taph; /* 26 2 */
struct iphdr iph; /* 44 20 */
struct tcphdr th; /* 64 40 */
struct tap_hdr taph; /* 26 10 */
struct iphdr iph; /* 44 28 */
struct tcphdr th; /* 64 48 */
char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
@ -631,13 +642,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.tcp.index = CONN_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
if (conn->events == CLOSED) {
if (conn->in_epoll)
if (conn->c.in_epoll)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
if (conn->timer != -1)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
@ -649,7 +660,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
return -errno;
conn->in_epoll = true;
conn->c.in_epoll = true;
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@ -847,7 +858,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
if (inany_equals(&conn->faddr, low_rtt_dst + i))
if (inany_equals(&conn->addr, low_rtt_dst + i))
return 1;
return 0;
@ -869,7 +880,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
if (inany_equals(&conn->faddr, low_rtt_dst + i))
if (inany_equals(&conn->addr, low_rtt_dst + i))
return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i;
@ -881,7 +892,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1)
return;
low_rtt_dst[hole++] = conn->faddr;
low_rtt_dst[hole++] = conn->addr;
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
@ -940,13 +951,11 @@ void tcp_sock_set_bufsize(const struct ctx *c, int s)
*/
static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
{
uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_TCP);
uint32_t sum = buf->psum;
sum += buf->iph.tot_len;
sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
buf->iph.check = (uint16_t)~csum_fold(sum);
}
@ -958,12 +967,10 @@ static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf)
static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf)
{
uint16_t tlen = ntohs(buf->iph.tot_len) - 20;
uint32_t sum = htons(IPPROTO_TCP);
uint32_t sum = buf->tsum;
sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
sum += htons(ntohs(buf->iph.tot_len) - 20);
buf->th.check = 0;
@ -994,8 +1001,10 @@ static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf)
* tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{
int i;
@ -1009,6 +1018,24 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
tap_update_mac(&b6->taph, eth_d, eth_s);
tap_update_mac(&b4f->taph, eth_d, eth_s);
tap_update_mac(&b6f->taph, eth_d, eth_s);
if (ip_da) {
b4f->iph.daddr = b4->iph.daddr = ip_da->s_addr;
if (!i) {
b4f->iph.saddr = b4->iph.saddr = 0;
b4f->iph.tot_len = b4->iph.tot_len = 0;
b4f->iph.check = b4->iph.check = 0;
b4f->psum = b4->psum = sum_16b(&b4->iph, 20);
b4->tsum = ((ip_da->s_addr >> 16) & 0xffff) +
(ip_da->s_addr & 0xffff) +
htons(IPPROTO_TCP);
b4f->tsum = b4->tsum;
} else {
b4f->psum = b4->psum = tcp4_l2_buf[0].psum;
b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum;
}
}
}
}
@ -1016,16 +1043,15 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
* tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
* @c: Execution context
*/
static void tcp_sock4_iov_init(struct ctx *c)
static void tcp_sock4_iov_init(const struct ctx *c)
{
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
struct iovec *iov;
int i;
for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
.taph = TAP_HDR_INIT(ETH_P_IP),
.iph = iph,
.iph = L2_BUF_IP4_INIT(IPPROTO_TCP),
.th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
};
}
@ -1136,18 +1162,18 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
/**
* tcp_hash_match() - Check if a connection entry matches address and ports
* @conn: Connection entry to match against
* @faddr: Guest side forwarding address
* @eport: Guest side endpoint port
* @fport: Guest side forwarding port
* @addr: Remote address
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
* Return: 1 on match, 0 otherwise
*/
static int tcp_hash_match(const struct tcp_tap_conn *conn,
const union inany_addr *faddr,
in_port_t eport, in_port_t fport)
const union inany_addr *addr,
in_port_t tap_port, in_port_t sock_port)
{
if (inany_equals(&conn->faddr, faddr) &&
conn->eport == eport && conn->fport == fport)
if (inany_equals(&conn->addr, addr) &&
conn->tap_port == tap_port && conn->sock_port == sock_port)
return 1;
return 0;
@ -1156,21 +1182,21 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn,
/**
* tcp_hash() - Calculate hash value for connection given address and ports
* @c: Execution context
* @faddr: Guest side forwarding address
* @eport: Guest side endpoint port
* @fport: Guest side forwarding port
* @addr: Remote address
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
* Return: hash value, already modulo size of the hash table
*/
static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr,
in_port_t eport, in_port_t fport)
static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr,
in_port_t tap_port, in_port_t sock_port)
{
struct {
union inany_addr faddr;
in_port_t eport;
in_port_t fport;
union inany_addr addr;
in_port_t tap_port;
in_port_t sock_port;
} __attribute__((__packed__)) in = {
*faddr, eport, fport
*addr, tap_port, sock_port
};
uint64_t b = 0;
@ -1189,7 +1215,7 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr,
static unsigned int tcp_conn_hash(const struct ctx *c,
const struct tcp_tap_conn *conn)
{
return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
return tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
}
/**
@ -1201,7 +1227,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
{
int b;
b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1;
tc_hash[b] = conn;
@ -1270,24 +1296,25 @@ static void tcp_tap_conn_update(struct ctx *c, struct tcp_tap_conn *old,
* tcp_hash_lookup() - Look up connection given remote address and ports
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @faddr: Guest side forwarding address (guest remote address)
* @eport: Guest side endpoint port (guest local port)
* @fport: Guest side forwarding port (guest remote port)
* @addr: Remote address, pointer to in_addr or in6_addr
* @tap_port: tap-facing port
* @sock_port: Socket-facing port
*
* Return: connection pointer, if found, -ENOENT otherwise
*/
static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
int af, const void *faddr,
in_port_t eport, in_port_t fport)
int af, const void *addr,
in_port_t tap_port,
in_port_t sock_port)
{
union inany_addr aany;
struct tcp_tap_conn *conn;
int b;
inany_from_af(&aany, af, faddr);
b = tcp_hash(c, &aany, eport, fport);
inany_from_af(&aany, af, addr);
b = tcp_hash(c, &aany, tap_port, sock_port);
for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) {
if (tcp_hash_match(conn, &aany, eport, fport))
if (tcp_hash_match(conn, &aany, tap_port, sock_port))
return conn;
}
@ -1382,11 +1409,17 @@ static void tcp_l2_data_buf_flush(struct ctx *c)
*/
void tcp_defer_handler(struct ctx *c)
{
int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
union tcp_conn *conn;
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
if ((c->tcp.conn_count < MIN(max_files, max_conns)) &&
(c->tcp.splice_conn_count < MIN(max_files / 6, max_conns)))
return;
for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) {
if (conn->c.spliced) {
if (conn->splice.flags & CLOSING)
@ -1414,13 +1447,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
void *p, size_t plen,
const uint16_t *check, uint32_t seq)
{
const struct in_addr *a4 = inany_v4(&conn->faddr);
const struct in_addr *a4 = inany_v4(&conn->addr);
size_t ip_len, tlen;
#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \
do { \
b->th.source = htons(conn->fport); \
b->th.dest = htons(conn->eport); \
b->th.source = htons(conn->sock_port); \
b->th.dest = htons(conn->tap_port); \
b->th.seq = htonl(seq); \
b->th.ack_seq = htonl(conn->seq_ack_to_tap); \
if (conn->events & ESTABLISHED) { \
@ -1456,7 +1489,7 @@ do { \
ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
b->ip6h.saddr = conn->faddr.a6;
b->ip6h.saddr = conn->addr.a6;
if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
b->ip6h.daddr = c->ip6.addr_ll_seen;
else
@ -1809,7 +1842,7 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn,
/**
* tcp_seq_init() - Calculate initial sequence number according to RFC 6528
* @c: Execution context
* @conn: TCP connection, with faddr, fport and eport populated
* @conn: TCP connection, with addr, sock_port and tap_port populated
* @now: Current timestamp
*/
static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
@ -1822,9 +1855,9 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
union inany_addr dst;
in_port_t dstport;
} __attribute__((__packed__)) in = {
.src = conn->faddr,
.srcport = conn->fport,
.dstport = conn->eport,
.src = conn->addr,
.srcport = conn->tap_port,
.dstport = conn->sock_port,
};
uint32_t ns, seq = 0;
@ -1972,15 +2005,13 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af)
* tcp_conn_from_tap() - Handle connection request (SYN segment) from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address, pointer to in_addr or in6_addr
* @daddr: Destination address, pointer to in_addr or in6_addr
* @addr: Remote address, pointer to in_addr or in6_addr
* @th: TCP header from tap: caller MUST ensure it's there
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
static void tcp_conn_from_tap(struct ctx *c,
int af, const void *saddr, const void *daddr,
static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
{
@ -1988,20 +2019,18 @@ static void tcp_conn_from_tap(struct ctx *c,
struct sockaddr_in addr4 = {
.sin_family = AF_INET,
.sin_port = th->dest,
.sin_addr = *(struct in_addr *)daddr,
.sin_addr = *(struct in_addr *)addr,
};
struct sockaddr_in6 addr6 = {
.sin6_family = AF_INET6,
.sin6_port = th->dest,
.sin6_addr = *(struct in6_addr *)daddr,
.sin6_addr = *(struct in6_addr *)addr,
};
const struct sockaddr *sa;
struct tcp_tap_conn *conn;
socklen_t sl;
int s, mss;
(void)saddr;
if (c->tcp.conn_count >= TCP_MAX_CONNS)
return;
@ -2010,9 +2039,9 @@ static void tcp_conn_from_tap(struct ctx *c,
return;
if (!c->no_map_gw) {
if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw))
if (af == AF_INET && IN4_ARE_ADDR_EQUAL(addr, &c->ip4.gw))
addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw))
if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw))
addr6.sin6_addr = in6addr_loopback;
}
@ -2049,7 +2078,7 @@ static void tcp_conn_from_tap(struct ctx *c,
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
conn->wnd_from_tap = 1;
inany_from_af(&conn->faddr, af, daddr);
inany_from_af(&conn->addr, af, addr);
if (af == AF_INET) {
sa = (struct sockaddr *)&addr4;
@ -2059,8 +2088,8 @@ static void tcp_conn_from_tap(struct ctx *c,
sl = sizeof(addr6);
}
conn->fport = ntohs(th->dest);
conn->eport = ntohs(th->source);
conn->sock_port = ntohs(th->dest);
conn->tap_port = ntohs(th->source);
conn->seq_init_from_tap = ntohl(th->seq);
conn->seq_from_tap = conn->seq_init_from_tap + 1;
@ -2527,14 +2556,13 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
* tcp_tap_handler() - Handle packets from tap and state transitions
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
* @addr: Destination address
* @p: Pool of TCP packets, with TCP headers
* @now: Current timestamp
*
* Return: count of consumed packets
*/
int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now)
{
struct tcp_tap_conn *conn;
@ -2555,13 +2583,12 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
opts = packet_get(p, 0, sizeof(*th), optlen, NULL);
conn = tcp_hash_lookup(c, af, daddr, htons(th->source), htons(th->dest));
conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest));
/* New connection from tap */
if (!conn) {
if (opts && th->syn && !th->ack)
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
tcp_conn_from_tap(c, af, addr, th, opts, optlen, now);
return 1;
}
@ -2720,10 +2747,10 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, SOCK_ACCEPTED);
inany_from_sockaddr(&conn->faddr, &conn->fport, sa);
conn->eport = ref.port;
inany_from_sockaddr(&conn->addr, &conn->sock_port, sa);
conn->tap_port = ref.port;
tcp_snat_inbound(c, &conn->faddr);
tcp_snat_inbound(c, &conn->addr);
tcp_seq_init(c, conn, now);
tcp_hash_insert(c, conn);

7
tcp.h
View File

@ -17,7 +17,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
const struct timespec *now);
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
int tcp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
const char *ifname, in_port_t port);
@ -26,7 +26,8 @@ void tcp_timer(struct ctx *c, const struct timespec *ts);
void tcp_defer_handler(struct ctx *c);
void tcp_sock_set_bufsize(const struct ctx *c, int s);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da);
/**
* union tcp_epoll_ref - epoll reference portion for TCP connections
@ -56,6 +57,7 @@ union tcp_listen_epoll_ref {
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
* @conn_count: Count of total connections in connection table
* @splice_conn_count: Count of spliced connections in connection table
* @port_to_tap: Ports bound host-side, packets to tap or spliced
* @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets
@ -66,6 +68,7 @@ union tcp_listen_epoll_ref {
struct tcp_ctx {
uint64_t hash_secret[2];
int conn_count;
int splice_conn_count;
struct port_fwd fwd_in;
struct port_fwd fwd_out;
struct timespec timer_run;

View File

@ -12,9 +12,11 @@
/**
* struct tcp_conn_common - Common fields for spliced and non-spliced
* @spliced: Is this a spliced connection?
* @in_epoll: Is the connection in the epoll set?
*/
struct tcp_conn_common {
bool spliced :1;
bool in_epoll :1;
};
extern const char *tcp_common_flag_str[];
@ -22,7 +24,6 @@ extern const char *tcp_common_flag_str[];
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
* @c: Fields common with tcp_splice_conn
* @in_epoll: Is the connection in the epoll set?
* @next_index: Connection index of next item in hash chain, -1 for none
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
@ -34,9 +35,9 @@ extern const char *tcp_common_flag_str[];
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
* @faddr: Guest side forwarding address (guest's remote address)
* @eport: Guest side endpoint port (guest's local port)
* @fport: Guest side forwarding port (guest's remote port)
* @addr: Remote address (IPv4 or IPv6)
* @tap_port: Guest-facing tap port
* @sock_port: Remote, socket-facing port
* @wnd_from_tap: Last window size from tap, unscaled (as received)
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
* @seq_to_tap: Next sequence for packets to tap
@ -49,7 +50,6 @@ struct tcp_tap_conn {
/* Must be first element to match tcp_splice_conn */
struct tcp_conn_common c;
bool in_epoll :1;
int next_index :TCP_CONN_INDEX_BITS + 2;
#define TCP_RETRANS_BITS 3
@ -105,9 +105,9 @@ struct tcp_tap_conn {
uint8_t seq_dup_ack_approx;
union inany_addr faddr;
in_port_t eport;
in_port_t fport;
union inany_addr addr;
in_port_t tap_port;
in_port_t sock_port;
uint16_t wnd_from_tap;
uint16_t wnd_to_tap;
@ -122,7 +122,6 @@ struct tcp_tap_conn {
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @c: Fields common with tcp_tap_conn
* @in_epoll: Is the connection in the epoll set?
* @a: File descriptor number of socket for accepted connection
* @pipe_a_b: Pipe ends for splice() from @a to @b
* @b: File descriptor number of peer connected socket
@ -138,7 +137,6 @@ struct tcp_splice_conn {
/* Must be first element to match tcp_tap_conn */
struct tcp_conn_common c;
bool in_epoll :1;
int a;
int pipe_a_b[2];
int b;

View File

@ -172,7 +172,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .type = EPOLL_TYPE_TCP, .fd = conn->a,
.tcp.index = CONN_IDX(conn) };
union epoll_ref ref_b = { .type = EPOLL_TYPE_TCP, .fd = conn->b,
@ -192,7 +192,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
epoll_ctl(c->epollfd, m, conn->b, &ev_b))
goto delete;
conn->in_epoll = true;
conn->c.in_epoll = true;
return 0;
@ -295,6 +295,7 @@ void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union)
conn->flags = 0;
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
c->tcp.splice_conn_count--;
tcp_table_compact(c, conn_union);
}
@ -512,6 +513,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref,
trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);
conn->c.spliced = true;
c->tcp.splice_conn_count++;
conn->a = s;
if (tcp_splice_new(c, conn, ref.port, ref.ns))

37
udp.c
View File

@ -168,6 +168,7 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)
/**
* udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
* @s_in: Source socket address, filled in by recvmmsg()
* @psum: Partial IP header checksum (excluding tot_len and saddr)
* @taph: Tap-level headers (partially pre-filled)
* @iph: Pre-filled IP header (except for tot_len and saddr)
* @uh: Headroom for UDP header
@ -175,6 +176,7 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)
*/
static struct udp4_l2_buf_t {
struct sockaddr_in s_in;
uint32_t psum;
struct tap_hdr taph;
struct iphdr iph;
@ -261,13 +263,11 @@ static void udp_invert_portmap(struct udp_port_fwd *fwd)
*/
static void udp_update_check4(struct udp4_l2_buf_t *buf)
{
uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_UDP);
uint32_t sum = buf->psum;
sum += buf->iph.tot_len;
sum += (buf->iph.saddr >> 16) & 0xffff;
sum += buf->iph.saddr & 0xffff;
sum += (buf->iph.daddr >> 16) & 0xffff;
sum += buf->iph.daddr & 0xffff;
buf->iph.check = (uint16_t)~csum_fold(sum);
}
@ -276,8 +276,10 @@ static void udp_update_check4(struct udp4_l2_buf_t *buf)
* udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
* @eth_s: Ethernet source address, NULL if unchanged
* @ip_da: Pointer to IPv4 destination address, NULL if unchanged
*/
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da)
{
int i;
@ -287,6 +289,18 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
tap_update_mac(&b4->taph, eth_d, eth_s);
tap_update_mac(&b6->taph, eth_d, eth_s);
if (ip_da) {
b4->iph.daddr = ip_da->s_addr;
if (!i) {
b4->iph.saddr = 0;
b4->iph.tot_len = 0;
b4->iph.check = 0;
b4->psum = sum_16b(&b4->iph, 20);
} else {
b4->psum = udp4_l2_buf[0].psum;
}
}
}
}
@ -572,7 +586,6 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport,
ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh);
b->iph.tot_len = htons(ip_len);
b->iph.daddr = c->ip4.addr_seen.s_addr;
src_port = ntohs(b->s_in.sin_port);
@ -786,8 +799,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
* udp_tap_handler() - Handle packets from tap
* @c: Execution context
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
* @addr: Destination address
* @p: Pool of UDP packets, with UDP headers
* @now: Current timestamp
*
@ -795,7 +807,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
*
* #syscalls sendmmsg
*/
int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
int udp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now)
{
struct mmsghdr mm[UIO_MAXIOV];
@ -809,7 +821,6 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
socklen_t sl;
(void)c;
(void)saddr;
uh = packet_get(p, 0, 0, sizeof(*uh), NULL);
if (!uh)
@ -825,7 +836,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
s_in = (struct sockaddr_in) {
.sin_family = AF_INET,
.sin_port = uh->dest,
.sin_addr = *(struct in_addr *)daddr,
.sin_addr = *(struct in_addr *)addr,
};
sa = (struct sockaddr *)&s_in;
@ -870,17 +881,17 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
s_in6 = (struct sockaddr_in6) {
.sin6_family = AF_INET6,
.sin6_port = uh->dest,
.sin6_addr = *(struct in6_addr *)daddr,
.sin6_addr = *(struct in6_addr *)addr,
};
const struct in6_addr *bind_addr = &in6addr_any;
sa = (struct sockaddr *)&s_in6;
sl = sizeof(s_in6);
if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) &&
if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.dns_match) &&
ntohs(s_in6.sin6_port) == 53) {
s_in6.sin6_addr = c->ip6.dns_host;
} else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) &&
} else if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw) &&
!c->no_map_gw) {
if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) ||
(udp_tap_map[V6][dst].flags & PORT_LOOPBACK))

5
udp.h
View File

@ -10,13 +10,14 @@
void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now);
int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr,
int udp_tap_handler(struct ctx *c, int af, const void *addr,
const struct pool *p, const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *ts);
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
const struct in_addr *ip_da);
/**
* union udp_epoll_ref - epoll reference portion for TCP connections

4
util.h
View File

@ -141,13 +141,11 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
.tot_len = 0, \
.id = 0, \
.frag_off = 0, \
.ttl = 0xff, \
.ttl = 255, \
.protocol = (proto), \
.saddr = 0, \
.daddr = 0, \
}
#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \
(uint32_t)htons_constant(0xff00 | (proto)))
#define L2_BUF_IP6_INIT(proto) \
{ \