diff --git a/Makefile b/Makefile index 09fc461..92cbd5a 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/flow.c b/flow.c index 27340df..4e337d4 100644 --- a/flow.c +++ b/flow.c @@ -37,6 +37,7 @@ const char *flow_type_str[] = { [FLOW_TCP_SPLICE] = "TCP connection (spliced)", [FLOW_PING4] = "ICMP ping sequence", [FLOW_PING6] = "ICMPv6 ping sequence", + [FLOW_UDP] = "UDP flow", }; static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); @@ -46,6 +47,7 @@ const uint8_t flow_proto[] = { [FLOW_TCP_SPLICE] = IPPROTO_TCP, [FLOW_PING4] = IPPROTO_ICMP, [FLOW_PING6] = IPPROTO_ICMPV6, + [FLOW_UDP] = IPPROTO_UDP, }; static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); @@ -701,6 +703,32 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, return flowside_lookup(c, proto, pif, &side); } +/** + * flow_lookup_sa() - Look up a flow given an endpoint socket address + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @esa: Socket address of the endpoint + * @fport: Forwarding port number + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, in_port_t fport) +{ + struct flowside side = { + .fport = fport, + }; + + inany_from_sockaddr(&side.eaddr, &side.eport, esa); + if (inany_v4(&side.eaddr)) + side.faddr = inany_any4; + else + side.faddr = inany_any6; + + return flowside_lookup(c, proto, pif, &side); +} + /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context @@ -780,6 +808,10 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) if (timer) closed = icmp_ping_timer(c, &flow->ping, now); break; + case FLOW_UDP: + if (timer) + closed = udp_flow_timer(c, &flow->udp, now); + break; default: /* Assume other flow types don't need any handling */ ; diff --git a/flow.h b/flow.h index bf6b845..7866477 100644 --- a/flow.h +++ b/flow.h @@ -115,6 +115,8 @@ enum flow_type { FLOW_PING4, /* ICMPv6 echo requests from guest to host and matching replies back */ FLOW_PING6, + /* UDP pseudo-connection */ + FLOW_UDP, FLOW_NUM_TYPES, }; @@ -238,6 +240,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, uint8_t proto, uint8_t pif, sa_family_t af, const void *eaddr, const void *faddr, in_port_t eport, in_port_t fport); +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, in_port_t fport); union flow; diff --git a/flow_table.h b/flow_table.h index 9d912c8..df253be 100644 --- a/flow_table.h +++ b/flow_table.h @@ -9,6 +9,7 @@ #include "tcp_conn.h" #include "icmp_flow.h" +#include "udp_flow.h" /** * struct flow_free_cluster - Information about a cluster of free entries @@ -35,6 +36,7 @@ union flow { struct tcp_tap_conn tcp; struct tcp_splice_conn tcp_splice; struct icmp_ping_flow ping; + struct udp_flow udp; }; /* Global Flow Table */ @@ -98,6 +100,19 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx) return flow->f.pif[sidx.sidei]; } +/** flow_sidx_opposite() - Get the other side of the same flow + * @sidx: Flow & side index + * + * Return: sidx for the other side of the same flow as @sidx + */ +static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx) +{ + if (!flow_sidx_valid(sidx)) + return FLOW_SIDX_NONE; + + return (flow_sidx_t){.flowi = sidx.flowi, .sidei = !sidx.sidei}; +} + /** flow_sidx() - Index of one side of a flow from common structure * @f: Common flow fields pointer * @sidei: Which side to refer to (0 or 1) diff --git a/udp.c b/udp.c index 150f970..fdbe396 100644 --- a/udp.c +++ b/udp.c @@ -15,6 +15,30 @@ /** * DOC: Theory of Operation * + * UDP Flows + * ========= + * + * UDP doesn't have true connections, but many protocols use a connection-like + * format. The flow is initiated by a client sending a datagram from a port of + * its choosing (usually ephemeral) to a specific port (usually well known) on a + * server. Both client and server address must be unicast. The server sends + * replies using the same addresses & ports with src/dest swapped. + * + * We track pseudo-connections of this type as flow table entries of type + * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, + * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. + * + * NOTE: This won't handle multicast protocols, or some protocols with different + * port usage. We'll need specific logic if we want to handle those. + * + * "Listening" sockets + * =================== + * + * UDP doesn't use listen(), but we consider long term sockets which are allowed + * to create new flows "listening" by analogy with TCP. + * + * Port tracking + * ============= * * For UDP, a reduced version of port-based connection tracking is implemented * with two purposes: @@ -122,6 +146,7 @@ #include "tap.h" #include "pcap.h" #include "log.h" +#include "flow_table.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ @@ -200,6 +225,7 @@ static struct ethhdr udp6_eth_hdr; * @taph: Tap backend specific header * @s_in: Source socket address, filled in by recvmmsg() * @splicesrc: Source port for splicing, or -1 if not spliceable + * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; @@ -208,6 +234,7 @@ static struct udp_meta_t { union sockaddr_inany s_in; int splicesrc; + flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) @@ -491,6 +518,115 @@ static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh) return -1; } +/** + * udp_at_sidx() - Get UDP specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if + * the flow at @sidx is not FLOW_UDP. + */ +struct udp_flow *udp_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_UDP); + return &flow->udp; +} + +/* + * udp_flow_close() - Close and clean up UDP flow + * @c: Execution context + * @uflow: UDP flow + */ +static void udp_flow_close(const struct ctx *c, const struct udp_flow *uflow) +{ + flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); +} + +/** + * udp_flow_new() - Common setup for a new UDP flow + * @c: Execution context + * @flow: Initiated flow + * @now: Timestamp + * + * Return: UDP specific flow, if successful, NULL on failure + */ +static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, + const struct timespec *now) +{ + const struct flowside *ini = &flow->f.side[INISIDE]; + struct udp_flow *uflow = NULL; + + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { + flow_trace(flow, "Invalid endpoint to initiate UDP flow"); + goto cancel; + } + + if (!flow_target(c, flow, IPPROTO_UDP)) + goto cancel; + + uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); + uflow->ts = now->tv_sec; + + flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); + FLOW_ACTIVATE(uflow); + + return FLOW_SIDX(uflow, TGTSIDE); + +cancel: + if (uflow) + udp_flow_close(c, uflow); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; + +} + +/** + * udp_flow_from_sock() - Find or create UDP flow for "listening" socket + * @c: Execution context + * @ref: epoll reference of the receiving socket + * @meta: Metadata buffer for the datagram + * @now: Timestamp + * + * Return: sidx for the destination side of the flow for this packet, or + * FLOW_SIDX_NONE if we couldn't find or create a flow. + */ +static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, + struct udp_meta_t *meta, + const struct timespec *now) +{ + struct udp_flow *uflow; + union flow *flow; + flow_sidx_t sidx; + + ASSERT(ref.type == EPOLL_TYPE_UDP); + + /* FIXME: Match reply packets to their flow as well */ + if (!ref.udp.orig) + return FLOW_SIDX_NONE; + + sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, &meta->s_in, ref.udp.port); + if ((uflow = udp_at_sidx(sidx))) { + uflow->ts = now->tv_sec; + return flow_sidx_opposite(sidx); + } + + if (!(flow = flow_alloc())) { + char sastr[SOCKADDR_STRLEN]; + + debug("Couldn't allocate flow for UDP datagram from %s %s", + pif_name(ref.udp.pif), + sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr))); + return FLOW_SIDX_NONE; + } + + flow_initiate_sa(flow, ref.udp.pif, &meta->s_in, ref.udp.port); + return udp_flow_new(c, flow, now); +} + /** * udp_splice_prepare() - Prepare one datagram for splicing * @mmh: Receiving mmsghdr array @@ -848,12 +984,15 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve dstport += c->udp.fwd_in.f.delta[dstport]; /* We divide datagrams into batches based on how we need to send them, - * determined by udp_meta[i].splicesrc. To avoid either two passes - * through the array, or recalculating splicesrc for a single entry, we - * have to populate it one entry *ahead* of the loop counter. + * determined by udp_meta[i].splicesrc and udp_meta[i].tosidx. To avoid + * either two passes through the array, or recalculating splicesrc and + * tosidxfor a single entry, we have to populate it one entry *ahead* of + * the loop counter. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv); + udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0], now); for (i = 0; i < n; ) { + flow_sidx_t batchsidx = udp_meta[i].tosidx; int batchsrc = udp_meta[i].splicesrc; int batchstart = i; @@ -870,7 +1009,11 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == batchsrc); + udp_meta[i].tosidx = udp_flow_from_sock(c, ref, + &udp_meta[i], + now); + } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx) && + udp_meta[i].splicesrc == batchsrc); if (batchsrc >= 0) { udp_splice_send(c, batchstart, i - batchstart, @@ -1268,6 +1411,24 @@ static int udp_port_rebind_outbound(void *arg) return 0; } +/** + * udp_flow_timer() - Handler for timed events related to a given flow + * @c: Execution context + * @uflow: UDP flow + * @now: Current timestamp + * + * Return: true if the flow is ready to free, false otherwise + */ +bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow, + const struct timespec *now) +{ + if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) + return false; + + udp_flow_close(c, uflow); + return true; +} + /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context diff --git a/udp_flow.h b/udp_flow.h new file mode 100644 index 0000000..18af9ac --- /dev/null +++ b/udp_flow.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson + * + * UDP flow tracking data structures + */ +#ifndef UDP_FLOW_H +#define UDP_FLOW_H + +/** + * struct udp - Descriptor for a flow of UDP packets + * @f: Generic flow information + * @ts: Activity timestamp + */ +struct udp_flow { + /* Must be first element */ + struct flow_common f; + + time_t ts; +}; + +bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow, + const struct timespec *now); + +#endif /* UDP_FLOW_H */