2023-11-30 13:02:08 +11:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
* Copyright Red Hat
|
|
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
|
|
*
|
|
|
|
* Tracking for logical "flows" of packets.
|
|
|
|
*/
|
|
|
|
|
2024-07-18 15:26:41 +10:00
|
|
|
#include <errno.h>
|
2023-11-30 13:02:08 +11:00
|
|
|
#include <stdint.h>
|
2023-12-24 17:56:51 +01:00
|
|
|
#include <stdio.h>
|
2023-11-30 13:02:09 +11:00
|
|
|
#include <unistd.h>
|
2024-07-18 15:26:41 +10:00
|
|
|
#include <sched.h>
|
2023-11-30 13:02:09 +11:00
|
|
|
#include <string.h>
|
2023-11-30 13:02:08 +11:00
|
|
|
|
2023-11-30 13:02:09 +11:00
|
|
|
#include "util.h"
|
2024-03-06 16:58:33 +11:00
|
|
|
#include "ip.h"
|
2023-11-30 13:02:09 +11:00
|
|
|
#include "passt.h"
|
|
|
|
#include "siphash.h"
|
|
|
|
#include "inany.h"
|
2023-11-30 13:02:08 +11:00
|
|
|
#include "flow.h"
|
2023-11-30 13:02:09 +11:00
|
|
|
#include "flow_table.h"
|
migrate: Migrate TCP flows
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.
Contents of pending queues are transferred as well.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.
Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-02-13 23:14:13 +11:00
|
|
|
#include "repair.h"
|
2023-11-30 13:02:08 +11:00
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
const char *flow_state_str[] = {
|
|
|
|
[FLOW_STATE_FREE] = "FREE",
|
|
|
|
[FLOW_STATE_NEW] = "NEW",
|
2024-05-21 15:57:07 +10:00
|
|
|
[FLOW_STATE_INI] = "INI",
|
|
|
|
[FLOW_STATE_TGT] = "TGT",
|
2024-05-21 15:57:05 +10:00
|
|
|
[FLOW_STATE_TYPED] = "TYPED",
|
|
|
|
[FLOW_STATE_ACTIVE] = "ACTIVE",
|
|
|
|
};
|
|
|
|
static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
|
|
|
|
"flow_state_str[] doesn't match enum flow_state");
|
|
|
|
|
2023-11-30 13:02:08 +11:00
|
|
|
const char *flow_type_str[] = {
|
|
|
|
[FLOW_TYPE_NONE] = "<none>",
|
|
|
|
[FLOW_TCP] = "TCP connection",
|
|
|
|
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
|
2024-02-29 15:15:32 +11:00
|
|
|
[FLOW_PING4] = "ICMP ping sequence",
|
|
|
|
[FLOW_PING6] = "ICMPv6 ping sequence",
|
2024-07-18 15:26:46 +10:00
|
|
|
[FLOW_UDP] = "UDP flow",
|
2023-11-30 13:02:08 +11:00
|
|
|
};
|
|
|
|
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
|
|
|
|
"flow_type_str[] doesn't match enum flow_type");
|
2023-11-30 13:02:09 +11:00
|
|
|
|
2024-02-28 22:25:07 +11:00
|
|
|
const uint8_t flow_proto[] = {
|
|
|
|
[FLOW_TCP] = IPPROTO_TCP,
|
|
|
|
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
|
2024-02-29 15:15:32 +11:00
|
|
|
[FLOW_PING4] = IPPROTO_ICMP,
|
|
|
|
[FLOW_PING6] = IPPROTO_ICMPV6,
|
2024-07-18 15:26:46 +10:00
|
|
|
[FLOW_UDP] = IPPROTO_UDP,
|
2024-02-28 22:25:07 +11:00
|
|
|
};
|
|
|
|
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
|
|
|
"flow_proto[] doesn't match enum flow_type");
|
|
|
|
|
migrate: Migrate TCP flows
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.
Contents of pending queues are transferred as well.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.
Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-02-13 23:14:13 +11:00
|
|
|
#define foreach_flow(i, flow, bound) \
|
|
|
|
for ((i) = 0, (flow) = &flowtab[(i)]; \
|
|
|
|
(i) < (bound); \
|
|
|
|
(i)++, (flow) = &flowtab[(i)]) \
|
|
|
|
if ((flow)->f.state == FLOW_STATE_FREE) \
|
|
|
|
(i) += (flow)->free.n - 1; \
|
|
|
|
else
|
|
|
|
|
|
|
|
#define foreach_active_flow(i, flow, bound) \
|
|
|
|
foreach_flow((i), (flow), (bound)) \
|
|
|
|
if ((flow)->f.state != FLOW_STATE_ACTIVE) \
|
|
|
|
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
|
|
|
continue; \
|
|
|
|
else
|
|
|
|
|
|
|
|
#define foreach_tcp_flow(i, flow, bound) \
|
|
|
|
foreach_active_flow((i), (flow), (bound)) \
|
|
|
|
if ((flow)->f.type != FLOW_TCP) \
|
|
|
|
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
|
|
|
continue; \
|
|
|
|
else
|
|
|
|
|
|
|
|
#define foreach_established_tcp_flow(i, flow, bound) \
|
|
|
|
foreach_tcp_flow((i), (flow), (bound)) \
|
|
|
|
if (!tcp_flow_is_established(&(flow)->tcp)) \
|
|
|
|
/* NOLINTNEXTLINE(bugprone-branch-clone) */ \
|
|
|
|
continue; \
|
|
|
|
else
|
|
|
|
|
2023-11-30 13:02:09 +11:00
|
|
|
/* Global Flow Table */
|
2024-01-16 11:50:43 +11:00
|
|
|
|
|
|
|
/**
|
|
|
|
* DOC: Theory of Operation - allocating and freeing flow entries
|
|
|
|
*
|
|
|
|
* Flows are entries in flowtab[]. We need to routinely scan the whole table to
|
|
|
|
* perform deferred bookkeeping tasks on active entries, and sparse empty slots
|
|
|
|
* waste time and worsen data locality. But, keeping the table fully compact by
|
|
|
|
* moving entries on deletion is fiddly: it requires updating hash tables, and
|
|
|
|
* the epoll references to flows. Instead, we implement the compromise described
|
|
|
|
* below.
|
|
|
|
*
|
|
|
|
* Free clusters
|
|
|
|
* A "free cluster" is a contiguous set of unused (FLOW_TYPE_NONE) entries in
|
|
|
|
* flowtab[]. The first entry in each cluster contains metadata ('free'
|
|
|
|
* field in union flow), specifically the number of entries in the cluster
|
|
|
|
* (free.n), and the index of the next free cluster (free.next). The entries
|
|
|
|
* in the cluster other than the first should have n == next == 0.
|
|
|
|
*
|
|
|
|
* Free cluster list
|
|
|
|
* flow_first_free gives the index of the first (lowest index) free cluster.
|
|
|
|
* Each free cluster has the index of the next free cluster, or MAX_FLOW if
|
|
|
|
* it is the last free cluster. Together these form a linked list of free
|
|
|
|
* clusters, in strictly increasing order of index.
|
|
|
|
*
|
|
|
|
* Allocating
|
|
|
|
* We always allocate a new flow into the lowest available index, i.e. the
|
|
|
|
* first entry of the first free cluster, that is, at index flow_first_free.
|
|
|
|
* We update flow_first_free and the free cluster to maintain the invariants
|
|
|
|
* above (so the free cluster list is still in strictly increasing order).
|
|
|
|
*
|
|
|
|
* Freeing
|
|
|
|
* It's not possible to maintain the invariants above if we allow freeing of
|
|
|
|
* any entry at any time. So we only allow freeing in two cases.
|
|
|
|
*
|
|
|
|
* 1) flow_alloc_cancel() will free the most recent allocation. We can
|
|
|
|
* maintain the invariants because we know that allocation was made in the
|
|
|
|
* lowest available slot, and so will become the lowest index free slot again
|
|
|
|
* after cancellation.
|
|
|
|
*
|
|
|
|
* 2) Flows can be freed by returning true from the flow type specific
|
|
|
|
* deferred or timer function. These are called from flow_defer_handler()
|
|
|
|
* which is already scanning the whole table in index order. We can use that
|
|
|
|
* to rebuild the free cluster list correctly, either merging them into
|
|
|
|
* existing free clusters or creating new free clusters in the list for them.
|
|
|
|
*
|
|
|
|
* Scanning the table
|
|
|
|
* Theoretically, scanning the table requires FLOW_MAX iterations. However,
|
|
|
|
* when we encounter the start of a free cluster, we can immediately skip
|
|
|
|
* past it, meaning that in practice we only need (number of active
|
|
|
|
* connections) + (number of free clusters) iterations.
|
|
|
|
*/
|
|
|
|
|
|
|
|
unsigned flow_first_free;
|
2023-11-30 13:02:09 +11:00
|
|
|
union flow flowtab[FLOW_MAX];
|
2024-05-21 15:57:05 +10:00
|
|
|
static const union flow *flow_new_entry; /* = NULL */
|
2023-11-30 13:02:12 +11:00
|
|
|
|
2024-07-18 15:26:35 +10:00
|
|
|
/* Hash table to index it */
|
|
|
|
#define FLOW_HASH_LOAD 70 /* % */
|
|
|
|
#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
|
|
|
|
|
|
|
|
/* Table for lookup from flowside information */
|
|
|
|
static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
|
|
|
|
|
|
|
|
static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
|
|
|
|
"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
|
|
|
|
|
2024-01-16 11:50:36 +11:00
|
|
|
/* Last time the flow timers ran */
|
|
|
|
static struct timespec flow_timer_run;
|
|
|
|
|
2024-07-18 15:26:27 +10:00
|
|
|
/** flowside_from_af() - Initialise flowside from addresses
|
|
|
|
* @side: flowside to initialise
|
|
|
|
* @af: Address family (AF_INET or AF_INET6)
|
|
|
|
* @eaddr: Endpoint address (pointer to in_addr or in6_addr)
|
|
|
|
* @eport: Endpoint port
|
2024-08-21 14:19:57 +10:00
|
|
|
* @oaddr: Our address (pointer to in_addr or in6_addr)
|
|
|
|
* @oport: Our port
|
2024-07-18 15:26:27 +10:00
|
|
|
*/
|
2024-07-18 15:26:35 +10:00
|
|
|
static void flowside_from_af(struct flowside *side, sa_family_t af,
|
|
|
|
const void *eaddr, in_port_t eport,
|
2024-08-21 14:19:57 +10:00
|
|
|
const void *oaddr, in_port_t oport)
|
2024-07-18 15:26:27 +10:00
|
|
|
{
|
2024-08-21 14:19:57 +10:00
|
|
|
if (oaddr)
|
|
|
|
inany_from_af(&side->oaddr, af, oaddr);
|
2024-07-18 15:26:27 +10:00
|
|
|
else
|
2024-08-21 14:19:57 +10:00
|
|
|
side->oaddr = inany_any6;
|
|
|
|
side->oport = oport;
|
2024-07-18 15:26:27 +10:00
|
|
|
|
|
|
|
if (eaddr)
|
|
|
|
inany_from_af(&side->eaddr, af, eaddr);
|
|
|
|
else
|
|
|
|
side->eaddr = inany_any6;
|
|
|
|
side->eport = eport;
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:41 +10:00
|
|
|
/**
|
|
|
|
* struct flowside_sock_args - Parameters for flowside_sock_splice()
|
|
|
|
* @c: Execution context
|
|
|
|
* @fd: Filled in with new socket fd
|
|
|
|
* @err: Filled in with errno if something failed
|
|
|
|
* @type: Socket epoll type
|
|
|
|
* @sa: Socket address
|
|
|
|
* @sl: Length of @sa
|
|
|
|
* @data: epoll reference data
|
|
|
|
*/
|
|
|
|
struct flowside_sock_args {
|
|
|
|
const struct ctx *c;
|
|
|
|
int fd;
|
|
|
|
int err;
|
|
|
|
enum epoll_type type;
|
|
|
|
const struct sockaddr *sa;
|
|
|
|
socklen_t sl;
|
|
|
|
const char *path;
|
|
|
|
uint32_t data;
|
|
|
|
};
|
|
|
|
|
|
|
|
/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
|
|
|
|
* @arg: Argument as a struct flowside_sock_args
|
|
|
|
*
|
|
|
|
* Return: 0
|
|
|
|
*/
|
|
|
|
static int flowside_sock_splice(void *arg)
|
|
|
|
{
|
|
|
|
struct flowside_sock_args *a = arg;
|
|
|
|
|
|
|
|
ns_enter(a->c);
|
|
|
|
|
|
|
|
a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
|
|
|
|
a->sa->sa_family == AF_INET6, a->data);
|
|
|
|
a->err = errno;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** flowside_sock_l4() - Create and bind socket based on flowside
|
|
|
|
* @c: Execution context
|
|
|
|
* @type: Socket epoll type
|
|
|
|
* @pif: Interface for this socket
|
|
|
|
* @tgt: Target flowside
|
|
|
|
* @data: epoll reference portion for protocol handlers
|
|
|
|
*
|
2024-08-21 14:19:57 +10:00
|
|
|
* Return: socket fd of protocol @proto bound to our address and port from @tgt
|
|
|
|
* (if specified).
|
2024-07-18 15:26:41 +10:00
|
|
|
*/
|
|
|
|
int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
|
|
|
|
const struct flowside *tgt, uint32_t data)
|
|
|
|
{
|
|
|
|
const char *ifname = NULL;
|
|
|
|
union sockaddr_inany sa;
|
|
|
|
socklen_t sl;
|
|
|
|
|
|
|
|
ASSERT(pif_is_socket(pif));
|
|
|
|
|
2024-08-21 14:19:57 +10:00
|
|
|
pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
|
2024-07-18 15:26:41 +10:00
|
|
|
|
|
|
|
switch (pif) {
|
|
|
|
case PIF_HOST:
|
2024-08-21 14:19:57 +10:00
|
|
|
if (inany_is_loopback(&tgt->oaddr))
|
2024-07-18 15:26:41 +10:00
|
|
|
ifname = NULL;
|
|
|
|
else if (sa.sa_family == AF_INET)
|
|
|
|
ifname = c->ip4.ifname_out;
|
|
|
|
else if (sa.sa_family == AF_INET6)
|
|
|
|
ifname = c->ip6.ifname_out;
|
|
|
|
|
|
|
|
return sock_l4_sa(c, type, &sa, sl, ifname,
|
|
|
|
sa.sa_family == AF_INET6, data);
|
|
|
|
|
|
|
|
case PIF_SPLICE: {
|
|
|
|
struct flowside_sock_args args = {
|
|
|
|
.c = c, .type = type,
|
|
|
|
.sa = &sa.sa, .sl = sl, .data = data,
|
|
|
|
};
|
|
|
|
NS_CALL(flowside_sock_splice, &args);
|
|
|
|
errno = args.err;
|
|
|
|
return args.fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
/* If we add new socket pifs, they'll need to be implemented
|
|
|
|
* here
|
|
|
|
*/
|
|
|
|
ASSERT(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:47 +10:00
|
|
|
/** flowside_connect() - Connect a socket based on flowside
|
|
|
|
* @c: Execution context
|
|
|
|
* @s: Socket to connect
|
|
|
|
* @pif: Target pif
|
|
|
|
* @tgt: Target flowside
|
|
|
|
*
|
|
|
|
* Connect @s to the endpoint address and port from @tgt.
|
|
|
|
*
|
|
|
|
* Return: 0 on success, negative on error
|
|
|
|
*/
|
|
|
|
int flowside_connect(const struct ctx *c, int s,
|
|
|
|
uint8_t pif, const struct flowside *tgt)
|
|
|
|
{
|
|
|
|
union sockaddr_inany sa;
|
|
|
|
socklen_t sl;
|
|
|
|
|
|
|
|
pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport);
|
|
|
|
return connect(s, &sa.sa, sl);
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:50:39 +11:00
|
|
|
/** flow_log_ - Log flow-related message
|
|
|
|
* @f: flow the message is related to
|
|
|
|
* @pri: Log priority
|
|
|
|
* @fmt: Format string
|
|
|
|
* @...: printf-arguments
|
|
|
|
*/
|
|
|
|
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
|
|
|
|
{
|
2024-05-21 15:57:05 +10:00
|
|
|
const char *type_or_state;
|
2024-01-16 11:50:39 +11:00
|
|
|
char msg[BUFSIZ];
|
|
|
|
va_list args;
|
|
|
|
|
|
|
|
va_start(args, fmt);
|
|
|
|
(void)vsnprintf(msg, sizeof(msg), fmt, args);
|
|
|
|
va_end(args);
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
/* Show type if it's set, otherwise the state */
|
|
|
|
if (f->state < FLOW_STATE_TYPED)
|
|
|
|
type_or_state = FLOW_STATE(f);
|
|
|
|
else
|
|
|
|
type_or_state = FLOW_TYPE(f);
|
|
|
|
|
2024-08-12 10:20:34 +02:00
|
|
|
logmsg(true, false, pri,
|
|
|
|
"Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
|
2024-05-21 15:57:05 +10:00
|
|
|
}
|
|
|
|
|
2024-09-06 15:17:07 +10:00
|
|
|
/** flow_log_details_() - Log the details of a flow
|
|
|
|
* @f: flow to log
|
|
|
|
* @pri: Log priority
|
|
|
|
* @state: State to log details according to
|
|
|
|
*
|
|
|
|
* Logs the details of the flow: endpoints, interfaces, type etc.
|
2024-05-21 15:57:05 +10:00
|
|
|
*/
|
2024-09-06 15:17:07 +10:00
|
|
|
void flow_log_details_(const struct flow_common *f, int pri,
|
|
|
|
enum flow_state state)
|
2024-05-21 15:57:05 +10:00
|
|
|
{
|
2024-07-18 15:26:28 +10:00
|
|
|
char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
|
|
|
|
char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
|
2024-07-18 15:26:27 +10:00
|
|
|
const struct flowside *ini = &f->side[INISIDE];
|
2024-07-18 15:26:28 +10:00
|
|
|
const struct flowside *tgt = &f->side[TGTSIDE];
|
2024-05-21 15:57:05 +10:00
|
|
|
|
2024-09-06 15:17:07 +10:00
|
|
|
if (state >= FLOW_STATE_TGT)
|
|
|
|
flow_log_(f, pri,
|
2024-07-18 15:26:28 +10:00
|
|
|
"%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
|
2024-07-18 15:26:27 +10:00
|
|
|
pif_name(f->pif[INISIDE]),
|
2024-07-18 15:26:28 +10:00
|
|
|
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
2024-07-18 15:26:27 +10:00
|
|
|
ini->eport,
|
2024-08-21 14:19:57 +10:00
|
|
|
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
|
|
|
ini->oport,
|
2024-07-18 15:26:28 +10:00
|
|
|
pif_name(f->pif[TGTSIDE]),
|
2024-08-21 14:19:57 +10:00
|
|
|
inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
|
|
|
|
tgt->oport,
|
2024-07-18 15:26:28 +10:00
|
|
|
inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
|
|
|
|
tgt->eport);
|
2024-09-06 15:17:07 +10:00
|
|
|
else if (state >= FLOW_STATE_INI)
|
|
|
|
flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
|
2024-07-18 15:26:27 +10:00
|
|
|
pif_name(f->pif[INISIDE]),
|
2024-07-18 15:26:28 +10:00
|
|
|
inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
|
2024-07-18 15:26:27 +10:00
|
|
|
ini->eport,
|
2024-08-21 14:19:57 +10:00
|
|
|
inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
|
|
|
|
ini->oport);
|
2024-05-21 15:57:07 +10:00
|
|
|
}
|
|
|
|
|
2024-09-06 15:17:07 +10:00
|
|
|
/**
|
|
|
|
* flow_set_state() - Change flow's state
|
|
|
|
* @f: Flow changing state
|
|
|
|
* @state: New state
|
|
|
|
*/
|
|
|
|
static void flow_set_state(struct flow_common *f, enum flow_state state)
|
|
|
|
{
|
|
|
|
uint8_t oldstate = f->state;
|
|
|
|
|
|
|
|
ASSERT(state < FLOW_NUM_STATES);
|
|
|
|
ASSERT(oldstate < FLOW_NUM_STATES);
|
|
|
|
|
|
|
|
f->state = state;
|
|
|
|
flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
|
|
|
|
FLOW_STATE(f));
|
|
|
|
|
|
|
|
flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
|
|
|
|
}
|
|
|
|
|
2024-05-21 15:57:07 +10:00
|
|
|
/**
|
2024-07-18 15:26:27 +10:00
|
|
|
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
|
2024-05-21 15:57:07 +10:00
|
|
|
* @flow: Flow to change state
|
|
|
|
* @pif: pif of the initiating side
|
|
|
|
*/
|
2024-07-18 15:26:27 +10:00
|
|
|
static void flow_initiate_(union flow *flow, uint8_t pif)
|
2024-05-21 15:57:07 +10:00
|
|
|
{
|
|
|
|
struct flow_common *f = &flow->f;
|
|
|
|
|
|
|
|
ASSERT(pif != PIF_NONE);
|
|
|
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW);
|
|
|
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
|
|
|
ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
|
|
|
|
|
|
|
f->pif[INISIDE] = pif;
|
|
|
|
flow_set_state(f, FLOW_STATE_INI);
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:27 +10:00
|
|
|
/**
|
|
|
|
* flow_initiate_af() - Move flow to INI, setting INISIDE details
|
|
|
|
* @flow: Flow to change state
|
|
|
|
* @pif: pif of the initiating side
|
2024-08-21 14:19:57 +10:00
|
|
|
* @af: Address family of @saddr and @daddr
|
2024-07-18 15:26:27 +10:00
|
|
|
* @saddr: Source address (pointer to in_addr or in6_addr)
|
|
|
|
* @sport: Endpoint port
|
|
|
|
* @daddr: Destination address (pointer to in_addr or in6_addr)
|
|
|
|
* @dport: Destination port
|
|
|
|
*
|
|
|
|
* Return: pointer to the initiating flowside information
|
|
|
|
*/
|
|
|
|
const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
|
|
|
|
sa_family_t af,
|
|
|
|
const void *saddr, in_port_t sport,
|
|
|
|
const void *daddr, in_port_t dport)
|
|
|
|
{
|
|
|
|
struct flowside *ini = &flow->f.side[INISIDE];
|
|
|
|
|
|
|
|
flowside_from_af(ini, af, saddr, sport, daddr, dport);
|
|
|
|
flow_initiate_(flow, pif);
|
|
|
|
return ini;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_initiate_sa() - Move flow to INI, setting INISIDE details
|
|
|
|
* @flow: Flow to change state
|
|
|
|
* @pif: pif of the initiating side
|
|
|
|
* @ssa: Source socket address
|
|
|
|
* @dport: Destination port
|
|
|
|
*
|
|
|
|
* Return: pointer to the initiating flowside information
|
|
|
|
*/
|
2025-02-12 18:07:17 +11:00
|
|
|
struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
|
|
|
|
const union sockaddr_inany *ssa,
|
|
|
|
in_port_t dport)
|
2024-07-18 15:26:27 +10:00
|
|
|
{
|
|
|
|
struct flowside *ini = &flow->f.side[INISIDE];
|
|
|
|
|
|
|
|
inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
|
|
|
|
if (inany_v4(&ini->eaddr))
|
2024-08-21 14:19:57 +10:00
|
|
|
ini->oaddr = inany_any4;
|
2024-07-18 15:26:27 +10:00
|
|
|
else
|
2024-08-21 14:19:57 +10:00
|
|
|
ini->oaddr = inany_any6;
|
|
|
|
ini->oport = dport;
|
2024-07-18 15:26:27 +10:00
|
|
|
flow_initiate_(flow, pif);
|
|
|
|
return ini;
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:43 +10:00
|
|
|
/**
|
|
|
|
* flow_target() - Determine where flow should forward to, and move to TGT
|
|
|
|
* @c: Execution context
|
|
|
|
* @flow: Flow to forward
|
|
|
|
* @proto: Protocol
|
|
|
|
*
|
|
|
|
* Return: pointer to the target flowside information
|
|
|
|
*/
|
2025-01-31 18:27:07 +01:00
|
|
|
struct flowside *flow_target(const struct ctx *c, union flow *flow,
|
|
|
|
uint8_t proto)
|
2024-07-18 15:26:43 +10:00
|
|
|
{
|
|
|
|
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
|
|
|
|
struct flow_common *f = &flow->f;
|
|
|
|
const struct flowside *ini = &f->side[INISIDE];
|
|
|
|
struct flowside *tgt = &f->side[TGTSIDE];
|
|
|
|
uint8_t tgtpif = PIF_NONE;
|
|
|
|
|
|
|
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI);
|
|
|
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
|
|
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
|
|
|
|
ASSERT(flow->f.state == FLOW_STATE_INI);
|
|
|
|
|
|
|
|
switch (f->pif[INISIDE]) {
|
|
|
|
case PIF_TAP:
|
|
|
|
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PIF_SPLICE:
|
|
|
|
tgtpif = fwd_nat_from_splice(c, proto, ini, tgt);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PIF_HOST:
|
|
|
|
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu",
|
|
|
|
pif_name(f->pif[INISIDE]),
|
|
|
|
inany_ntop(&ini->eaddr, estr, sizeof(estr)),
|
|
|
|
ini->eport,
|
2024-08-21 14:19:57 +10:00
|
|
|
inany_ntop(&ini->oaddr, fstr, sizeof(fstr)),
|
|
|
|
ini->oport);
|
2024-07-18 15:26:43 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
if (tgtpif == PIF_NONE)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
f->pif[TGTSIDE] = tgtpif;
|
|
|
|
flow_set_state(f, FLOW_STATE_TGT);
|
|
|
|
return tgt;
|
|
|
|
}
|
|
|
|
|
2024-02-28 22:25:10 +11:00
|
|
|
/**
|
2024-05-21 15:57:05 +10:00
|
|
|
* flow_set_type() - Set type and move to TYPED
|
|
|
|
* @flow: Flow to change state
|
2024-05-21 15:57:07 +10:00
|
|
|
* @pif: pif of the initiating side
|
2024-02-28 22:25:10 +11:00
|
|
|
*/
|
2024-05-21 15:57:06 +10:00
|
|
|
union flow *flow_set_type(union flow *flow, enum flow_type type)
|
2024-02-28 22:25:10 +11:00
|
|
|
{
|
2024-05-21 15:57:05 +10:00
|
|
|
struct flow_common *f = &flow->f;
|
|
|
|
|
|
|
|
ASSERT(type != FLOW_TYPE_NONE);
|
2024-05-21 15:57:07 +10:00
|
|
|
ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT);
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(f->type == FLOW_TYPE_NONE);
|
2024-05-21 15:57:07 +10:00
|
|
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
2024-05-21 15:57:05 +10:00
|
|
|
|
|
|
|
f->type = type;
|
|
|
|
flow_set_state(f, FLOW_STATE_TYPED);
|
2024-02-28 22:25:10 +11:00
|
|
|
return flow;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2024-05-21 15:57:05 +10:00
|
|
|
* flow_activate() - Move flow to ACTIVE
|
|
|
|
* @f: Flow to change state
|
2024-02-28 22:25:10 +11:00
|
|
|
*/
|
2024-05-21 15:57:05 +10:00
|
|
|
void flow_activate(struct flow_common *f)
|
2024-02-28 22:25:10 +11:00
|
|
|
{
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED);
|
2024-05-21 15:57:07 +10:00
|
|
|
ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
|
2024-02-28 22:25:10 +11:00
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_set_state(f, FLOW_STATE_ACTIVE);
|
|
|
|
flow_new_entry = NULL;
|
2024-02-28 22:25:10 +11:00
|
|
|
}
|
|
|
|
|
2024-01-16 11:50:41 +11:00
|
|
|
/**
|
|
|
|
* flow_alloc() - Allocate a new flow
|
|
|
|
*
|
|
|
|
* Return: pointer to an unused flow entry, or NULL if the table is full
|
|
|
|
*/
|
|
|
|
union flow *flow_alloc(void)
|
|
|
|
{
|
2024-01-16 11:50:43 +11:00
|
|
|
union flow *flow = &flowtab[flow_first_free];
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(!flow_new_entry);
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
if (flow_first_free >= FLOW_MAX)
|
2024-01-16 11:50:41 +11:00
|
|
|
return NULL;
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(flow->f.state == FLOW_STATE_FREE);
|
2024-01-16 11:50:43 +11:00
|
|
|
ASSERT(flow->f.type == FLOW_TYPE_NONE);
|
|
|
|
ASSERT(flow->free.n >= 1);
|
|
|
|
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
|
|
|
|
|
|
|
|
if (flow->free.n > 1) {
|
|
|
|
union flow *next;
|
|
|
|
|
|
|
|
/* Use one entry from the cluster */
|
|
|
|
ASSERT(flow_first_free <= FLOW_MAX - 2);
|
|
|
|
next = &flowtab[++flow_first_free];
|
|
|
|
|
|
|
|
ASSERT(FLOW_IDX(next) < FLOW_MAX);
|
|
|
|
ASSERT(next->f.type == FLOW_TYPE_NONE);
|
|
|
|
ASSERT(next->free.n == 0);
|
|
|
|
|
|
|
|
next->free.n = flow->free.n - 1;
|
|
|
|
next->free.next = flow->free.next;
|
|
|
|
} else {
|
|
|
|
/* Use the entire cluster */
|
|
|
|
flow_first_free = flow->free.next;
|
|
|
|
}
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_new_entry = flow;
|
2024-01-16 11:50:43 +11:00
|
|
|
memset(flow, 0, sizeof(*flow));
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_set_state(&flow->f, FLOW_STATE_NEW);
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
return flow;
|
2024-01-16 11:50:41 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_alloc_cancel() - Free a newly allocated flow
|
|
|
|
* @flow: Flow to deallocate
|
|
|
|
*
|
|
|
|
* @flow must be the last flow allocated by flow_alloc()
|
|
|
|
*/
|
|
|
|
void flow_alloc_cancel(union flow *flow)
|
|
|
|
{
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(flow_new_entry == flow);
|
|
|
|
ASSERT(flow->f.state == FLOW_STATE_NEW ||
|
2024-05-21 15:57:07 +10:00
|
|
|
flow->f.state == FLOW_STATE_INI ||
|
|
|
|
flow->f.state == FLOW_STATE_TGT ||
|
2024-05-21 15:57:05 +10:00
|
|
|
flow->f.state == FLOW_STATE_TYPED);
|
2024-01-16 11:50:43 +11:00
|
|
|
ASSERT(flow_first_free > FLOW_IDX(flow));
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
|
|
|
memset(flow, 0, sizeof(*flow));
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
/* Put it back in a length 1 free cluster, don't attempt to fully
|
|
|
|
* reverse flow_alloc()s steps. This will get folded together the next
|
|
|
|
* time flow_defer_handler runs anyway() */
|
|
|
|
flow->free.n = 1;
|
|
|
|
flow->free.next = flow_first_free;
|
|
|
|
flow_first_free = FLOW_IDX(flow);
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_new_entry = NULL;
|
2023-11-30 13:02:12 +11:00
|
|
|
}
|
2023-11-30 13:02:13 +11:00
|
|
|
|
2024-07-18 15:26:34 +10:00
|
|
|
/**
|
|
|
|
* flow_hash() - Calculate hash value for one side of a flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @proto: Protocol of this flow (IP L4 protocol number)
|
|
|
|
* @pif: pif of the side to hash
|
|
|
|
* @side: Flowside (must not have unspecified parts)
|
|
|
|
*
|
|
|
|
* Return: hash value
|
|
|
|
*/
|
2024-07-18 15:26:35 +10:00
|
|
|
static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
|
|
|
|
const struct flowside *side)
|
2024-07-18 15:26:34 +10:00
|
|
|
{
|
|
|
|
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
|
|
|
|
2024-08-21 14:19:57 +10:00
|
|
|
inany_siphash_feed(&state, &side->oaddr);
|
2024-07-18 15:26:34 +10:00
|
|
|
inany_siphash_feed(&state, &side->eaddr);
|
|
|
|
|
|
|
|
return siphash_final(&state, 38, (uint64_t)proto << 40 |
|
|
|
|
(uint64_t)pif << 32 |
|
2024-08-21 14:19:57 +10:00
|
|
|
(uint64_t)side->oport << 16 |
|
2024-07-18 15:26:34 +10:00
|
|
|
(uint64_t)side->eport);
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:35 +10:00
|
|
|
/**
|
|
|
|
* flow_sidx_hash() - Calculate hash value for given side of a given flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @sidx: Flow & side index to get hash for
|
|
|
|
*
|
|
|
|
* Return: hash value, of the flow & side represented by @sidx
|
|
|
|
*/
|
|
|
|
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
|
|
|
{
|
|
|
|
const struct flow_common *f = &flow_at_sidx(sidx)->f;
|
2024-08-14 20:03:33 +10:00
|
|
|
const struct flowside *side = &f->side[sidx.sidei];
|
|
|
|
uint8_t pif = f->pif[sidx.sidei];
|
|
|
|
|
2024-12-05 15:26:02 +11:00
|
|
|
ASSERT(pif != PIF_NONE);
|
2024-08-14 20:03:33 +10:00
|
|
|
return flow_hash(c, FLOW_PROTO(f), pif, side);
|
2024-07-18 15:26:35 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2024-07-18 15:26:36 +10:00
|
|
|
* flow_hash_probe_() - Find hash bucket for a flow, given hash
|
|
|
|
* @hash: Raw hash value for flow & side
|
2024-07-18 15:26:35 +10:00
|
|
|
* @sidx: Flow and side to find bucket for
|
|
|
|
*
|
|
|
|
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
|
|
|
* suitable free bucket for it.
|
|
|
|
*/
|
2024-07-18 15:26:36 +10:00
|
|
|
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
|
2024-07-18 15:26:35 +10:00
|
|
|
{
|
2024-07-18 15:26:36 +10:00
|
|
|
unsigned b = hash % FLOW_HASH_SIZE;
|
2024-07-18 15:26:35 +10:00
|
|
|
|
|
|
|
/* Linear probing */
|
|
|
|
while (flow_sidx_valid(flow_hashtab[b]) &&
|
|
|
|
!flow_sidx_eq(flow_hashtab[b], sidx))
|
|
|
|
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
|
|
|
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:36 +10:00
|
|
|
/**
|
|
|
|
* flow_hash_probe() - Find hash bucket for a flow
|
|
|
|
* @c: Execution context
|
|
|
|
* @sidx: Flow and side to find bucket for
|
|
|
|
*
|
|
|
|
* Return: If @sidx is in the hash table, its current bucket, otherwise a
|
|
|
|
* suitable free bucket for it.
|
|
|
|
*/
|
|
|
|
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
|
|
|
|
{
|
|
|
|
return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx);
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:35 +10:00
|
|
|
/**
|
|
|
|
* flow_hash_insert() - Insert side of a flow into into hash table
|
|
|
|
* @c: Execution context
|
|
|
|
* @sidx: Flow & side index
|
2024-07-18 15:26:36 +10:00
|
|
|
*
|
|
|
|
* Return: raw (un-modded) hash value of side of flow
|
2024-07-18 15:26:35 +10:00
|
|
|
*/
|
2024-07-18 15:26:36 +10:00
|
|
|
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
|
2024-07-18 15:26:35 +10:00
|
|
|
{
|
2024-07-18 15:26:36 +10:00
|
|
|
uint64_t hash = flow_sidx_hash(c, sidx);
|
|
|
|
unsigned b = flow_hash_probe_(hash, sidx);
|
2024-07-18 15:26:35 +10:00
|
|
|
|
|
|
|
flow_hashtab[b] = sidx;
|
|
|
|
flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u",
|
|
|
|
sidx.sidei, b);
|
2024-07-18 15:26:36 +10:00
|
|
|
|
|
|
|
return hash;
|
2024-07-18 15:26:35 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_hash_remove() - Drop side of a flow from the hash table
|
|
|
|
* @c: Execution context
|
|
|
|
* @sidx: Side of flow to remove
|
|
|
|
*/
|
|
|
|
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
|
|
|
|
{
|
|
|
|
unsigned b = flow_hash_probe(c, sidx), s;
|
|
|
|
|
|
|
|
if (!flow_sidx_valid(flow_hashtab[b]))
|
|
|
|
return; /* Redundant remove */
|
|
|
|
|
|
|
|
flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u",
|
|
|
|
sidx.sidei, b);
|
|
|
|
|
|
|
|
/* Scan the remainder of the cluster */
|
|
|
|
for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
|
|
|
|
flow_sidx_valid(flow_hashtab[s]);
|
|
|
|
s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
|
|
|
|
unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;
|
|
|
|
|
|
|
|
if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
|
|
|
|
/* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
|
|
|
|
debug("hash table remove: shuffle %u -> %u", s, b);
|
|
|
|
flow_hashtab[b] = flow_hashtab[s];
|
|
|
|
b = s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
flow_hashtab[b] = FLOW_SIDX_NONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flowside_lookup() - Look for a matching flowside in the flow table
|
|
|
|
* @c: Execution context
|
|
|
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
|
|
|
* @pif: pif to look for in the table
|
|
|
|
* @side: Flowside to look for in the table
|
|
|
|
*
|
|
|
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
|
|
|
*/
|
|
|
|
static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
|
|
|
|
uint8_t pif, const struct flowside *side)
|
|
|
|
{
|
|
|
|
flow_sidx_t sidx;
|
|
|
|
union flow *flow;
|
|
|
|
unsigned b;
|
|
|
|
|
|
|
|
b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE;
|
|
|
|
while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) &&
|
|
|
|
!(FLOW_PROTO(&flow->f) == proto &&
|
|
|
|
flow->f.pif[sidx.sidei] == pif &&
|
|
|
|
flowside_eq(&flow->f.side[sidx.sidei], side)))
|
2024-09-06 15:17:05 +10:00
|
|
|
b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
2024-07-18 15:26:35 +10:00
|
|
|
|
|
|
|
return flow_hashtab[b];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_lookup_af() - Look up a flow given addressing information
|
|
|
|
* @c: Execution context
|
|
|
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
|
|
|
* @pif: Interface of the flow
|
|
|
|
* @af: Address family, AF_INET or AF_INET6
|
|
|
|
* @eaddr: Guest side endpoint address (guest local address)
|
2024-08-21 14:19:57 +10:00
|
|
|
* @oaddr: Our guest side address (guest remote address)
|
2024-07-18 15:26:35 +10:00
|
|
|
* @eport: Guest side endpoint port (guest local port)
|
2024-08-21 14:19:57 +10:00
|
|
|
* @oport: Our guest side port (guest remote port)
|
2024-07-18 15:26:35 +10:00
|
|
|
*
|
|
|
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
|
|
|
*/
|
|
|
|
flow_sidx_t flow_lookup_af(const struct ctx *c,
|
|
|
|
uint8_t proto, uint8_t pif, sa_family_t af,
|
2024-08-21 14:19:57 +10:00
|
|
|
const void *eaddr, const void *oaddr,
|
|
|
|
in_port_t eport, in_port_t oport)
|
2024-07-18 15:26:35 +10:00
|
|
|
{
|
|
|
|
struct flowside side;
|
|
|
|
|
2024-08-21 14:19:57 +10:00
|
|
|
flowside_from_af(&side, af, eaddr, eport, oaddr, oport);
|
2024-07-18 15:26:35 +10:00
|
|
|
return flowside_lookup(c, proto, pif, &side);
|
|
|
|
}
|
|
|
|
|
2024-07-18 15:26:46 +10:00
|
|
|
/**
|
|
|
|
* flow_lookup_sa() - Look up a flow given an endpoint socket address
|
|
|
|
* @c: Execution context
|
|
|
|
* @proto: Protocol of the flow (IP L4 protocol number)
|
|
|
|
* @pif: Interface of the flow
|
|
|
|
* @esa: Socket address of the endpoint
|
2024-08-21 14:19:57 +10:00
|
|
|
* @oport: Our port number
|
2024-07-18 15:26:46 +10:00
|
|
|
*
|
|
|
|
* Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
|
|
|
|
*/
|
|
|
|
flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
|
2024-08-21 14:19:57 +10:00
|
|
|
const void *esa, in_port_t oport)
|
2024-07-18 15:26:46 +10:00
|
|
|
{
|
|
|
|
struct flowside side = {
|
2024-08-21 14:19:57 +10:00
|
|
|
.oport = oport,
|
2024-07-18 15:26:46 +10:00
|
|
|
};
|
|
|
|
|
|
|
|
inany_from_sockaddr(&side.eaddr, &side.eport, esa);
|
|
|
|
if (inany_v4(&side.eaddr))
|
2024-08-21 14:19:57 +10:00
|
|
|
side.oaddr = inany_any4;
|
2024-07-18 15:26:46 +10:00
|
|
|
else
|
2024-08-21 14:19:57 +10:00
|
|
|
side.oaddr = inany_any6;
|
2024-07-18 15:26:46 +10:00
|
|
|
|
|
|
|
return flowside_lookup(c, proto, pif, &side);
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:50:35 +11:00
|
|
|
/**
|
2024-01-16 11:50:36 +11:00
|
|
|
* flow_defer_handler() - Handler for per-flow deferred and timed tasks
|
2024-01-16 11:50:35 +11:00
|
|
|
* @c: Execution context
|
2024-01-16 11:50:36 +11:00
|
|
|
* @now: Current timestamp
|
2024-01-16 11:50:35 +11:00
|
|
|
*/
|
2024-01-16 11:50:40 +11:00
|
|
|
void flow_defer_handler(const struct ctx *c, const struct timespec *now)
|
2024-01-16 11:50:35 +11:00
|
|
|
{
|
2024-01-16 11:50:43 +11:00
|
|
|
struct flow_free_cluster *free_head = NULL;
|
|
|
|
unsigned *last_next = &flow_first_free;
|
2024-01-16 11:50:36 +11:00
|
|
|
bool timer = false;
|
2024-01-16 11:50:43 +11:00
|
|
|
unsigned idx;
|
2024-01-16 11:50:35 +11:00
|
|
|
|
2024-01-16 11:50:36 +11:00
|
|
|
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
|
|
|
|
timer = true;
|
|
|
|
flow_timer_run = *now;
|
|
|
|
}
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
for (idx = 0; idx < FLOW_MAX; idx++) {
|
|
|
|
union flow *flow = &flowtab[idx];
|
2024-01-16 11:50:42 +11:00
|
|
|
bool closed = false;
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
switch (flow->f.state) {
|
|
|
|
case FLOW_STATE_FREE: {
|
2024-01-16 11:50:43 +11:00
|
|
|
unsigned skip = flow->free.n;
|
|
|
|
|
|
|
|
/* First entry of a free cluster must have n >= 1 */
|
|
|
|
ASSERT(skip);
|
|
|
|
|
|
|
|
if (free_head) {
|
|
|
|
/* Merge into preceding free cluster */
|
|
|
|
free_head->n += flow->free.n;
|
|
|
|
flow->free.n = flow->free.next = 0;
|
|
|
|
} else {
|
|
|
|
/* New free cluster, add to chain */
|
|
|
|
free_head = &flow->free;
|
|
|
|
*last_next = idx;
|
|
|
|
last_next = &free_head->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip remaining empty entries */
|
|
|
|
idx += skip - 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-05-21 15:57:05 +10:00
|
|
|
case FLOW_STATE_NEW:
|
2024-05-21 15:57:07 +10:00
|
|
|
case FLOW_STATE_INI:
|
|
|
|
case FLOW_STATE_TGT:
|
2024-05-21 15:57:05 +10:00
|
|
|
case FLOW_STATE_TYPED:
|
|
|
|
/* Incomplete flow at end of cycle */
|
|
|
|
ASSERT(false);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FLOW_STATE_ACTIVE:
|
|
|
|
/* Nothing to do */
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
ASSERT(false);
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:50:35 +11:00
|
|
|
switch (flow->f.type) {
|
2024-01-16 11:50:43 +11:00
|
|
|
case FLOW_TYPE_NONE:
|
|
|
|
ASSERT(false);
|
|
|
|
break;
|
2024-01-16 11:50:35 +11:00
|
|
|
case FLOW_TCP:
|
2024-05-21 15:57:03 +10:00
|
|
|
closed = tcp_flow_defer(&flow->tcp);
|
2024-01-16 11:50:35 +11:00
|
|
|
break;
|
|
|
|
case FLOW_TCP_SPLICE:
|
2024-05-21 15:57:03 +10:00
|
|
|
closed = tcp_splice_flow_defer(&flow->tcp_splice);
|
2024-01-16 11:50:42 +11:00
|
|
|
if (!closed && timer)
|
2024-05-21 15:57:03 +10:00
|
|
|
tcp_splice_timer(c, &flow->tcp_splice);
|
2024-01-16 11:50:35 +11:00
|
|
|
break;
|
2024-02-29 15:15:32 +11:00
|
|
|
case FLOW_PING4:
|
|
|
|
case FLOW_PING6:
|
|
|
|
if (timer)
|
2024-05-21 15:57:03 +10:00
|
|
|
closed = icmp_ping_timer(c, &flow->ping, now);
|
2024-02-29 15:15:32 +11:00
|
|
|
break;
|
2024-07-18 15:26:46 +10:00
|
|
|
case FLOW_UDP:
|
2024-09-06 15:17:06 +10:00
|
|
|
closed = udp_flow_defer(&flow->udp);
|
|
|
|
if (!closed && timer)
|
2024-07-18 15:26:46 +10:00
|
|
|
closed = udp_flow_timer(c, &flow->udp, now);
|
|
|
|
break;
|
2024-01-16 11:50:35 +11:00
|
|
|
default:
|
|
|
|
/* Assume other flow types don't need any handling */
|
|
|
|
;
|
|
|
|
}
|
2024-01-16 11:50:42 +11:00
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
if (closed) {
|
2024-05-21 15:57:05 +10:00
|
|
|
flow_set_state(&flow->f, FLOW_STATE_FREE);
|
|
|
|
memset(flow, 0, sizeof(*flow));
|
2024-01-16 11:50:43 +11:00
|
|
|
|
|
|
|
if (free_head) {
|
|
|
|
/* Add slot to current free cluster */
|
|
|
|
ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
|
|
|
|
free_head->n++;
|
|
|
|
flow->free.n = flow->free.next = 0;
|
|
|
|
} else {
|
|
|
|
/* Create new free cluster */
|
|
|
|
free_head = &flow->free;
|
|
|
|
free_head->n = 1;
|
|
|
|
*last_next = idx;
|
|
|
|
last_next = &free_head->next;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
free_head = NULL;
|
|
|
|
}
|
2024-01-16 11:50:35 +11:00
|
|
|
}
|
2024-01-16 11:50:43 +11:00
|
|
|
|
|
|
|
*last_next = FLOW_MAX;
|
|
|
|
}
|
|
|
|
|
migrate: Migrate TCP flows
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.
Contents of pending queues are transferred as well.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.
Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-02-13 23:14:13 +11:00
|
|
|
/**
|
|
|
|
* flow_migrate_source_rollback() - Disable repair mode, return failure
|
|
|
|
* @c: Execution context
|
|
|
|
* @max_flow: Maximum index of affected flows
|
|
|
|
* @ret: Negative error code
|
|
|
|
*
|
|
|
|
* Return: @ret
|
|
|
|
*/
|
|
|
|
static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
|
|
|
|
int ret)
|
|
|
|
{
|
|
|
|
union flow *flow;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
debug("...roll back migration");
|
|
|
|
|
|
|
|
foreach_established_tcp_flow(i, flow, max_flow)
|
|
|
|
if (tcp_flow_repair_off(c, &flow->tcp))
|
|
|
|
die("Failed to roll back TCP_REPAIR mode");
|
|
|
|
|
|
|
|
if (repair_flush(c))
|
|
|
|
die("Failed to roll back TCP_REPAIR mode");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_migrate_repair_all() - Turn repair mode on or off for all flows
|
|
|
|
* @c: Execution context
|
|
|
|
* @enable: Switch repair mode on if set, off otherwise
|
|
|
|
*
|
|
|
|
* Return: 0 on success, negative error code on failure
|
|
|
|
*/
|
|
|
|
static int flow_migrate_repair_all(struct ctx *c, bool enable)
|
|
|
|
{
|
|
|
|
union flow *flow;
|
|
|
|
unsigned i;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
|
|
|
if (enable)
|
|
|
|
rc = tcp_flow_repair_on(c, &flow->tcp);
|
|
|
|
else
|
|
|
|
rc = tcp_flow_repair_off(c, &flow->tcp);
|
|
|
|
|
|
|
|
if (rc) {
|
|
|
|
debug("Can't %s repair mode: %s",
|
|
|
|
enable ? "enable" : "disable", strerror_(-rc));
|
|
|
|
return flow_migrate_source_rollback(c, i, rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((rc = repair_flush(c))) {
|
|
|
|
debug("Can't %s repair mode: %s",
|
|
|
|
enable ? "enable" : "disable", strerror_(-rc));
|
|
|
|
return flow_migrate_source_rollback(c, i, rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
|
|
|
|
* @c: Execution context
|
|
|
|
* @stage: Migration stage information (unused)
|
|
|
|
* @fd: Migration file descriptor (unused)
|
|
|
|
*
|
|
|
|
* Return: 0 on success, positive error code on failure
|
|
|
|
*/
|
|
|
|
int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
|
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
(void)stage;
|
|
|
|
(void)fd;
|
|
|
|
|
|
|
|
if ((rc = flow_migrate_repair_all(c, true)))
|
|
|
|
return -rc;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_migrate_source() - Dump all the remaining information and send data
|
|
|
|
* @c: Execution context (unused)
|
|
|
|
* @stage: Migration stage information (unused)
|
|
|
|
* @fd: Migration file descriptor
|
|
|
|
*
|
|
|
|
* Return: 0 on success, positive error code on failure
|
|
|
|
*/
|
|
|
|
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
uint32_t count = 0;
|
|
|
|
bool first = true;
|
|
|
|
union flow *flow;
|
|
|
|
unsigned i;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
(void)c;
|
|
|
|
(void)stage;
|
|
|
|
|
|
|
|
foreach_established_tcp_flow(i, flow, FLOW_MAX)
|
|
|
|
count++;
|
|
|
|
|
|
|
|
count = htonl(count);
|
|
|
|
if (write_all_buf(fd, &count, sizeof(count))) {
|
|
|
|
rc = errno;
|
|
|
|
err_perror("Can't send flow count (%u)", ntohl(count));
|
|
|
|
return flow_migrate_source_rollback(c, FLOW_MAX, rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
debug("Sending %u flows", ntohl(count));
|
|
|
|
|
|
|
|
/* Dump and send information that can be stored in the flow table.
|
|
|
|
*
|
|
|
|
* Limited rollback options here: if we fail to transfer any data (that
|
|
|
|
* is, on the first flow), undo everything and resume. Otherwise, the
|
|
|
|
* stream might now be inconsistent, and we might have closed listening
|
|
|
|
* TCP sockets, so just terminate.
|
|
|
|
*/
|
|
|
|
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
|
|
|
rc = tcp_flow_migrate_source(fd, &flow->tcp);
|
|
|
|
if (rc) {
|
|
|
|
err("Can't send data, flow %u: %s", i, strerror_(-rc));
|
|
|
|
if (!first)
|
|
|
|
die("Inconsistent migration state, exiting");
|
|
|
|
|
|
|
|
return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
first = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* And then "extended" data (including window data we saved previously):
|
|
|
|
* the target needs to set repair mode on sockets before it can set
|
|
|
|
* this stuff, but it needs sockets (and flows) for that.
|
|
|
|
*
|
|
|
|
* This also closes sockets so that the target can start connecting
|
|
|
|
* theirs: you can't sendmsg() to queues (using the socket) if the
|
|
|
|
* socket is not connected (EPIPE), not even in repair mode. And the
|
|
|
|
* target needs to restore queues now because we're sending the data.
|
|
|
|
*
|
|
|
|
* So, no rollback here, just try as hard as we can. Tolerate per-flow
|
|
|
|
* failures but not if the stream might be inconsistent (reported here
|
|
|
|
* as EIO).
|
|
|
|
*/
|
|
|
|
foreach_established_tcp_flow(i, flow, FLOW_MAX) {
|
2025-02-18 19:59:23 +11:00
|
|
|
rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
|
migrate: Migrate TCP flows
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.
Contents of pending queues are transferred as well.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.
Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-02-13 23:14:13 +11:00
|
|
|
if (rc) {
|
|
|
|
err("Extended data for flow %u: %s", i, strerror_(-rc));
|
|
|
|
|
|
|
|
if (rc == -EIO)
|
|
|
|
die("Inconsistent migration state, exiting");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* flow_migrate_target() - Receive flows and insert in flow table
|
|
|
|
* @c: Execution context
|
|
|
|
* @stage: Migration stage information (unused)
|
|
|
|
* @fd: Migration file descriptor
|
|
|
|
*
|
|
|
|
* Return: 0 on success, positive error code on failure
|
|
|
|
*/
|
|
|
|
int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
|
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
uint32_t count;
|
|
|
|
unsigned i;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
(void)stage;
|
|
|
|
|
|
|
|
if (read_all_buf(fd, &count, sizeof(count)))
|
|
|
|
return errno;
|
|
|
|
|
|
|
|
count = ntohl(count);
|
|
|
|
debug("Receiving %u flows", count);
|
|
|
|
|
|
|
|
if ((rc = flow_migrate_repair_all(c, true)))
|
|
|
|
return -rc;
|
|
|
|
|
|
|
|
repair_flush(c);
|
|
|
|
|
|
|
|
/* TODO: flow header with type, instead? */
|
|
|
|
for (i = 0; i < count; i++) {
|
|
|
|
rc = tcp_flow_migrate_target(c, fd);
|
|
|
|
if (rc) {
|
|
|
|
debug("Migration data failure at flow %u: %s, abort",
|
|
|
|
i, strerror_(-rc));
|
|
|
|
return -rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
repair_flush(c);
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++) {
|
2025-02-18 19:59:21 +11:00
|
|
|
rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd);
|
migrate: Migrate TCP flows
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, plus a specific
structure for parameters that don't fit in the flow table, and flow
insertion on the target, with all the appropriate window options,
window scaling, MSS, etc.
Contents of pending queues are transferred as well.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. This also means that, if
we're testing this on the same machine, in the same namespace, we need
to close the listening socket on the source before we can start moving
data.
Further, we need to connect() the socket on the target before we can
restore data queues, but we can't do that (again, on the same machine)
as long as the matching source socket is open, which implies an
arbitrary limit on queue sizes we can transfer, because we can only
dump pending queues on the source as long as the socket is open, of
course.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2025-02-13 23:14:13 +11:00
|
|
|
if (rc) {
|
|
|
|
debug("Migration data failure at flow %u: %s, abort",
|
|
|
|
i, strerror_(-rc));
|
|
|
|
return -rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
/**
|
|
|
|
* flow_init() - Initialise flow related data structures
|
|
|
|
*/
|
|
|
|
void flow_init(void)
|
|
|
|
{
|
2024-07-18 15:26:35 +10:00
|
|
|
unsigned b;
|
|
|
|
|
2024-01-16 11:50:43 +11:00
|
|
|
/* Initial state is a single free cluster containing the whole table */
|
|
|
|
flowtab[0].free.n = FLOW_MAX;
|
|
|
|
flowtab[0].free.next = FLOW_MAX;
|
2024-07-18 15:26:35 +10:00
|
|
|
|
|
|
|
for (b = 0; b < FLOW_HASH_SIZE; b++)
|
|
|
|
flow_hashtab[b] = FLOW_SIDX_NONE;
|
2024-01-16 11:50:35 +11:00
|
|
|
}
|