1
0
mirror of https://passt.top/passt synced 2025-02-22 19:02:20 +00:00

Add interfaces and configuration bits for passt-repair

In vhost-user mode, by default, create a second UNIX domain socket
accepting connections from passt-repair, with the usual listener
socket.

When we need to set or clear TCP_REPAIR on sockets, we'll send them
via SCM_RIGHTS to passt-repair, who sets the socket option values we
ask for.

To that end, introduce batched functions to request TCP_REPAIR
settings on sockets, so that we don't have to send a single message
for each socket, on migration. When needed, repair_flush() will
send the message and check for the reply.

Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2025-02-12 18:07:15 +11:00
parent 155cd0c41e
commit b899141ad5
12 changed files with 381 additions and 73 deletions

View File

@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
udp_vu.c util.c vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
@ -50,9 +50,9 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
vhost_user.h virtio.h vu_common.h
pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
udp_vu.h util.h vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}

43
conf.c
View File

@ -820,6 +820,9 @@ static void usage(const char *name, FILE *f, int status)
" UNIX domain socket is provided by -s option\n"
" --print-capabilities print back-end capabilities in JSON format,\n"
" only meaningful for vhost-user mode\n");
FPRINTF(f,
" --repair-path PATH path for passt-repair(1)\n"
" default: append '.repair' to UNIX domain path\n");
}
FPRINTF(f,
@ -1245,8 +1248,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
*/
static void conf_open_files(struct ctx *c)
{
if (c->mode != MODE_PASTA && c->fd_tap == -1)
c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
if (c->mode != MODE_PASTA && c->fd_tap == -1) {
c->fd_tap_listen = sock_unix(c->sock_path);
if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) {
if (!*c->repair_path &&
snprintf_check(c->repair_path,
sizeof(c->repair_path), "%s.repair",
c->sock_path)) {
warn("passt-repair path %s not usable",
c->repair_path);
c->fd_repair_listen = -1;
} else {
c->fd_repair_listen = sock_unix(c->repair_path);
}
} else {
c->fd_repair_listen = -1;
}
c->fd_repair = -1;
}
if (*c->pidfile) {
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
@ -1360,10 +1380,12 @@ void conf(struct ctx *c, int argc, char **argv)
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{"vhost-user", no_argument, NULL, 25 },
/* vhost-user backend program convention */
{"print-capabilities", no_argument, NULL, 26 },
{"socket-path", required_argument, NULL, 's' },
{"fqdn", required_argument, NULL, 27 },
{"repair-path", required_argument, NULL, 28 },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@ -1570,6 +1592,9 @@ void conf(struct ctx *c, int argc, char **argv)
"%s", optarg))
die("Invalid FQDN: %s", optarg);
break;
case 28:
/* Handle this once we checked --vhost-user */
break;
case 'd':
c->debug = 1;
c->quiet = 0;
@ -1841,8 +1866,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
/* Inbound port options & DNS can be parsed now (after IPv4/IPv6
* settings)
/* Inbound port options, DNS, and --repair-path can be parsed now, after
* IPv4/IPv6 settings and --vhost-user.
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@ -1888,6 +1913,16 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
} else if (name == 28) {
if (c->mode != MODE_VU && strcmp(optarg, "none"))
die("--repair-path is for vhost-user mode only");
if (snprintf_check(c->repair_path,
sizeof(c->repair_path), "%s",
optarg))
die("Invalid passt-repair path: %s", optarg);
break;
}
} while (name != -1);

View File

@ -40,6 +40,10 @@ enum epoll_type {
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
/* TCP_REPAIR helper listening socket */
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
EPOLL_NUM_TYPES,
};

View File

@ -23,6 +23,7 @@
#include "flow_table.h"
#include "migrate.h"
#include "repair.h"
/* Magic identifier for migration data */
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
@ -232,7 +233,7 @@ void migrate_init(struct ctx *c)
}
/**
* migrate_close() - Close migration channel
* migrate_close() - Close migration channel and connection to passt-repair
* @c: Execution context
*/
void migrate_close(struct ctx *c)
@ -243,6 +244,8 @@ void migrate_close(struct ctx *c)
c->device_state_fd = -1;
c->device_state_result = -1;
}
repair_close(c);
}
/**

11
passt.1
View File

@ -428,6 +428,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
.TP
.BR \-\-repair-path " " \fIpath
Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
migration. \fB--repair-path none\fR disables this interface (if you need to
specify a socket path called "none" you can prefix the path by \fI./\fR).
Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
chosen for the hypervisor UNIX domain socket. No socket is created if not in
\-\-vhost-user mode.
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened

View File

@ -52,6 +52,7 @@
#include "ndp.h"
#include "vu_common.h"
#include "migrate.h"
#include "repair.h"
#define EPOLL_EVENTS 8
@ -76,6 +77,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@ -358,6 +361,12 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
case EPOLL_TYPE_REPAIR_LISTEN:
repair_listen_handler(&c, eventmask);
break;
case EPOLL_TYPE_REPAIR:
repair_handler(&c, eventmask);
break;
default:
/* Can't happen */
ASSERT(0);

View File

@ -20,6 +20,7 @@ union epoll_ref;
#include "siphash.h"
#include "ip.h"
#include "inany.h"
#include "migrate.h"
#include "flow.h"
#include "icmp.h"
#include "fwd.h"
@ -193,6 +194,7 @@ struct ip6_ctx {
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
* @sock_path: Path for UNIX domain socket
* @repair_path: TCP_REPAIR helper path, can be "none", empty for default
* @pcap: Path for packet capture file
* @pidfile: Path to PID file, empty string if not configured
* @pidfile_fd: File descriptor for PID file, -1 if none
@ -203,6 +205,8 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
* @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
* @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
* @our_tap_mac: Pasta/passt's MAC on the tap link
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
@ -249,6 +253,7 @@ struct ctx {
int foreground;
int nofile;
char sock_path[UNIX_PATH_MAX];
char repair_path[UNIX_PATH_MAX];
char pcap[PATH_MAX];
char pidfile[PATH_MAX];
@ -265,6 +270,8 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
int fd_repair_listen;
int fd_repair;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint64_t hash_secret[2];

219
repair.c Normal file
View File

@ -0,0 +1,219 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
*
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <errno.h>
#include <sys/uio.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
#include "repair.h"
#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
/* Pending file descriptors for next repair_flush() call, or command change */
static int repair_fds[SCM_MAX_FD];
/* Pending command: flush pending file descriptors if it changes */
static int8_t repair_cmd;
/* Number of pending file descriptors set in @repair_fds */
static int repair_nfds;
/**
* repair_sock_init() - Start listening for connections on helper socket
* @c: Execution context
*/
void repair_sock_init(const struct ctx *c)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
struct epoll_event ev = { 0 };
if (c->fd_repair_listen == -1)
return;
if (listen(c->fd_repair_listen, 0)) {
err_perror("listen() on repair helper socket, won't migrate");
return;
}
ref.fd = c->fd_repair_listen;
ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
err_perror("repair helper socket epoll_ctl(), won't migrate");
}
/**
* repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
* @c: Execution context
* @events: epoll events
*/
void repair_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
struct epoll_event ev = { 0 };
struct ucred ucred;
socklen_t len;
if (events != EPOLLIN) {
debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
events);
return;
}
len = sizeof(ucred);
/* Another client is already connected: accept and close right away. */
if (c->fd_repair != -1) {
int discard = accept4(c->fd_repair_listen, NULL, NULL,
SOCK_NONBLOCK);
if (discard == -1)
return;
if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
close(discard);
return;
}
if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
debug_perror("accept4() on TCP_REPAIR helper listening socket");
return;
}
if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
ref.fd = c->fd_repair;
ev.events = EPOLLHUP | EPOLLET;
ev.data.u64 = ref.u64;
if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
close(c->fd_repair);
c->fd_repair = -1;
}
}
/**
* repair_close() - Close connection to TCP_REPAIR helper
* @c: Execution context
*/
void repair_close(struct ctx *c)
{
debug("Closing TCP_REPAIR helper socket");
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
close(c->fd_repair);
c->fd_repair = -1;
}
/**
* repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
* @c: Execution context
* @events: epoll events
*/
void repair_handler(struct ctx *c, uint32_t events)
{
(void)events;
repair_close(c);
}
/**
* repair_flush() - Flush current set of sockets to helper, with current command
* @c: Execution context
*
* Return: 0 on success, negative error code on failure
*/
int repair_flush(struct ctx *c)
{
struct iovec iov = { &repair_cmd, sizeof(repair_cmd) };
char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
__attribute__ ((aligned(__alignof__(struct cmsghdr))));
struct cmsghdr *cmsg;
struct msghdr msg;
int8_t reply;
if (!repair_nfds)
return 0;
msg = (struct msghdr){ NULL, 0, &iov, 1,
buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
repair_nfds = 0;
if (sendmsg(c->fd_repair, &msg, 0) < 0) {
int ret = -errno;
err_perror("Failed to send sockets to TCP_REPAIR helper");
repair_close(c);
return ret;
}
if (recv(c->fd_repair, &reply, sizeof(reply), 0) < 0) {
int ret = -errno;
err_perror("Failed to receive reply from TCP_REPAIR helper");
repair_close(c);
return ret;
}
if (reply != repair_cmd) {
err("Unexpected reply from TCP_REPAIR helper: %d", reply);
repair_close(c);
return -ENXIO;
}
return 0;
}
/**
* repair_set() - Add socket to TCP_REPAIR set with given command
* @c: Execution context
* @s: Socket to add
* @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
*
* Return: 0 on success, negative error code on failure
*/
/* cppcheck-suppress unusedFunction */
int repair_set(struct ctx *c, int s, int cmd)
{
int rc;
if (repair_nfds && repair_cmd != cmd) {
if ((rc = repair_flush(c)))
return rc;
}
repair_cmd = cmd;
repair_fds[repair_nfds++] = s;
if (repair_nfds >= SCM_MAX_FD) {
if ((rc = repair_flush(c)))
return rc;
}
return 0;
}

16
repair.h Normal file
View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright (c) 2025 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#ifndef REPAIR_H
#define REPAIR_H
void repair_sock_init(const struct ctx *c);
void repair_listen_handler(struct ctx *c, uint32_t events);
void repair_handler(struct ctx *c, uint32_t events);
void repair_close(struct ctx *c);
int repair_flush(struct ctx *c);
int repair_set(struct ctx *c, int s, int cmd);
#endif /* REPAIR_H */

65
tap.c
View File

@ -56,6 +56,7 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
tap_pasta_input(c, now);
}
/**
* tap_sock_unix_open() - Create and bind AF_UNIX socket
* @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
*
* Return: socket descriptor on success, won't return on failure
*/
int tap_sock_unix_open(char *sock_path)
{
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
int i;
if (fd < 0)
die_perror("Failed to open UNIX domain socket");
for (i = 1; i < UNIX_SOCK_MAX; i++) {
char *path = addr.sun_path;
int ex, ret;
if (*sock_path)
memcpy(path, sock_path, UNIX_PATH_MAX);
else if (snprintf_check(path, UNIX_PATH_MAX - 1,
UNIX_SOCK_PATH, i))
die_perror("Can't build UNIX domain socket path");
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
0);
if (ex < 0)
die_perror("Failed to check for UNIX domain conflicts");
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
errno != EACCES)) {
if (*sock_path)
die("Socket path %s already in use", path);
close(ex);
continue;
}
close(ex);
unlink(path);
ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
if (*sock_path && ret)
die_perror("Failed to bind UNIX domain socket");
if (!ret)
break;
}
if (i == UNIX_SOCK_MAX)
die_perror("Failed to bind UNIX domain socket");
info("UNIX domain socket bound at %s", addr.sun_path);
if (!*sock_path)
memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
return fd;
}
/**
* tap_backend_show_hints() - Give help information to start QEMU
* @c: Execution context
@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c)
tap_sock_tun_init(c);
break;
case MODE_VU:
repair_sock_init(c);
/* fall through */
case MODE_PASST:
tap_sock_unix_init(c);

62
util.c
View File

@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd;
}
/**
* sock_unix() - Create and bind AF_UNIX socket
* @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
*
* Return: socket descriptor on success, won't return on failure
*/
int sock_unix(char *sock_path)
{
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
int i;
if (fd < 0)
die_perror("Failed to open UNIX domain socket");
for (i = 1; i < UNIX_SOCK_MAX; i++) {
char *path = addr.sun_path;
int ex, ret;
if (*sock_path)
memcpy(path, sock_path, UNIX_PATH_MAX);
else if (snprintf_check(path, UNIX_PATH_MAX - 1,
UNIX_SOCK_PATH, i))
die_perror("Can't build UNIX domain socket path");
ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
0);
if (ex < 0)
die_perror("Failed to check for UNIX domain conflicts");
ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
errno != EACCES)) {
if (*sock_path)
die("Socket path %s already in use", path);
close(ex);
continue;
}
close(ex);
unlink(path);
ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
if (*sock_path && ret)
die_perror("Failed to bind UNIX domain socket");
if (!ret)
break;
}
if (i == UNIX_SOCK_MAX)
die_perror("Failed to bind UNIX domain socket");
info("UNIX domain socket bound at %s", addr.sun_path);
if (!*sock_path)
memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
return fd;
}
/**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
* @c: Execution context

1
util.h
View File

@ -217,6 +217,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);