1
0
mirror of https://passt.top/passt synced 2025-01-18 10:25:17 +00:00

vhost-user: add vhost-user

add virtio and vhost-user functions to connect with QEMU.

  $ ./passt --vhost-user

and

  # qemu-system-x86_64 ... -m 4G \
        -object memory-backend-memfd,id=memfd0,share=on,size=4G \
        -numa node,memdev=memfd0 \
        -chardev socket,id=chr0,path=/tmp/passt_1.socket \
        -netdev vhost-user,id=netdev0,chardev=chr0 \
        -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \
        ...

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
This commit is contained in:
Laurent Vivier 2024-11-13 09:04:06 +01:00 committed by Stefano Brivio
parent 007af94bb9
commit 92fe7e967a
24 changed files with 1412 additions and 54 deletions

View File

@ -37,7 +37,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c
tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@ -47,7 +48,8 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
udp.h udp_flow.h util.h vhost_user.h virtio.h
tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}

21
conf.c
View File

@ -45,6 +45,7 @@
#include "lineread.h"
#include "isolation.h"
#include "log.h"
#include "vhost_user.h"
#define NETNS_RUN_DIR "/run/netns"
@ -769,9 +770,14 @@ static void usage(const char *name, FILE *f, int status)
" default: same interface name as external one\n");
} else {
FPRINTF(f,
" -s, --socket PATH UNIX domain socket path\n"
" -s, --socket, --socket-path PATH UNIX domain socket path\n"
" default: probe free path starting from "
UNIX_SOCK_PATH "\n", 1);
FPRINTF(f,
" --vhost-user Enable vhost-user mode\n"
" UNIX domain socket is provided by -s option\n"
" --print-capabilities print back-end capabilities in JSON format,\n"
" only meaningful for vhost-user mode\n");
}
FPRINTF(f,
@ -1305,6 +1311,10 @@ void conf(struct ctx *c, int argc, char **argv)
{"map-guest-addr", required_argument, NULL, 22 },
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{"vhost-user", no_argument, NULL, 25 },
/* vhost-user backend program convention */
{"print-capabilities", no_argument, NULL, 26 },
{"socket-path", required_argument, NULL, 's' },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@ -1498,6 +1508,15 @@ void conf(struct ctx *c, int argc, char **argv)
break;
die("Invalid host nameserver address: %s", optarg);
case 25:
if (c->mode == MODE_PASTA) {
err("--vhost-user is for passt mode only");
usage(argv[0], stdout, EXIT_SUCCESS);
}
c->mode = MODE_VU;
break;
case 26:
vu_print_capabilities();
break;
case 'd':
c->debug = 1;

View File

@ -36,6 +36,10 @@ enum epoll_type {
EPOLL_TYPE_TAP_PASST,
/* socket listening for qemu socket connections */
EPOLL_TYPE_TAP_LISTEN,
/* vhost-user command socket */
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
EPOLL_NUM_TYPES,
};

1
iov.c
View File

@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n,
*
* Returns: The number of bytes successfully copied.
*/
/* cppcheck-suppress unusedFunction */
size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt,
size_t offset, const void *buf, size_t bytes)
{

View File

@ -379,12 +379,21 @@ void isolate_postfork(const struct ctx *c)
prctl(PR_SET_DUMPABLE, 0);
if (c->mode == MODE_PASTA) {
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
} else {
switch (c->mode) {
case MODE_PASST:
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
prog.filter = filter_passt;
break;
case MODE_PASTA:
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
break;
case MODE_VU:
prog.len = (unsigned short)ARRAY_SIZE(filter_vu);
prog.filter = filter_vu;
break;
default:
ASSERT(0);
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||

View File

@ -36,6 +36,17 @@
static int packet_check_range(const struct pool *p, size_t offset, size_t len,
const char *start, const char *func, int line)
{
if (p->buf_size == 0) {
int ret;
ret = vu_packet_check_range((void *)p->buf, offset, len, start);
if (ret == -1)
trace("cannot find region, %s:%i", func, line);
return ret;
}
if (start < p->buf) {
trace("packet start %p before buffer start %p, "
"%s:%i", (void *)start, (void *)p->buf, func, line);

View File

@ -8,8 +8,10 @@
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors
* @buf_size: Total size of buffer
* @buf: Buffer storing packet descriptors,
* a struct vu_dev_region array for passt vhost-user mode
* @buf_size: Total size of buffer,
* 0 for passt vhost-user mode
* @size: Number of usable descriptors for the pool
* @count: Number of used descriptors for the pool
* @pkt: Descriptors: see macros below
@ -22,6 +24,8 @@ struct pool {
struct iovec pkt[1];
};
int vu_packet_check_range(void *buf, size_t offset, size_t len,
const char *start);
void packet_add_do(struct pool *p, size_t len, const char *start,
const char *func, int line);
void *packet_get_do(const struct pool *p, const size_t idx,

10
passt.1
View File

@ -397,12 +397,20 @@ interface address are configured on a given host interface.
.SS \fBpasst\fR-only options
.TP
.BR \-s ", " \-\-socket " " \fIpath
.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath
Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to
\fBpasst\fR.
Default is to probe a free socket, not accepting connections, starting from
\fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR.
.TP
.BR \-\-vhost-user
Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
.TP
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened

View File

@ -50,6 +50,7 @@
#include "log.h"
#include "tcp_splice.h"
#include "ndp.h"
#include "vu_common.h"
#define EPOLL_EVENTS 8
@ -72,6 +73,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device",
[EPOLL_TYPE_TAP_PASST] = "connected qemu socket",
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@ -346,6 +349,12 @@ loop:
case EPOLL_TYPE_PING:
icmp_sock_handler(&c, ref);
break;
case EPOLL_TYPE_VHOST_CMD:
vu_control_handler(c.vdev, c.fd_tap, eventmask);
break;
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
default:
/* Can't happen */
ASSERT(0);

View File

@ -25,6 +25,7 @@ union epoll_ref;
#include "fwd.h"
#include "tcp.h"
#include "udp.h"
#include "vhost_user.h"
/* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0
* (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise
@ -43,6 +44,7 @@ union epoll_ref;
* @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
* @queue: vhost-user queue index for this fd
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
*/
union epoll_ref {
@ -58,6 +60,7 @@ union epoll_ref {
union udp_listen_epoll_ref udp;
uint32_t data;
int nsdir_fd;
int queue;
};
};
uint64_t u64;
@ -94,6 +97,7 @@ struct fqdn {
enum passt_modes {
MODE_PASST,
MODE_PASTA,
MODE_VU,
};
/**
@ -229,6 +233,7 @@ struct ip6_ctx {
* @freebind: Allow binding of non-local addresses for forwarding
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
* @vdev: vhost-user device
*/
struct ctx {
enum passt_modes mode;
@ -291,6 +296,8 @@ struct ctx {
int low_wmem;
int low_rmem;
struct vu_dev *vdev;
};
void proto_update_l2_buf(const unsigned char *eth_d,

1
pcap.c
View File

@ -143,7 +143,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
* @iovcnt: Number of buffers (@iov entries)
* @offset: Offset of the L2 frame within the full data length
*/
/* cppcheck-suppress unusedFunction */
void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
{
struct timespec now = { 0 };

67
tap.c
View File

@ -58,6 +58,8 @@
#include "packet.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
#include "vu_common.h"
/* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */
static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf);
@ -78,16 +80,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
struct iovec iov[2];
size_t iovcnt = 0;
if (c->mode == MODE_PASST) {
switch (c->mode) {
case MODE_PASST:
iov[iovcnt] = IOV_OF_LVALUE(vnet_len);
iovcnt++;
}
/* fall through */
case MODE_PASTA:
iov[iovcnt].iov_base = (void *)data;
iov[iovcnt].iov_len = l2len;
iovcnt++;
tap_send_frames(c, iov, iovcnt, 1);
break;
case MODE_VU:
vu_send_single(c, data, l2len);
break;
}
}
/**
@ -414,10 +422,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
if (!nframes)
return 0;
if (c->mode == MODE_PASTA)
switch (c->mode) {
case MODE_PASTA:
m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
else
break;
case MODE_PASST:
m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
break;
case MODE_VU:
/* fall through */
default:
ASSERT(0);
}
if (m < nframes)
debug("tap: failed to send %zu frames of %zu",
@ -976,7 +992,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p)
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
* @c: Execution context
*/
static void tap_sock_reset(struct ctx *c)
void tap_sock_reset(struct ctx *c)
{
info("Client connection closed%s", c->one_off ? ", exiting" : "");
@ -987,6 +1003,8 @@ static void tap_sock_reset(struct ctx *c)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
close(c->fd_tap);
c->fd_tap = -1;
if (c->mode == MODE_VU)
vu_cleanup(c->vdev);
}
/**
@ -1207,6 +1225,11 @@ static void tap_backend_show_hints(struct ctx *c)
info("or qrap, for earlier qemu versions:");
info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio");
break;
case MODE_VU:
info("You can start qemu with:");
info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n",
c->sock_path);
break;
}
}
@ -1234,8 +1257,8 @@ static void tap_sock_unix_init(const struct ctx *c)
*/
void tap_listen_handler(struct ctx *c, uint32_t events)
{
union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST };
struct epoll_event ev = { 0 };
union epoll_ref ref = { 0 };
int v = INT_MAX / 2;
struct ucred ucred;
socklen_t len;
@ -1275,6 +1298,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
trace("tap: failed to set SO_SNDBUF to %i", v);
ref.fd = c->fd_tap;
if (c->mode == MODE_VU)
ref.type = EPOLL_TYPE_VHOST_CMD;
else
ref.type = EPOLL_TYPE_TAP_PASST;
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev);
@ -1341,7 +1368,7 @@ static void tap_sock_tun_init(struct ctx *c)
* @base: Buffer base
* @size Buffer size
*/
static void tap_sock_update_pool(void *base, size_t size)
void tap_sock_update_pool(void *base, size_t size)
{
int i;
@ -1361,6 +1388,9 @@ static void tap_sock_update_pool(void *base, size_t size)
*/
void tap_backend_init(struct ctx *c)
{
if (c->mode == MODE_VU)
tap_sock_update_pool(NULL, 0);
else
tap_sock_update_pool(pkt_buf, sizeof(pkt_buf));
if (c->fd_tap != -1) { /* Passed as --fd */
@ -1369,10 +1399,17 @@ void tap_backend_init(struct ctx *c)
ASSERT(c->one_off);
ref.fd = c->fd_tap;
if (c->mode == MODE_PASST)
switch (c->mode) {
case MODE_PASST:
ref.type = EPOLL_TYPE_TAP_PASST;
else
break;
case MODE_PASTA:
ref.type = EPOLL_TYPE_TAP_PASTA;
break;
case MODE_VU:
ref.type = EPOLL_TYPE_VHOST_CMD;
break;
}
ev.events = EPOLLIN | EPOLLRDHUP;
ev.data.u64 = ref.u64;
@ -1380,9 +1417,14 @@ void tap_backend_init(struct ctx *c)
return;
}
if (c->mode == MODE_PASTA) {
switch (c->mode) {
case MODE_PASTA:
tap_sock_tun_init(c);
} else {
break;
case MODE_VU:
vu_init(c);
/* fall through */
case MODE_PASST:
tap_sock_unix_init(c);
/* In passt mode, we don't know the guest's MAC address until it
@ -1390,6 +1432,7 @@ void tap_backend_init(struct ctx *c)
* first packets will reach it.
*/
memset(&c->guest_mac, 0xff, sizeof(c->guest_mac));
break;
}
tap_backend_show_hints(c);

3
tap.h
View File

@ -40,6 +40,7 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c,
*/
static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
{
if (thdr)
thdr->vnet_len = htonl(l2len);
}
@ -68,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
int tap_sock_unix_open(char *sock_path);
void tap_sock_reset(struct ctx *c);
void tap_sock_update_pool(void *base, size_t size);
void tap_backend_init(struct ctx *c);
void tap_flush_pools(void);
void tap_handler(struct ctx *c, const struct timespec *now);

7
tcp.c
View File

@ -304,6 +304,7 @@
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
@ -1312,6 +1313,9 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
int flags)
{
if (c->mode == MODE_VU)
return tcp_vu_send_flag(c, conn, flags);
return tcp_buf_send_flag(c, conn, flags);
}
@ -1705,6 +1709,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
*/
static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (c->mode == MODE_VU)
return tcp_vu_data_from_sock(c, conn);
return tcp_buf_data_from_sock(c, conn);
}

513
tcp_vu.c Normal file
View File

@ -0,0 +1,513 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* tcp_vu.c - TCP L2 vhost-user management functions
*
* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <sys/socket.h>
#include <linux/virtio_net.h>
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "vhost_user.h"
#include "tcp.h"
#include "pcap.h"
#include "flow.h"
#include "tcp_conn.h"
#include "flow_table.h"
#include "tcp_vu.h"
#include "tap.h"
#include "tcp_internal.h"
#include "checksum.h"
#include "vu_common.h"
static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
/**
* tcp_vu_hdrlen() - return the size of the header in level 2 frame (TCP)
* @v6: Set for IPv6 packet
*
* Return: Return the size of the header
*/
static size_t tcp_vu_hdrlen(bool v6)
{
size_t hdrlen;
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
sizeof(struct ethhdr) + sizeof(struct tcphdr);
if (v6)
hdrlen += sizeof(struct ipv6hdr);
else
hdrlen += sizeof(struct iphdr);
return hdrlen;
}
/**
* tcp_vu_update_check() - Calculate TCP checksum
* @tapside: Address information for one side of the flow
* @iov: Pointer to the array of IO vectors
* @iov_used: Length of the array
*/
static void tcp_vu_update_check(const struct flowside *tapside,
struct iovec *iov, int iov_used)
{
char *base = iov[0].iov_base;
if (inany_v4(&tapside->oaddr)) {
const struct iphdr *iph = vu_ip(base);
tcp_update_check_tcp4(iph, iov, iov_used,
(char *)vu_payloadv4(base) - base);
} else {
const struct ipv6hdr *ip6h = vu_ip(base);
tcp_update_check_tcp6(ip6h, iov, iov_used,
(char *)vu_payloadv6(base) - base);
}
}
/**
* tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
*
* Return: negative error code on connection reset, 0 otherwise
*/
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
const struct flowside *tapside = TAPFLOW(conn);
size_t l2len, l4len, optlen, hdrlen;
struct vu_virtq_element flags_elem[2];
struct iovec flags_iov[2];
struct ethhdr *eh;
int elem_cnt;
int nb_ack;
int ret;
hdrlen = tcp_vu_hdrlen(CONN_V6(conn));
vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
hdrlen + sizeof(struct tcp_syn_opts), NULL);
if (elem_cnt != 1)
return -1;
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
eh = vu_eth(flags_elem[0].in_sg[0].iov_base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
if (CONN_V4(conn)) {
struct tcp_payload_t *payload;
struct iphdr *iph;
uint32_t seq;
eh->h_proto = htons(ETH_P_IP);
iph = vu_ip(flags_elem[0].in_sg[0].iov_base);
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
payload = vu_payloadv4(flags_elem[0].in_sg[0].iov_base);
memset(&payload->th, 0, sizeof(payload->th));
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
payload->th.ack = 1;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
(struct tcp_syn_opts *)payload->data,
&optlen);
if (ret <= 0) {
vu_queue_rewind(vq, 1);
return ret;
}
l4len = tcp_fill_headers4(conn, NULL, iph, payload, optlen,
NULL, seq, true);
l2len = sizeof(*iph);
} else {
struct tcp_payload_t *payload;
struct ipv6hdr *ip6h;
uint32_t seq;
eh->h_proto = htons(ETH_P_IPV6);
ip6h = vu_ip(flags_elem[0].in_sg[0].iov_base);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
payload = vu_payloadv6(flags_elem[0].in_sg[0].iov_base);
memset(&payload->th, 0, sizeof(payload->th));
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
payload->th.ack = 1;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
(struct tcp_syn_opts *)payload->data,
&optlen);
if (ret <= 0) {
vu_queue_rewind(vq, 1);
return ret;
}
l4len = tcp_fill_headers6(conn, NULL, ip6h, payload, optlen,
seq, true);
l2len = sizeof(*ip6h);
}
l2len += l4len + sizeof(struct ethhdr);
flags_elem[0].in_sg[0].iov_len = l2len +
sizeof(struct virtio_net_hdr_mrg_rxbuf);
if (*c->pcap) {
tcp_vu_update_check(tapside, &flags_elem[0].in_sg[0], 1);
pcap_iov(&flags_elem[0].in_sg[0], 1,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
nb_ack = 1;
if (flags & DUP_ACK) {
vu_set_element(&flags_elem[1], NULL, &flags_iov[1]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
flags_elem[0].in_sg[0].iov_len, NULL);
if (elem_cnt == 1) {
memcpy(flags_elem[1].in_sg[0].iov_base,
flags_elem[0].in_sg[0].iov_base,
flags_elem[0].in_sg[0].iov_len);
nb_ack++;
if (*c->pcap)
pcap_iov(&flags_elem[1].in_sg[0], 1, 0);
}
}
vu_flush(vdev, vq, flags_elem, nb_ack);
return 0;
}
/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers
* @c: Execution context
* @conn: Connection pointer
* @v6: Set for IPv6 connections
* @already_sent: Number of bytes already sent
* @fillsize: Number of bytes we can receive
* @iov_cnt: number of iov (output)
*
* Return: Number of iov entries used to store the data
*/
static ssize_t tcp_vu_sock_recv(const struct ctx *c,
const struct tcp_tap_conn *conn, bool v6,
uint32_t already_sent, size_t fillsize,
int *iov_cnt)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
struct msghdr mh_sock = { 0 };
uint16_t mss = MSS_GET(conn);
int s = conn->sock;
size_t hdrlen;
int elem_cnt;
ssize_t ret;
*iov_cnt = 0;
hdrlen = tcp_vu_hdrlen(v6);
vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
while (fillsize > 0 && elem_cnt < VIRTQUEUE_MAX_SIZE) {
struct iovec *iov;
size_t frame_size;
int cnt;
if (mss > fillsize)
mss = fillsize;
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
VIRTQUEUE_MAX_SIZE - elem_cnt,
mss + hdrlen, &frame_size);
if (cnt == 0)
break;
frame_size -= hdrlen;
iov = &elem[elem_cnt].in_sg[0];
iov->iov_base = (char *)iov->iov_base + hdrlen;
iov->iov_len -= hdrlen;
fillsize -= frame_size;
elem_cnt += cnt;
/* All the frames must have the same size (except the last one),
* otherwise we will no able to scan the iov array
* to find iov entries with headers
* (headers are spread every frame_size in the the array
*/
if (frame_size < mss)
break;
}
if (peek_offset_cap) {
mh_sock.msg_iov = iov_vu + 1;
mh_sock.msg_iovlen = elem_cnt;
} else {
iov_vu[0].iov_base = tcp_buf_discard;
iov_vu[0].iov_len = already_sent;
mh_sock.msg_iov = iov_vu;
mh_sock.msg_iovlen = elem_cnt + 1;
}
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
while (ret < 0 && errno == EINTR);
*iov_cnt = elem_cnt;
return ret;
}
/**
* tcp_vu_prepare() - Prepare the frame header
* @c: Execution context
* @conn: Connection pointer
* @first: Pointer to the array of IO vectors
* @dlen: Packet data length
* @check: Checksum, if already known
*/
static void tcp_vu_prepare(const struct ctx *c,
struct tcp_tap_conn *conn, struct iovec *first,
size_t dlen, const uint16_t **check)
{
const struct flowside *toside = TAPFLOW(conn);
char *base = first->iov_base;
struct ethhdr *eh;
/* we guess the first iovec provided by the guest can embed
* all the headers needed by L2 frame
*/
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
struct tcp_payload_t *payload;
struct iphdr *iph;
ASSERT(first[0].iov_len >= tcp_vu_hdrlen(false));
eh->h_proto = htons(ETH_P_IP);
iph = vu_ip(base);
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
payload = vu_payloadv4(base);
memset(&payload->th, 0, sizeof(payload->th));
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
payload->th.ack = 1;
tcp_fill_headers4(conn, NULL, iph, payload, dlen,
*check, conn->seq_to_tap, true);
*check = &iph->check;
} else {
struct tcp_payload_t *payload;
struct ipv6hdr *ip6h;
ASSERT(first[0].iov_len >= tcp_vu_hdrlen(true));
eh->h_proto = htons(ETH_P_IPV6);
ip6h = vu_ip(base);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
payload = vu_payloadv6(base);
memset(&payload->th, 0, sizeof(payload->th));
payload->th.doff = offsetof(struct tcp_payload_t, data) / 4;
payload->th.ack = 1;
tcp_fill_headers6(conn, NULL, ip6h, payload, dlen,
conn->seq_to_tap, true);
}
}
/**
* tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user,
* in window
* @c: Execution context
* @conn: Connection pointer
*
* Return: Negative on connection reset, 0 otherwise
*/
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
const struct flowside *tapside = TAPFLOW(conn);
uint16_t mss = MSS_GET(conn);
size_t hdrlen, fillsize;
int i, iov_cnt, iov_used;
int v6 = CONN_V6(conn);
uint32_t already_sent = 0;
const uint16_t *check;
struct iovec *first;
int frame_size;
int num_buffers;
ssize_t len;
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
flow_err(conn,
"Got packet, but RX virtqueue not usable yet");
return 0;
}
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
if (SEQ_LT(already_sent, 0)) {
/* RFC 761, section 2.1. */
flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
conn->seq_ack_from_tap, conn->seq_to_tap);
conn->seq_to_tap = conn->seq_ack_from_tap;
already_sent = 0;
if (tcp_set_peek_offset(conn->sock, 0)) {
tcp_rst(c, conn);
return -1;
}
}
if (!wnd_scaled || already_sent >= wnd_scaled) {
conn_flag(c, conn, STALLED);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}
/* Set up buffer descriptors we'll fill completely and partially. */
fillsize = wnd_scaled - already_sent;
/* collect the buffers from vhost-user and fill them with the
* data from the socket
*/
len = tcp_vu_sock_recv(c, conn, v6, already_sent, fillsize, &iov_cnt);
if (len < 0) {
vu_queue_rewind(vq, iov_cnt);
if (errno != EAGAIN && errno != EWOULDBLOCK) {
tcp_rst(c, conn);
return -errno;
}
return 0;
}
if (!len) {
vu_queue_rewind(vq, iov_cnt);
if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
if (ret) {
tcp_rst(c, conn);
return ret;
}
conn_event(c, conn, TAP_FIN_SENT);
}
return 0;
}
if (!peek_offset_cap)
len -= already_sent;
if (len <= 0) {
vu_queue_rewind(vq, iov_cnt);
conn_flag(c, conn, STALLED);
return 0;
}
conn_flag(c, conn, ~STALLED);
/* Likely, some new data was acked too. */
tcp_update_seqack_wnd(c, conn, false, NULL);
/* initialize headers */
hdrlen = tcp_vu_hdrlen(v6);
iov_used = 0;
num_buffers = 0;
check = NULL;
frame_size = 0;
/* iov_vu is an array of buffers and the buffer size can be
* smaller than the frame size we want to use but with
* num_buffer we can merge several virtio iov buffers in one packet
* we need only to set the packet headers in the first iov and
* num_buffer to the number of iov entries
*/
for (i = 0; i < iov_cnt && len; i++) {
if (frame_size == 0)
first = &iov_vu[i + 1];
if (iov_vu[i + 1].iov_len > (size_t)len)
iov_vu[i + 1].iov_len = len;
len -= iov_vu[i + 1].iov_len;
iov_used++;
frame_size += iov_vu[i + 1].iov_len;
num_buffers++;
if (frame_size >= mss || len == 0 ||
i + 1 == iov_cnt || !vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
if (i + 1 == iov_cnt)
check = NULL;
/* restore first iovec base: point to vnet header */
first->iov_base = (char *)first->iov_base - hdrlen;
first->iov_len += hdrlen;
vu_set_vnethdr(vdev, first->iov_base, num_buffers);
tcp_vu_prepare(c, conn, first, frame_size, &check);
if (*c->pcap) {
tcp_vu_update_check(tapside, first, num_buffers);
pcap_iov(first, num_buffers,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
conn->seq_to_tap += frame_size;
frame_size = 0;
num_buffers = 0;
}
}
/* release unused buffers */
vu_queue_rewind(vq, iov_cnt - iov_used);
/* send packets */
vu_flush(vdev, vq, elem, iov_used);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
return 0;
}

12
tcp_vu.h Normal file
View File

@ -0,0 +1,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#ifndef TCP_VU_H
#define TCP_VU_H
int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
#endif /*TCP_VU_H */

11
udp.c
View File

@ -110,6 +110,7 @@
#include "log.h"
#include "flow_table.h"
#include "udp_internal.h"
#include "udp_vu.h"
/* "Spliced" sockets indexed by bound port (host order) */
static int udp_splice_ns [IP_VERSIONS][NUM_PORTS];
@ -628,6 +629,11 @@ void udp_listen_sock_handler(const struct ctx *c,
union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
if (c->mode == MODE_VU) {
udp_vu_listen_sock_handler(c, ref, events, now);
return;
}
udp_buf_listen_sock_handler(c, ref, events, now);
}
@ -698,6 +704,11 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
if (c->mode == MODE_VU) {
udp_vu_reply_sock_handler(c, ref, events, now);
return;
}
udp_buf_reply_sock_handler(c, ref, events, now);
}

336
udp_vu.c Normal file
View File

@ -0,0 +1,336 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* udp_vu.c - UDP L2 vhost-user management functions
*
* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#include <unistd.h>
#include <assert.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
#include <stdint.h>
#include <stddef.h>
#include <sys/uio.h>
#include <linux/virtio_net.h>
#include "checksum.h"
#include "util.h"
#include "ip.h"
#include "siphash.h"
#include "inany.h"
#include "passt.h"
#include "pcap.h"
#include "log.h"
#include "vhost_user.h"
#include "udp_internal.h"
#include "flow.h"
#include "flow_table.h"
#include "udp_flow.h"
#include "udp_vu.h"
#include "vu_common.h"
static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE];
/**
* udp_vu_hdrlen() - return the size of the header in level 2 frame (UDP)
* @v6: Set for IPv6 packet
*
* Return: Return the size of the header
*/
static size_t udp_vu_hdrlen(bool v6)
{
size_t hdrlen;
hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
sizeof(struct ethhdr) + sizeof(struct udphdr);
if (v6)
hdrlen += sizeof(struct ipv6hdr);
else
hdrlen += sizeof(struct iphdr);
return hdrlen;
}
static int udp_vu_sock_init(int s, union sockaddr_inany *s_in)
{
struct msghdr msg = {
.msg_name = s_in,
.msg_namelen = sizeof(union sockaddr_inany),
};
return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT);
}
/**
* udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers
* @c: Execution context
* @s: Socket to receive from
* @events: epoll events bitmap
* @v6: Set for IPv6 connections
* @dlen: Size of received data (output)
*
* Return: Number of iov entries used to store the datagram
*/
static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events,
bool v6, ssize_t *dlen)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int iov_cnt, idx, iov_used;
struct msghdr msg = { 0 };
size_t off, hdrlen;
ASSERT(!c->no_udp);
if (!(events & EPOLLIN))
return 0;
/* compute L2 header length */
hdrlen = udp_vu_hdrlen(v6);
vu_init_elem(elem, iov_vu, VIRTQUEUE_MAX_SIZE);
iov_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE,
IP_MAX_MTU - sizeof(struct udphdr) + hdrlen,
NULL);
if (iov_cnt == 0)
return 0;
/* reserve space for the headers */
iov_vu[0].iov_base = (char *)iov_vu[0].iov_base + hdrlen;
iov_vu[0].iov_len -= hdrlen;
/* read data from the socket */
msg.msg_iov = iov_vu;
msg.msg_iovlen = iov_cnt;
*dlen = recvmsg(s, &msg, 0);
if (*dlen < 0) {
vu_queue_rewind(vq, iov_cnt);
return 0;
}
/* restore the pointer to the headers address */
iov_vu[0].iov_base = (char *)iov_vu[0].iov_base - hdrlen;
iov_vu[0].iov_len += hdrlen;
/* count the numbers of buffer filled by recvmsg() */
idx = iov_skip_bytes(iov_vu, iov_cnt, *dlen + hdrlen, &off);
/* adjust last iov length */
if (idx < iov_cnt)
iov_vu[idx].iov_len = off;
iov_used = idx + !!off;
vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used);
/* release unused buffers */
vu_queue_rewind(vq, iov_cnt - iov_used);
return iov_used;
}
/**
* udp_vu_prepare() - Prepare the packet header
* @c: Execution context
* @toside: Address information for one side of the flow
* @dlen: Packet data length
*
* Return: Layer-4 length
*/
static size_t udp_vu_prepare(const struct ctx *c,
const struct flowside *toside, ssize_t dlen)
{
struct ethhdr *eh;
size_t l4len;
/* ethernet header */
eh = vu_eth(iov_vu[0].iov_base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) {
struct iphdr *iph = vu_ip(iov_vu[0].iov_base);
struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base);
eh->h_proto = htons(ETH_P_IP);
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
l4len = udp_update_hdr4(iph, bp, toside, dlen, true);
} else {
struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base);
struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base);
eh->h_proto = htons(ETH_P_IPV6);
*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP);
l4len = udp_update_hdr6(ip6h, bp, toside, dlen, true);
}
return l4len;
}
/**
* udp_vu_csum() - Calculate and set checksum for a UDP packet
* @toside: ddress information for one side of the flow
* @l4len: IPv4 Payload length
* @iov_used: Length of the array
*/
static void udp_vu_csum(const struct flowside *toside, int iov_used)
{
const struct in_addr *src4 = inany_v4(&toside->oaddr);
const struct in_addr *dst4 = inany_v4(&toside->eaddr);
char *base = iov_vu[0].iov_base;
struct udp_payload_t *bp;
if (src4 && dst4) {
bp = vu_payloadv4(base);
csum_udp4(&bp->uh, *src4, *dst4, iov_vu, iov_used,
(char *)&bp->data - base);
} else {
bp = vu_payloadv6(base);
csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6,
iov_vu, iov_used, (char *)&bp->data - base);
}
}
/**
* udp_vu_listen_sock_handler() - Handle new data from socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
* @now: Current timestamp
*/
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
if (udp_sock_errs(c, ref.fd, events) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
return;
}
for (i = 0; i < UDP_MAX_FRAMES; i++) {
const struct flowside *toside;
union sockaddr_inany s_in;
flow_sidx_t sidx;
uint8_t pif;
ssize_t dlen;
int iov_used;
bool v6;
if (udp_vu_sock_init(ref.fd, &s_in) < 0)
break;
sidx = udp_flow_from_sock(c, ref, &s_in, now);
pif = pif_at_sidx(sidx);
if (pif != PIF_TAP) {
if (flow_sidx_valid(sidx)) {
flow_sidx_t fromsidx = flow_sidx_opposite(sidx);
struct udp_flow *uflow = udp_at_sidx(sidx);
flow_err(uflow,
"No support for forwarding UDP from %s to %s",
pif_name(pif_at_sidx(fromsidx)),
pif_name(pif));
} else {
debug("Discarding 1 datagram without flow");
}
continue;
}
toside = flowside_at_sidx(sidx);
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &dlen);
if (iov_used <= 0)
break;
udp_vu_prepare(c, toside, dlen);
if (*c->pcap) {
udp_vu_csum(toside, iov_used);
pcap_iov(iov_vu, iov_used,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
vu_flush(vdev, vq, elem, iov_used);
}
}
/**
* udp_vu_reply_sock_handler() - Handle new data from flow specific socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
* @now: Current timestamp
*/
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now)
{
flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
const struct flowside *toside = flowside_at_sidx(tosidx);
struct udp_flow *uflow = udp_at_sidx(ref.flowside);
int from_s = uflow->s[ref.flowside.sidei];
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
ASSERT(!c->no_udp);
if (udp_sock_errs(c, from_s, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
return;
}
for (i = 0; i < UDP_MAX_FRAMES; i++) {
uint8_t topif = pif_at_sidx(tosidx);
ssize_t dlen;
int iov_used;
bool v6;
ASSERT(uflow);
if (topif != PIF_TAP) {
uint8_t frompif = pif_at_sidx(ref.flowside);
flow_err(uflow,
"No support for forwarding UDP from %s to %s",
pif_name(frompif), pif_name(topif));
continue;
}
v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr));
iov_used = udp_vu_sock_recv(c, from_s, events, v6, &dlen);
if (iov_used <= 0)
break;
flow_trace(uflow, "Received 1 datagram on reply socket");
uflow->ts = now->tv_sec;
udp_vu_prepare(c, toside, dlen);
if (*c->pcap) {
udp_vu_csum(toside, iov_used);
pcap_iov(iov_vu, iov_used,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
vu_flush(vdev, vq, elem, iov_used);
}
}

13
udp_vu.h Normal file
View File

@ -0,0 +1,13 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*/
#ifndef UDP_VU_H
#define UDP_VU_H
void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
#endif /* UDP_VU_H */

View File

@ -48,12 +48,13 @@
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
static struct vu_dev vdev_storage;
/**
* vu_print_capabilities() - print vhost-user capabilities
* this is part of the vhost-user backend
* convention.
*/
/* cppcheck-suppress unusedFunction */
void vu_print_capabilities(void)
{
info("{");
@ -163,9 +164,7 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
*/
static void vu_remove_watch(const struct vu_dev *vdev, int fd)
{
/* Placeholder to add passt related code */
(void)vdev;
(void)fd;
epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL);
}
/**
@ -487,6 +486,14 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev,
}
}
/* As vu_packet_check_range() has no access to the number of
* memory regions, mark the end of the array with mmap_addr = 0
*/
ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1);
vdev->regions[vdev->nregions].mmap_addr = 0;
tap_sock_update_pool(vdev->regions, 0);
return false;
}
@ -615,9 +622,16 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
*/
static void vu_set_watch(const struct vu_dev *vdev, int idx)
{
/* Placeholder to add passt related code */
(void)vdev;
(void)idx;
union epoll_ref ref = {
.type = EPOLL_TYPE_VHOST_KICK,
.fd = vdev->vq[idx].kick_fd,
.queue = idx
};
struct epoll_event ev = { 0 };
ev.data.u64 = ref.u64;
ev.events = EPOLLIN;
epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
}
/**
@ -674,7 +688,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
vdev->vq[idx].started = true;
if (vdev->vq[idx].kick_fd != -1 && VHOST_USER_IS_QUEUE_TX(idx)) {
vu_set_watch(vdev, vdev->vq[idx].kick_fd);
vu_set_watch(vdev, idx);
debug("Waiting for kicks on fd: %d for vq: %d",
vdev->vq[idx].kick_fd, idx);
}
@ -829,14 +843,14 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev,
* @c: execution context
* @vdev: vhost-user device
*/
/* cppcheck-suppress unusedFunction */
void vu_init(struct ctx *c, struct vu_dev *vdev)
void vu_init(struct ctx *c)
{
int i;
vdev->context = c;
c->vdev = &vdev_storage;
c->vdev->context = c;
for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) {
vdev->vq[i] = (struct vu_virtq){
c->vdev->vq[i] = (struct vu_virtq){
.call_fd = -1,
.kick_fd = -1,
.err_fd = -1,
@ -849,7 +863,6 @@ void vu_init(struct ctx *c, struct vu_dev *vdev)
* vu_cleanup() - Reset vhost-user device
* @vdev: vhost-user device
*/
/* cppcheck-suppress unusedFunction */
void vu_cleanup(struct vu_dev *vdev)
{
unsigned int i;
@ -896,8 +909,7 @@ void vu_cleanup(struct vu_dev *vdev)
*/
static void vu_sock_reset(struct vu_dev *vdev)
{
/* Placeholder to add passt related code */
(void)vdev;
tap_sock_reset(vdev->context);
}
static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
@ -925,7 +937,6 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev,
* @fd: vhost-user message socket
* @events: epoll events
*/
/* cppcheck-suppress unusedFunction */
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
{
struct vhost_user_msg msg = { 0 };

View File

@ -183,7 +183,6 @@ struct vhost_user_msg {
*
* Return: true if the virqueue is enabled, false otherwise
*/
/* cppcheck-suppress unusedFunction */
static inline bool vu_queue_enabled(const struct vu_virtq *vq)
{
return vq->enable;
@ -195,14 +194,13 @@ static inline bool vu_queue_enabled(const struct vu_virtq *vq)
*
* Return: true if the virqueue is started, false otherwise
*/
/* cppcheck-suppress unusedFunction */
static inline bool vu_queue_started(const struct vu_virtq *vq)
{
return vq->started;
}
void vu_print_capabilities(void);
void vu_init(struct ctx *c, struct vu_dev *vdev);
void vu_init(struct ctx *c);
void vu_cleanup(struct vu_dev *vdev);
void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events);
#endif /* VHOST_USER_H */

View File

@ -328,7 +328,6 @@ static bool vring_can_notify(const struct vu_dev *dev, struct vu_virtq *vq)
* @dev: Vhost-user device
* @vq: Virtqueue
*/
/* cppcheck-suppress unusedFunction */
void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq)
{
if (!vq->vring.avail)
@ -504,7 +503,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i
*
* Return: -1 if there is an error, 0 otherwise
*/
/* cppcheck-suppress unusedFunction */
int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem)
{
unsigned int head;
@ -565,7 +563,6 @@ void vu_queue_unpop(struct vu_virtq *vq)
* @vq: Virtqueue
* @num: Number of element to unpop
*/
/* cppcheck-suppress unusedFunction */
bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num)
{
if (num > vq->inuse)
@ -621,7 +618,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index,
* @len: Size of the element
* @idx: Used ring entry index
*/
/* cppcheck-suppress unusedFunction */
void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem,
unsigned int len, unsigned int idx)
{
@ -645,7 +641,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val)
* @vq: Virtqueue
* @count: Number of entry to flush
*/
/* cppcheck-suppress unusedFunction */
void vu_queue_flush(struct vu_virtq *vq, unsigned int count)
{
uint16_t old, new;

285
vu_common.c Normal file
View File

@ -0,0 +1,285 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*
* common_vu.c - vhost-user common UDP and TCP functions
*/
#include <unistd.h>
#include <sys/uio.h>
#include <sys/eventfd.h>
#include <linux/virtio_net.h>
#include "util.h"
#include "passt.h"
#include "tap.h"
#include "vhost_user.h"
#include "pcap.h"
#include "vu_common.h"
/**
* vu_packet_check_range() - Check if a given memory zone is contained in
* a mapped guest memory region
* @buf: Array of the available memory regions
* @offset: Offset of data range in packet descriptor
* @size: Length of desired data range
* @start: Start of the packet descriptor
*
* Return: 0 if the zone is in a mapped memory region, -1 otherwise
*/
int vu_packet_check_range(void *buf, size_t offset, size_t len,
const char *start)
{
struct vu_dev_region *dev_region;
for (dev_region = buf; dev_region->mmap_addr; dev_region++) {
/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
char *m = (char *)dev_region->mmap_addr;
if (m <= start &&
start + offset + len <= m + dev_region->mmap_offset +
dev_region->size)
return 0;
}
return -1;
}
/**
* vu_init_elem() - initialize an array of virtqueue element with 1 iov in each
* @elem: Array of virtqueue element to initialize
* @iov: Array of iovec to assign to virtqueue element
* @elem_cnt: Number of virtqueue element
*/
void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt)
{
int i;
for (i = 0; i < elem_cnt; i++)
vu_set_element(&elem[i], NULL, &iov[i]);
}
/**
* vu_collect() - collect virtio buffers from a given virtqueue
* @vdev: vhost-user device
* @vq: virtqueue to collect from
* @elem: Array of virtqueue element
* each element must be initialized with one iovec entry
* in the in_sg array.
* @max_elem: Number of virtqueue element in the array
* @size: Maximum size of the data in the frame
* @frame_size: The total size of the buffers (output)
*
* Return: number of elements used to contain the frame
*/
int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int max_elem,
size_t size, size_t *frame_size)
{
size_t current_size = 0;
int elem_cnt = 0;
while (current_size < size && elem_cnt < max_elem) {
struct iovec *iov;
int ret;
ret = vu_queue_pop(vdev, vq, &elem[elem_cnt]);
if (ret < 0)
break;
if (elem[elem_cnt].in_num < 1) {
warn("virtio-net receive queue contains no in buffers");
vu_queue_detach_element(vq);
break;
}
iov = &elem[elem_cnt].in_sg[0];
if (iov->iov_len > size - current_size)
iov->iov_len = size - current_size;
current_size += iov->iov_len;
elem_cnt++;
if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
break;
}
if (frame_size)
*frame_size = current_size;
return elem_cnt;
}
/**
* vu_set_vnethdr() - set virtio-net headers
* @vdev: vhost-user device
* @vnethdr: Address of the header to set
* @num_buffers: Number of guest buffers of the frame
*/
void vu_set_vnethdr(const struct vu_dev *vdev,
struct virtio_net_hdr_mrg_rxbuf *vnethdr,
int num_buffers)
{
vnethdr->hdr = VU_HEADER;
if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
vnethdr->num_buffers = htole16(num_buffers);
}
/**
* vu_flush() - flush all the collected buffers to the vhost-user interface
* @vdev: vhost-user device
* @vq: vhost-user virtqueue
* @elem: virtqueue element array to send back to the virqueue
* @iov_used: Length of the array
*/
void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int elem_cnt)
{
int i;
for (i = 0; i < elem_cnt; i++)
vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i);
vu_queue_flush(vq, elem_cnt);
vu_queue_notify(vdev, vq);
}
/**
* vu_handle_tx() - Receive data from the TX virtqueue
* @vdev: vhost-user device
* @index: index of the virtqueue
* @now: Current timestamp
*/
static void vu_handle_tx(struct vu_dev *vdev, int index,
const struct timespec *now)
{
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
struct vu_virtq *vq = &vdev->vq[index];
int hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
int out_sg_count;
int count;
if (!VHOST_USER_IS_QUEUE_TX(index)) {
debug("vhost-user: index %d is not a TX queue", index);
return;
}
tap_flush_pools();
count = 0;
out_sg_count = 0;
while (count < VIRTQUEUE_MAX_SIZE) {
int ret;
vu_set_element(&elem[count], &out_sg[out_sg_count], NULL);
ret = vu_queue_pop(vdev, vq, &elem[count]);
if (ret < 0)
break;
out_sg_count += elem[count].out_num;
if (elem[count].out_num < 1) {
warn("virtio-net transmit queue contains no out buffers");
break;
}
ASSERT(elem[count].out_num == 1);
tap_add_packet(vdev->context,
elem[count].out_sg[0].iov_len - hdrlen,
(char *)elem[count].out_sg[0].iov_base + hdrlen);
count++;
}
tap_handler(vdev->context, now);
if (count) {
int i;
for (i = 0; i < count; i++)
vu_queue_fill(vq, &elem[i], 0, i);
vu_queue_flush(vq, count);
vu_queue_notify(vdev, vq);
}
}
/**
* vu_kick_cb() - Called on a kick event to start to receive data
* @vdev: vhost-user device
* @ref: epoll reference information
* @now: Current timestamp
*/
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now)
{
eventfd_t kick_data;
ssize_t rc;
rc = eventfd_read(ref.fd, &kick_data);
if (rc == -1)
die_perror("vhost-user kick eventfd_read()");
debug("vhost-user: ot kick_data: %016"PRIx64" idx:%d",
kick_data, ref.queue);
if (VHOST_USER_IS_QUEUE_TX(ref.queue))
vu_handle_tx(vdev, ref.queue, now);
}
/**
* vu_send_single() - Send a buffer to the front-end using the RX virtqueue
* @c: execution context
* @buf: address of the buffer
* @size: size of the buffer
*
* Return: number of bytes sent, -1 if there is an error
*/
int vu_send_single(const struct ctx *c, const void *buf, size_t size)
{
struct vu_dev *vdev = c->vdev;
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
size_t total;
int elem_cnt;
int i;
debug("vu_send_single size %zu", size);
if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
err("Got packet, but RX virtqueue not usable yet");
return 0;
}
vu_init_elem(elem, in_sg, VIRTQUEUE_MAX_SIZE);
size += sizeof(struct virtio_net_hdr_mrg_rxbuf);
elem_cnt = vu_collect(vdev, vq, elem, VIRTQUEUE_MAX_SIZE, size, &total);
if (total < size) {
debug("vu_send_single: no space to send the data "
"elem_cnt %d size %zd", elem_cnt, total);
goto err;
}
vu_set_vnethdr(vdev, in_sg[0].iov_base, elem_cnt);
total -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
/* copy data from the buffer to the iovec */
iov_from_buf(in_sg, elem_cnt, sizeof(struct virtio_net_hdr_mrg_rxbuf),
buf, total);
if (*c->pcap) {
pcap_iov(in_sg, elem_cnt,
sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
vu_flush(vdev, vq, elem, elem_cnt);
debug("vhost-user sent %zu", total);
return total;
err:
for (i = 0; i < elem_cnt; i++)
vu_queue_detach_element(vq);
return -1;
}

60
vu_common.h Normal file
View File

@ -0,0 +1,60 @@
/* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright Red Hat
* Author: Laurent Vivier <lvivier@redhat.com>
*
* vhost-user common UDP and TCP functions
*/
#ifndef VU_COMMON_H
#define VU_COMMON_H
#include <linux/virtio_net.h>
static inline void *vu_eth(void *base)
{
return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf));
}
static inline void *vu_ip(void *base)
{
return (struct ethhdr *)vu_eth(base) + 1;
}
static inline void *vu_payloadv4(void *base)
{
return (struct iphdr *)vu_ip(base) + 1;
}
static inline void *vu_payloadv6(void *base)
{
return (struct ipv6hdr *)vu_ip(base) + 1;
}
/**
* vu_set_element() - Initialize a vu_virtq_element
* @elem: Element to initialize
* @out_sg: One out iovec entry to set in elem
* @in_sg: One in iovec entry to set in elem
*/
static inline void vu_set_element(struct vu_virtq_element *elem,
struct iovec *out_sg, struct iovec *in_sg)
{
elem->out_num = !!out_sg;
elem->out_sg = out_sg;
elem->in_num = !!in_sg;
elem->in_sg = in_sg;
}
void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov,
int elem_cnt);
int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int max_elem, size_t size,
size_t *frame_size);
void vu_set_vnethdr(const struct vu_dev *vdev,
struct virtio_net_hdr_mrg_rxbuf *vnethdr,
int num_buffers);
void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
struct vu_virtq_element *elem, int elem_cnt);
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
#endif /* VU_COMMON_H */