1
0
mirror of https://passt.top/passt synced 2024-07-02 07:52:41 +00:00

tcp: Proper error handling for sendmmsg() to UNIX domain socket

As data from socket is forwarded to the guest, sendmmsg() might send
fewer bytes than requested in three different ways:

- failing altogether with a negative error code -- ignore that,
  we'll get an error on the UNIX domain socket later if there's
  really an issue with it and reset the connection to the guest

- sending less than 'vlen' messages -- instead of assuming success
  in that case and waiting for the guest to send a duplicate ACK
  indicating missing data, update the sequence number according to
  what was actually sent and spare some retransmissions

- somewhat unexpectedly to me, sending 'vlen' or less than 'vlen'
  messages, returning up to 'vlen', with the last message being
  partially sent, and no further indication of errors other than
  the returned msg_len for the last partially sent message being
  less than iov_len.

  In this case, we would assume success and proceed as nothing
  happened. However, qemu would fail to parse any further message,
  having received a partial descriptor, and eventually close the
  connection, logging:

	serious error: oversized packet received,connection terminated.

  as the length descriptor for the next message would be sourced
  from the middle of the next successfully sent message, not from
  its header.

  Handle this by checking the msg_len returned for the last (even
  partially) sent message, and force re-sending the missing bytes,
  if any, with a blocking sendmsg() -- qemu must not receive
  anything else than that anyway.

While at it, allow to send up to 64KiB for each message, the
previous 32KiB limit isn't actually required, and just switch to a
new message at each iteration on sending buffers, they are already
MSS-sized anyway, so the check in the loop isn't really needed.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
Stefano Brivio 2021-08-26 14:37:48 +02:00
parent cc2ebfd5f2
commit d2272f74f7

103
tcp.c
View File

@ -601,7 +601,12 @@ static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES];
static struct msghdr tcp4_l2_mh_sock;
static struct msghdr tcp6_l2_mh_sock;
static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES];
__extension__
static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES] = {
[ 0 ... TCP_TAP_FRAMES - 1 ] = {
.msg_hdr.msg_iovlen = 1,
},
};
/* sendmsg() to socket */
static struct iovec tcp_tap_iov [TAP_MSGS];
@ -1358,8 +1363,15 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr,
if (conn->mss_guest < 0)
conn->mss_guest = MSS_DEFAULT;
if (c->mode == MODE_PASST && c->v6 && conn->mss_guest > SHRT_MAX)
conn->mss_guest = SHRT_MAX;
if (c->mode == MODE_PASST) {
/* Don't upset qemu */
conn->mss_guest = MIN(USHRT_MAX -
sizeof(uint32_t) -
sizeof(struct ethhdr) -
sizeof(struct ipv6hdr) -
sizeof(struct tcphdr),
conn->mss_guest);
}
sl = sizeof(conn->mss_guest);
setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl);
@ -1534,15 +1546,16 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
struct timespec *now)
{
int mss_tap, fill_bufs, send_bufs = 0, last_len, msg_len, iov_rem = 0;
int *buf_mss, *buf_mss_nr_set, *buf_mss_tap, *buf_mss_tap_nr_set;
int mss_tap, fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
int send, len, plen, v4 = IN6_IS_ADDR_V4MAPPED(&conn->a.a6);
uint32_t seq_to_tap = conn->seq_to_tap;
socklen_t sl = sizeof(struct tcp_info);
struct mmsghdr *mh = tcp_l2_mh_tap;
int s = conn->sock, i, ret = 0;
struct iovec *iov, *iov_tap;
uint32_t already_sent;
struct tcp_info info;
struct mmsghdr *mh;
already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
@ -1644,15 +1657,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
info.tcpi_snd_wnd = conn->tcpi_snd_wnd;
}
if (v4)
mh->msg_hdr.msg_iov = tcp4_l2_iov_tap;
else
mh->msg_hdr.msg_iov = tcp6_l2_iov_tap;
mh->msg_hdr.msg_iovlen = 0;
plen = conn->mss_guest;
msg_len = 0;
for (i = 0; i < send_bufs; i++) {
int iov_len, ip_len;
for (i = 0, mh = tcp_l2_mh_tap; i < send_bufs; i++, mh++) {
int ip_len;
if (i == send_bufs - 1)
plen = last_len;
@ -1673,7 +1680,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
b->th.source = htons(conn->sock_port);
b->th.dest = htons(conn->tap_port);
b->th.seq = htonl(conn->seq_to_tap);
b->th.seq = htonl(seq_to_tap);
b->th.ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->no_snd_wnd) {
@ -1696,6 +1703,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
}
b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
} else {
struct tcp6_l2_buf_t *b = &tcp6_l2_buf[i];
uint32_t flow = conn->seq_init_to_tap;
@ -1713,7 +1722,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
b->th.source = htons(conn->sock_port);
b->th.dest = htons(conn->tap_port);
b->th.seq = htonl(conn->seq_to_tap);
b->th.seq = htonl(seq_to_tap);
b->th.ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->no_snd_wnd) {
@ -1741,32 +1750,45 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn,
}
b->vnet_len = htonl(sizeof(struct ethhdr) + ip_len);
mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
}
iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len;
/* Switch to a new message if this one is too long for qemu. */
if (msg_len && msg_len + iov_len > SHRT_MAX) {
mh++;
mh->msg_hdr.msg_iovlen = 0;
msg_len = 0;
if (v4)
mh->msg_hdr.msg_iov = &tcp4_l2_iov_tap[i];
else
mh->msg_hdr.msg_iov = &tcp6_l2_iov_tap[i];
}
mh->msg_hdr.msg_iovlen++;
msg_len += iov_len;
conn->seq_to_tap += plen;
seq_to_tap += plen;
}
if (c->mode == MODE_PASTA)
goto out;
sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1,
MSG_NOSIGNAL | MSG_DONTWAIT);
pcapmm(tcp_l2_mh_tap, mh - tcp_l2_mh_tap + 1);
ret = sendmmsg(c->fd_tap, tcp_l2_mh_tap, mh - tcp_l2_mh_tap,
MSG_NOSIGNAL | MSG_DONTWAIT);
if (ret <= 0)
goto out;
conn->seq_to_tap += conn->mss_guest * (ret - 1) + last_len;
/* sendmmsg() indicates how many messages were sent at least partially.
* Kernel commit 3023898b7d4a ("sock: fix sendmmsg for partial sendmsg")
* gives us the guarantee that at most one message, namely the last sent
* one, might have been sent partially. Check how many bytes of that
* message were sent, and re-send any missing bytes with a blocking
* sendmsg(), otherwise qemu will fail to parse any subsequent message.
*/
mh = &tcp_l2_mh_tap[ret - 1];
if (mh->msg_len < mh->msg_hdr.msg_iov->iov_len) {
uint8_t **iov_base = (uint8_t **)&mh->msg_hdr.msg_iov->iov_base;
int part_sent = mh->msg_len;
mh->msg_hdr.msg_iov->iov_len -= part_sent;
*iov_base += part_sent;
sendmsg(c->fd_tap, &mh->msg_hdr, MSG_NOSIGNAL);
mh->msg_hdr.msg_iov->iov_len += part_sent;
*iov_base -= part_sent;
}
pcapmm(tcp_l2_mh_tap, ret);
goto out;
@ -2027,10 +2049,15 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr,
if (conn->mss_guest < 0)
conn->mss_guest = MSS_DEFAULT;
/* Don't upset qemu */
if (c->mode == MODE_PASST && c->v6 &&
conn->mss_guest > SHRT_MAX)
conn->mss_guest = SHRT_MAX;
if (c->mode == MODE_PASST) {
/* Don't upset qemu */
conn->mss_guest = MIN(USHRT_MAX -
sizeof(uint32_t) -
sizeof(struct ethhdr) -
sizeof(struct ipv6hdr) -
sizeof(struct tcphdr),
conn->mss_guest);
}
ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL);
if (ws > MAX_WS) {