mirror of
https://passt.top/passt
synced 2025-01-21 19:55:17 +00:00
tcp: Mask EPOLLIN altogether if we're blocked waiting on an ACK from the guest
There are pretty much two cases of the (misnomer) STALLED: in one case, we could send more data to the guest if it becomes available, and in another case, we can't, because we filled the window. If, in this second case, we keep EPOLLIN enabled, but never read from the socket, we get short but CPU-annoying storms of EPOLLIN events, upon which we reschedule the ACK timeout handler, never read from the socket, go back to epoll_wait(), and so on: timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 timerfd_settime(76, 0, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=2, tv_nsec=0}}, NULL) = 0 epoll_wait(3, [{events=EPOLLIN, data={u32=10497, u64=38654716161}}], 8, 1000) = 1 also known as: 29.1517: Flow 2 (TCP connection): timer expires in 2.000s 29.1517: Flow 2 (TCP connection): timer expires in 2.000s 29.1517: Flow 2 (TCP connection): timer expires in 2.000s which, for some reason, becomes very visible with muvm and aria2c downloading from a server nearby in parallel chunks. That's because EPOLLIN isn't cleared if we don't read from the socket, and even with EPOLLET, epoll_wait() will repeatedly wake us up until we actually read something. In this case, we don't want to subscribe to EPOLLIN at all: all we're waiting for is an ACK segment from the guest. Differentiate this case with a new connection flag, ACK_FROM_TAP_BLOCKS, which doesn't just indicate that we're waiting for an ACK from the guest (ACK_FROM_TAP_DUE), but also that we're blocked waiting for it. If this flag is set before we set STALLED, EPOLLIN will be masked while we set EPOLLET because of STALLED. Whenever we clear STALLED, we also clear this flag. This is definitely not elegant, but it's a minimal fix. We can probably simplify this at a later point by having a category of connection flags directly corresponding to epoll flags, and dropping STALLED altogether, or, perhaps, always using EPOLLET (but we need a mechanism to re-check sockets for pending data if we can't temporarily write to the guest). I suspect that this might also be implied in https://github.com/containers/podman/issues/23686, hence the Link: tag. It doesn't necessarily mean I'm fixing it (I can't reproduce that). Link: https://github.com/containers/podman/issues/23686 Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
b8f573cdc2
commit
a8f4fc481c
8
tcp.c
8
tcp.c
@ -345,7 +345,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
|
|||||||
|
|
||||||
static const char *tcp_flag_str[] __attribute((__unused__)) = {
|
static const char *tcp_flag_str[] __attribute((__unused__)) = {
|
||||||
"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
|
"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
|
||||||
"ACK_FROM_TAP_DUE",
|
"ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS",
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Listening sockets, used for automatic port forwarding in pasta mode only */
|
/* Listening sockets, used for automatic port forwarding in pasta mode only */
|
||||||
@ -436,8 +436,12 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
|
|||||||
if (events & TAP_FIN_SENT)
|
if (events & TAP_FIN_SENT)
|
||||||
return EPOLLET;
|
return EPOLLET;
|
||||||
|
|
||||||
if (conn_flags & STALLED)
|
if (conn_flags & STALLED) {
|
||||||
|
if (conn_flags & ACK_FROM_TAP_BLOCKS)
|
||||||
|
return EPOLLRDHUP | EPOLLET;
|
||||||
|
|
||||||
return EPOLLIN | EPOLLRDHUP | EPOLLET;
|
return EPOLLIN | EPOLLRDHUP | EPOLLET;
|
||||||
|
}
|
||||||
|
|
||||||
return EPOLLIN | EPOLLRDHUP;
|
return EPOLLIN | EPOLLRDHUP;
|
||||||
}
|
}
|
||||||
|
@ -309,6 +309,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||||
|
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
|
||||||
conn_flag(c, conn, STALLED);
|
conn_flag(c, conn, STALLED);
|
||||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||||
return 0;
|
return 0;
|
||||||
@ -387,6 +388,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
|
||||||
conn_flag(c, conn, ~STALLED);
|
conn_flag(c, conn, ~STALLED);
|
||||||
|
|
||||||
send_bufs = DIV_ROUND_UP(len, mss);
|
send_bufs = DIV_ROUND_UP(len, mss);
|
||||||
|
@ -77,6 +77,7 @@ struct tcp_tap_conn {
|
|||||||
#define ACTIVE_CLOSE BIT(2)
|
#define ACTIVE_CLOSE BIT(2)
|
||||||
#define ACK_TO_TAP_DUE BIT(3)
|
#define ACK_TO_TAP_DUE BIT(3)
|
||||||
#define ACK_FROM_TAP_DUE BIT(4)
|
#define ACK_FROM_TAP_DUE BIT(4)
|
||||||
|
#define ACK_FROM_TAP_BLOCKS BIT(5)
|
||||||
|
|
||||||
#define SNDBUF_BITS 24
|
#define SNDBUF_BITS 24
|
||||||
unsigned int sndbuf :SNDBUF_BITS;
|
unsigned int sndbuf :SNDBUF_BITS;
|
||||||
|
2
tcp_vu.c
2
tcp_vu.c
@ -381,6 +381,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
if (!wnd_scaled || already_sent >= wnd_scaled) {
|
||||||
|
conn_flag(c, conn, ACK_FROM_TAP_BLOCKS);
|
||||||
conn_flag(c, conn, STALLED);
|
conn_flag(c, conn, STALLED);
|
||||||
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
conn_flag(c, conn, ACK_FROM_TAP_DUE);
|
||||||
return 0;
|
return 0;
|
||||||
@ -423,6 +424,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS);
|
||||||
conn_flag(c, conn, ~STALLED);
|
conn_flag(c, conn, ~STALLED);
|
||||||
|
|
||||||
/* Likely, some new data was acked too. */
|
/* Likely, some new data was acked too. */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user