diff --git a/tcp.c b/tcp.c index 75b959a..78c4add 100644 --- a/tcp.c +++ b/tcp.c @@ -373,6 +373,9 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; char tcp_buf_discard [MAX_WINDOW]; +/* Does the kernel support TCP_PEEK_OFF? */ +bool peek_offset_cap; + /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -388,6 +391,25 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + * + * Return: -1 when it fails, 0 otherwise. + */ +int tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return 0; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { + err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); + return -1; + } + return 0; +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1947,6 +1969,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); conn->seq_to_tap = max_ack_seq; + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + return -1; + } tcp_data_from_sock(c, conn); } @@ -2039,6 +2065,10 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + return; + } /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. @@ -2119,6 +2149,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + if (tcp_set_peek_offset(conn->sock, 0)) + goto reset; if (th->fin) { conn->seq_from_tap++; @@ -2367,8 +2399,12 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + if (tcp_set_peek_offset(conn->sock, 0)) { + tcp_rst(c, conn); + } else { + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); + } } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2659,7 +2695,8 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + unsigned int b, optv = 0; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -2683,6 +2720,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + peek_offset_cap = true; + close(s); + } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); + return 0; } diff --git a/tcp.h b/tcp.h index a9b8bf8..e9ff019 100644 --- a/tcp.h +++ b/tcp.h @@ -24,6 +24,9 @@ void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); +int tcp_set_peek_offset(int s, int offset); + +extern bool peek_offset_cap; /** * union tcp_epoll_ref - epoll reference portion for TCP connections diff --git a/tcp_buf.c b/tcp_buf.c index ff9f797..11dce02 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -408,6 +408,7 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) uint32_t already_sent, seq; struct iovec *iov; + /* How much have we read/sent since last received ack ? */ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; if (SEQ_LT(already_sent, 0)) { @@ -416,6 +417,10 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; + if (tcp_set_peek_offset(s, 0)) { + tcp_rst(c, conn); + return -1; + } } if (!wnd_scaled || already_sent >= wnd_scaled) { @@ -433,11 +438,16 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_rem = (wnd_scaled - already_sent) % mss; } - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { @@ -478,7 +488,10 @@ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; + if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0;