diff --git a/Makefile b/Makefile index 2f48c35..5a6692e 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,23 @@ CFLAGS += -DRLIMIT_STACK_VAL=$(shell ulimit -s) all: passt pasta passt4netns qrap -passt: passt.c passt.h arp.c arp.h dhcp.c dhcp.h dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h - $(CC) $(CFLAGS) passt.c arp.c dhcp.c dhcpv6.c pcap.c ndp.c siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt +avx2: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops +avx2: clean all + +avx2_debug: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops -DDEBUG -g +avx2_debug: clean all + +static: CFLAGS += -static +static: clean all + +debug: CFLAGS += -static -DDEBUG -g +debug: clean all + +passt: passt.c passt.h arp.c arp.h checksum.c checksum.h dhcp.c dhcp.h \ + dhcpv6.c dhcpv6.h pcap.c pcap.h ndp.c ndp.h siphash.c siphash.h \ + tap.c tap.h icmp.c icmp.h tcp.c tcp.h udp.c udp.h util.c util.h + $(CC) $(CFLAGS) passt.c arp.c checksum.c dhcp.c dhcpv6.c pcap.c ndp.c \ + siphash.c tap.c icmp.c tcp.c udp.c util.c -o passt pasta: passt ln -s passt pasta diff --git a/checksum.c b/checksum.c new file mode 100644 index 0000000..9c8a458 --- /dev/null +++ b/checksum.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later +// SPDX-License-Identifier: BSD-3-Clause + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * checksum.c - TCP/IP checksum routines + * + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio + * + * This file also contains code originally licensed under the following terms: + * + * Copyright (c) 2014-2016, The Regents of the University of California. + * Copyright (c) 2016-2017, Nefeli Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of their + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * See the comment to csum_avx2() for further details. + */ + +#include +#include +#include +#include +#include +#include + +/** + * sum_16b() - Calculate sum of 16-bit words + * @buf: Input buffer + * @len: Buffer length + * + * Return: 32-bit sum of 16-bit words +*/ +uint32_t sum_16b(const void *buf, size_t len) +{ + const uint16_t *p = buf; + uint32_t sum = 0; + + while (len > 1) { + sum += *p++; + len -= 2; + } + + if (len > 0) + sum += *p & htons(0xff00); + + return sum; +} + +/** + * csum_fold() - Fold long sum for IP and TCP checksum + * @sum: Original long sum + * + * Return: 16-bit folded sum + */ +uint16_t csum_fold(uint32_t sum) +{ + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return sum; +} + +/** + * csum_unaligned() - Compute TCP/IP-style checksum for not 32-byte aligned data + * @buf: Input data + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit IPv4-style checksum + */ +uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(sum_16b(buf, len) + init); +} + +/** + * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place + * @iph: Packet buffer, IP header + */ +void csum_tcp4(struct iphdr *iph) +{ + struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); + uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th; + uint32_t sum = 0; + + sum += (iph->saddr >> 16) & 0xffff; + sum += iph->saddr & 0xffff; + sum += (iph->daddr >> 16) & 0xffff; + sum += iph->daddr & 0xffff; + + sum += htons(IPPROTO_TCP); + sum += htons(tlen); + + th->check = 0; + while (tlen > 1) { + sum += *p++; + tlen -= 2; + } + + if (tlen > 0) { + sum += *p & htons(0xff00); + } + + th->check = (uint16_t)~csum_fold(sum); +} + +#ifdef __AVX2__ +#include + +/** + * csum_avx2() - Compute 32-bit checksum using AVX2 SIMD instructions + * @buf: Input buffer, must be aligned to 32-byte boundary + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + * + * This implementation is mostly sourced from BESS ("Berkeley Extensible + * Software Switch"), core/utils/checksum.h, distributed under the terms of the + * 3-Clause BSD license. Notable changes: + * - input buffer data is loaded (streamed) with a non-temporal aligned hint + * (VMOVNTDQA, _mm256_stream_load_si256() intrinsic) instead of the original + * unaligned load with temporal hint (VMOVDQU, _mm256_loadu_si256() intrinsic) + * given that the input buffer layout guarantees 32-byte alignment of TCP and + * UDP headers, and that the data is not used immediately afterwards, reducing + * cache pollution significantly and latency (e.g. on Intel Skylake: 0 instead + * of 7) + * - replace the ADCQ implementation for the portion remaining after the + * checksum computation for 128-byte blocks by a load/unpack/add loop on a + * single stream, and do the rest with a for loop, auto-vectorisation seems to + * outperforms the original hand-coded loop there + * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls + * - coding style adaptation + */ +static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init) +{ + __m256i zero, a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo; + const uint64_t *buf64 = (const uint64_t *)buf; + const __m256i *buf256 = (__m256i *)buf64; + const uint16_t *buf16; + uint64_t sum64 = init; + int odd = len & 1; + __m128i sum128; + + zero = _mm256_setzero_si256(); + buf256 = (const __m256i *)buf64; + + if (len < sizeof(__m256i) * 4) + goto less_than_128_bytes; + + + /* We parallelize two ymm streams to minimize register dependency: + * + * a: buf256, buf256 + 2, ... + * b: buf256 + 1, buf256 + 3, ... + */ + a = _mm256_stream_load_si256(buf256); + b = _mm256_stream_load_si256(buf256 + 1); + + /* For each stream, accumulate unpackhi and unpacklo in parallel (as + * 4x64bit vectors, so that each upper 0000 can hold carries): + * + * 32B data: aaaaAAAA bbbbBBBB ccccCCCC ddddDDDD (1 letter: 1 byte) + * unpackhi: bbbb0000 BBBB0000 dddd0000 DDDD0000 + * unpacklo: aaaa0000 AAAA0000 cccc0000 CCCC0000 + */ + sum_a_hi = _mm256_unpackhi_epi32(a, zero); + sum_b_hi = _mm256_unpackhi_epi32(b, zero); + sum_a_lo = _mm256_unpacklo_epi32(a, zero); + sum_b_lo = _mm256_unpacklo_epi32(b, zero); + + len -= sizeof(__m256i) * 2; + buf256 += 2; + + for (; len >= sizeof(a) * 2; len -= sizeof(a) * 2, buf256 += 2) { + a = _mm256_stream_load_si256(buf256); + b = _mm256_stream_load_si256(buf256 + 1); + + sum_a_hi = _mm256_add_epi64(sum_a_hi, + _mm256_unpackhi_epi32(a, zero)); + sum_b_hi = _mm256_add_epi64(sum_b_hi, + _mm256_unpackhi_epi32(b, zero)); + sum_a_lo = _mm256_add_epi64(sum_a_lo, + _mm256_unpacklo_epi32(a, zero)); + sum_b_lo = _mm256_add_epi64(sum_b_lo, + _mm256_unpacklo_epi32(b, zero)); + } + + /* Fold four 256bit sums into one 128-bit sum. TODO */ + sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo), + _mm256_add_epi64(sum_b_hi, sum_b_lo)); + sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0), + _mm256_extracti128_si256(sum256, 1)); + + /* Fold 128-bit sum into 64 bits. */ + sum64 += _mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1); + buf64 = (const uint64_t *)buf256; + +less_than_128_bytes: + for (; len >= sizeof(a); len -= sizeof(a), buf256++) { + a = _mm256_stream_load_si256(buf256); + + sum_a_hi = _mm256_unpackhi_epi32(a, zero); + sum_a_lo = _mm256_unpacklo_epi32(a, zero); + + sum256 = _mm256_add_epi64(sum_a_hi, sum_a_lo); + sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0), + _mm256_extracti128_si256(sum256, 1)); + + sum64 += _mm_extract_epi64(sum128, 0); + sum64 += _mm_extract_epi64(sum128, 1); + } + buf64 = (const uint64_t *)buf256; + + /* Repeat 16-bit one's complement sum (at sum64). */ + buf16 = (const uint16_t *)buf64; + while (len >= sizeof(uint16_t)) { + sum64 += *buf16++; + len -= sizeof(uint16_t); + } + + /* Add remaining 8 bits to the one's complement sum. */ + if (odd) + sum64 += *(const uint8_t *)buf16; + + /* Reduce 64-bit unsigned int to 32-bit unsigned int. */ + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer, must be aligned to 32-byte boundary + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum sum + */ +uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return (uint16_t)~csum_fold(csum_avx2(buf, len, init)); +} + +#else /* __AVX2__ */ + +/** + * csum() - Compute TCP/IP-style checksum + * @buf: Input buffer + * @len: Input length + * @sum: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 16-bit folded, complemented checksum + */ +uint16_t csum(const void *buf, size_t len, uint32_t init) +{ + return csum_unaligned(buf, len, init); +} + +#endif /* !__AVX2__ */ diff --git a/dhcp.c b/dhcp.c index 4a83627..ed5df27 100644 --- a/dhcp.c +++ b/dhcp.c @@ -24,6 +24,7 @@ #include #include +#include "checksum.h" #include "util.h" #include "passt.h" #include "dhcp.h" @@ -320,7 +321,7 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) iph->daddr = c->addr4; iph->saddr = c->gw4; iph->check = 0; - iph->check = csum_ip4(iph, iph->ihl * 4); + iph->check = csum_unaligned(iph, iph->ihl * 4, 0); len += sizeof(*eh); memcpy(eh->h_dest, eh->h_source, ETH_ALEN); diff --git a/ndp.c b/ndp.c index acc0473..b676825 100644 --- a/ndp.c +++ b/ndp.c @@ -27,6 +27,7 @@ #include #include +#include "checksum.h" #include "util.h" #include "passt.h" #include "tap.h" @@ -172,8 +173,8 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) ip6hr->payload_len = htons(sizeof(*ihr) + len); ip6hr->hop_limit = IPPROTO_ICMPV6; ihr->icmp6_cksum = 0; - ihr->icmp6_cksum = csum_ip4(ip6hr, sizeof(*ip6hr) + - sizeof(*ihr) + len); + ihr->icmp6_cksum = csum_unaligned(ip6hr, sizeof(*ip6hr) + + sizeof(*ihr) + len, 0); ip6hr->version = 6; ip6hr->nexthdr = IPPROTO_ICMPV6; diff --git a/tap.c b/tap.c index 2af1c7a..08b85ff 100644 --- a/tap.c +++ b/tap.c @@ -41,6 +41,7 @@ #include #include +#include "checksum.h" #include "util.h" #include "passt.h" #include "arp.h" @@ -122,7 +123,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, memcpy(&iph->saddr, &src->s6_addr[12], 4); iph->check = 0; - iph->check = csum_ip4(iph, iph->ihl * 4); + iph->check = csum_unaligned(iph, iph->ihl * 4, 0); memcpy(data, in, len); diff --git a/udp.c b/udp.c index 30659e0..00a34a9 100644 --- a/udp.c +++ b/udp.c @@ -110,6 +110,7 @@ #include #include +#include "checksum.h" #include "util.h" #include "passt.h" #include "tap.h" @@ -210,6 +211,11 @@ udp4_l2_buf[UDP_TAP_FRAMES] = { */ __extension__ struct udp6_l2_buf_t { struct sockaddr_in6 s_in6; +#ifdef __AVX2__ + /* Align ip6h to 32-byte boundary. */ + uint8_t pad[64 - (sizeof(struct sockaddr_in6) + sizeof(struct ethhdr) + + sizeof(uint32_t))]; +#endif uint32_t vnet_len; struct ethhdr eh; @@ -217,10 +223,18 @@ __extension__ struct udp6_l2_buf_t { struct udphdr uh; uint8_t data[USHRT_MAX - (sizeof(struct ipv6hdr) + sizeof(struct udphdr))]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))) +#else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +#endif udp6_l2_buf[UDP_TAP_FRAMES] = { [ 0 ... UDP_TAP_FRAMES - 1 ] = { - { 0 }, 0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP), + { 0 }, +#ifdef __AVX2__ + { 0 }, +#endif + 0, L2_BUF_ETH_IP6_INIT, L2_BUF_IP6_INIT(IPPROTO_UDP), { 0 }, { 0 }, }, }; @@ -656,7 +670,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, b->ip6h.version = 0; b->ip6h.nexthdr = 0; b->uh.check = 0; - b->uh.check = csum_ip4(&b->ip6h, ip_len); + b->uh.check = csum(&b->ip6h, ip_len, 0); b->ip6h.version = 6; b->ip6h.nexthdr = IPPROTO_UDP; b->ip6h.hop_limit = 255; diff --git a/util.c b/util.c index 4d4661a..33894f7 100644 --- a/util.c +++ b/util.c @@ -76,86 +76,6 @@ logfn(info, LOG_INFO) logfn(debug, LOG_DEBUG) #endif -/** - * sum_16b() - Calculate sum of 16-bit words - * @buf: Input buffer - * @len: Buffer length - * - * Return: 32-bit sum of 16-bit words -*/ -uint32_t sum_16b(void *buf, size_t len) -{ - uint32_t sum = 0; - uint16_t *p = buf; - size_t len1 = len / 2; - size_t off; - - for (off = 0; off < len1; off++, p++) - sum += *p; - - if (len % 2) - sum += *p & 0xff; - - return sum; -} - -/** - * csum_fold() - Fold long sum for IP and TCP checksum - * @sum: Original long sum - * - * Return: 16-bit folded sum - */ -uint16_t csum_fold(uint32_t sum) -{ - while (sum >> 16) - sum = (sum & 0xffff) + (sum >> 16); - - return sum; -} - -/** - * csum_ip4() - Calculate IPv4 checksum - * @buf: Packet buffer, L3 headers - * @len: Total L3 packet length - * - * Return: 16-bit IPv4-style checksum - */ -uint16_t csum_ip4(void *buf, size_t len) -{ - return ~csum_fold(sum_16b(buf, len)); -} - -/** - * csum_tcp4() - Calculate TCP checksum for IPv4 and set in place - * @iph: Packet buffer, IP header - */ -void csum_tcp4(struct iphdr *iph) -{ - struct tcphdr *th = (struct tcphdr *)((char *)iph + iph->ihl * 4); - uint16_t tlen = ntohs(iph->tot_len) - iph->ihl * 4, *p = (uint16_t *)th; - uint32_t sum = 0; - - sum += (iph->saddr >> 16) & 0xffff; - sum += iph->saddr & 0xffff; - sum += (iph->daddr >> 16) & 0xffff; - sum += iph->daddr & 0xffff; - - sum += htons(IPPROTO_TCP); - sum += htons(tlen); - - th->check = 0; - while (tlen > 1) { - sum += *p++; - tlen -= 2; - } - - if (tlen > 0) { - sum += *p & htons(0xff00); - } - - th->check = (uint16_t)~csum_fold(sum); -} - /** * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol * @ip6h: IPv6 header diff --git a/util.h b/util.h index 7fbce1f..1c11474 100644 --- a/util.h +++ b/util.h @@ -117,10 +117,6 @@ void debug(const char *format, ...); struct ctx; -uint32_t sum_16b(void *buf, size_t len); -uint16_t csum_fold(uint32_t sum); -uint16_t csum_ip4(void *buf, size_t len); -void csum_tcp4(struct iphdr *iph); char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, int lo, uint32_t data);