Message ID | 20221124093000.3869344-1-emma.finn@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v4] odp-execute: Add ISA implementation of set_masked IPv6 action | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/intel-ovs-compilation | success | test: success |
On 24 Nov 2022, at 10:30, Emma Finn wrote: > This commit adds support for the AVX512 implementation of the > ipv6_set_addrs action as well as an AVX512 implementation of > updating the L4 checksums. > > Signed-off-by: Emma Finn <emma.finn@intel.com> Thanks Emma for the v4, I have one question and a couple of style issues. To speed things up I just provide the diff for the style issues. I was not able to do any actual testing, as my system did not have the avx512vbmi extension :( Cheers, Eelco > --- Style issues diff: diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 82ff7e647..f798d6708 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -20,9 +20,9 @@ #include <config.h> #include <errno.h> -#include <sys/types.h> #include <netinet/in.h> #include <netinet/ip6.h> +#include <sys/types.h> #include "csum.h" #include "dp-packet.h" @@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header) * horizontal add. */ __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF); - v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); v_delta = _mm256_hadd_epi32(v_delta, v_zeros); v_delta = _mm256_hadd_epi16(v_delta, v_zeros); @@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) { uint16_t old_delta = avx512_ipv6_sum_header(old_header); uint16_t new_delta = avx512_ipv6_sum_header(new_header); - uint32_t csum_delta = (uint16_t)~old_delta + new_delta; + uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; return ~csum_finish(csum_delta); } @@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); - /* Set the v_zero register to all zero's. */ const __m128i v_zeros = _mm_setzero_si128(); + /* Set the v_all_ones register to all one's. */ const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); - /* Load ip6 src and dst respectively into 128-bit wide registers. */ + /* Load ip6 src and dst masks respectively into 128-bit wide registers. */ __m128i v_src = _mm_loadu_si128((void *) mask); - __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); + __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask); /* Perform a bitwise OR between src and dst registers. */ __m128i v_or = _mm_or_si128(v_src, v_dst); > v4: > - Reworked and moved check for checksum outside loop. > - Code cleanup based on review from Eelco. > v3: > - Added a runtime check for AVX512 vbmi. > v2: > - Added check for availbility of s6_addr32 field of struct in6_addr. > - Fixed network headers for freebsd builds. > <SNIP> > + /* Load ip6 src and dst respectively into 128-bit wide registers. */ > + __m128i v_src = _mm_loadu_si128((void *) mask); > + __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); Guess it might be me, but I do not understand how _mm_maskz_loadu_epi64() will load the dst from the mask. Looking at the intrinsics guide it will only read the first two 64-bit values, but mask points to src? Should we not just do the following here? + __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src); + __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst); > + > + /* Perform a bitwise OR between src and dst registers. */ > + __m128i v_or = _mm_or_si128(v_src, v_dst); > + > + /* Will return true if any bit has been set in v_or, else it will return > + * false. */ > + bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones); > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); > + > + /* Load the 40 bytes of the IPv6 header. */ > + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); > + > + /* AND the v_pkt_mask to the packet data (v_packet). */ > + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); > + > + /* OR the new addresses (v_key_shuf) with the masked packet addresses > + * (v_pkt_masked). */ > + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); > + > + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to > + * be updated. */ > + if (do_checksum) { > + uint8_t proto = nh->ip6_nxt; > + uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, > + v_new_hdr); > + > + if (proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(packet); > + > + if (uh->udp_csum) { > + uint16_t old_udp_checksum = ~uh->udp_csum; > + uint32_t udp_checksum = old_udp_checksum + delta_checksum; > + > + udp_checksum = csum_finish(udp_checksum); > + > + if (!udp_checksum) { > + udp_checksum = htons(0xffff); > + } > + > + uh->udp_csum = udp_checksum; > + } > + } else if (proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(packet); > + uint16_t old_tcp_checksum = ~th->tcp_csum; > + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; > + > + tcp_checksum = csum_finish(tcp_checksum); > + th->tcp_csum = tcp_checksum; > + } else if (proto == IPPROTO_ICMPV6) { > + struct icmp6_header *icmp = dp_packet_l4(packet); > + uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; > + uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; > + > + icmp6_checksum = csum_finish(icmp6_checksum); > + icmp->icmp6_cksum = icmp6_checksum; > + } > + } > + /* Write back the modified IPv6 addresses. */ > + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); > + } > +} > +#endif /* HAVE_AVX512VBMI */ > + > static void > action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) > { > @@ -514,6 +711,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) > impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; > impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; > > +#if HAVE_AVX512VBMI > + if (action_avx512vbmi_isa_probe()) { > + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = > + action_avx512_ipv6_set_addrs; > + } > +#endif > + > return 0; > } > > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c > index f80ae5a23..8b86b1e4f 100644 > --- a/lib/odp-execute-private.c > +++ b/lib/odp-execute-private.c > @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) > > #endif > > +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI > +bool > +action_avx512vbmi_isa_probe(void) > +{ > + if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { > + return true; > + } > + return false; > +} > +#else > +bool > +action_avx512vbmi_isa_probe(void) > +{ > + return false; > +} > +#endif > + > static struct odp_execute_action_impl action_impls[] = { > [ACTION_IMPL_AUTOVALIDATOR] = { > .available = false, > diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h > index 940180c99..643f41c2a 100644 > --- a/lib/odp-execute-private.h > +++ b/lib/odp-execute-private.h > @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); > #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) > > bool action_avx512_isa_probe(void); > +bool action_avx512vbmi_isa_probe(void); > > /* Odp execute init handles setting up the state of the actions functions at > * initialization time. It cannot return errors, as it must always succeed in > -- > 2.25.1
> -----Original Message----- > From: Eelco Chaudron <echaudro@redhat.com> > Sent: Friday 25 November 2022 12:26 > To: Finn, Emma <emma.finn@intel.com> > Cc: dev@openvswitch.org; Van Haaren, Harry > <harry.van.haaren@intel.com> > Subject: Re: [v4] odp-execute: Add ISA implementation of set_masked IPv6 > action > > > > On 24 Nov 2022, at 10:30, Emma Finn wrote: > > > This commit adds support for the AVX512 implementation of the > > ipv6_set_addrs action as well as an AVX512 implementation of updating > > the L4 checksums. > > > > Signed-off-by: Emma Finn <emma.finn@intel.com> > > Thanks Emma for the v4, I have one question and a couple of style issues. To > speed things up I just provide the diff for the style issues. > > I was not able to do any actual testing, as my system did not have the > avx512vbmi extension :( > > Cheers, > > Eelco > > > --- > > Style issues diff: > > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index > 82ff7e647..f798d6708 100644 > --- a/lib/odp-execute-avx512.c > +++ b/lib/odp-execute-avx512.c > @@ -20,9 +20,9 @@ > > #include <config.h> > #include <errno.h> > -#include <sys/types.h> > #include <netinet/in.h> > #include <netinet/ip6.h> > +#include <sys/types.h> > For FreeBSD, network headers must be included in a certain order. See this commit for details: b2befd5bb2db ("sparse: Add guards to prevent FreeBSD-incompatible #include order.") So these need to stay in this order. > #include "csum.h" > #include "dp-packet.h" > @@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header) > * horizontal add. */ > __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, > 0xF, 0xF, 0xF, 0xF); > - v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); > > + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); > v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > v_delta = _mm256_hadd_epi16(v_delta, v_zeros); > > @@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header, > __m512i new_header) { > uint16_t old_delta = avx512_ipv6_sum_header(old_header); > uint16_t new_delta = avx512_ipv6_sum_header(new_header); > - uint32_t csum_delta = (uint16_t)~old_delta + new_delta; > + uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; > > return ~csum_finish(csum_delta); > } > @@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct > dp_packet_batch *batch, > __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); > __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); > > - > /* Set the v_zero register to all zero's. */ > const __m128i v_zeros = _mm_setzero_si128(); > + > /* Set the v_all_ones register to all one's. */ > const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); > > - /* Load ip6 src and dst respectively into 128-bit wide registers. */ > + /* Load ip6 src and dst masks respectively into 128-bit wide > + registers. */ > __m128i v_src = _mm_loadu_si128((void *) mask); > - __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); > + __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask); > > /* Perform a bitwise OR between src and dst registers. */ > __m128i v_or = _mm_or_si128(v_src, v_dst); > > > v4: > > - Reworked and moved check for checksum outside loop. > > - Code cleanup based on review from Eelco. > > v3: > > - Added a runtime check for AVX512 vbmi. > > v2: > > - Added check for availbility of s6_addr32 field of struct in6_addr. > > - Fixed network headers for freebsd builds. > > > > <SNIP> > > > + /* Load ip6 src and dst respectively into 128-bit wide registers. */ > > + __m128i v_src = _mm_loadu_si128((void *) mask); > > + __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); > > Guess it might be me, but I do not understand how > _mm_maskz_loadu_epi64() will load the dst from the mask. > Looking at the intrinsics guide it will only read the first two 64-bit values, but > mask points to src? > > Should we not just do the following here? > > + __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src); > + __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst); > Yes, good catch. The _maskz_load_ isn't correctly pointing to the dst values from the mask. I will make all the changes above (except the include header comment) and send v5 shortly. <SNIP>
On 25 Nov 2022, at 16:20, Finn, Emma wrote: >> -----Original Message----- >> From: Eelco Chaudron <echaudro@redhat.com> >> Sent: Friday 25 November 2022 12:26 >> To: Finn, Emma <emma.finn@intel.com> >> Cc: dev@openvswitch.org; Van Haaren, Harry >> <harry.van.haaren@intel.com> >> Subject: Re: [v4] odp-execute: Add ISA implementation of set_masked IPv6 >> action >> >> >> >> On 24 Nov 2022, at 10:30, Emma Finn wrote: >> >>> This commit adds support for the AVX512 implementation of the >>> ipv6_set_addrs action as well as an AVX512 implementation of updating >>> the L4 checksums. >>> >>> Signed-off-by: Emma Finn <emma.finn@intel.com> >> >> Thanks Emma for the v4, I have one question and a couple of style issues. To >> speed things up I just provide the diff for the style issues. >> >> I was not able to do any actual testing, as my system did not have the >> avx512vbmi extension :( >> >> Cheers, >> >> Eelco >> >>> --- >> >> Style issues diff: >> >> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index >> 82ff7e647..f798d6708 100644 >> --- a/lib/odp-execute-avx512.c >> +++ b/lib/odp-execute-avx512.c >> @@ -20,9 +20,9 @@ >> >> #include <config.h> >> #include <errno.h> >> -#include <sys/types.h> >> #include <netinet/in.h> >> #include <netinet/ip6.h> >> +#include <sys/types.h> >> > For FreeBSD, network headers must be included in a certain order. > See this commit for details: > b2befd5bb2db ("sparse: Add guards to prevent FreeBSD-incompatible #include order.") > So these need to stay in this order. Oops, yes I did not test with FreeBSD (or windows), I need to figure out how to do that in the future ;) >> #include "csum.h" >> #include "dp-packet.h" >> @@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header) >> * horizontal add. */ >> __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, >> 0xF, 0xF, 0xF, 0xF); >> - v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); >> >> + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); >> v_delta = _mm256_hadd_epi32(v_delta, v_zeros); >> v_delta = _mm256_hadd_epi16(v_delta, v_zeros); >> >> @@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header, >> __m512i new_header) { >> uint16_t old_delta = avx512_ipv6_sum_header(old_header); >> uint16_t new_delta = avx512_ipv6_sum_header(new_header); >> - uint32_t csum_delta = (uint16_t)~old_delta + new_delta; >> + uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; >> >> return ~csum_finish(csum_delta); >> } >> @@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct >> dp_packet_batch *batch, >> __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); >> __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); >> >> - >> /* Set the v_zero register to all zero's. */ >> const __m128i v_zeros = _mm_setzero_si128(); >> + >> /* Set the v_all_ones register to all one's. */ >> const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); >> >> - /* Load ip6 src and dst respectively into 128-bit wide registers. */ >> + /* Load ip6 src and dst masks respectively into 128-bit wide >> + registers. */ >> __m128i v_src = _mm_loadu_si128((void *) mask); >> - __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); >> + __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask); >> >> /* Perform a bitwise OR between src and dst registers. */ >> __m128i v_or = _mm_or_si128(v_src, v_dst); >> >>> v4: >>> - Reworked and moved check for checksum outside loop. >>> - Code cleanup based on review from Eelco. >>> v3: >>> - Added a runtime check for AVX512 vbmi. >>> v2: >>> - Added check for availbility of s6_addr32 field of struct in6_addr. >>> - Fixed network headers for freebsd builds. >>> >> >> <SNIP> >> >>> + /* Load ip6 src and dst respectively into 128-bit wide registers. */ >>> + __m128i v_src = _mm_loadu_si128((void *) mask); >>> + __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); >> >> Guess it might be me, but I do not understand how >> _mm_maskz_loadu_epi64() will load the dst from the mask. >> Looking at the intrinsics guide it will only read the first two 64-bit values, but >> mask points to src? >> >> Should we not just do the following here? >> >> + __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src); >> + __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst); >> > > Yes, good catch. The _maskz_load_ isn't correctly pointing to the dst values from the mask. > > I will make all the changes above (except the include header comment) and send v5 shortly. Thanks, take your time and enjoy the weekend! > <SNIP>
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 6c7713251..82ff7e647 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -20,6 +20,9 @@ #include <config.h> #include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netinet/ip6.h> #include "csum.h" #include "dp-packet.h" @@ -75,6 +78,26 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) == offsetof(struct ovs_key_ipv4, ipv4_ttl)); +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) == + offsetof(struct ovs_key_ipv6, ipv6_dst)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) == + offsetof(struct ovs_key_ipv6, ipv6_label)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) == + offsetof(struct ovs_key_ipv6, ipv6_proto)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) == + offsetof(struct ovs_key_ipv6, ipv6_tclass)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) == + offsetof(struct ovs_key_ipv6, ipv6_hlimit)); + /* Array of callback functions, one for each masked operation. */ odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX]; @@ -483,6 +506,180 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, } } +#if HAVE_AVX512VBMI +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_sum_header(__m512i ip6_header) +{ + __m256i v_zeros = _mm256_setzero_si256(); + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, + 0xFF, 0xFF, 0xFF, 0xFF); + + /* Shuffle ip6 src and dst to beginning of register. */ + __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst, + ip6_header); + + /* Extract ip6 src and dst into smaller 256-bit wide register. */ + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0); + + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the + * src and dst fields and add padding after each 16-bit value for the + * following carry over addition. */ + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF, + 0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF); + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a); + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b); + + /* Add each part of the old and new headers together. */ + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); + + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + + /* Shuffle 32-bit value from 3rd lane into first lane for final + * horizontal add. */ + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, + 0xF, 0xF, 0xF, 0xF); + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); + + /* Extract delta value. */ + return _mm256_extract_epi16(v_delta, 0); +} + +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) +{ + uint16_t old_delta = avx512_ipv6_sum_header(old_header); + uint16_t new_delta = avx512_ipv6_sum_header(new_header); + uint32_t csum_delta = (uint16_t)~old_delta + new_delta; + + return ~csum_finish(csum_delta); +} + +/* This function performs the same operation on each packet in the batch as + * the scalar odp_set_ipv6() function. */ +static void +__attribute__((__target__("avx512vbmi"))) +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, + const struct nlattr *a) +{ + const struct ovs_key_ipv6 *key, *mask; + struct dp_packet *packet; + + a = nl_attr_get(a); + key = nl_attr_get(a); + mask = odp_get_key_mask(a, struct ovs_key_ipv6); + + /* Read the content of the key and mask in the respective registers. We + * only load the size of the actual structure, which is only 40 bytes. */ + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); + + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the + * ip6_hdr structure layout. */ + static const uint8_t ip_shuffle_mask[64] = { + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF + }; + + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); + + /* This shuffle is required for key and mask to match the layout of the + * ip6_hdr struct. */ + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); + + + /* Set the v_zero register to all zero's. */ + const __m128i v_zeros = _mm_setzero_si128(); + /* Set the v_all_ones register to all one's. */ + const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); + + /* Load ip6 src and dst respectively into 128-bit wide registers. */ + __m128i v_src = _mm_loadu_si128((void *) mask); + __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask); + + /* Perform a bitwise OR between src and dst registers. */ + __m128i v_or = _mm_or_si128(v_src, v_dst); + + /* Will return true if any bit has been set in v_or, else it will return + * false. */ + bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones); + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + + /* Load the 40 bytes of the IPv6 header. */ + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); + + /* AND the v_pkt_mask to the packet data (v_packet). */ + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); + + /* OR the new addresses (v_key_shuf) with the masked packet addresses + * (v_pkt_masked). */ + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); + + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to + * be updated. */ + if (do_checksum) { + uint8_t proto = nh->ip6_nxt; + uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr); + + if (proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(packet); + + if (uh->udp_csum) { + uint16_t old_udp_checksum = ~uh->udp_csum; + uint32_t udp_checksum = old_udp_checksum + delta_checksum; + + udp_checksum = csum_finish(udp_checksum); + + if (!udp_checksum) { + udp_checksum = htons(0xffff); + } + + uh->udp_csum = udp_checksum; + } + } else if (proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + + tcp_checksum = csum_finish(tcp_checksum); + th->tcp_csum = tcp_checksum; + } else if (proto == IPPROTO_ICMPV6) { + struct icmp6_header *icmp = dp_packet_l4(packet); + uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; + uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; + + icmp6_checksum = csum_finish(icmp6_checksum); + icmp->icmp6_cksum = icmp6_checksum; + } + } + /* Write back the modified IPv6 addresses. */ + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); + } +} +#endif /* HAVE_AVX512VBMI */ + static void action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) { @@ -514,6 +711,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; +#if HAVE_AVX512VBMI + if (action_avx512vbmi_isa_probe()) { + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = + action_avx512_ipv6_set_addrs; + } +#endif + return 0; } diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c index f80ae5a23..8b86b1e4f 100644 --- a/lib/odp-execute-private.c +++ b/lib/odp-execute-private.c @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) #endif +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI +bool +action_avx512vbmi_isa_probe(void) +{ + if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { + return true; + } + return false; +} +#else +bool +action_avx512vbmi_isa_probe(void) +{ + return false; +} +#endif + static struct odp_execute_action_impl action_impls[] = { [ACTION_IMPL_AUTOVALIDATOR] = { .available = false, diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h index 940180c99..643f41c2a 100644 --- a/lib/odp-execute-private.h +++ b/lib/odp-execute-private.h @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) bool action_avx512_isa_probe(void); +bool action_avx512vbmi_isa_probe(void); /* Odp execute init handles setting up the state of the actions functions at * initialization time. It cannot return errors, as it must always succeed in
This commit adds support for the AVX512 implementation of the ipv6_set_addrs action as well as an AVX512 implementation of updating the L4 checksums. Signed-off-by: Emma Finn <emma.finn@intel.com> --- v4: - Reworked and moved check for checksum outside loop. - Code cleanup based on review from Eelco. v3: - Added a runtime check for AVX512 vbmi. v2: - Added check for availbility of s6_addr32 field of struct in6_addr. - Fixed network headers for freebsd builds. --- --- lib/odp-execute-avx512.c | 204 ++++++++++++++++++++++++++++++++++++++ lib/odp-execute-private.c | 17 ++++ lib/odp-execute-private.h | 1 + 3 files changed, 222 insertions(+)