diff mbox series

[ovs-dev,v5] odp-execute: Add ISA implementation of set_masked IPv6 action

Message ID 20221125162318.489909-1-emma.finn@intel.com
State Superseded
Headers show
Series [ovs-dev,v5] odp-execute: Add ISA implementation of set_masked IPv6 action | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation fail test: fail

Commit Message

Finn, Emma Nov. 25, 2022, 4:23 p.m. UTC
This commit adds support for the AVX512 implementation of the
ipv6_set_addrs action as well as an AVX512 implementation of
updating the L4 checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>

---
v5:
  - Fixed load for ip6 src and dst mask for checksum check.
v4:
  - Reworked and moved check for checksum outside loop.
  - Code cleanup based on review from Eelco.
v3:
  - Added a runtime check for AVX512 vbmi.
v2:
  - Added check for availbility of s6_addr32 field of struct in6_addr.
  - Fixed network headers for freebsd builds.
---
---
 lib/odp-execute-avx512.c  | 204 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |  17 ++++
 lib/odp-execute-private.h |   1 +
 3 files changed, 222 insertions(+)

Comments

Ilya Maximets Nov. 25, 2022, 5:22 p.m. UTC | #1
On 11/25/22 17:23, Emma Finn wrote:
> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
> 
> Signed-off-by: Emma Finn <emma.finn@intel.com>
> 
> ---
> v5:
>   - Fixed load for ip6 src and dst mask for checksum check.
> v4:
>   - Reworked and moved check for checksum outside loop.
>   - Code cleanup based on review from Eelco.
> v3:
>   - Added a runtime check for AVX512 vbmi.
> v2:
>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>   - Fixed network headers for freebsd builds.
> ---
> ---
>  lib/odp-execute-avx512.c  | 204 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |  17 ++++
>  lib/odp-execute-private.h |   1 +
>  3 files changed, 222 insertions(+)

Hi, Emma.  Thanks for the patch!
I didn't review the actual AVX512 code, but I have a couple of
questions and nits inline.

Best regards, Ilya Maximets.

> 
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 6c7713251..df0b31ffd 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -20,6 +20,9 @@
>  
>  #include <config.h>
>  #include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
>  
>  #include "csum.h"
>  #include "dp-packet.h"
> @@ -75,6 +78,26 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
>                    MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
>                    offsetof(struct ovs_key_ipv4, ipv4_ttl));
>  
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_dst));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_label));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_proto));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_tclass));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_hlimit));
> +
>  /* Array of callback functions, one for each masked operation. */
>  odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
>  
> @@ -483,6 +506,180 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
>      }
>  }
>  
> +#if HAVE_AVX512VBMI
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_sum_header(__m512i ip6_header)
> +{
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
> +                                               0xFF, 0xFF, 0xFF, 0xFF);
> +
> +    /* Shuffle ip6 src and dst to beginning of register. */
> +    __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst,
> +                                                      ip6_header);
> +
> +    /* Extract ip6 src and dst into smaller 256-bit wide register. */
> +    __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0);
> +
> +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
> +     * src and dst fields and add padding after each 16-bit value for the
> +     * following carry over addition. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
> +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
> +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
> +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
> +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
> +
> +    /* Add each part of the old and new headers together. */
> +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final
> +     * horizontal add. */
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> +
> +    /* Extract delta value. */
> +    return _mm256_extract_epi16(v_delta, 0);
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
> +{
> +    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
> +    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
> +    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;
> +
> +    return  ~csum_finish(csum_delta);
> +}
> +
> +/* This function performs the same operation on each packet in the batch as
> + * the scalar odp_set_ipv6() function. */

I'm not sure if that statement is correct.  If you'll look
at the odp_set_ipv6() implementation and precisely at the
packet_set_ipv6() implementation, there is a check for the
routing extension header combined with the check for the
fragmentation header (packet_rh_present) to prevent writing
into L4 fields that do not exist or, in case of routing
header being present, checksum should not be updated for the
destination address.

Could you point me to the AVX512 code that is responsible
for that check?

> +static void
> +__attribute__((__target__("avx512vbmi")))
> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> +                             const struct nlattr *a)

Name of a function is a bit confusing.  Doesn't it also
set tclass, proto, etc. ?

> +{
> +    const struct ovs_key_ipv6 *key, *mask;
> +    struct dp_packet *packet;
> +
> +    a = nl_attr_get(a);
> +    key = nl_attr_get(a);
> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
> +
> +    /* Read the content of the key and mask in the respective registers. We
> +     * only load the size of the actual structure, which is only 40 bytes. */
> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> +
> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> +     * ip6_hdr structure layout. */
> +    static const uint8_t ip_shuffle_mask[64] = {
> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> +    };
> +
> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> +
> +    /* This shuffle is required for key and mask to match the layout of the
> +     * ip6_hdr struct. */
> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
> +
> +    /* Set the v_zero register to all zero's. */
> +    const __m128i v_zeros = _mm_setzero_si128();
> +
> +    /* Set the v_all_ones register to all one's. */
> +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> +
> +    /* Perform a bitwise OR between src and dst registers. */
> +    __m128i v_or = _mm_or_si128(v_src, v_dst);
> +
> +    /* Will return true if any bit has been set in v_or, else it will return
> +     * false. */
> +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> +        /* Load the 40 bytes of the IPv6 header. */
> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> +        /* AND the v_pkt_mask to the packet data (v_packet). */
> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> +        /* OR the new addresses (v_key_shuf) with the masked packet addresses
> +         * (v_pkt_masked). */
> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> +         * be updated. */
> +        if (do_checksum) {
> +            uint8_t proto = nh->ip6_nxt;
> +            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> +                                                                  v_new_hdr);
> +
> +            if (proto == IPPROTO_UDP) {
> +                struct udp_header *uh = dp_packet_l4(packet);
> +
> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
> +
> +                    udp_checksum = csum_finish(udp_checksum);
> +
> +                    if (!udp_checksum) {
> +                        udp_checksum = htons(0xffff);
> +                    }
> +
> +                    uh->udp_csum = udp_checksum;
> +                }
> +            } else if (proto == IPPROTO_TCP) {
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
> +
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                th->tcp_csum = tcp_checksum;
> +            } else if (proto == IPPROTO_ICMPV6) {
> +                struct icmp6_header *icmp = dp_packet_l4(packet);
> +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> +                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
> +
> +                icmp6_checksum = csum_finish(icmp6_checksum);
> +                icmp->icmp6_cksum = icmp6_checksum;
> +            }
> +        }
> +        /* Write back the modified IPv6 addresses. */
> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);

Overindented.

> +    }
> +}
> +#endif /* HAVE_AVX512VBMI */
> +
>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
>  {
> @@ -514,6 +711,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>  
> +#if HAVE_AVX512VBMI
> +    if (action_avx512vbmi_isa_probe()) {
> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> +                              action_avx512_ipv6_set_addrs;
> +    }
> +#endif
> +
>      return 0;
>  }
>  
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..8b86b1e4f 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>  
>  #endif
>  
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> +        return true;
> +    }
> +    return false;

Hmm,  should this be just:
    return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI);
?

> +}
> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return false;
> +}
> +#endif
> +
>  static struct odp_execute_action_impl action_impls[] = {
>      [ACTION_IMPL_AUTOVALIDATOR] = {
>          .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>  #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>  
>  bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>  
>  /* Odp execute init handles setting up the state of the actions functions at
>   * initialization time. It cannot return errors, as it must always succeed in
Finn, Emma Nov. 29, 2022, 2:09 p.m. UTC | #2
> -----Original Message-----
> From: Ilya Maximets <i.maximets@ovn.org>
> Sent: Friday 25 November 2022 17:22
> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>
> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
> set_masked IPv6 action
> 
> On 11/25/22 17:23, Emma Finn wrote:
> > This commit adds support for the AVX512 implementation of the
> > ipv6_set_addrs action as well as an AVX512 implementation of updating
> > the L4 checksums.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> >
> > ---
> > v5:
> >   - Fixed load for ip6 src and dst mask for checksum check.
> > v4:
> >   - Reworked and moved check for checksum outside loop.
> >   - Code cleanup based on review from Eelco.
> > v3:
> >   - Added a runtime check for AVX512 vbmi.
> > v2:
> >   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >   - Fixed network headers for freebsd builds.
> > ---
> > ---
> >  lib/odp-execute-avx512.c  | 204
> > ++++++++++++++++++++++++++++++++++++++
> >  lib/odp-execute-private.c |  17 ++++
> >  lib/odp-execute-private.h |   1 +
> >  3 files changed, 222 insertions(+)
> 
> Hi, Emma.  Thanks for the patch!
> I didn't review the actual AVX512 code, but I have a couple of questions and
> nits inline.
> 

Thanks Ilya.
My replies are inline below. 

<SNIP>
> > +
> > +/* This function performs the same operation on each packet in the
> > +batch as
> > + * the scalar odp_set_ipv6() function. */
> 
> I'm not sure if that statement is correct.  If you'll look at the odp_set_ipv6()
> implementation and precisely at the
> packet_set_ipv6() implementation, there is a check for the routing extension
> header combined with the check for the fragmentation header
> (packet_rh_present) to prevent writing into L4 fields that do not exist or, in
> case of routing header being present, checksum should not be updated for
> the destination address.
> 
> Could you point me to the AVX512 code that is responsible for that check?
> 
I think the AVX code is handling this case the same as scalar and also I cannot reproduce a failure with the autovalidator.
If I am following the scalar code correctly, you're right. If there is a routing extension header present, for the dst address no checksum will happen. 
But similarly for src address, a checksum won't happen.
As packet_update_csum128() will only do a checksum if ip6_nxt is UPD,TCP or ICMPv6. Which won't be the case if any extension header is present. 
Similarly in the AVX code, l4 checksum will only happen if ip6_nxt is UPD,TCP or ICMPv6, i.e no extension header is present. 
So I think this case is covered if I'm not missing any corner cases?
Have you been able to see a failure with autovalidator ?

> > +static void
> > +__attribute__((__target__("avx512vbmi")))
> > +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> > +                             const struct nlattr *a)
> 
> Name of a function is a bit confusing.  Doesn't it also set tclass, proto, etc. ?
> 
It does. Would something like action_avx512_set_ipv6() be better?
As the scalar function is called packet_set_ipv6().

> > +{
> > +    const struct ovs_key_ipv6 *key, *mask;
> > +    struct dp_packet *packet;
> > +
> > +    a = nl_attr_get(a);
> > +    key = nl_attr_get(a);
> > +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
> > +
> > +    /* Read the content of the key and mask in the respective registers. We
> > +     * only load the size of the actual structure, which is only 40 bytes. */
> > +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> > +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> > +
> > +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> > +     * ip6_hdr structure layout. */
> > +    static const uint8_t ip_shuffle_mask[64] = {
> > +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> > +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> > +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> > +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> > +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> > +    };
> > +
> > +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> > +
> > +    /* This shuffle is required for key and mask to match the layout of the
> > +     * ip6_hdr struct. */
> > +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> > +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
> v_mask);
> > +
> > +    /* Set the v_zero register to all zero's. */
> > +    const __m128i v_zeros = _mm_setzero_si128();
> > +
> > +    /* Set the v_all_ones register to all one's. */
> > +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
> > +
> > +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
> > +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
> > +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> > +
> > +    /* Perform a bitwise OR between src and dst registers. */
> > +    __m128i v_or = _mm_or_si128(v_src, v_dst);
> > +
> > +    /* Will return true if any bit has been set in v_or, else it will return
> > +     * false. */
> > +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> > +
> > +        /* Load the 40 bytes of the IPv6 header. */
> > +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
> > + nh);
> > +
> > +        /* AND the v_pkt_mask to the packet data (v_packet). */
> > +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
> > + v_packet);
> > +
> > +        /* OR the new addresses (v_key_shuf) with the masked packet
> addresses
> > +         * (v_pkt_masked). */
> > +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
> > + v_pkt_masked);
> > +
> > +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> > +         * be updated. */
> > +        if (do_checksum) {
> > +            uint8_t proto = nh->ip6_nxt;
> > +            uint16_t delta_checksum =
> avx512_ipv6_addr_csum_delta(v_packet,
> > +
> > + v_new_hdr);
> > +
> > +            if (proto == IPPROTO_UDP) {
> > +                struct udp_header *uh = dp_packet_l4(packet);
> > +
> > +                if (uh->udp_csum) {
> > +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> > +                    uint32_t udp_checksum = old_udp_checksum +
> > + delta_checksum;
> > +
> > +                    udp_checksum = csum_finish(udp_checksum);
> > +
> > +                    if (!udp_checksum) {
> > +                        udp_checksum = htons(0xffff);
> > +                    }
> > +
> > +                    uh->udp_csum = udp_checksum;
> > +                }
> > +            } else if (proto == IPPROTO_TCP) {
> > +                struct tcp_header *th = dp_packet_l4(packet);
> > +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> > +                uint32_t tcp_checksum = old_tcp_checksum +
> > + delta_checksum;
> > +
> > +                tcp_checksum = csum_finish(tcp_checksum);
> > +                th->tcp_csum = tcp_checksum;
> > +            } else if (proto == IPPROTO_ICMPV6) {
> > +                struct icmp6_header *icmp = dp_packet_l4(packet);
> > +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> > +                uint32_t icmp6_checksum = old_icmp6_checksum +
> > + delta_checksum;
> > +
> > +                icmp6_checksum = csum_finish(icmp6_checksum);
> > +                icmp->icmp6_cksum = icmp6_checksum;
> > +            }
> > +        }
> > +        /* Write back the modified IPv6 addresses. */
> > +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> 
> Overindented.
> 
Sure, will fix this.

> > +    }
> > +}
> > +#endif /* HAVE_AVX512VBMI */
> > +
> >  static void
> >  action_avx512_set_masked(struct dp_packet_batch *batch, const struct
> > nlattr *a)  { @@ -514,6 +711,13 @@ action_avx512_init(struct
> > odp_execute_action_impl *self OVS_UNUSED)
> >      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> action_avx512_eth_set_addrs;
> >      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > action_avx512_ipv4_set_addrs;
> >
> > +#if HAVE_AVX512VBMI
> > +    if (action_avx512vbmi_isa_probe()) {
> > +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> > +                              action_avx512_ipv6_set_addrs;
> > +    }
> > +#endif
> > +
> >      return 0;
> >  }
> >
> > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> > index f80ae5a23..8b86b1e4f 100644
> > --- a/lib/odp-execute-private.c
> > +++ b/lib/odp-execute-private.c
> > @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
> >
> >  #endif
> >
> > +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool
> > +action_avx512vbmi_isa_probe(void)
> > +{
> > +    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> > +        return true;
> > +    }
> > +    return false;
> 
> Hmm,  should this be just:
>     return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI);
> ?

Eelco, if you're okay with this change (as you asked for this function to be changed in v4)?
I can update to the above.

<SNIP>
Eelco Chaudron Nov. 29, 2022, 2:18 p.m. UTC | #3
On 29 Nov 2022, at 15:09, Finn, Emma wrote:

>> -----Original Message-----
>> From: Ilya Maximets <i.maximets@ovn.org>
>> Sent: Friday 25 November 2022 17:22
>> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
>> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>
>> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>> set_masked IPv6 action
>>
>> On 11/25/22 17:23, Emma Finn wrote:
>>> This commit adds support for the AVX512 implementation of the
>>> ipv6_set_addrs action as well as an AVX512 implementation of updating
>>> the L4 checksums.
>>>
>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>>
>>> ---
>>> v5:
>>>   - Fixed load for ip6 src and dst mask for checksum check.
>>> v4:
>>>   - Reworked and moved check for checksum outside loop.
>>>   - Code cleanup based on review from Eelco.
>>> v3:
>>>   - Added a runtime check for AVX512 vbmi.
>>> v2:
>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>   - Fixed network headers for freebsd builds.
>>> ---
>>> ---
>>>  lib/odp-execute-avx512.c  | 204
>>> ++++++++++++++++++++++++++++++++++++++
>>>  lib/odp-execute-private.c |  17 ++++
>>>  lib/odp-execute-private.h |   1 +
>>>  3 files changed, 222 insertions(+)
>>
>> Hi, Emma.  Thanks for the patch!
>> I didn't review the actual AVX512 code, but I have a couple of questions and
>> nits inline.
>>
>
> Thanks Ilya.
> My replies are inline below.
>
> <SNIP>
>>> +
>>> +/* This function performs the same operation on each packet in the
>>> +batch as
>>> + * the scalar odp_set_ipv6() function. */
>>
>> I'm not sure if that statement is correct.  If you'll look at the odp_set_ipv6()
>> implementation and precisely at the
>> packet_set_ipv6() implementation, there is a check for the routing extension
>> header combined with the check for the fragmentation header
>> (packet_rh_present) to prevent writing into L4 fields that do not exist or, in
>> case of routing header being present, checksum should not be updated for
>> the destination address.
>>
>> Could you point me to the AVX512 code that is responsible for that check?
>>
> I think the AVX code is handling this case the same as scalar and also I cannot reproduce a failure with the autovalidator.
> If I am following the scalar code correctly, you're right. If there is a routing extension header present, for the dst address no checksum will happen.
> But similarly for src address, a checksum won't happen.
> As packet_update_csum128() will only do a checksum if ip6_nxt is UPD,TCP or ICMPv6. Which won't be the case if any extension header is present.
> Similarly in the AVX code, l4 checksum will only happen if ip6_nxt is UPD,TCP or ICMPv6, i.e no extension header is present.
> So I think this case is covered if I'm not missing any corner cases?
> Have you been able to see a failure with autovalidator ?
>
>>> +static void
>>> +__attribute__((__target__("avx512vbmi")))
>>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
>>> +                             const struct nlattr *a)
>>
>> Name of a function is a bit confusing.  Doesn't it also set tclass, proto, etc. ?
>>
> It does. Would something like action_avx512_set_ipv6() be better?
> As the scalar function is called packet_set_ipv6().
>
>>> +{
>>> +    const struct ovs_key_ipv6 *key, *mask;
>>> +    struct dp_packet *packet;
>>> +
>>> +    a = nl_attr_get(a);
>>> +    key = nl_attr_get(a);
>>> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
>>> +
>>> +    /* Read the content of the key and mask in the respective registers. We
>>> +     * only load the size of the actual structure, which is only 40 bytes. */
>>> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
>>> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
>>> +
>>> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
>>> +     * ip6_hdr structure layout. */
>>> +    static const uint8_t ip_shuffle_mask[64] = {
>>> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
>>> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
>>> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
>>> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
>>> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
>>> +    };
>>> +
>>> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
>>> +
>>> +    /* This shuffle is required for key and mask to match the layout of the
>>> +     * ip6_hdr struct. */
>>> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
>>> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
>> v_mask);
>>> +
>>> +    /* Set the v_zero register to all zero's. */
>>> +    const __m128i v_zeros = _mm_setzero_si128();
>>> +
>>> +    /* Set the v_all_ones register to all one's. */
>>> +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
>>> +
>>> +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
>>> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
>>> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
>>> +
>>> +    /* Perform a bitwise OR between src and dst registers. */
>>> +    __m128i v_or = _mm_or_si128(v_src, v_dst);
>>> +
>>> +    /* Will return true if any bit has been set in v_or, else it will return
>>> +     * false. */
>>> +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
>>> +
>>> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
>>> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
>>> +
>>> +        /* Load the 40 bytes of the IPv6 header. */
>>> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
>>> + nh);
>>> +
>>> +        /* AND the v_pkt_mask to the packet data (v_packet). */
>>> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
>>> + v_packet);
>>> +
>>> +        /* OR the new addresses (v_key_shuf) with the masked packet
>> addresses
>>> +         * (v_pkt_masked). */
>>> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
>>> + v_pkt_masked);
>>> +
>>> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
>>> +         * be updated. */
>>> +        if (do_checksum) {
>>> +            uint8_t proto = nh->ip6_nxt;
>>> +            uint16_t delta_checksum =
>> avx512_ipv6_addr_csum_delta(v_packet,
>>> +
>>> + v_new_hdr);
>>> +
>>> +            if (proto == IPPROTO_UDP) {
>>> +                struct udp_header *uh = dp_packet_l4(packet);
>>> +
>>> +                if (uh->udp_csum) {
>>> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
>>> +                    uint32_t udp_checksum = old_udp_checksum +
>>> + delta_checksum;
>>> +
>>> +                    udp_checksum = csum_finish(udp_checksum);
>>> +
>>> +                    if (!udp_checksum) {
>>> +                        udp_checksum = htons(0xffff);
>>> +                    }
>>> +
>>> +                    uh->udp_csum = udp_checksum;
>>> +                }
>>> +            } else if (proto == IPPROTO_TCP) {
>>> +                struct tcp_header *th = dp_packet_l4(packet);
>>> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
>>> +                uint32_t tcp_checksum = old_tcp_checksum +
>>> + delta_checksum;
>>> +
>>> +                tcp_checksum = csum_finish(tcp_checksum);
>>> +                th->tcp_csum = tcp_checksum;
>>> +            } else if (proto == IPPROTO_ICMPV6) {
>>> +                struct icmp6_header *icmp = dp_packet_l4(packet);
>>> +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
>>> +                uint32_t icmp6_checksum = old_icmp6_checksum +
>>> + delta_checksum;
>>> +
>>> +                icmp6_checksum = csum_finish(icmp6_checksum);
>>> +                icmp->icmp6_cksum = icmp6_checksum;
>>> +            }
>>> +        }
>>> +        /* Write back the modified IPv6 addresses. */
>>> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
>>
>> Overindented.
>>
> Sure, will fix this.
>
>>> +    }
>>> +}
>>> +#endif /* HAVE_AVX512VBMI */
>>> +
>>>  static void
>>>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct
>>> nlattr *a)  { @@ -514,6 +711,13 @@ action_avx512_init(struct
>>> odp_execute_action_impl *self OVS_UNUSED)
>>>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
>> action_avx512_eth_set_addrs;
>>>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] =
>>> action_avx512_ipv4_set_addrs;
>>>
>>> +#if HAVE_AVX512VBMI
>>> +    if (action_avx512vbmi_isa_probe()) {
>>> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
>>> +                              action_avx512_ipv6_set_addrs;
>>> +    }
>>> +#endif
>>> +
>>>      return 0;
>>>  }
>>>
>>> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
>>> index f80ae5a23..8b86b1e4f 100644
>>> --- a/lib/odp-execute-private.c
>>> +++ b/lib/odp-execute-private.c
>>> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>>>
>>>  #endif
>>>
>>> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool
>>> +action_avx512vbmi_isa_probe(void)
>>> +{
>>> +    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
>>> +        return true;
>>> +    }
>>> +    return false;
>>
>> Hmm,  should this be just:
>>     return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI);
>> ?
>
> Eelco, if you're okay with this change (as you asked for this function to be changed in v4)?
> I can update to the above.

Yes sound good to me, guess I just didn’t see this nicer solution ;)

> <SNIP>
Ilya Maximets Nov. 29, 2022, 3:07 p.m. UTC | #4
On 11/29/22 15:09, Finn, Emma wrote:
> 
> 
>> -----Original Message-----
>> From: Ilya Maximets <i.maximets@ovn.org>
>> Sent: Friday 25 November 2022 17:22
>> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
>> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>
>> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>> set_masked IPv6 action
>>
>> On 11/25/22 17:23, Emma Finn wrote:
>>> This commit adds support for the AVX512 implementation of the
>>> ipv6_set_addrs action as well as an AVX512 implementation of updating
>>> the L4 checksums.
>>>
>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>>
>>> ---
>>> v5:
>>>   - Fixed load for ip6 src and dst mask for checksum check.
>>> v4:
>>>   - Reworked and moved check for checksum outside loop.
>>>   - Code cleanup based on review from Eelco.
>>> v3:
>>>   - Added a runtime check for AVX512 vbmi.
>>> v2:
>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>   - Fixed network headers for freebsd builds.
>>> ---
>>> ---
>>>  lib/odp-execute-avx512.c  | 204
>>> ++++++++++++++++++++++++++++++++++++++
>>>  lib/odp-execute-private.c |  17 ++++
>>>  lib/odp-execute-private.h |   1 +
>>>  3 files changed, 222 insertions(+)
>>
>> Hi, Emma.  Thanks for the patch!
>> I didn't review the actual AVX512 code, but I have a couple of questions and
>> nits inline.
>>
> 
> Thanks Ilya.
> My replies are inline below. 
> 
> <SNIP>
>>> +
>>> +/* This function performs the same operation on each packet in the
>>> +batch as
>>> + * the scalar odp_set_ipv6() function. */
>>
>> I'm not sure if that statement is correct.  If you'll look at the odp_set_ipv6()
>> implementation and precisely at the
>> packet_set_ipv6() implementation, there is a check for the routing extension
>> header combined with the check for the fragmentation header
>> (packet_rh_present) to prevent writing into L4 fields that do not exist or, in
>> case of routing header being present, checksum should not be updated for
>> the destination address.
>>
>> Could you point me to the AVX512 code that is responsible for that check?
>>
> I think the AVX code is handling this case the same as scalar and also I cannot reproduce a failure with the autovalidator.
> If I am following the scalar code correctly, you're right. If there is a routing extension header present, for the dst address no checksum will happen. 
> But similarly for src address, a checksum won't happen.
> As packet_update_csum128() will only do a checksum if ip6_nxt is UPD,TCP or ICMPv6. Which won't be the case if any extension header is present.

Not really, the 'proto' argument in this function is one
of the results of packet_rh_present() that iterates over
all the extension headers and takes the protocol number
from the last one.  So, extension headers are jumped over
this way.

> Similarly in the AVX code, l4 checksum will only happen if ip6_nxt is UPD,TCP or ICMPv6, i.e no extension header is present. 
> So I think this case is covered if I'm not missing any corner cases?

I didn't read the AVX code carefully enough to confirm that,
but it is not really a correct behavior as extension headers
should generally be just ignored except for fragmentation
header and the routing header.  So, the logic is:

- If the fragmentation header is present and it is a 'later'
  fragment - skip the checksum as there is no L4 header in
  the packet.  For the 'first' fragment the checksum should
  be re-calculated.

- If the routing header with non-zero segments_left is present
  then update of the destination address should not be reflected
  in the checksum.  Update of the source address should still
  trigger the checksum update.  This is because the original
  packet checksum is calculated with the destination address
  taken from the last segment of the routing header.

- In all other cases, extension headers should be just ignored
  and the checksum should be updated.

I'm not sure if that logic is covering all the cases, but that
is what scalar code is doing.

> Have you been able to see a failure with autovalidator ?

Yes, there is a failure on a system test:

9. system-traffic.at:229: testing datapath - ping6 between two ports with header modify ...

2022-11-28T17:27:13.067Z|00107|dpif_lookup_avx512_gather|INFO|Using non-specialized AVX512 lookup for subtable (4,5) and possibly oth
ers.
2022-11-28T17:27:13.389Z|00108|odp_execute_impl|ERR|Autovalidation of avx512 failed. Details:
Packet: 0
Action : set(ipv6(dst=fc00::2))
Good hex:
00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
00000040  0b bc 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19
<...>
Test hex:
00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
00000040  0b bb 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19
<...>


This is a fragmented ICMPv6 packet.  The first fragment.

I'm also wondering why CI didn't catch that...

There might be 2 reasons:

1. Actions autovalidator is not enabled in CI, or
2. CI system doesn't have avx512vbmi.

Michael, could you check that?


> 
>>> +static void
>>> +__attribute__((__target__("avx512vbmi")))
>>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
>>> +                             const struct nlattr *a)
>>
>> Name of a function is a bit confusing.  Doesn't it also set tclass, proto, etc. ?
>>
> It does. Would something like action_avx512_set_ipv6() be better?
> As the scalar function is called packet_set_ipv6().

Yes, that looks better.  Thanks!

Best regards, Ilya Maximets.
Phelan, Michael Nov. 29, 2022, 4:35 p.m. UTC | #5
> -----Original Message-----
> From: Ilya Maximets <i.maximets@ovn.org>
> Sent: Tuesday 29 November 2022 15:07
> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org; Phelan,
> Michael <michael.phelan@intel.com>
> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
> <ian.stokes@intel.com>
> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
> set_masked IPv6 action
> 
> On 11/29/22 15:09, Finn, Emma wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ilya Maximets <i.maximets@ovn.org>
> >> Sent: Friday 25 November 2022 17:22
> >> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
> >> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>
> >> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
> >> set_masked IPv6 action
> >>
> >> On 11/25/22 17:23, Emma Finn wrote:
> >>> This commit adds support for the AVX512 implementation of the
> >>> ipv6_set_addrs action as well as an AVX512 implementation of
> >>> updating the L4 checksums.
> >>>
> >>> Signed-off-by: Emma Finn <emma.finn@intel.com>
> >>>
> >>> ---
> >>> v5:
> >>>   - Fixed load for ip6 src and dst mask for checksum check.
> >>> v4:
> >>>   - Reworked and moved check for checksum outside loop.
> >>>   - Code cleanup based on review from Eelco.
> >>> v3:
> >>>   - Added a runtime check for AVX512 vbmi.
> >>> v2:
> >>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >>>   - Fixed network headers for freebsd builds.
> >>> ---
> >>> ---
> >>>  lib/odp-execute-avx512.c  | 204
> >>> ++++++++++++++++++++++++++++++++++++++
> >>>  lib/odp-execute-private.c |  17 ++++
> >>>  lib/odp-execute-private.h |   1 +
> >>>  3 files changed, 222 insertions(+)
> >>
> >> Hi, Emma.  Thanks for the patch!
> >> I didn't review the actual AVX512 code, but I have a couple of
> >> questions and nits inline.
> >>
> >
> > Thanks Ilya.
> > My replies are inline below.
> >
> > <SNIP>
> >>> +
> >>> +/* This function performs the same operation on each packet in the
> >>> +batch as
> >>> + * the scalar odp_set_ipv6() function. */
> >>
> >> I'm not sure if that statement is correct.  If you'll look at the
> >> odp_set_ipv6() implementation and precisely at the
> >> packet_set_ipv6() implementation, there is a check for the routing
> >> extension header combined with the check for the fragmentation header
> >> (packet_rh_present) to prevent writing into L4 fields that do not
> >> exist or, in case of routing header being present, checksum should
> >> not be updated for the destination address.
> >>
> >> Could you point me to the AVX512 code that is responsible for that check?
> >>
> > I think the AVX code is handling this case the same as scalar and also I
> cannot reproduce a failure with the autovalidator.
> > If I am following the scalar code correctly, you're right. If there is a routing
> extension header present, for the dst address no checksum will happen.
> > But similarly for src address, a checksum won't happen.
> > As packet_update_csum128() will only do a checksum if ip6_nxt is UPD,TCP
> or ICMPv6. Which won't be the case if any extension header is present.
> 
> Not really, the 'proto' argument in this function is one of the results of
> packet_rh_present() that iterates over all the extension headers and takes
> the protocol number from the last one.  So, extension headers are jumped
> over this way.
> 
> > Similarly in the AVX code, l4 checksum will only happen if ip6_nxt is
> UPD,TCP or ICMPv6, i.e no extension header is present.
> > So I think this case is covered if I'm not missing any corner cases?
> 
> I didn't read the AVX code carefully enough to confirm that, but it is not really
> a correct behavior as extension headers should generally be just ignored
> except for fragmentation header and the routing header.  So, the logic is:
> 
> - If the fragmentation header is present and it is a 'later'
>   fragment - skip the checksum as there is no L4 header in
>   the packet.  For the 'first' fragment the checksum should
>   be re-calculated.
> 
> - If the routing header with non-zero segments_left is present
>   then update of the destination address should not be reflected
>   in the checksum.  Update of the source address should still
>   trigger the checksum update.  This is because the original
>   packet checksum is calculated with the destination address
>   taken from the last segment of the routing header.
> 
> - In all other cases, extension headers should be just ignored
>   and the checksum should be updated.
> 
> I'm not sure if that logic is covering all the cases, but that is what scalar code is
> doing.
> 
> > Have you been able to see a failure with autovalidator ?
> 
> Yes, there is a failure on a system test:
> 
> 9. system-traffic.at:229: testing datapath - ping6 between two ports with
> header modify ...
> 
> 2022-11-28T17:27:13.067Z|00107|dpif_lookup_avx512_gather|INFO|Using
> non-specialized AVX512 lookup for subtable (4,5) and possibly oth ers.
> 2022-11-28T17:27:13.389Z|00108|odp_execute_impl|ERR|Autovalidation of
> avx512 failed. Details:
> Packet: 0
> Action : set(ipv6(dst=fc00::2))
> Good hex:
> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
> 00000040  0b bc 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...> Test hex:
> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
> 00000040  0b bb 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...>
> 
> 
> This is a fragmented ICMPv6 packet.  The first fragment.
> 
> I'm also wondering why CI didn't catch that...
> 
> There might be 2 reasons:
> 
> 1. Actions autovalidator is not enabled in CI, or 2. CI system doesn't have
> avx512vbmi.
> 
> Michael, could you check that?

Hi Ilya,
The CI system does have avx512vbmi, however, the actions autovalidator is never enabled for any of the tests.

I could add a test to configure with the actions autovalidator if you think this would be a good value add for the CI?
> 
> 
> >
> >>> +static void
> >>> +__attribute__((__target__("avx512vbmi")))
> >>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> >>> +                             const struct nlattr *a)
> >>
> >> Name of a function is a bit confusing.  Doesn't it also set tclass, proto, etc.
> ?
> >>
> > It does. Would something like action_avx512_set_ipv6() be better?
> > As the scalar function is called packet_set_ipv6().
> 
> Yes, that looks better.  Thanks!
> 
> Best regards, Ilya Maximets.
Thanks,
Michael.
Ilya Maximets Nov. 29, 2022, 6 p.m. UTC | #6
On 11/29/22 17:35, Phelan, Michael wrote:
> 
>> -----Original Message-----
>>
>> I'm also wondering why CI didn't catch that...
>>
>> There might be 2 reasons:
>>
>> 1. Actions autovalidator is not enabled in CI, or 2. CI system doesn't have
>> avx512vbmi.
>>
>> Michael, could you check that?
> 
> Hi Ilya,
> The CI system does have avx512vbmi, however, the actions autovalidator is never enabled for any of the tests.

OK.  Thanks for checking!

> 
> I could add a test to configure with the actions autovalidator if you think this would be a good value add for the CI?

I think it's useful.  At least, it would have caught the
issue with the current patch much earlier.

Best regards, Ilya Maximets.
Eelco Chaudron Nov. 30, 2022, 10:23 a.m. UTC | #7
On 29 Nov 2022, at 17:35, Phelan, Michael wrote:

>> -----Original Message-----
>> From: Ilya Maximets <i.maximets@ovn.org>
>> Sent: Tuesday 29 November 2022 15:07
>> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org; Phelan,
>> Michael <michael.phelan@intel.com>
>> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>; Van
>> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
>> <ian.stokes@intel.com>
>> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>> set_masked IPv6 action
>>
>> On 11/29/22 15:09, Finn, Emma wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Ilya Maximets <i.maximets@ovn.org>
>>>> Sent: Friday 25 November 2022 17:22
>>>> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
>>>> Cc: i.maximets@ovn.org; Eelco Chaudron <echaudro@redhat.com>
>>>> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>>>> set_masked IPv6 action
>>>>
>>>> On 11/25/22 17:23, Emma Finn wrote:
>>>>> This commit adds support for the AVX512 implementation of the
>>>>> ipv6_set_addrs action as well as an AVX512 implementation of
>>>>> updating the L4 checksums.
>>>>>
>>>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>>>>
>>>>> ---
>>>>> v5:
>>>>>   - Fixed load for ip6 src and dst mask for checksum check.
>>>>> v4:
>>>>>   - Reworked and moved check for checksum outside loop.
>>>>>   - Code cleanup based on review from Eelco.
>>>>> v3:
>>>>>   - Added a runtime check for AVX512 vbmi.
>>>>> v2:
>>>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>>>   - Fixed network headers for freebsd builds.
>>>>> ---
>>>>> ---
>>>>>  lib/odp-execute-avx512.c  | 204
>>>>> ++++++++++++++++++++++++++++++++++++++
>>>>>  lib/odp-execute-private.c |  17 ++++
>>>>>  lib/odp-execute-private.h |   1 +
>>>>>  3 files changed, 222 insertions(+)
>>>>
>>>> Hi, Emma.  Thanks for the patch!
>>>> I didn't review the actual AVX512 code, but I have a couple of
>>>> questions and nits inline.
>>>>
>>>
>>> Thanks Ilya.
>>> My replies are inline below.
>>>
>>> <SNIP>
>>>>> +
>>>>> +/* This function performs the same operation on each packet in the
>>>>> +batch as
>>>>> + * the scalar odp_set_ipv6() function. */
>>>>
>>>> I'm not sure if that statement is correct.  If you'll look at the
>>>> odp_set_ipv6() implementation and precisely at the
>>>> packet_set_ipv6() implementation, there is a check for the routing
>>>> extension header combined with the check for the fragmentation header
>>>> (packet_rh_present) to prevent writing into L4 fields that do not
>>>> exist or, in case of routing header being present, checksum should
>>>> not be updated for the destination address.
>>>>
>>>> Could you point me to the AVX512 code that is responsible for that check?
>>>>
>>> I think the AVX code is handling this case the same as scalar and also I
>> cannot reproduce a failure with the autovalidator.
>>> If I am following the scalar code correctly, you're right. If there is a routing
>> extension header present, for the dst address no checksum will happen.
>>> But similarly for src address, a checksum won't happen.
>>> As packet_update_csum128() will only do a checksum if ip6_nxt is UPD,TCP
>> or ICMPv6. Which won't be the case if any extension header is present.
>>
>> Not really, the 'proto' argument in this function is one of the results of
>> packet_rh_present() that iterates over all the extension headers and takes
>> the protocol number from the last one.  So, extension headers are jumped
>> over this way.
>>
>>> Similarly in the AVX code, l4 checksum will only happen if ip6_nxt is
>> UPD,TCP or ICMPv6, i.e no extension header is present.
>>> So I think this case is covered if I'm not missing any corner cases?
>>
>> I didn't read the AVX code carefully enough to confirm that, but it is not really
>> a correct behavior as extension headers should generally be just ignored
>> except for fragmentation header and the routing header.  So, the logic is:
>>
>> - If the fragmentation header is present and it is a 'later'
>>   fragment - skip the checksum as there is no L4 header in
>>   the packet.  For the 'first' fragment the checksum should
>>   be re-calculated.
>>
>> - If the routing header with non-zero segments_left is present
>>   then update of the destination address should not be reflected
>>   in the checksum.  Update of the source address should still
>>   trigger the checksum update.  This is because the original
>>   packet checksum is calculated with the destination address
>>   taken from the last segment of the routing header.
>>
>> - In all other cases, extension headers should be just ignored
>>   and the checksum should be updated.
>>
>> I'm not sure if that logic is covering all the cases, but that is what scalar code is
>> doing.
>>
>>> Have you been able to see a failure with autovalidator ?
>>
>> Yes, there is a failure on a system test:
>>
>> 9. system-traffic.at:229: testing datapath - ping6 between two ports with
>> header modify ...
>>
>> 2022-11-28T17:27:13.067Z|00107|dpif_lookup_avx512_gather|INFO|Using
>> non-specialized AVX512 lookup for subtable (4,5) and possibly oth ers.
>> 2022-11-28T17:27:13.389Z|00108|odp_execute_impl|ERR|Autovalidation of
>> avx512 failed. Details:
>> Packet: 0
>> Action : set(ipv6(dst=fc00::2))
>> Good hex:
>> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
>> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
>> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
>> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
>> 00000040  0b bc 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
>> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...> Test hex:
>> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
>> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
>> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
>> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
>> 00000040  0b bb 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
>> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...>
>>
>>
>> This is a fragmented ICMPv6 packet.  The first fragment.
>>
>> I'm also wondering why CI didn't catch that...
>>
>> There might be 2 reasons:
>>
>> 1. Actions autovalidator is not enabled in CI, or 2. CI system doesn't have
>> avx512vbmi.
>>
>> Michael, could you check that?
>
> Hi Ilya,
> The CI system does have avx512vbmi, however, the actions autovalidator is never enabled for any of the tests.
>
> I could add a test to configure with the actions autovalidator if you think this would be a good value add for the CI?

I would suggest doing a run with and without all the avx512 auto validators enabled at compile time.

>>
>>
>>>
>>>>> +static void
>>>>> +__attribute__((__target__("avx512vbmi")))
>>>>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
>>>>> +                             const struct nlattr *a)
>>>>
>>>> Name of a function is a bit confusing.  Doesn't it also set tclass, proto, etc.
>> ?
>>>>
>>> It does. Would something like action_avx512_set_ipv6() be better?
>>> As the scalar function is called packet_set_ipv6().
>>
>> Yes, that looks better.  Thanks!
>>
>> Best regards, Ilya Maximets.
> Thanks,
> Michael.
Finn, Emma Nov. 30, 2022, 2:14 p.m. UTC | #8
<snip>
> >>>>
> >>>> On 11/25/22 17:23, Emma Finn wrote:
> >>>>> This commit adds support for the AVX512 implementation of the
> >>>>> ipv6_set_addrs action as well as an AVX512 implementation of
> >>>>> updating the L4 checksums.
> >>>>>
> >>>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
> >>>>>
> >>>>> ---
> >>>>> v5:
> >>>>>   - Fixed load for ip6 src and dst mask for checksum check.
> >>>>> v4:
> >>>>>   - Reworked and moved check for checksum outside loop.
> >>>>>   - Code cleanup based on review from Eelco.
> >>>>> v3:
> >>>>>   - Added a runtime check for AVX512 vbmi.
> >>>>> v2:
> >>>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >>>>>   - Fixed network headers for freebsd builds.
> >>>>> ---
> >>>>> ---
> >>>>>  lib/odp-execute-avx512.c  | 204
> >>>>> ++++++++++++++++++++++++++++++++++++++
> >>>>>  lib/odp-execute-private.c |  17 ++++
> >>>>>  lib/odp-execute-private.h |   1 +
> >>>>>  3 files changed, 222 insertions(+)
> >>>>
> >>>> Hi, Emma.  Thanks for the patch!
> >>>> I didn't review the actual AVX512 code, but I have a couple of
> >>>> questions and nits inline.
> >>>>
> >>>
> >>> Thanks Ilya.
> >>> My replies are inline below.
> >>>
> >>> <SNIP>
> >>>>> +
> >>>>> +/* This function performs the same operation on each packet in
> >>>>> +the batch as
> >>>>> + * the scalar odp_set_ipv6() function. */
> >>>>
> >>>> I'm not sure if that statement is correct.  If you'll look at the
> >>>> odp_set_ipv6() implementation and precisely at the
> >>>> packet_set_ipv6() implementation, there is a check for the routing
> >>>> extension header combined with the check for the fragmentation
> >>>> header
> >>>> (packet_rh_present) to prevent writing into L4 fields that do not
> >>>> exist or, in case of routing header being present, checksum should
> >>>> not be updated for the destination address.
> >>>>
> >>>> Could you point me to the AVX512 code that is responsible for that
> check?
> >>>>
> >>> I think the AVX code is handling this case the same as scalar and
> >>> also I
> >> cannot reproduce a failure with the autovalidator.
> >>> If I am following the scalar code correctly, you're right. If there
> >>> is a routing
> >> extension header present, for the dst address no checksum will happen.
> >>> But similarly for src address, a checksum won't happen.
> >>> As packet_update_csum128() will only do a checksum if ip6_nxt is
> >>> UPD,TCP
> >> or ICMPv6. Which won't be the case if any extension header is present.
> >>
> >> Not really, the 'proto' argument in this function is one of the
> >> results of
> >> packet_rh_present() that iterates over all the extension headers and
> >> takes the protocol number from the last one.  So, extension headers
> >> are jumped over this way.
> >>
> >>> Similarly in the AVX code, l4 checksum will only happen if ip6_nxt
> >>> is
> >> UPD,TCP or ICMPv6, i.e no extension header is present.
> >>> So I think this case is covered if I'm not missing any corner cases?
> >>
> >> I didn't read the AVX code carefully enough to confirm that, but it
> >> is not really a correct behavior as extension headers should
> >> generally be just ignored except for fragmentation header and the
> routing header.  So, the logic is:
> >>
> >> - If the fragmentation header is present and it is a 'later'
> >>   fragment - skip the checksum as there is no L4 header in
> >>   the packet.  For the 'first' fragment the checksum should
> >>   be re-calculated.
> >>
> >> - If the routing header with non-zero segments_left is present
> >>   then update of the destination address should not be reflected
> >>   in the checksum.  Update of the source address should still
> >>   trigger the checksum update.  This is because the original
> >>   packet checksum is calculated with the destination address
> >>   taken from the last segment of the routing header.
> >>
> >> - In all other cases, extension headers should be just ignored
> >>   and the checksum should be updated.
> >>
> >> I'm not sure if that logic is covering all the cases, but that is
> >> what scalar code is doing.
> >>
> >>> Have you been able to see a failure with autovalidator ?
> >>
> >> Yes, there is a failure on a system test:
> >>
> >> 9. system-traffic.at:229: testing datapath - ping6 between two ports
> >> with header modify ...
> >>
> >> 2022-11-
> 28T17:27:13.067Z|00107|dpif_lookup_avx512_gather|INFO|Using
> >> non-specialized AVX512 lookup for subtable (4,5) and possibly oth ers.
> >> 2022-11-28T17:27:13.389Z|00108|odp_execute_impl|ERR|Autovalidation
> of
> >> avx512 failed. Details:
> >> Packet: 0
> >> Action : set(ipv6(dst=fc00::2))
> >> Good hex:
> >> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
> >> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
> >> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
> >> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
> >> 00000040  0b bc 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
> >> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...> Test hex:
> >> 00000000  e4 11 22 33 44 54 e4 11-22 33 44 55 86 dd 60 06
> >> 00000010  01 5d 05 b0 2c 40 fc 00-00 00 00 00 00 00 00 00
> >> 00000020  00 00 00 00 00 01 fc 00-00 00 00 00 00 00 00 00
> >> 00000030  00 00 00 00 00 02 3a 00-00 01 81 05 2f 0d 80 00
> >> 00000040  0b bb 39 39 00 01 71 ef-84 63 00 00 00 00 13 ed
> >> 00000050  05 00 00 00 00 00 10 11-12 13 14 15 16 17 18 19 <...>
> >>
> >>
> >> This is a fragmented ICMPv6 packet.  The first fragment.
> >>

Thanks for the explanation. I have added a check for extension headers to mimic the scalar
behaviour in the next version. 
 
> >> I'm also wondering why CI didn't catch that...
> >>
> >> There might be 2 reasons:
> >>
> >> 1. Actions autovalidator is not enabled in CI, or 2. CI system
> >> doesn't have avx512vbmi.
> >>
> >> Michael, could you check that?
> >
> > Hi Ilya,
> > The CI system does have avx512vbmi, however, the actions autovalidator is
> never enabled for any of the tests.
> >
> > I could add a test to configure with the actions autovalidator if you think
> this would be a good value add for the CI?
> 
> I would suggest doing a run with and without all the avx512 auto validators
> enabled at compile time.
>
<snip>
Phelan, Michael Nov. 30, 2022, 3:27 p.m. UTC | #9
> -----Original Message-----
> From: Finn, Emma <emma.finn@intel.com>
> Sent: Wednesday 30 November 2022 14:15
> To: Eelco Chaudron <echaudro@redhat.com>; Phelan, Michael
> <michael.phelan@intel.com>
> Cc: Ilya Maximets <i.maximets@ovn.org>; dev@openvswitch.org; Van
> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
> <ian.stokes@intel.com>
> Subject: RE: [ovs-dev] [v5] odp-execute: Add ISA implementation of
> set_masked IPv6 action
<snip> 
> > >> I'm also wondering why CI didn't catch that...
> > >>
> > >> There might be 2 reasons:
> > >>
> > >> 1. Actions autovalidator is not enabled in CI, or 2. CI system
> > >> doesn't have avx512vbmi.
> > >>
> > >> Michael, could you check that?
> > >
> > > Hi Ilya,
> > > The CI system does have avx512vbmi, however, the actions
> > > autovalidator is
> > never enabled for any of the tests.
> > >
> > > I could add a test to configure with the actions autovalidator if
> > > you think
> > this would be a good value add for the CI?
> >
> > I would suggest doing a run with and without all the avx512 auto
> > validators enabled at compile time.
> >
Hi Eelco,
I believe make check-local is run through the GitHub Build and Test job, Aaron you might correct me if I'm wrong on that. 
If this is the case then is there a need to do a check without AVX512 enabled on the Intel CI?

Kind Regards,
Michael.
> <snip>
Aaron Conole Nov. 30, 2022, 7:50 p.m. UTC | #10
"Phelan, Michael" <michael.phelan@intel.com> writes:

>> -----Original Message-----
>> From: Finn, Emma <emma.finn@intel.com>
>> Sent: Wednesday 30 November 2022 14:15
>> To: Eelco Chaudron <echaudro@redhat.com>; Phelan, Michael
>> <michael.phelan@intel.com>
>> Cc: Ilya Maximets <i.maximets@ovn.org>; dev@openvswitch.org; Van
>> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
>> <ian.stokes@intel.com>
>> Subject: RE: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>> set_masked IPv6 action
> <snip> 
>> > >> I'm also wondering why CI didn't catch that...
>> > >>
>> > >> There might be 2 reasons:
>> > >>
>> > >> 1. Actions autovalidator is not enabled in CI, or 2. CI system
>> > >> doesn't have avx512vbmi.
>> > >>
>> > >> Michael, could you check that?
>> > >
>> > > Hi Ilya,
>> > > The CI system does have avx512vbmi, however, the actions
>> > > autovalidator is
>> > never enabled for any of the tests.
>> > >
>> > > I could add a test to configure with the actions autovalidator if
>> > > you think
>> > this would be a good value add for the CI?
>> >
>> > I would suggest doing a run with and without all the avx512 auto
>> > validators enabled at compile time.
>> >
> Hi Eelco,
> I believe make check-local is run through the GitHub Build and Test
> job, Aaron you might correct me if I'm wrong on that.

That job does run 'make check' and I think it is the same thing.

> If this is the case then is there a need to do a check without AVX512 enabled on the Intel CI?

I am not sure what the case is that isn't covered.  Maybe Eelco has a
thought?

> Kind Regards,
> Michael.
>> <snip>
Eelco Chaudron Dec. 1, 2022, 8:23 a.m. UTC | #11
On 30 Nov 2022, at 20:50, Aaron Conole wrote:

> "Phelan, Michael" <michael.phelan@intel.com> writes:
>
>>> -----Original Message-----
>>> From: Finn, Emma <emma.finn@intel.com>
>>> Sent: Wednesday 30 November 2022 14:15
>>> To: Eelco Chaudron <echaudro@redhat.com>; Phelan, Michael
>>> <michael.phelan@intel.com>
>>> Cc: Ilya Maximets <i.maximets@ovn.org>; dev@openvswitch.org; Van
>>> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
>>> <ian.stokes@intel.com>
>>> Subject: RE: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>>> set_masked IPv6 action
>> <snip>
>>>>>> I'm also wondering why CI didn't catch that...
>>>>>>
>>>>>> There might be 2 reasons:
>>>>>>
>>>>>> 1. Actions autovalidator is not enabled in CI, or 2. CI system
>>>>>> doesn't have avx512vbmi.
>>>>>>
>>>>>> Michael, could you check that?
>>>>>
>>>>> Hi Ilya,
>>>>> The CI system does have avx512vbmi, however, the actions
>>>>> autovalidator is
>>>> never enabled for any of the tests.
>>>>>
>>>>> I could add a test to configure with the actions autovalidator if
>>>>> you think
>>>> this would be a good value add for the CI?
>>>>
>>>> I would suggest doing a run with and without all the avx512 auto
>>>> validators enabled at compile time.
>>>>
>> Hi Eelco,
>> I believe make check-local is run through the GitHub Build and Test
>> job, Aaron you might correct me if I'm wrong on that.
>
> That job does run 'make check' and I think it is the same thing.
>
>> If this is the case then is there a need to do a check without AVX512 enabled on the Intel CI?
>
> I am not sure what the case is that isn't covered.  Maybe Eelco has a
> thought?

I was referring to that you should build with the following configuration options:

  --enable-actions-default-autovalidator
  --enable-autovalidator
  --enable-mfex-default-autovalidator

And then on top of this run the following checks, which include datapaths, so all AVX stuff gets tested:

  make check
  make check-kernel
  make check-system-userspace
  make check-afxdp

Also include re-runs of failed tests to avoid false positives.

Cheers,

Eelco
Phelan, Michael Dec. 2, 2022, 10:58 a.m. UTC | #12
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Thursday 1 December 2022 08:23
> To: Aaron Conole <aconole@redhat.com>
> Cc: Phelan, Michael <michael.phelan@intel.com>; Finn, Emma
> <emma.finn@intel.com>; Ilya Maximets <i.maximets@ovn.org>;
> dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
> Stokes, Ian <ian.stokes@intel.com>
> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of set_masked
> IPv6 action
> 
> 
> 
> On 30 Nov 2022, at 20:50, Aaron Conole wrote:
> 
> > "Phelan, Michael" <michael.phelan@intel.com> writes:
> >
> >>> -----Original Message-----
> >>> From: Finn, Emma <emma.finn@intel.com>
> >>> Sent: Wednesday 30 November 2022 14:15
> >>> To: Eelco Chaudron <echaudro@redhat.com>; Phelan, Michael
> >>> <michael.phelan@intel.com>
> >>> Cc: Ilya Maximets <i.maximets@ovn.org>; dev@openvswitch.org; Van
> >>> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
> >>> <ian.stokes@intel.com>
> >>> Subject: RE: [ovs-dev] [v5] odp-execute: Add ISA implementation of
> >>> set_masked IPv6 action
> >> <snip>
> >>>>>> I'm also wondering why CI didn't catch that...
> >>>>>>
> >>>>>> There might be 2 reasons:
> >>>>>>
> >>>>>> 1. Actions autovalidator is not enabled in CI, or 2. CI system
> >>>>>> doesn't have avx512vbmi.
> >>>>>>
> >>>>>> Michael, could you check that?
> >>>>>
> >>>>> Hi Ilya,
> >>>>> The CI system does have avx512vbmi, however, the actions
> >>>>> autovalidator is
> >>>> never enabled for any of the tests.
> >>>>>
> >>>>> I could add a test to configure with the actions autovalidator if
> >>>>> you think
> >>>> this would be a good value add for the CI?
> >>>>
> >>>> I would suggest doing a run with and without all the avx512 auto
> >>>> validators enabled at compile time.
> >>>>
> >> Hi Eelco,
> >> I believe make check-local is run through the GitHub Build and Test
> >> job, Aaron you might correct me if I'm wrong on that.
> >
> > That job does run 'make check' and I think it is the same thing.
> >
> >> If this is the case then is there a need to do a check without AVX512 enabled
> on the Intel CI?
> >
> > I am not sure what the case is that isn't covered.  Maybe Eelco has a
> > thought?
> 
> I was referring to that you should build with the following configuration options:
> 
>   --enable-actions-default-autovalidator
>   --enable-autovalidator
>   --enable-mfex-default-autovalidator
> 
> And then on top of this run the following checks, which include datapaths, so all
> AVX stuff gets tested:
> 
>   make check
>   make check-kernel
>   make check-system-userspace
>   make check-afxdp
> 
> Also include re-runs of failed tests to avoid false positives.
Sure, I can add make check to the list of tests run on the CI. Make check-system-userspace is already tested on all patches. I'll also add the recheck flag so that we avoid any false positives.

I don't think make check-kernel or make check-afxdp are affected by AVX512 implementations but I may be wrong so feel free to correct me on that. If that is the case then I think they should be tested somewhere else.

I have also added a new job to test the actions autovalidator in the same way as DPCLS, DPIF and MFEX.

Thanks,
Michael.
> 
> Cheers,
> 
> Eelco
>
Eelco Chaudron Dec. 2, 2022, 11:08 a.m. UTC | #13
On 2 Dec 2022, at 11:58, Phelan, Michael wrote:

>> -----Original Message-----
>> From: Eelco Chaudron <echaudro@redhat.com>
>> Sent: Thursday 1 December 2022 08:23
>> To: Aaron Conole <aconole@redhat.com>
>> Cc: Phelan, Michael <michael.phelan@intel.com>; Finn, Emma
>> <emma.finn@intel.com>; Ilya Maximets <i.maximets@ovn.org>;
>> dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
>> Stokes, Ian <ian.stokes@intel.com>
>> Subject: Re: [ovs-dev] [v5] odp-execute: Add ISA implementation of set_masked
>> IPv6 action
>>
>>
>>
>> On 30 Nov 2022, at 20:50, Aaron Conole wrote:
>>
>>> "Phelan, Michael" <michael.phelan@intel.com> writes:
>>>
>>>>> -----Original Message-----
>>>>> From: Finn, Emma <emma.finn@intel.com>
>>>>> Sent: Wednesday 30 November 2022 14:15
>>>>> To: Eelco Chaudron <echaudro@redhat.com>; Phelan, Michael
>>>>> <michael.phelan@intel.com>
>>>>> Cc: Ilya Maximets <i.maximets@ovn.org>; dev@openvswitch.org; Van
>>>>> Haaren, Harry <harry.van.haaren@intel.com>; Stokes, Ian
>>>>> <ian.stokes@intel.com>
>>>>> Subject: RE: [ovs-dev] [v5] odp-execute: Add ISA implementation of
>>>>> set_masked IPv6 action
>>>> <snip>
>>>>>>>> I'm also wondering why CI didn't catch that...
>>>>>>>>
>>>>>>>> There might be 2 reasons:
>>>>>>>>
>>>>>>>> 1. Actions autovalidator is not enabled in CI, or 2. CI system
>>>>>>>> doesn't have avx512vbmi.
>>>>>>>>
>>>>>>>> Michael, could you check that?
>>>>>>>
>>>>>>> Hi Ilya,
>>>>>>> The CI system does have avx512vbmi, however, the actions
>>>>>>> autovalidator is
>>>>>> never enabled for any of the tests.
>>>>>>>
>>>>>>> I could add a test to configure with the actions autovalidator if
>>>>>>> you think
>>>>>> this would be a good value add for the CI?
>>>>>>
>>>>>> I would suggest doing a run with and without all the avx512 auto
>>>>>> validators enabled at compile time.
>>>>>>
>>>> Hi Eelco,
>>>> I believe make check-local is run through the GitHub Build and Test
>>>> job, Aaron you might correct me if I'm wrong on that.
>>>
>>> That job does run 'make check' and I think it is the same thing.
>>>
>>>> If this is the case then is there a need to do a check without AVX512 enabled
>> on the Intel CI?
>>>
>>> I am not sure what the case is that isn't covered.  Maybe Eelco has a
>>> thought?
>>
>> I was referring to that you should build with the following configuration options:
>>
>>   --enable-actions-default-autovalidator
>>   --enable-autovalidator
>>   --enable-mfex-default-autovalidator
>>
>> And then on top of this run the following checks, which include datapaths, so all
>> AVX stuff gets tested:
>>
>>   make check
>>   make check-kernel
>>   make check-system-userspace
>>   make check-afxdp
>>
>> Also include re-runs of failed tests to avoid false positives.
> Sure, I can add make check to the list of tests run on the CI. Make check-system-userspace is already tested on all patches. I'll also add the recheck flag so that we avoid any false positives.
>
> I don't think make check-kernel or make check-afxdp are affected by AVX512 implementations but I may be wrong so feel free to correct me on that. If that is the case then I think they should be tested somewhere else.

So “make check-kernel” should not be affected, as there are no changes to the datapath.  I thought it would just be nice for some external entity to run all the datapaths ;)

The check-afxdp is affected, as it uses the same userspace AVX API, so I think it should add it.

> I have also added a new job to test the actions autovalidator in the same way as DPCLS, DPIF and MFEX.
>
> Thanks,
> Michael.
>>
>> Cheers,
>>
>> Eelco
>>
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 6c7713251..df0b31ffd 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,6 +20,9 @@ 
 
 #include <config.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
 
 #include "csum.h"
 #include "dp-packet.h"
@@ -75,6 +78,26 @@  BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
                   MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
                   offsetof(struct ovs_key_ipv4, ipv4_ttl));
 
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_label));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_tclass));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_hlimit));
+
 /* Array of callback functions, one for each masked operation. */
 odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
 
@@ -483,6 +506,180 @@  action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+#if HAVE_AVX512VBMI
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_sum_header(__m512i ip6_header)
+{
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
+                                               0xFF, 0xFF, 0xFF, 0xFF);
+
+    /* Shuffle ip6 src and dst to beginning of register. */
+    __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst,
+                                                      ip6_header);
+
+    /* Extract ip6 src and dst into smaller 256-bit wide register. */
+    __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0);
+
+    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+     * src and dst fields and add padding after each 16-bit value for the
+     * following carry over addition. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
+                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
+                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
+    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
+    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
+
+    /* Add each part of the old and new headers together. */
+    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final
+     * horizontal add. */
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+    /* Extract delta value. */
+    return _mm256_extract_epi16(v_delta, 0);
+}
+
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
+{
+    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
+    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
+    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;
+
+    return  ~csum_finish(csum_delta);
+}
+
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_set_ipv6() function. */
+static void
+__attribute__((__target__("avx512vbmi")))
+action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
+                             const struct nlattr *a)
+{
+    const struct ovs_key_ipv6 *key, *mask;
+    struct dp_packet *packet;
+
+    a = nl_attr_get(a);
+    key = nl_attr_get(a);
+    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
+
+    /* Read the content of the key and mask in the respective registers. We
+     * only load the size of the actual structure, which is only 40 bytes. */
+    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
+    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
+
+    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
+     * ip6_hdr structure layout. */
+    static const uint8_t ip_shuffle_mask[64] = {
+            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
+    };
+
+    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
+
+    /* This shuffle is required for key and mask to match the layout of the
+     * ip6_hdr struct. */
+    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
+    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
+
+    /* Set the v_zero register to all zero's. */
+    const __m128i v_zeros = _mm_setzero_si128();
+
+    /* Set the v_all_ones register to all one's. */
+    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
+    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
+    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
+
+    /* Perform a bitwise OR between src and dst registers. */
+    __m128i v_or = _mm_or_si128(v_src, v_dst);
+
+    /* Will return true if any bit has been set in v_or, else it will return
+     * false. */
+    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
+
+        /* Load the 40 bytes of the IPv6 header. */
+        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
+
+        /* AND the v_pkt_mask to the packet data (v_packet). */
+        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
+
+        /* OR the new addresses (v_key_shuf) with the masked packet addresses
+         * (v_pkt_masked). */
+        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
+
+        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
+         * be updated. */
+        if (do_checksum) {
+            uint8_t proto = nh->ip6_nxt;
+            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
+                                                                  v_new_hdr);
+
+            if (proto == IPPROTO_UDP) {
+                struct udp_header *uh = dp_packet_l4(packet);
+
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
+
+                    udp_checksum = csum_finish(udp_checksum);
+
+                    if (!udp_checksum) {
+                        udp_checksum = htons(0xffff);
+                    }
+
+                    uh->udp_csum = udp_checksum;
+                }
+            } else if (proto == IPPROTO_TCP) {
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
+
+                tcp_checksum = csum_finish(tcp_checksum);
+                th->tcp_csum = tcp_checksum;
+            } else if (proto == IPPROTO_ICMPV6) {
+                struct icmp6_header *icmp = dp_packet_l4(packet);
+                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
+                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
+
+                icmp6_checksum = csum_finish(icmp6_checksum);
+                icmp->icmp6_cksum = icmp6_checksum;
+            }
+        }
+        /* Write back the modified IPv6 addresses. */
+         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
+    }
+}
+#endif /* HAVE_AVX512VBMI */
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
 {
@@ -514,6 +711,13 @@  action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
     impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
     impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
 
+#if HAVE_AVX512VBMI
+    if (action_avx512vbmi_isa_probe()) {
+        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
+                              action_avx512_ipv6_set_addrs;
+    }
+#endif
+
     return 0;
 }
 
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index f80ae5a23..8b86b1e4f 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -60,6 +60,23 @@  action_avx512_isa_probe(void)
 
 #endif
 
+#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
+        return true;
+    }
+    return false;
+}
+#else
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    return false;
+}
+#endif
+
 static struct odp_execute_action_impl action_impls[] = {
     [ACTION_IMPL_AUTOVALIDATOR] = {
         .available = false,
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 940180c99..643f41c2a 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -78,6 +78,7 @@  BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
 #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
 
 bool action_avx512_isa_probe(void);
+bool action_avx512vbmi_isa_probe(void);
 
 /* Odp execute init handles setting up the state of the actions functions at
  * initialization time. It cannot return errors, as it must always succeed in