diff mbox series

[ovs-dev,v6,08/11] odp-execute: Add ISA implementation of pop_vlan action.

Message ID 20220510142202.1087967-9-emma.finn@intel.com
State Changes Requested
Headers show
Series Actions Infrastructure + Optimizations | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Finn, Emma May 10, 2022, 2:21 p.m. UTC
This commit adds the AVX512 implementation of the
pop_vlan action.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 lib/odp-execute-avx512.c  | 73 ++++++++++++++++++++++++++++++++++++++-
 lib/odp-execute-private.c |  2 +-
 lib/odp-execute-private.h |  2 +-
 3 files changed, 74 insertions(+), 3 deletions(-)

Comments

Eelco Chaudron June 2, 2022, 2:35 p.m. UTC | #1
On 10 May 2022, at 16:21, Emma Finn wrote:

> This commit adds the AVX512 implementation of the
> pop_vlan action.
>
> Signed-off-by: Emma Finn <emma.finn@intel.com>
> ---
>  lib/odp-execute-avx512.c  | 73 ++++++++++++++++++++++++++++++++++++++-
>  lib/odp-execute-private.c |  2 +-
>  lib/odp-execute-private.h |  2 +-
>  3 files changed, 74 insertions(+), 3 deletions(-)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 84f68d378..637956236 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -14,6 +14,11 @@
>   * limitations under the License.
>   */
>
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +
>  #include <config.h>
>  #include <errno.h>
>
> @@ -24,6 +29,67 @@
>  #include "odp-netlink.h"
>  #include "openvswitch/vlog.h"
>
> +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);

> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
> +                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
> +                  offsetof(struct dp_packet, l3_ofs));
> +
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
> +                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
> +                           offsetof(struct dp_packet, l4_ofs));

Can you add some detail on why the above to asserts are needed?
This will help people trying to understand why their changes will break AVX512 (and maybe how to solve it).

Also should we not verify all offsets starting from l2_pad_size,

> +
> +static inline void ALWAYS_INLINE
> +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
> +{
> +    /* Update packet size/data pointers */
> +    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
> +    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);

Here you only handle the pull case, what about push? This is the original code
for the normal dp:

486      if (increment >= 0) {
487          dp_packet_push_uninit(b, increment);
488      } else {
489          dp_packet_pull(b, -increment);
490      }

I think you should be complete here because some one might be using this
function in the future for pull.

I do notice you include something in a later patch, however, I would prefer
to keep the code the same as above (it's all inlines anyway), and do it in
this patch.

> +
> +    /* Increment u16 packet offset values */

This comment makes no sense to me, we are initialising variables to all 0s and 1s?

> +    const __m128i v_zeros = _mm_setzero_si128();
> +    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> +    /* Only these lanes can be incremented/decremented for L2. */


Can we be more clear, i.e. we want to update l2_5_ofs, l3_ofs, and l4_ofs?

I think these comments need to be re-written in such a way that a none AVX person can understand what happens without having to reach out to Intel's Intrinsics Guide.

> +    const uint8_t k_lanes = 0b1110;
> +    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);

Why is there this static VLAN_HEADER_LEN? Should this not be resize_by_bytes?

> +
> +    /* Load packet and compare with UINT16_MAX */

This only loads the packet data, so the comment should be something like:

/* Load 128 bits from the dp_packet structure starting at the l2_pad_size
 * offset. */

> +    void *adjust_ptr = &b->l2_pad_size;
> +    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
> +
> +    /* Generate K mask to use for updating offset values of
> +    * the packet buffer. */
> +    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
> +                                                    v_u16_max);
> +
> +    /* Update VLAN_HEADER_LEN using compare mask, store results. */
> +    __m128i v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
> +                                              v_adjust_src, v_offset);

Here we always assume resize_by_bytes is negative.

> +    _mm_storeu_si128(adjust_ptr, v_adjust_wip);
> +}
> +
> +static void
> +action_avx512_pop_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
> +                       const struct nlattr *a OVS_UNUSED,
> +                       bool should_steal OVS_UNUSED)
> +{
> +    struct dp_packet *packet;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct vlan_eth_header *veh = dp_packet_eth(packet);
> +
> +        if (veh && dp_packet_size(packet) >= sizeof *veh &&
> +            eth_type_vlan(veh->veth_type)) {
> +
> +            __m128i v_ether = _mm_loadu_si128((void *) veh);
> +            __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
> +                                                16 - VLAN_HEADER_LEN);

Where does 16 come from? Maybe just add a comment for non-AVX folks on how this works (I had to look it up, and if we can avoid this it might help people forced to look at this due to asserts).

> +            _mm_storeu_si128((void *) veh, v_realign);
> +            avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
> +        }
> +    }
> +}
> +
>  /* Probe functions to check ISA requirements. */
>  static int32_t
>  avx512_isa_probe(uint32_t needs_vbmi)
> @@ -60,8 +126,13 @@ action_avx512_probe(void)
>

All the below changes (except for the action_avx512_pop_vlan assignment), should have been part of the previous patch.

>  int32_t
> -action_avx512_init(void)
> +action_avx512_init(struct odp_execute_action_impl *self)
>  {
>      avx512_isa_probe(0);
> +    self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
> +
>      return 0;
>  }
> +
> +#endif
> +#endif
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index 2bfa84152..8257bba80 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -52,7 +52,7 @@ static struct odp_execute_action_impl action_impls[] = {
>          .available = 1,
>          .name = "avx512",
>          .probe = action_avx512_probe,
> -        .init_func = NULL,
> +        .init_func = action_avx512_init,
>      },
>      #endif
>  };
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 13fc74e52..231d72492 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -100,7 +100,7 @@ int32_t odp_execute_action_set(const char *name,
>  int32_t odp_action_scalar_init(struct odp_execute_action_impl *self);
>
>  /* Init function for the optimized with AVX512 actions. */
> -int32_t action_avx512_init(void);
> +int32_t action_avx512_init(struct odp_execute_action_impl *self);
>
>  /* Probe function to check ISA requirements. */
>  int32_t action_avx512_probe(void);
> -- 
> 2.25.1
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 84f68d378..637956236 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -14,6 +14,11 @@ 
  * limitations under the License.
  */
 
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+
 #include <config.h>
 #include <errno.h>
 
@@ -24,6 +29,67 @@ 
 #include "odp-netlink.h"
 #include "openvswitch/vlog.h"
 
+VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
+                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
+                  offsetof(struct dp_packet, l3_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
+                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
+                           offsetof(struct dp_packet, l4_ofs));
+
+static inline void ALWAYS_INLINE
+avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
+{
+    /* Update packet size/data pointers */
+    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
+    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
+
+    /* Increment u16 packet offset values */
+    const __m128i v_zeros = _mm_setzero_si128();
+    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+    /* Only these lanes can be incremented/decremented for L2. */
+    const uint8_t k_lanes = 0b1110;
+    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);
+
+    /* Load packet and compare with UINT16_MAX */
+    void *adjust_ptr = &b->l2_pad_size;
+    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
+
+    /* Generate K mask to use for updating offset values of
+    * the packet buffer. */
+    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
+                                                    v_u16_max);
+
+    /* Update VLAN_HEADER_LEN using compare mask, store results. */
+    __m128i v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
+                                              v_adjust_src, v_offset);
+    _mm_storeu_si128(adjust_ptr, v_adjust_wip);
+}
+
+static void
+action_avx512_pop_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                       const struct nlattr *a OVS_UNUSED,
+                       bool should_steal OVS_UNUSED)
+{
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct vlan_eth_header *veh = dp_packet_eth(packet);
+
+        if (veh && dp_packet_size(packet) >= sizeof *veh &&
+            eth_type_vlan(veh->veth_type)) {
+
+            __m128i v_ether = _mm_loadu_si128((void *) veh);
+            __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
+                                                16 - VLAN_HEADER_LEN);
+            _mm_storeu_si128((void *) veh, v_realign);
+            avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
+        }
+    }
+}
+
 /* Probe functions to check ISA requirements. */
 static int32_t
 avx512_isa_probe(uint32_t needs_vbmi)
@@ -60,8 +126,13 @@  action_avx512_probe(void)
 
 
 int32_t
-action_avx512_init(void)
+action_avx512_init(struct odp_execute_action_impl *self)
 {
     avx512_isa_probe(0);
+    self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
+
     return 0;
 }
+
+#endif
+#endif
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 2bfa84152..8257bba80 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -52,7 +52,7 @@  static struct odp_execute_action_impl action_impls[] = {
         .available = 1,
         .name = "avx512",
         .probe = action_avx512_probe,
-        .init_func = NULL,
+        .init_func = action_avx512_init,
     },
     #endif
 };
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 13fc74e52..231d72492 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -100,7 +100,7 @@  int32_t odp_execute_action_set(const char *name,
 int32_t odp_action_scalar_init(struct odp_execute_action_impl *self);
 
 /* Init function for the optimized with AVX512 actions. */
-int32_t action_avx512_init(void);
+int32_t action_avx512_init(struct odp_execute_action_impl *self);
 
 /* Probe function to check ISA requirements. */
 int32_t action_avx512_probe(void);