diff mbox series

[ovs-dev,v4,9/9] odp-execute: Add ISA implementation of push_vlan action.

Message ID 20220105165349.3447695-10-emma.finn@intel.com
State Superseded
Headers show
Series Actions Infrastructure + Optimizations | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Finn, Emma Jan. 5, 2022, 4:53 p.m. UTC
This commit adds the AVX512 implementation of the push_vlan action.
The implementation here is auto-validated by the miniflow
extract autovalidator, hence its correctness can be easily
tested and verified.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 NEWS                      |  1 +
 lib/odp-execute-avx512.c  | 74 ++++++++++++++++++++++++++++++++++-----
 lib/odp-execute-private.c |  1 +
 lib/odp-execute.c         | 24 ++++++++-----
 4 files changed, 83 insertions(+), 17 deletions(-)

Comments

Van Haaren, Harry Jan. 6, 2022, 1:11 p.m. UTC | #1
> -----Original Message-----
> From: Finn, Emma <emma.finn@intel.com>
> Sent: Wednesday, January 5, 2022 4:54 PM
> To: dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
> Amber, Kumar <kumar.amber@intel.com>
> Cc: Finn, Emma <emma.finn@intel.com>
> Subject: [PATCH v4 9/9] odp-execute: Add ISA implementation of push_vlan
> action.
> 
> This commit adds the AVX512 implementation of the push_vlan action.
> The implementation here is auto-validated by the miniflow
> extract autovalidator, hence its correctness can be easily
> tested and verified.
> 
> Signed-off-by: Emma Finn <emma.finn@intel.com>

Some comments below;

> +enum OPERATION {
> +    OP_ADD,
> +    OP_SUB,
> +};
> +
>  static inline void ALWAYS_INLINE
> -avx512_dp_packet_resize_l2(struct dp_packet *b, int increment)
> +avx512_dp_packet_resize_l2(struct dp_packet *b, int increment,
> +                           enum OPERATION op)

I like the more generic nature of the function here (with OP introduction)
as per internal reviews, however there might be more improvements possible.
A few suggestions & explanations below.

>  {
>      /* update packet size/data pointers */
> +    if (increment >= 0) {
> +        dp_packet_prealloc_headroom(b, increment);
> +    } else {
> +        ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= -increment);
> +    }
> +
>      dp_packet_set_data(b, (char *) dp_packet_data(b) - increment);
>      dp_packet_set_size(b, dp_packet_size(b) + increment);
> 
> @@ -50,9 +62,9 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int
> increment)
>      const __m128i v_zeros = _mm_setzero_si128();
>      const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> 
> -    /* Only these lanes can be incremented for push-VLAN action. */
> +    /* Only these lanes can be incremented/decremented for L2. */
>      const uint8_t k_lanes = 0b1110;
> -    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);
> +    __m128i v_offset = _mm_set1_epi16(abs(increment));
> 
>      /* Load packet and compare with UINT16_MAX */
>      void *adjust_ptr = &b->l2_pad_size;
> @@ -60,9 +72,19 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int
> increment)
>      __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes,
> v_adjust_src,
>                                                      v_u16_max);
> 
> -    /* Add VLAN_HEADER_LEN using compare mask, store results. */
> -    __m128i v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
> -                                              v_adjust_src, v_offset);
> +    /* Update VLAN_HEADER_LEN using compare mask, store results. */
> +    __m128i v_adjust_wip;
> +    switch (op) {
> +    case OP_ADD:
> +        v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp,
> +                                            v_adjust_src, v_offset);
> +        break;
> +    case OP_SUB:
> +        v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
> +                                            v_adjust_src, v_offset);
> +        break;
> +    }
> +
>      _mm_storeu_si128(adjust_ptr, v_adjust_wip);
> 
>  }
> @@ -79,8 +101,7 @@ avx512_eth_pop_vlan(struct dp_packet *packet)
>          __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
>                                              16 - VLAN_HEADER_LEN);
>          _mm_storeu_si128((void *) veh, v_realign);
> -        avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
> -
> +        avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN, OP_SUB);
>      }
>  }
> 
> @@ -96,6 +117,42 @@ action_avx512_pop_vlan(void *dp OVS_UNUSED, struct
> dp_packet_batch *batch,
>      }
>  }
> 
> +static inline void ALWAYS_INLINE
> +avx512_eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
> +{
> +    avx512_dp_packet_resize_l2(packet, VLAN_HEADER_LEN, OP_ADD);
> +
> +    /* Build up the VLAN TCI/TPID, and merge with the moving of Ether. */
> +    char *new_pkt_data = (char *) dp_packet_data(packet);

Improve variable name to just "pkt_data". 

> +
> +    const uint16_t tci_proc = tci & htons(~VLAN_CFI);
> +    const uint32_t tpid_tci = (tci_proc << 16) | tpid;
> +    __m128i v_ether = _mm_loadu_si128((void *) new_pkt_data);

Move the load of the v_ether down to below where the rest of the SIMD code is,
to keep it all close together. Prefer to have static initialization of const data above
the SIMD code to keep it "out of the way" when reviewing the SIMD operations.

> +    static const uint8_t vlan_push_shuffle_mask[16] = {
> +    4, 5, 6, 7, 8, 9, 10, 11,
> +    12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF
> +    };

For "SIMD interested folks", this shuffle moves the data in the register "across"
by 4 bytes (as the indexes for 0 = 4, 1=5, etc to 11=15). The last 4 bytes in the
register are explicitly zeroed (MSB set, but 0xFF is the normal way of indicating that).

> +
> +    __m128i v_index = _mm_loadu_si128((void *) vlan_push_shuffle_mask);
> +    __m128i v_shift = _mm_shuffle_epi8(v_ether, v_index);
> +    __m128i v_vlan_hdr = _mm_insert_epi32(v_shift, tpid_tci, 3);

The "VLAN insertion" happens here where the v_shift gets the GPR tpid_tci inserted.

> +     _mm_storeu_si128((void *) new_pkt_data, v_vlan_hdr);

Then the updated packet data is stored back.
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index f5032bdd0..af1759685 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,7 @@  Post-v2.16.0
        actions implementation at build time.
      * Add AVX512 implementation of actions.
      * Add support for an AVX512 optimized version of pop_vlan action.
+     * Add support for an AVX512 optimized version of push_vlan action.
    - Python:
      * For SSL support, the use of the pyOpenSSL library has been replaced
        with the native 'ssl' module.
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 8bbfd5203..3d850026a 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -39,10 +39,22 @@  BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
                            MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
                            offsetof(struct dp_packet, l4_ofs));
 
+enum OPERATION {
+    OP_ADD,
+    OP_SUB,
+};
+
 static inline void ALWAYS_INLINE
-avx512_dp_packet_resize_l2(struct dp_packet *b, int increment)
+avx512_dp_packet_resize_l2(struct dp_packet *b, int increment,
+                           enum OPERATION op)
 {
     /* update packet size/data pointers */
+    if (increment >= 0) {
+        dp_packet_prealloc_headroom(b, increment);
+    } else {
+        ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= -increment);
+    }
+
     dp_packet_set_data(b, (char *) dp_packet_data(b) - increment);
     dp_packet_set_size(b, dp_packet_size(b) + increment);
 
@@ -50,9 +62,9 @@  avx512_dp_packet_resize_l2(struct dp_packet *b, int increment)
     const __m128i v_zeros = _mm_setzero_si128();
     const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
 
-    /* Only these lanes can be incremented for push-VLAN action. */
+    /* Only these lanes can be incremented/decremented for L2. */
     const uint8_t k_lanes = 0b1110;
-    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);
+    __m128i v_offset = _mm_set1_epi16(abs(increment));
 
     /* Load packet and compare with UINT16_MAX */
     void *adjust_ptr = &b->l2_pad_size;
@@ -60,9 +72,19 @@  avx512_dp_packet_resize_l2(struct dp_packet *b, int increment)
     __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
                                                     v_u16_max);
 
-    /* Add VLAN_HEADER_LEN using compare mask, store results. */
-    __m128i v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
-                                              v_adjust_src, v_offset);
+    /* Update VLAN_HEADER_LEN using compare mask, store results. */
+    __m128i v_adjust_wip;
+    switch (op) {
+    case OP_ADD:
+        v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp,
+                                            v_adjust_src, v_offset);
+        break;
+    case OP_SUB:
+        v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
+                                            v_adjust_src, v_offset);
+        break;
+    }
+
     _mm_storeu_si128(adjust_ptr, v_adjust_wip);
 
 }
@@ -79,8 +101,7 @@  avx512_eth_pop_vlan(struct dp_packet *packet)
         __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
                                             16 - VLAN_HEADER_LEN);
         _mm_storeu_si128((void *) veh, v_realign);
-        avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
-
+        avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN, OP_SUB);
     }
 }
 
@@ -96,6 +117,42 @@  action_avx512_pop_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
     }
 }
 
+static inline void ALWAYS_INLINE
+avx512_eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
+{
+    avx512_dp_packet_resize_l2(packet, VLAN_HEADER_LEN, OP_ADD);
+
+    /* Build up the VLAN TCI/TPID, and merge with the moving of Ether. */
+    char *new_pkt_data = (char *) dp_packet_data(packet);
+
+    const uint16_t tci_proc = tci & htons(~VLAN_CFI);
+    const uint32_t tpid_tci = (tci_proc << 16) | tpid;
+    __m128i v_ether = _mm_loadu_si128((void *) new_pkt_data);
+
+    static const uint8_t vlan_push_shuffle_mask[16] = {
+    4, 5, 6, 7, 8, 9, 10, 11,
+    12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF
+    };
+
+    __m128i v_index = _mm_loadu_si128((void *) vlan_push_shuffle_mask);
+    __m128i v_shift = _mm_shuffle_epi8(v_ether, v_index);
+    __m128i v_vlan_hdr = _mm_insert_epi32(v_shift, tpid_tci, 3);
+     _mm_storeu_si128((void *) new_pkt_data, v_vlan_hdr);
+}
+
+static void
+action_avx512_push_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                       const struct nlattr *a,
+                       bool should_steal OVS_UNUSED)
+{
+    struct dp_packet *packet;
+    const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        avx512_eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
+    }
+}
+
 /* Probe functions to check ISA requirements. */
 static int32_t
 avx512_isa_probe(uint32_t needs_vbmi)
@@ -136,6 +193,7 @@  action_avx512_init(struct odp_execute_action_impl *self)
 {
     avx512_isa_probe(0);
     self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
+    self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_avx512_push_vlan;
 
     return 0;
 }
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 7c58d90d2..0bff66b50 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -218,6 +218,7 @@  int32_t
 action_autoval_init(struct odp_execute_action_impl *self)
 {
     self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_autoval_generic;
+    self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_autoval_generic;
 
     return 0;
 }
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 1bc9fae09..40f71fa96 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -842,6 +842,19 @@  action_pop_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
     }
 }
 
+static void
+action_push_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                const struct nlattr *a OVS_UNUSED,
+                bool should_steal OVS_UNUSED)
+{
+    struct dp_packet *packet;
+    const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
+    }
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -849,6 +862,7 @@  int32_t
 odp_action_scalar_init(struct odp_execute_action_impl *self)
 {
     self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_pop_vlan;
+    self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
 
     return 0;
 }
@@ -991,15 +1005,6 @@  odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal,
             break;
         }
 
-        case OVS_ACTION_ATTR_PUSH_VLAN: {
-            const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
-
-            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-                eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
-            }
-            break;
-        }
-
         case OVS_ACTION_ATTR_PUSH_MPLS: {
             const struct ovs_action_push_mpls *mpls = nl_attr_get(a);
 
@@ -1133,6 +1138,7 @@  odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal,
         case OVS_ACTION_ATTR_OUTPUT:
         case OVS_ACTION_ATTR_LB_OUTPUT:
         case OVS_ACTION_ATTR_POP_VLAN:
+        case OVS_ACTION_ATTR_PUSH_VLAN:
         case OVS_ACTION_ATTR_TUNNEL_PUSH:
         case OVS_ACTION_ATTR_TUNNEL_POP:
         case OVS_ACTION_ATTR_USERSPACE: