diff mbox series

[ovs-dev,v7,11/11] odp-execute: Add ISA implementation of set_masked IPv4 action

Message ID 20220614115409.1143269-9-emma.finn@intel.com
State Rejected
Headers show
Series None | expand

Commit Message

Finn, Emma June 14, 2022, 11:54 a.m. UTC
This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 lib/odp-execute-avx512.c  | 210 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |   1 +
 lib/odp-execute.c         |  19 +++-
 3 files changed, 225 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index ffe25b41d..5cba14b92 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -22,6 +22,7 @@ 
 #include <config.h>
 #include <errno.h>
 
+#include "csum.h"
 #include "cpu.h"
 #include "dp-packet.h"
 #include "immintrin.h"
@@ -193,6 +194,213 @@  action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+/* Calculate delta checksum by summing only ip_src and ip_dst fields of
+ * ip_header. Resulting checksum will be used for updating L4 checksum */
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
+{
+    uint16_t tmp_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    /* Each field needs to be shuffle into 16- bit granularity and across
+     * lanes. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i oh = _mm256_loadu_si256((void *) old_header);
+    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
+    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract checksum value. */
+    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~tmp_checksum;
+}
+
+/* Calculate checksum by summing entire contents of ip_header leaving out
+ * current checksum field. */
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_recalc_csum(__m256i res)
+{
+    uint32_t new_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    /* Each field needs to be shuffle into 16-bit granularity and across
+     * lanes. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract new checksum value. */
+    new_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~new_checksum;
+}
+
+/* The shuffles used in action_avx512_ipv4_set_addrs() require the ovs_key_ipv4
+ * struct to be in this layout. If struct changes, shuffle mask also needs to
+ * be updated. */
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
+static void
+action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
+                             const struct nlattr *a)
+{
+    a = nl_attr_get(a);
+    const struct ovs_key_ipv4 *key = nl_attr_get(a);
+    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
+    struct dp_packet *packet;
+    ovs_be16 old_csum;
+
+    __m256i v_key = _mm256_loadu_si256((void *) key);
+    __m256i v_mask = _mm256_loadu_si256((void *) mask);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ip_header *nh = dp_packet_l3(packet);
+        old_csum = nh->ip_csum;
+
+        __m256i v_packet = _mm256_loadu_si256((void *) nh);
+
+        /* Shuffle key and mask to match ip_header struct layout. */
+        static const uint8_t ip_shuffle_mask[32] = {
+            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
+            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
+                                             0x1, 0xF, 0xF, 0xF);
+
+        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
+
+        /* Two shuffles are required for key and mask to match the layout of
+         * the ip_header struct. The _shuffle_epi8 only works within 128-bit
+         * lanes, so a permute is required to move src and dst into the correct
+         * lanes. And then a shuffle is used to move the fields into the right
+         * order.
+         */
+        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
+        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
+
+        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
+        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
+
+        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
+        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
+
+        /* Recalculate the ip_csum based on updated values. */
+        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
+
+        /* Insert new checksum. */
+        v_res = _mm256_insert_epi16(v_res, checksum, 5);
+
+       /* If ip_src or ip_dst has been modified, L4 checksum needs to
+        * be updated too. */
+        int update_mask = _mm256_movemask_epi8(v_mask);
+        if (update_mask & 0xFF) {
+
+            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
+            tmp_checksum = ~tmp_checksum;
+            uint16_t csum;
+
+            if (nh->ip_proto == IPPROTO_UDP) {
+                /* New UDP checksum. */
+                struct udp_header *uh = dp_packet_l4(packet);
+                if (!uh->udp_csum) {
+                    uh->udp_csum = htons(0xffff);
+                } else {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+
+                    uint32_t udp_checksum = old_csum + tmp_checksum;
+                    udp_checksum = csum_finish(udp_checksum);
+                    uint16_t udp_csum = ~udp_checksum;
+
+                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
+
+                    csum =  csum_finish(nw_udp_checksum);
+
+                    /* Insert new udp checksum. */
+                    v_res = _mm256_insert_epi16(v_res, csum, 13);
+                }
+            } else if (nh->ip_proto == IPPROTO_TCP) {
+                /* New TCP checksum. */
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+
+                uint32_t tcp_checksum = old_csum + tmp_checksum;
+                tcp_checksum = csum_finish(tcp_checksum);
+                uint16_t tcp_csum = ~tcp_checksum;
+
+                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
+
+                csum =  csum_finish(nw_tcp_checksum);
+
+                th->tcp_csum = csum;
+            }
+        }
+        /* Store new IP header. */
+        _mm256_storeu_si256((void *) nh, v_res);
+    }
+}
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch OVS_UNUSED,
                          const struct nlattr *a)
@@ -244,6 +452,8 @@  action_avx512_init(struct odp_execute_action_impl *self)
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
                             action_avx512_eth_set_addrs;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
+                            action_avx512_ipv4_set_addrs;
     avx512_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index e2d650779..763c8afc7 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -259,6 +259,7 @@  action_autoval_init(struct odp_execute_action_impl *self)
      * requires further processing for action type. Note that 2nd level items
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
     autoval_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index db6e1ec03..8da009ce9 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -585,11 +585,6 @@  odp_execute_masked_set_action(struct dp_packet *packet,
         break;
     }
 
-    case OVS_KEY_ATTR_IPV4:
-        odp_set_ipv4(packet, nl_attr_get(a),
-                     get_mask(a, struct ovs_key_ipv4));
-        break;
-
     case OVS_KEY_ATTR_IPV6:
         odp_set_ipv6(packet, nl_attr_get(a),
                      get_mask(a, struct ovs_key_ipv6));
@@ -664,6 +659,7 @@  odp_execute_masked_set_action(struct dp_packet *packet,
     case __OVS_KEY_ATTR_MAX:
     /* The following action types are handled by the scalar implementation. */
     case OVS_KEY_ATTR_ETHERNET:
+    case OVS_KEY_ATTR_IPV4:
     default:
         OVS_NOT_REACHED();
     }
@@ -887,6 +883,18 @@  action_mod_eth(struct dp_packet_batch *batch, const struct nlattr *a)
     }
 }
 
+static void
+action_mod_ipv4(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+    a = nl_attr_get(a);
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        odp_set_ipv4(packet, nl_attr_get(a),
+                     get_mask(a, struct ovs_key_ipv4));
+    }
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -903,6 +911,7 @@  odp_action_scalar_init(struct odp_execute_action_impl *self)
      * requires further processing for action type. Note that 2nd level items
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
     actions_active_impl = *self;
 
     return 0;