diff mbox series

[ovs-dev,v2,5/5] dpif-lookup: add avx512 gather implementation

Message ID 20200506130609.84792-6-harry.van.haaren@intel.com
State Superseded
Headers show
Series DPCLS Subtable ISA Optimization | expand

Commit Message

Van Haaren, Harry May 6, 2020, 1:06 p.m. UTC
This commit adds an AVX-512 dpcls lookup implementation.
It uses the AVX-512 SIMD ISA to perform multiple miniflow
operations in parallel.

To run this implementation, the "avx512f" and "bmi2" ISAs are
required. These ISA checks are performed at runtime while
probing the subtable implementation. If a CPU does not provide
both "avx512f" and "bmi2", then this code does not execute.

The avx512 code is built as a seperate static library, with added
CFLAGS to enable the required ISA features. By building only this
static library with avx512 enabled, it is ensured that the main OVS
core library is *not* using avx512, and that OVS continues to run
as before on CPUs that do not support avx512.

The approach taken in this implementation is to use the
gather instruction to access the packet miniflow, allowing
any miniflow blocks to be loaded into an AVX-512 register.
This maximises the usefulness of the register, and hence this
implementation handles any subtable with up to miniflow 8 bits.

Note that specialization of these avx512 lookup routines
still provides performance value, as the hashing of the
resulting data is performed in scalar code, and compile-time
loop unrolling occurs when specialized to miniflow bits.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 lib/automake.mk                        |  16 ++
 lib/dpif-netdev-lookup-avx512-gather.c | 255 +++++++++++++++++++++++++
 lib/dpif-netdev-lookup.c               |   7 +
 lib/dpif-netdev-lookup.h               |   7 +
 lib/dpif-netdev.c                      |   4 +
 5 files changed, 289 insertions(+)
 create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c

Comments

William Tu May 18, 2020, 2:57 p.m. UTC | #1
On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> This commit adds an AVX-512 dpcls lookup implementation.
> It uses the AVX-512 SIMD ISA to perform multiple miniflow
> operations in parallel.
> 
> To run this implementation, the "avx512f" and "bmi2" ISAs are
> required. These ISA checks are performed at runtime while
> probing the subtable implementation. If a CPU does not provide
> both "avx512f" and "bmi2", then this code does not execute.
> 
> The avx512 code is built as a seperate static library, with added
> CFLAGS to enable the required ISA features. By building only this
> static library with avx512 enabled, it is ensured that the main OVS
> core library is *not* using avx512, and that OVS continues to run
> as before on CPUs that do not support avx512.
> 
> The approach taken in this implementation is to use the
> gather instruction to access the packet miniflow, allowing
> any miniflow blocks to be loaded into an AVX-512 register.
> This maximises the usefulness of the register, and hence this
> implementation handles any subtable with up to miniflow 8 bits.
> 
> Note that specialization of these avx512 lookup routines
> still provides performance value, as the hashing of the
> resulting data is performed in scalar code, and compile-time
> loop unrolling occurs when specialized to miniflow bits.
> 

Hi Harry,

I haven't tried running the code due to my machine only
support avx2. There are some minor issues such as indentation.
But I read through it with example below and I think it's correct.

Given that you have to do a lot of preparation (ex: popcount, creating
bit_masks, broadcast, ... etc) before using avx instructions, do you
have some performance number? I didn't see any from ovsconf 18 or 19.
Is using avx512 much better than avx2?

> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> ---
>  lib/automake.mk                        |  16 ++
>  lib/dpif-netdev-lookup-avx512-gather.c | 255 +++++++++++++++++++++++++
>  lib/dpif-netdev-lookup.c               |   7 +
>  lib/dpif-netdev-lookup.h               |   7 +
>  lib/dpif-netdev.c                      |   4 +
>  5 files changed, 289 insertions(+)
>  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 19e454c4b..d8a05b384 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -8,13 +8,16 @@
>  # libopenvswitch.la is the library to link against for binaries like vswitchd.
>  # The code itself is built as two seperate static libraries;
>  # - core: Core files, always compiled with distro provided CFLAGS
> +# - lookupavx512: ISA optimized routines that require CPUID checks at runtime
>  lib_LTLIBRARIES += lib/libopenvswitch.la
>  lib_LTLIBRARIES += lib/libopenvswitchcore.la
> +lib_LTLIBRARIES += lib/libopenvswitchlookupavx512.la
>  
>  # Dummy library to link against doesn't have any sources, but does
>  # depend on libopenvswitchcore static library
>  lib_libopenvswitch_la_SOURCES =
>  lib_libopenvswitch_la_LIBADD = lib/libopenvswitchcore.la
> +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchlookupavx512.la
>  
>  # Dummy library continues to depend on external libraries as before
>  lib_libopenvswitch_la_LIBADD += $(SSL_LIBS)
> @@ -31,6 +34,19 @@ lib_libopenvswitch_la_LDFLAGS = \
>          $(lib_libopenvswitchcore_la_LIBS) \
>          $(AM_LDFLAGS)
>  
> +
> +# Build lookupavx512 library with extra CFLAGS enabled. This allows the
> +# compiler to use the ISA features required for the ISA optimized code-paths.
> +lib_libopenvswitchlookupavx512_la_CFLAGS = \
> +	-mavx512f \
> +	-mavx512bw \
> +	-mavx512dq \
> +	-mbmi2 \
> +	$(AM_CFLAGS)
> +lib_libopenvswitchlookupavx512_la_SOURCES = \
> +	lib/dpif-netdev-lookup-avx512-gather.c
> +
> +
>  # Build core vswitch libraries as before
>  lib_libopenvswitchcore_la_SOURCES = \
>  	lib/aes128.c \
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
> new file mode 100644
> index 000000000..52348041b
> --- /dev/null
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -0,0 +1,255 @@
> +/*
> + * Copyright (c) 2020, Intel Corperation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-lookup.h"
> +#include "dpif-netdev-private.h"
> +#include "cmap.h"
> +#include "flow.h"
> +#include "pvector.h"
> +#include "openvswitch/vlog.h"
> +
> +#include <immintrin.h>
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> +
> +static inline __m512i
> +_mm512_popcnt_epi64_manual(__m512i v_in)
> +{
> +    static const uint8_t pop_lut[64] = {
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +    };
> +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> +
> +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> +
> +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> +
> +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> +}
> +
> +static inline uint64_t
> +netdev_rule_matches_key(const struct dpcls_rule *rule,
> +                        const uint32_t mf_bits_total,
> +                        const uint64_t * block_cache)
> +{
> +    ovs_assert(mf_bits_total <= 8);
> +    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
> +    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
> +    const uint32_t lane_mask = (1 << mf_bits_total) - 1;
> +
> +    /* Always load a full cache line from blocks_cache. Other loads must be
> +     * trimmed to the amount of data required for mf_bits_total blocks.
> +     */
> +    __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
> +    __m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
> +    __m512i v_key    = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
> +
> +    __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
> +    uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
> +
> +    /* returns 1 assuming result of SIMD compare is all blocks */
> +    return res_mask == lane_mask;
> +}
> +

I think the below function is the most difficult one.
I wonder if there is a better way to make it easier to understand?
ex: break it into subfunctions or utility functions
I end up using an example from your slides 2 here:
https://www.openvswitch.org/support/ovscon2019/day1/1108-next_steps_sw_datapath_hvh.pdf
and the API document here
https://software.intel.com/sites/landingpage/IntrinsicsGuide/

> +static inline uint32_t ALWAYS_INLINE
> +avx512_lookup_impl(struct dpcls_subtable *subtable,
> +                   uint32_t keys_map,
> +                   const struct netdev_flow_key *keys[],
> +                   struct dpcls_rule **rules,
> +                   const uint32_t bit_count_u0,
> +                   const uint32_t bit_count_u1)
> +{
> +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> +    int i;
> +    uint32_t hashes[NETDEV_MAX_BURST];
> +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> +
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 8];
> +
> +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> +
> +    /* Load subtable blocks for masking later */
> +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> +
> +    /* Load pre-created subtable masks for each block in subtable */
> +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> +    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
> +                                                        subtable->mf_masks);
> +
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> +
> +        /* Pre-create register with *PER PACKET* u0 offset */
> +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> +        const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
> +                                                                pkt_mf_u0_pop);
> +
> +        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> +                                         keys[i]->mf.map.bits[1]);
> +
> +        /* Bitmask by pre-created masks */
> +        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> +
> +        /* Manual AVX512 popcount for u64 lanes */
> +        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +
> +        /* Offset popcounts for u1 with pre-created offset register */
> +        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> +
> +        /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
> +        const __m512i v_zeros = _mm512_setzero_si512();
> +        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> +        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> +                                 bit_count_total_mask, v_indexes, pkt_data, 8);
indent

> +
> +        /* Zero out bits that pkt doesn't have:
> +         * - 2x pext() to extract bits from packet miniflow as needed by TBL
> +         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> +         */
> +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> +         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
indentation: remove one space
> +
> +        /* Mask blocks using AND with subtable blocks, use k-mask to zero
> +         * where lanes as required for this packet.
> +         */
> +        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
> +                                                v_all_blocks, v_tbl_blocks);
> +
> +        /* Store to blocks cache, full cache line aligned */
> +        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
> +    }
> +
> +    /* Hash the now linearized blocks of packet metadata. */
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        uint64_t *block_ptr = &block_cache[i * 8];
> +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> +    }
> +
> +    /* Lookup: this returns a bitmask of packets where the hash table had
> +     * an entry for the given hash key. Presence of a hash key does not
> +     * guarantee matching the key, as there can be hash collisions.
> +     */
> +    uint32_t found_map;
> +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
> +
> +    /* Verify that packet actually matched rule. If not found, a hash
> +     * collision has taken place, so continue searching with the next node.
> +     */
> +    ULLONG_FOR_EACH_1 (i, found_map) {
> +        struct dpcls_rule *rule;
> +
> +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> +            const uint32_t cidx = i * 8;
> +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> +                                                     &block_cache[cidx]);
> +            if (OVS_LIKELY(match)) {
> +                rules[i] = rule;
> +                subtable->hit_cnt++;
> +                goto next;
> +            }
> +        }
> +
> +        /* None of the found rules was a match.  Clear the i-th bit to
> +         * search for this key in the next subtable. */
> +        ULLONG_SET0(found_map, i);
> +    next:
> +        ;                     /* Keep Sparse happy. */
> +    }
> +
> +    return found_map;
> +}

If someone is interested, the example below with the slides
help understand the above function.

diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
index 52348041bd00..f84a95423cf8 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -93,56 +93,77 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 
     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 8];
 
-    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
-    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
-    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
-    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
+    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; //1000,0000
+    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; //0100,0000
+    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); //1
+    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); //1
+    // bit_count_total = 2
 
     /* Load subtable blocks for masking later */
-    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
-    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);//point to ipv4 mask
+    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);    //porint to ipv4 mask
 
     /* Load pre-created subtable masks for each block in subtable */
-    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
-    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; // (1 << 2) - 1 = 0x3
+    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask /* 0x3 */,
                                                         subtable->mf_masks);
+    // subtable->mf_masks[0] = 0b01111111
+    // subtable->mf_masks[1] = 0b00111111
+    // v_mf_masks = [0,0,0,0,0,0, 0b00111111, 0b01111111]
 
-    ULLONG_FOR_EACH_1 (i, keys_map) {
-        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
-        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+    ULLONG_FOR_EACH_1 (i, keys_map) {// for each packets in batch
+        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; //0b1000,0100
+        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); //2
 
         /* Pre-create register with *PER PACKET* u0 offset */
-        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); //(0xff << 1) = 0xfe
         const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
                                                                 pkt_mf_u0_pop);
+        //v_idx_u0_offset = [2,2,2,2,2,2,2,0]

         /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
-        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
-        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
-                                         keys[i]->mf.map.bits[1]);
+        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);// [0b10000100,0b10000100,0b10000100, ...]
+
+        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask /*0xfe*/,
+                                         keys[i]->mf.map.bits[1] /* 0b01100000 */);
+        //0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000,0b10000100
 
-        /* Bitmask by pre-created masks */
+
+        /* Bitmask by pre-created masks. */
         __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+        // v_masks = [0,0,0,0,0,0, 0b00100000,0b00000100]
 
         /* Manual AVX512 popcount for u64 lanes */
         __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+        // v_popcnts = [0,0,0,0,0,0,1,1]
 
         /* Offset popcounts for u1 with pre-created offset register */
         __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+        // v_indexes = [0,0,0,0,0,0,3,1]
 
         /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
         const __m512i v_zeros = _mm512_setzero_si512();
         const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
+        // pkt_data = ipv4_src, ipv4_dst, mac_src, vlan_tci
+
         __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
-                                 bit_count_total_mask, v_indexes, pkt_data, 8);
+                                 bit_count_total_mask /* 0x3 */,
+                                 v_indexes, pkt_data, 8);
+        //v_all_blocks: use v_index[0]=1*8 , v_index[1]=3*8 to gather data
+        //v_all_blocks = [0,0,0,0,0,0, ipv4_dst, vlan_tci]
 
         /* Zero out bits that pkt doesn't have:
          * - 2x pext() to extract bits from packet miniflow as needed by TBL
          * - Shift u1 over by bit_count of u0, OR to create zero bitmask
          */
-         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
-         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0] /* 0b1000,0100*/,
+                                         tbl_u0 /* 0b1000,0000 */);
+         // u0_to_zero = 0b00000001
+         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1] /* 0b0110, 0000*/,
+                                         tbl_u1 /* 0b0100,0000 */);
+         // u1_to_zero = 0b00000001
          uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+        // 0b00000011
 
         /* Mask blocks using AND with subtable blocks, use k-mask to zero
          * where lanes as required for this packet.

---
Pretty cool piece of code. Thanks!

William
Van Haaren, Harry May 18, 2020, 4:12 p.m. UTC | #2
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Monday, May 18, 2020 3:58 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation
> 
> On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> > This commit adds an AVX-512 dpcls lookup implementation.
> > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > operations in parallel.
> >
> > To run this implementation, the "avx512f" and "bmi2" ISAs are
> > required. These ISA checks are performed at runtime while
> > probing the subtable implementation. If a CPU does not provide
> > both "avx512f" and "bmi2", then this code does not execute.
> >
> > The avx512 code is built as a seperate static library, with added
> > CFLAGS to enable the required ISA features. By building only this
> > static library with avx512 enabled, it is ensured that the main OVS
> > core library is *not* using avx512, and that OVS continues to run
> > as before on CPUs that do not support avx512.
> >
> > The approach taken in this implementation is to use the
> > gather instruction to access the packet miniflow, allowing
> > any miniflow blocks to be loaded into an AVX-512 register.
> > This maximises the usefulness of the register, and hence this
> > implementation handles any subtable with up to miniflow 8 bits.
> >
> > Note that specialization of these avx512 lookup routines
> > still provides performance value, as the hashing of the
> > resulting data is performed in scalar code, and compile-time
> > loop unrolling occurs when specialized to miniflow bits.
> >
> 
> Hi Harry,
> 
> I haven't tried running the code due to my machine only
> support avx2. There are some minor issues such as indentation.
> But I read through it with example below and I think it's correct.

Thanks for the review! I'll post replies inline for context.

Note, the Software Development Emulator (SDE) tool enables emulation of AVX512 ISA.
Full details provided at the link below, using this would enable running AVX512 DPCLS
implementation itself, should you want to test it locally:
https://software.intel.com/content/www/us/en/develop/articles/intel-software-development-emulator.html


> Given that you have to do a lot of preparation (ex: popcount, creating
> bit_masks, broadcast, ... etc) before using avx instructions, do you
> have some performance number? I didn't see any from ovsconf 18 or 19.
> Is using avx512 much better than avx2?

Correct there is some "pre-work" to do before the miniflow manipulation itself.
Note that much of the more complex work (e.g. miniflow bitmask generation for the subtable)
is done at subtable instantiation time, instead of on the critical path. Also the popcount
lookup table is "static const", which will turn into a single AVX512 load at runtime.

AVX512 provides some very useful features, which are used throughout the code
below. In particular, the AVX512 "k-mask" feature allows the developer to switch-off
a lane in the SIMD register (this is sometimes referred to as a predication mask).
Using these "k-masks" solves requiring more instructions later to "merge" results
back together (as SSE or AVX2 code would have to do).
Example : "mask_set1_epi64" allows setting a specific value into the "lanes" as
given by the k-mask, and results in an AVX512 register with those contents.

There are also new instructions in AVX512 which provide even more powerful ISA, for example
the "AVX512VPOPCNTDQ" CPUID provides a vectorized popcount which can be used instead of
the "_mm512_popcnt_epi64_manual()" helper function. Enabling of the AVX512 VPOPCNT instruction
is planned in future patches to OVS. Details of the instruction are available on the intrinsics guide:
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi64&expand=4368

Finally, although the code can seem a bit verbose, most _mm512_xxx_yyy() intrinsics result in a single
instruction. This means that although the code looks "big", however the resulting instruction stream often
extremely densely packed. Combine that with the fact that the implementation is focused on using instructions
to deliver the maximum amount of required compute without any waste, it can result in very high performance :)

Regarding performance numbers, unfortunately I don't have official numbers to state here.
For an approximation (caveats such as "depends on exact usage" etc apply), for about the same packet
rate, the CPU cycles spent in DPCLS is about halved in the AVX512 version, compared to the scalar version.

<snip lots of patch contents>

> I think the below function is the most difficult one.
> I wonder if there is a better way to make it easier to understand?
> ex: break it into subfunctions or utility functions

My experience has been that breaking it up into smaller snippets causes me to
lose sight of the big picture. Code like below is typically not written in one pass but
more of an iterative process. Seeing the desired register-contents is valuable,
and knowing the context and state of registers in near proximity to it can often provide
new optimizations or strength reduction of existing code.

Clearly commenting the reason for the compute, and sometimes how it is computed
is the best-known-method for writing maintainable SIMD code. This method is also used
in DPDK for its PMDs, for example the i40e driver SIMD rx codepath:
http://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_avx2.c#n221 


> I end up using an example from your slides 2 here:
> https://www.openvswitch.org/support/ovscon2019/day1/1108-
> next_steps_sw_datapath_hvh.pdf
> and the API document here
> https://software.intel.com/sites/landingpage/IntrinsicsGuide/

Aha, you've found the colorful instruction set architecture guide :)
There is another which presents the data-movement more graphically,
I'll mention it but advise using the IntrinsicsGuide as linked above as it
is the official resource, and maintained and up-to-datedate. The graphical
webpage is here: https://www.officedaytime.com/simd512e/simd.html



> > +static inline uint32_t ALWAYS_INLINE
> > +avx512_lookup_impl(struct dpcls_subtable *subtable,
> > +                   uint32_t keys_map,
> > +                   const struct netdev_flow_key *keys[],
> > +                   struct dpcls_rule **rules,
> > +                   const uint32_t bit_count_u0,
> > +                   const uint32_t bit_count_u1)
> > +{
> > +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> > +    int i;
> > +    uint32_t hashes[NETDEV_MAX_BURST];
> > +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> > +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> > +
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> block_cache[NETDEV_MAX_BURST * 8];
> > +
> > +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> > +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> > +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> > +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> > +
> > +    /* Load subtable blocks for masking later */
> > +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> > +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> > +
> > +    /* Load pre-created subtable masks for each block in subtable */
> > +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> > +    const __m512i v_mf_masks =
> _mm512_maskz_loadu_epi64(bit_count_total_mask,
> > +                                                        subtable->mf_masks);
> > +
> > +    ULLONG_FOR_EACH_1 (i, keys_map) {
> > +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> > +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> > +
> > +        /* Pre-create register with *PER PACKET* u0 offset */
> > +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> > +        const __m512i v_idx_u0_offset =
> _mm512_maskz_set1_epi64(u1_bcast_mask,
> > +                                                                pkt_mf_u0_pop);
> > +
> > +        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> > +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> > +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> > +                                         keys[i]->mf.map.bits[1]);
> > +
> > +        /* Bitmask by pre-created masks */
> > +        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> > +
> > +        /* Manual AVX512 popcount for u64 lanes */
> > +        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > +
> > +        /* Offset popcounts for u1 with pre-created offset register */
> > +        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> > +
> > +        /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
> > +        const __m512i v_zeros = _mm512_setzero_si512();
> > +        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> > +        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> > +                                 bit_count_total_mask, v_indexes, pkt_data, 8);
> indent

Thanks!

> > +        /* Zero out bits that pkt doesn't have:
> > +         * - 2x pext() to extract bits from packet miniflow as needed by TBL
> > +         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> > +         */
> > +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> > +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> > +         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
> indentation: remove one space

Will fix.


> > +        /* Mask blocks using AND with subtable blocks, use k-mask to zero
> > +         * where lanes as required for this packet.
> > +         */
> > +        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
> > +                                                v_all_blocks, v_tbl_blocks);
> > +
> > +        /* Store to blocks cache, full cache line aligned */
> > +        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
> > +    }
> > +
> > +    /* Hash the now linearized blocks of packet metadata. */
> > +    ULLONG_FOR_EACH_1 (i, keys_map) {
> > +        uint64_t *block_ptr = &block_cache[i * 8];
> > +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> > +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> > +    }
> > +
> > +    /* Lookup: this returns a bitmask of packets where the hash table had
> > +     * an entry for the given hash key. Presence of a hash key does not
> > +     * guarantee matching the key, as there can be hash collisions.
> > +     */
> > +    uint32_t found_map;
> > +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> > +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes,
> nodes);
> > +
> > +    /* Verify that packet actually matched rule. If not found, a hash
> > +     * collision has taken place, so continue searching with the next node.
> > +     */
> > +    ULLONG_FOR_EACH_1 (i, found_map) {
> > +        struct dpcls_rule *rule;
> > +
> > +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> > +            const uint32_t cidx = i * 8;
> > +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> > +                                                     &block_cache[cidx]);
> > +            if (OVS_LIKELY(match)) {
> > +                rules[i] = rule;
> > +                subtable->hit_cnt++;
> > +                goto next;
> > +            }
> > +        }
> > +
> > +        /* None of the found rules was a match.  Clear the i-th bit to
> > +         * search for this key in the next subtable. */
> > +        ULLONG_SET0(found_map, i);
> > +    next:
> > +        ;                     /* Keep Sparse happy. */
> > +    }
> > +
> > +    return found_map;
> > +}
> 
> If someone is interested, the example below with the slides
> help understand the above function.

Wow - nice work! Impressive to see the code taken apart and reduced
to its logical behavior like this, interesting to see.


> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-
> avx512-gather.c
> index 52348041bd00..f84a95423cf8 100644
> --- a/lib/dpif-netdev-lookup-avx512-gather.c
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -93,56 +93,77 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
> 
>      OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> block_cache[NETDEV_MAX_BURST * 8];
> 
> -    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> -    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> -    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> -    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; //1000,0000
> +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; //0100,0000
> +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); //1
> +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); //1
> +    // bit_count_total = 2
> 
>      /* Load subtable blocks for masking later */
> -    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> -    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable-
> >mask.mf);//point to ipv4 mask
> +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> //porint to ipv4 mask
> 
>      /* Load pre-created subtable masks for each block in subtable */
> -    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> -    const __m512i v_mf_masks =
> _mm512_maskz_loadu_epi64(bit_count_total_mask,
> +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; // (1 <<
> 2) - 1 = 0x3
> +    const __m512i v_mf_masks =
> _mm512_maskz_loadu_epi64(bit_count_total_mask /* 0x3 */,
>                                                          subtable->mf_masks);
> +    // subtable->mf_masks[0] = 0b01111111
> +    // subtable->mf_masks[1] = 0b00111111
> +    // v_mf_masks = [0,0,0,0,0,0, 0b00111111, 0b01111111]
> 
> -    ULLONG_FOR_EACH_1 (i, keys_map) {
> -        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> -        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> +    ULLONG_FOR_EACH_1 (i, keys_map) {// for each packets in batch
> +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; //0b1000,0100
> +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> //2
> 
>          /* Pre-create register with *PER PACKET* u0 offset */
> -        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); //(0xff
> << 1) = 0xfe
>          const __m512i v_idx_u0_offset =
> _mm512_maskz_set1_epi64(u1_bcast_mask,
>                                                                  pkt_mf_u0_pop);
> +        //v_idx_u0_offset = [2,2,2,2,2,2,2,0]
> 
>          /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> -        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> -        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> -                                         keys[i]->mf.map.bits[1]);
> +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);//
> [0b10000100,0b10000100,0b10000100, ...]
> +
> +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask
> /*0xfe*/,
> +                                         keys[i]->mf.map.bits[1] /* 0b01100000 */);
> +        //0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000,
> 0b01100000, 0b01100000,0b10000100
> 
> -        /* Bitmask by pre-created masks */
> +
> +        /* Bitmask by pre-created masks. */
>          __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> +        // v_masks = [0,0,0,0,0,0, 0b00100000,0b00000100]
> 
>          /* Manual AVX512 popcount for u64 lanes */
>          __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +        // v_popcnts = [0,0,0,0,0,0,1,1]
> 
>          /* Offset popcounts for u1 with pre-created offset register */
>          __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> +        // v_indexes = [0,0,0,0,0,0,3,1]
> 
>          /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
>          const __m512i v_zeros = _mm512_setzero_si512();
>          const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> +        // pkt_data = ipv4_src, ipv4_dst, mac_src, vlan_tci
> +
>          __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> -                                 bit_count_total_mask, v_indexes, pkt_data, 8);
> +                                 bit_count_total_mask /* 0x3 */,
> +                                 v_indexes, pkt_data, 8);
> +        //v_all_blocks: use v_index[0]=1*8 , v_index[1]=3*8 to gather data
> +        //v_all_blocks = [0,0,0,0,0,0, ipv4_dst, vlan_tci]
> 
>          /* Zero out bits that pkt doesn't have:
>           * - 2x pext() to extract bits from packet miniflow as needed by TBL
>           * - Shift u1 over by bit_count of u0, OR to create zero bitmask
>           */
> -         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> -         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0] /*
> 0b1000,0100*/,
> +                                         tbl_u0 /* 0b1000,0000 */);
> +         // u0_to_zero = 0b00000001
> +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1] /* 0b0110,
> 0000*/,
> +                                         tbl_u1 /* 0b0100,0000 */);
> +         // u1_to_zero = 0b00000001
>           uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
> +        // 0b00000011
> 
>          /* Mask blocks using AND with subtable blocks, use k-mask to zero
>           * where lanes as required for this packet.
> 
> ---
> Pretty cool piece of code. Thanks!
> 
> William

Pretty cool review. Thanks!

Harry
William Tu May 20, 2020, 12:11 a.m. UTC | #3
On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
<harry.van.haaren@intel.com> wrote:
>
> > -----Original Message-----
> > From: William Tu <u9012063@gmail.com>
> > Sent: Monday, May 18, 2020 3:58 PM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > implementation
> >
> > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> > > This commit adds an AVX-512 dpcls lookup implementation.
> > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > > operations in parallel.
> > >
> > > To run this implementation, the "avx512f" and "bmi2" ISAs are
> > > required. These ISA checks are performed at runtime while
> > > probing the subtable implementation. If a CPU does not provide
> > > both "avx512f" and "bmi2", then this code does not execute.
> > >
> > > The avx512 code is built as a seperate static library, with added
> > > CFLAGS to enable the required ISA features. By building only this
> > > static library with avx512 enabled, it is ensured that the main OVS
> > > core library is *not* using avx512, and that OVS continues to run
> > > as before on CPUs that do not support avx512.
> > >
> > > The approach taken in this implementation is to use the
> > > gather instruction to access the packet miniflow, allowing
> > > any miniflow blocks to be loaded into an AVX-512 register.
> > > This maximises the usefulness of the register, and hence this
> > > implementation handles any subtable with up to miniflow 8 bits.
> > >
> > > Note that specialization of these avx512 lookup routines
> > > still provides performance value, as the hashing of the
> > > resulting data is performed in scalar code, and compile-time
> > > loop unrolling occurs when specialized to miniflow bits.
> > >
> >
> > Hi Harry,
> >
> > I haven't tried running the code due to my machine only
> > support avx2. There are some minor issues such as indentation.
> > But I read through it with example below and I think it's correct.
>
> Thanks for the review! I'll post replies inline for context.
>
> Note, the Software Development Emulator (SDE) tool enables emulation of AVX512 ISA.
> Full details provided at the link below, using this would enable running AVX512 DPCLS
> implementation itself, should you want to test it locally:
> https://software.intel.com/content/www/us/en/develop/articles/intel-software-development-emulator.html
>
>
> > Given that you have to do a lot of preparation (ex: popcount, creating
> > bit_masks, broadcast, ... etc) before using avx instructions, do you
> > have some performance number? I didn't see any from ovsconf 18 or 19.
> > Is using avx512 much better than avx2?
>
> Correct there is some "pre-work" to do before the miniflow manipulation itself.
> Note that much of the more complex work (e.g. miniflow bitmask generation for the subtable)
> is done at subtable instantiation time, instead of on the critical path. Also the popcount
> lookup table is "static const", which will turn into a single AVX512 load at runtime.
>
> AVX512 provides some very useful features, which are used throughout the code
> below. In particular, the AVX512 "k-mask" feature allows the developer to switch-off
> a lane in the SIMD register (this is sometimes referred to as a predication mask).
> Using these "k-masks" solves requiring more instructions later to "merge" results
> back together (as SSE or AVX2 code would have to do).
> Example : "mask_set1_epi64" allows setting a specific value into the "lanes" as
> given by the k-mask, and results in an AVX512 register with those contents.
>
> There are also new instructions in AVX512 which provide even more powerful ISA, for example
> the "AVX512VPOPCNTDQ" CPUID provides a vectorized popcount which can be used instead of
> the "_mm512_popcnt_epi64_manual()" helper function. Enabling of the AVX512 VPOPCNT instruction
> is planned in future patches to OVS. Details of the instruction are available on the intrinsics guide:
> https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_popcnt_epi64&expand=4368
>
> Finally, although the code can seem a bit verbose, most _mm512_xxx_yyy() intrinsics result in a single
> instruction. This means that although the code looks "big", however the resulting instruction stream often
> extremely densely packed. Combine that with the fact that the implementation is focused on using instructions
> to deliver the maximum amount of required compute without any waste, it can result in very high performance :)
>
> Regarding performance numbers, unfortunately I don't have official numbers to state here.
> For an approximation (caveats such as "depends on exact usage" etc apply), for about the same packet
> rate, the CPU cycles spent in DPCLS is about halved in the AVX512 version, compared to the scalar version.
>
> <snip lots of patch contents>
>
> > I think the below function is the most difficult one.
> > I wonder if there is a better way to make it easier to understand?
> > ex: break it into subfunctions or utility functions
>
> My experience has been that breaking it up into smaller snippets causes me to
> lose sight of the big picture. Code like below is typically not written in one pass but
> more of an iterative process. Seeing the desired register-contents is valuable,
> and knowing the context and state of registers in near proximity to it can often provide
> new optimizations or strength reduction of existing code.
>
> Clearly commenting the reason for the compute, and sometimes how it is computed
> is the best-known-method for writing maintainable SIMD code. This method is also used
> in DPDK for its PMDs, for example the i40e driver SIMD rx codepath:
> http://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_avx2.c#n221
>
>
> > I end up using an example from your slides 2 here:
> > https://www.openvswitch.org/support/ovscon2019/day1/1108-
> > next_steps_sw_datapath_hvh.pdf
> > and the API document here
> > https://software.intel.com/sites/landingpage/IntrinsicsGuide/
>
> Aha, you've found the colorful instruction set architecture guide :)
> There is another which presents the data-movement more graphically,
> I'll mention it but advise using the IntrinsicsGuide as linked above as it
> is the official resource, and maintained and up-to-datedate. The graphical
> webpage is here: https://www.officedaytime.com/simd512e/simd.html
>
>
>
> > > +static inline uint32_t ALWAYS_INLINE
> > > +avx512_lookup_impl(struct dpcls_subtable *subtable,
> > > +                   uint32_t keys_map,
> > > +                   const struct netdev_flow_key *keys[],
> > > +                   struct dpcls_rule **rules,
> > > +                   const uint32_t bit_count_u0,
> > > +                   const uint32_t bit_count_u1)
> > > +{
> > > +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> > > +    int i;
> > > +    uint32_t hashes[NETDEV_MAX_BURST];
> > > +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> > > +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> > > +
> > > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> > block_cache[NETDEV_MAX_BURST * 8];
> > > +
> > > +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> > > +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> > > +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> > > +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> > > +
> > > +    /* Load subtable blocks for masking later */
> > > +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> > > +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> > > +
> > > +    /* Load pre-created subtable masks for each block in subtable */
> > > +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> > > +    const __m512i v_mf_masks =
> > _mm512_maskz_loadu_epi64(bit_count_total_mask,
> > > +                                                        subtable->mf_masks);
> > > +
> > > +    ULLONG_FOR_EACH_1 (i, keys_map) {
> > > +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> > > +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> > > +
> > > +        /* Pre-create register with *PER PACKET* u0 offset */
> > > +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> > > +        const __m512i v_idx_u0_offset =
> > _mm512_maskz_set1_epi64(u1_bcast_mask,
> > > +                                                                pkt_mf_u0_pop);
> > > +
> > > +        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> > > +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> > > +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> > > +                                         keys[i]->mf.map.bits[1]);
> > > +
> > > +        /* Bitmask by pre-created masks */
> > > +        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> > > +
> > > +        /* Manual AVX512 popcount for u64 lanes */
> > > +        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > > +
> > > +        /* Offset popcounts for u1 with pre-created offset register */
> > > +        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> > > +
> > > +        /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
> > > +        const __m512i v_zeros = _mm512_setzero_si512();
> > > +        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> > > +        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> > > +                                 bit_count_total_mask, v_indexes, pkt_data, 8);
> > indent
>
> Thanks!
>
> > > +        /* Zero out bits that pkt doesn't have:
> > > +         * - 2x pext() to extract bits from packet miniflow as needed by TBL
> > > +         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> > > +         */
> > > +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> > > +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> > > +         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
> > indentation: remove one space
>
> Will fix.
>
>
> > > +        /* Mask blocks using AND with subtable blocks, use k-mask to zero
> > > +         * where lanes as required for this packet.
> > > +         */
> > > +        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
> > > +                                                v_all_blocks, v_tbl_blocks);
> > > +
> > > +        /* Store to blocks cache, full cache line aligned */
> > > +        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
> > > +    }
> > > +
> > > +    /* Hash the now linearized blocks of packet metadata. */
> > > +    ULLONG_FOR_EACH_1 (i, keys_map) {
> > > +        uint64_t *block_ptr = &block_cache[i * 8];
> > > +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> > > +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> > > +    }
> > > +
> > > +    /* Lookup: this returns a bitmask of packets where the hash table had
> > > +     * an entry for the given hash key. Presence of a hash key does not
> > > +     * guarantee matching the key, as there can be hash collisions.
> > > +     */
> > > +    uint32_t found_map;
> > > +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> > > +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes,
> > nodes);
> > > +
> > > +    /* Verify that packet actually matched rule. If not found, a hash
> > > +     * collision has taken place, so continue searching with the next node.
> > > +     */
> > > +    ULLONG_FOR_EACH_1 (i, found_map) {
> > > +        struct dpcls_rule *rule;
> > > +
> > > +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> > > +            const uint32_t cidx = i * 8;
> > > +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> > > +                                                     &block_cache[cidx]);
> > > +            if (OVS_LIKELY(match)) {
> > > +                rules[i] = rule;
> > > +                subtable->hit_cnt++;
> > > +                goto next;
> > > +            }
> > > +        }
> > > +
> > > +        /* None of the found rules was a match.  Clear the i-th bit to
> > > +         * search for this key in the next subtable. */
> > > +        ULLONG_SET0(found_map, i);
> > > +    next:
> > > +        ;                     /* Keep Sparse happy. */
> > > +    }
> > > +
> > > +    return found_map;
> > > +}
> >
> > If someone is interested, the example below with the slides
> > help understand the above function.
>
> Wow - nice work! Impressive to see the code taken apart and reduced
> to its logical behavior like this, interesting to see.
>
>
> > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-
> > avx512-gather.c
> > index 52348041bd00..f84a95423cf8 100644
> > --- a/lib/dpif-netdev-lookup-avx512-gather.c
> > +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> > @@ -93,56 +93,77 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
> >
> >      OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> > block_cache[NETDEV_MAX_BURST * 8];
> >
> > -    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> > -    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> > -    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> > -    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> > +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; //1000,0000
> > +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; //0100,0000
> > +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); //1
> > +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); //1
> > +    // bit_count_total = 2
> >
> >      /* Load subtable blocks for masking later */
> > -    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> > -    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> > +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable-
> > >mask.mf);//point to ipv4 mask
> > +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> > //porint to ipv4 mask
> >
> >      /* Load pre-created subtable masks for each block in subtable */
> > -    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> > -    const __m512i v_mf_masks =
> > _mm512_maskz_loadu_epi64(bit_count_total_mask,
> > +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; // (1 <<
> > 2) - 1 = 0x3
> > +    const __m512i v_mf_masks =
> > _mm512_maskz_loadu_epi64(bit_count_total_mask /* 0x3 */,
> >                                                          subtable->mf_masks);
> > +    // subtable->mf_masks[0] = 0b01111111
> > +    // subtable->mf_masks[1] = 0b00111111
> > +    // v_mf_masks = [0,0,0,0,0,0, 0b00111111, 0b01111111]
> >
> > -    ULLONG_FOR_EACH_1 (i, keys_map) {
> > -        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> > -        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> > +    ULLONG_FOR_EACH_1 (i, keys_map) {// for each packets in batch
> > +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; //0b1000,0100
> > +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> > //2
> >
> >          /* Pre-create register with *PER PACKET* u0 offset */
> > -        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> > +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); //(0xff
> > << 1) = 0xfe
> >          const __m512i v_idx_u0_offset =
> > _mm512_maskz_set1_epi64(u1_bcast_mask,
> >                                                                  pkt_mf_u0_pop);
> > +        //v_idx_u0_offset = [2,2,2,2,2,2,2,0]
> >
> >          /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> > -        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> > -        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> > -                                         keys[i]->mf.map.bits[1]);
> > +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);//
> > [0b10000100,0b10000100,0b10000100, ...]
> > +
> > +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask
> > /*0xfe*/,
> > +                                         keys[i]->mf.map.bits[1] /* 0b01100000 */);
> > +        //0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000,
> > 0b01100000, 0b01100000,0b10000100
> >
> > -        /* Bitmask by pre-created masks */
> > +
> > +        /* Bitmask by pre-created masks. */
> >          __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> > +        // v_masks = [0,0,0,0,0,0, 0b00100000,0b00000100]
> >
> >          /* Manual AVX512 popcount for u64 lanes */
> >          __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > +        // v_popcnts = [0,0,0,0,0,0,1,1]
> >
> >          /* Offset popcounts for u1 with pre-created offset register */
> >          __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> > +        // v_indexes = [0,0,0,0,0,0,3,1]
> >
> >          /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
> >          const __m512i v_zeros = _mm512_setzero_si512();
> >          const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> > +        // pkt_data = ipv4_src, ipv4_dst, mac_src, vlan_tci
> > +
> >          __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> > -                                 bit_count_total_mask, v_indexes, pkt_data, 8);
> > +                                 bit_count_total_mask /* 0x3 */,
> > +                                 v_indexes, pkt_data, 8);
> > +        //v_all_blocks: use v_index[0]=1*8 , v_index[1]=3*8 to gather data
> > +        //v_all_blocks = [0,0,0,0,0,0, ipv4_dst, vlan_tci]
> >
> >          /* Zero out bits that pkt doesn't have:
> >           * - 2x pext() to extract bits from packet miniflow as needed by TBL
> >           * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> >           */
> > -         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> > -         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> > +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0] /*
> > 0b1000,0100*/,
> > +                                         tbl_u0 /* 0b1000,0000 */);
> > +         // u0_to_zero = 0b00000001
> > +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1] /* 0b0110,
> > 0000*/,
> > +                                         tbl_u1 /* 0b0100,0000 */);
> > +         // u1_to_zero = 0b00000001
> >           uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
> > +        // 0b00000011
> >
> >          /* Mask blocks using AND with subtable blocks, use k-mask to zero
> >           * where lanes as required for this packet.
> >
> > ---
Hi Harry,

I managed to find a machine with avx512 in google cloud and did some
performance testing. I saw lower performance when enabling avx512,
I believe I did something wrong. Do you mind having a look:

1) first a compile error
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index b22a26b8c8a2..5c71096c10c5 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -1,5 +1,6 @@

 #include <config.h>
+#include <errno.h>
 #include "dpif-netdev-lookup.h"

 #include "openvswitch/vlog.h"
---

2) cpuinfo
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm
constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq
pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch
invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle
avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap
clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1
xsaves arat md_clear arch_capabilities

3) start ovs and set avx and traffic gen
 ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
 ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk
options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1

4) dp flows with miniflow info
root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m
flow-dump from pmd on cpu core: 0
ufid:caf11111-2e15-418c-a7d4-b4ec377593ca,
skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=10.182.0.2/0.0.0.0,dst=76.21.95.192/0.0.0.0,proto=6/0,tos=0x10/0,ttl=64/0,frag=no),tcp(src=22/0,dst=62190/0),tcp_flags(0/0),
packets:0, bytes:0, used:never, dp:ovs, actions:drop,
dp-extra-info:miniflow_bits(5,1)
ufid:78cc1751-3a81-4dba-900c-b3507d965bdc,
skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=10.182.0.2/0.0.0.0,dst=169.254.169.254/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no),tcp(src=51650/0,dst=80/0),tcp_flags(0/0),
packets:0, bytes:0, used:never, dp:ovs, actions:drop,
dp-extra-info:miniflow_bits(5,1)

5) pmd-stat-show
root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
pmd thread numa_id 0 core_id 0:
  packets received: 19838528
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 0
  avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
look right ....)
  miss with success upcall: 78
  miss with failed upcall: 19838418
  avg. packets per output batch: 2.00
  idle cycles: 0 (0.00%)
  processing cycles: 103431787838 (100.00%)
  avg cycles per packet: 5213.68 (103431787838/19838528)
  avg processing cycles per packet: 5213.68 (103431787838/19838528)

6) gdb also looks not right..., I didn't see any avx512 instructions
(gdb) b avx512_lookup_impl
Breakpoint 2 at 0x55e92342a8df: avx512_lookup_impl. (4 locations)
Dump of assembler code for function dpcls_avx512_gather_skx_mf_5_1:
96     const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
   0x000055e92342a8df <+31>: mov    0x30(%rdi),%r8
97     const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
   0x000055e92342a8e3 <+35>: mov    0x38(%rdi),%r9
98     ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
   0x000055e92342a8f6 <+54>: xor    %eax,%eax
   0x000055e92342a8f8 <+56>: popcnt %r8,%rax
   0x000055e92342a8fd <+61>: cmp    $0x5,%eax
   0x000055e92342a900 <+64>: jne    0x55e92342abc3
<dpcls_avx512_gather_skx_mf_5_1+771>
   0x000055e92342abc3 <+771>: lea    0x277b0e(%rip),%rdx        # 0x55e9236a26d8
   0x000055e92342abca <+778>: lea    0x277ccf(%rip),%rsi        #
0x55e9236a28a0 <__func__.43755>
   0x000055e92342abd1 <+785>: lea    0x277b30(%rip),%rdi        # 0x55e9236a2708
   0x000055e92342abd8 <+792>: callq  0x55e9233a71e0 <ovs_assert_failure>
99     ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
   0x000055e92342a906 <+70>: xor    %eax,%eax
   0x000055e92342a908 <+72>: popcnt %r9,%rax
   0x000055e92342a90d <+77>: cmp    $0x1,%eax
   0x000055e92342a910 <+80>: jne    0x55e92342abdd
<dpcls_avx512_gather_skx_mf_5_1+797>
   0x000055e92342a916 <+86>: mov    %rcx,%r12
   0x000055e92342abdd <+797>: lea    0x277b54(%rip),%rdx        # 0x55e9236a2738
   0x000055e92342abe4 <+804>: lea    0x277cb5(%rip),%rsi        #
0x55e9236a28a0 <__func__.43755>
   0x000055e92342abeb <+811>: lea    0x277b76(%rip),%rdi        # 0x55e9236a2768
   0x000055e92342abf2 <+818>: callq  0x55e9233a71e0 <ovs_assert_failure>
100
101     /* Load subtable blocks for masking later */
102     const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
103     const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
104
105     /* Load pre-created subtable masks for each block in subtable */
106     const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
107     const __m512i v_mf_masks =
_mm512_maskz_loadu_epi64(bit_count_total_mask,
108                                                         subtable->mf_masks);
109
110     ULLONG_FOR_EACH_1 (i, keys_map) {
111         const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
   0x000055e92342a98a <+202>: movslq %ecx,%rax
   0x000055e92342a990 <+208>: mov    (%rdx,%rax,8),%r11
   0x000055e92342a999 <+217>: mov    0x8(%r11),%r10
112         const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
113
114         /* Pre-create register with *PER PACKET* u0 offset */
115         const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
116         const __m512i v_idx_u0_offset =
_mm512_maskz_set1_epi64(u1_bcast_mask,
   0x000055e92342a994 <+212>: xor    %eax,%eax
   0x000055e92342a99d <+221>: popcnt %r10,%rax

Thanks!
William
Van Haaren, Harry May 20, 2020, 10:19 a.m. UTC | #4
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Wednesday, May 20, 2020 1:12 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation
> 
> On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
> <harry.van.haaren@intel.com> wrote:
> >
> > > -----Original Message-----
> > > From: William Tu <u9012063@gmail.com>
> > > Sent: Monday, May 18, 2020 3:58 PM
> > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > > implementation
> > >
> > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> > > > This commit adds an AVX-512 dpcls lookup implementation.
> > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > > > operations in parallel.

<snip lots of code/patch contents for readability>

> Hi Harry,
> 
> I managed to find a machine with avx512 in google cloud and did some
> performance testing. I saw lower performance when enabling avx512,
> I believe I did something wrong. Do you mind having a look:
> 
> 1) first a compile error
> diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
> index b22a26b8c8a2..5c71096c10c5 100644
> --- a/lib/dpif-netdev-lookup.c
> +++ b/lib/dpif-netdev-lookup.c
> @@ -1,5 +1,6 @@
> 
>  #include <config.h>
> +#include <errno.h>
>  #include "dpif-netdev-lookup.h"
> 
>  #include "openvswitch/vlog.h"

Existing code compiles fine here - but I've added this in the v3, thanks for flagging.


> 2) cpuinfo
> flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
> pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm
> constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq
> pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
> aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch
> invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle
> avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap
> clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1
> xsaves arat md_clear arch_capabilities

The avx512f and dq/cd/bw/vl extensions indicate AVX512 is available on this machine, all looks good so far.


> 
> 3) start ovs and set avx and traffic gen
>  ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
>  ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk
> options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1

The output of the first command (enabling the AVX512 lookup) posts some output to Log INFO, please ensure its there? 

2020-05-20T09:39:09Z|00262|dpif_netdev_lookup|INFO|Subtable function 'avx512_gather' set priority to 4
2020-05-20T09:39:09Z|00006|dpif_netdev(pmd-c15/id:99)|INFO|reprobing sub func, 5 1


> 4) dp flows with miniflow info
> root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m
> flow-dump from pmd on cpu core: 0
> ufid:caf11111-2e15-418c-a7d4-b4ec377593ca,
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> 0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=10.182.0.2/0.0.0.0,
> dst=76.21.95.192/0.0.0.0,proto=6/0,tos=0x10/0,ttl=64/0,frag=no),tcp(src=22/0,ds
> t=62190/0),tcp_flags(0/0),
> packets:0, bytes:0, used:never, dp:ovs, actions:drop,
> dp-extra-info:miniflow_bits(5,1)
> ufid:78cc1751-3a81-4dba-900c-b3507d965bdc,
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> 0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=10.182.0.2/0.0.0.0,
> dst=169.254.169.254/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no),tcp(src=51650/
> 0,dst=80/0),tcp_flags(0/0),
> packets:0, bytes:0, used:never, dp:ovs, actions:drop,
> dp-extra-info:miniflow_bits(5,1)

It seems the "packets:0, bytes:0,used:never" tags indicate that there is no traffic hitting these rules at all?

Output here (with traffic running for a while) shows:
packets:621588543, bytes:37295312580, used:0.000s, dp:ovs, actions:dpdk1, dp-extra-info:miniflow_bits(4,1)


> 5) pmd-stat-show
> root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> pmd thread numa_id 0 core_id 0:
>   packets received: 19838528
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 0
>   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> look right ....)
>   miss with success upcall: 78
>   miss with failed upcall: 19838418
>   avg. packets per output batch: 2.00
>   idle cycles: 0 (0.00%)
>   processing cycles: 103431787838 (100.00%)
>   avg cycles per packet: 5213.68 (103431787838/19838528)
>   avg processing cycles per packet: 5213.68 (103431787838/19838528)

Would you try the pmd-stats-show command before setting the AVX512 lookup?
If the issue is still present it would indicate its not related to the exact lookup
implementation.

Running the same test on master, patchset using scalar, and patchset using AVX512
for lookup all provides a valid megaflow hits count, exactly equal to  (packets_rx - 1)
as the first one goes to the upcall to install the rule (and hence doesn't hit the rule :)

$ ./utilities/ovs-appctl dpif-netdev/pmd-stats-show

### Master
pmd thread numa_id 0 core_id 15:
  packets received: 503095
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 503094
  avg. subtable lookups per megaflow hit: 1.00

### Scalar Lookup
$ ./utilities/ovs-appctl dpif-netdev/pmd-stats-show
pmd thread numa_id 0 core_id 15:
  packets received: 508759
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 508758
  avg. subtable lookups per megaflow hit: 1.00

### AVX512 Lookup
./utilities/ovs-appctl dpif-netdev/pmd-stats-show
pmd thread numa_id 0 core_id 15:
  packets received: 540311
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 540310
  avg. subtable lookups per megaflow hit: 1.00


> 6) gdb also looks not right..., I didn't see any avx512 instructions
> (gdb) b avx512_lookup_impl
> Breakpoint 2 at 0x55e92342a8df: avx512_lookup_impl. (4 locations)
> Dump of assembler code for function dpcls_avx512_gather_skx_mf_5_1:
> 96     const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
>    0x000055e92342a8df <+31>: mov    0x30(%rdi),%r8
> 97     const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
>    0x000055e92342a8e3 <+35>: mov    0x38(%rdi),%r9
> 98     ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
<snip some ASM>

(gdb) disas dpcls_avx512_gather_skx_mf_5_1
<snip preamble>
  0x0000555556103f34 <+724>:   vmovdqu64 0x28(%rdi),%zmm2{%k1}{z}
   0x0000555556103f3e <+734>:   vmovdqu64 0x18(%rcx),%zmm0{%k1}{z}
   0x0000555556103f48 <+744>:   vpandd %zmm0,%zmm1,%zmm0
   0x0000555556103f4e <+750>:   vpcmpeqq %zmm2,%zmm0,%k7{%k1}

Disassembly here shows AVX512 register usage here, as expected.

Note the "avx512_lookup_impl" is a static function in a .c file, so it is not visible
outside the compilation unit. Further, it is also marked "ALWAYS_INLINE", so even
inside the compilation unit, there isn't a symbol with that name. I'm surprised GDB
let me set a breakpoint on it. Disassembling it doesn't work:
(gdb) b avx512_lookup_impl
Breakpoint 2 at 0x5555561035af: avx512_lookup_impl. (4 locations)
(gdb) disas avx512_lookup_impl
No symbol "avx512_lookup_impl" in current context.

The functions it is inlined into are available for disassembly, as their symbols
do exist in the binary. (Sidenote: Going to add dpcls_ to the _any function for
consistency in naming with the others);
dpcls_avx512_gather_skx_mf_4_0 
dpcls_avx512_gather_skx_mf_4_1
dpcls_avx512_gather_skx_mf_5_1
avx512_gather_any

Disassembling the _any version of the avx512 lookup function here
shows the AVX512 instructions, using ZMM registers and {k} masks.
(gdb) disas avx512_gather_mf_any
Dump of assembler code for function avx512_gather_mf_any:
   0x0000555556103fb0 <+0>:     lea    0x8(%rsp),%r10
   0x0000555556103fb5 <+5>:     and    $0xffffffffffffffc0,%rsp
   0x0000555556103fb9 <+9>:     pushq  -0x8(%r10)
<skipping preamble/pushes etc, to the fun AVX512 part>
   0x00005555561040dd <+301>:   vpandd %zmm0,%zmm5,%zmm0
   0x00005555561040e3 <+307>:   or     %rdi,%rax
   0x00005555561040e6 <+310>:   test   %r8,%r8
   0x00005555561040e9 <+313>:   kmovb  %eax,%k4
   0x00005555561040ed <+317>:   vpsrlq $0x4,%zmm0,%zmm2
   0x00005555561040f4 <+324>:   vpandd %zmm3,%zmm0,%zmm0
   0x00005555561040fa <+330>:   vpandd %zmm2,%zmm3,%zmm2
   0x0000555556104100 <+336>:   vpshufb %zmm0,%zmm4,%zmm0
   0x0000555556104106 <+342>:   vpshufb %zmm2,%zmm4,%zmm2
   0x000055555610410c <+348>:   vpaddb %zmm2,%zmm0,%zmm0
   0x0000555556104112 <+354>:   vpsadbw %zmm7,%zmm0,%zmm0
   0x0000555556104118 <+360>:   vpaddq %zmm1,%zmm0,%zmm0
   0x000055555610411e <+366>:   vmovdqa64 %zmm8,%zmm1
   0x0000555556104124 <+372>:   vpgatherqq 0x18(%r9,%zmm0,8),%zmm1{%k3}
   0x000055555610412c <+380>:   vpandq %zmm6,%zmm1,%zmm0{%k4}{z}

Would you try some of the above and see can it be reproduced?

Regards, -Harry
Federico Iezzi May 20, 2020, 10:34 a.m. UTC | #5
On Wed, 20 May 2020 at 12:20, Van Haaren, Harry <harry.van.haaren@intel.com>
wrote:

> > -----Original Message-----
> > From: William Tu <u9012063@gmail.com>
> > Sent: Wednesday, May 20, 2020 1:12 AM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > implementation
> >
> > On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
> > <harry.van.haaren@intel.com> wrote:
> > >
> > > > -----Original Message-----
> > > > From: William Tu <u9012063@gmail.com>
> > > > Sent: Monday, May 18, 2020 3:58 PM
> > > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > > > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > > > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > > > implementation
> > > >
> > > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> > > > > This commit adds an AVX-512 dpcls lookup implementation.
> > > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > > > > operations in parallel.
>
> <snip lots of code/patch contents for readability>
>
> > Hi Harry,
> >
> > I managed to find a machine with avx512 in google cloud and did some
> > performance testing. I saw lower performance when enabling avx512,
>

AVX512 instruction path lowers the clock speed well below the base
frequency [1].
Aren't you killing the PMD performance while improving the lookup ones?

[1]
https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/2nd-gen-xeon-scalable-spec-update.pdf
(see
page 20)


> > I believe I did something wrong. Do you mind having a look:
> >
> > 1) first a compile error
> > diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
> > index b22a26b8c8a2..5c71096c10c5 100644
> > --- a/lib/dpif-netdev-lookup.c
> > +++ b/lib/dpif-netdev-lookup.c
> > @@ -1,5 +1,6 @@
> >
> >  #include <config.h>
> > +#include <errno.h>
> >  #include "dpif-netdev-lookup.h"
> >
> >  #include "openvswitch/vlog.h"
>
> Existing code compiles fine here - but I've added this in the v3, thanks
> for flagging.
>
>
> > 2) cpuinfo
> > flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
> > pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm
> > constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq
> > pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
> > aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch
> > invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle
> > avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap
> > clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1
> > xsaves arat md_clear arch_capabilities
>
> The avx512f and dq/cd/bw/vl extensions indicate AVX512 is available on
> this machine, all looks good so far.
>
>
> >
> > 3) start ovs and set avx and traffic gen
> >  ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> >  ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk
> >
> options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
>
> The output of the first command (enabling the AVX512 lookup) posts some
> output to Log INFO, please ensure its there?
>
> 2020-05-20T09:39:09Z|00262|dpif_netdev_lookup|INFO|Subtable function
> 'avx512_gather' set priority to 4
> 2020-05-20T09:39:09Z|00006|dpif_netdev(pmd-c15/id:99)|INFO|reprobing sub
> func, 5 1
>
>
> > 4) dp flows with miniflow info
> > root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m
> > flow-dump from pmd on cpu core: 0
> > ufid:caf11111-2e15-418c-a7d4-b4ec377593ca,
> >
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> >
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> > 0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=
> 10.182.0.2/0.0.0.0,
> > dst=76.21.95.192/0.0.0.0,proto=6/0,tos=0x10/0,ttl=64/0,frag=no
> ),tcp(src=22/0,ds
> > t=62190/0),tcp_flags(0/0),
> > packets:0, bytes:0, used:never, dp:ovs, actions:drop,
> > dp-extra-info:miniflow_bits(5,1)
> > ufid:78cc1751-3a81-4dba-900c-b3507d965bdc,
> >
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> >
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> > 0a:b6:00:02,dst=42:01:0a:b6:00:01),eth_type(0x0800),ipv4(src=
> 10.182.0.2/0.0.0.0,
> > dst=169.254.169.254/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no
> ),tcp(src=51650/
> > 0,dst=80/0),tcp_flags(0/0),
> > packets:0, bytes:0, used:never, dp:ovs, actions:drop,
> > dp-extra-info:miniflow_bits(5,1)
>
> It seems the "packets:0, bytes:0,used:never" tags indicate that there is
> no traffic hitting these rules at all?
>
> Output here (with traffic running for a while) shows:
> packets:621588543, bytes:37295312580, used:0.000s, dp:ovs, actions:dpdk1,
> dp-extra-info:miniflow_bits(4,1)
>
>
> > 5) pmd-stat-show
> > root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> > pmd thread numa_id 0 core_id 0:
> >   packets received: 19838528
> >   packet recirculations: 0
> >   avg. datapath passes per packet: 1.00
> >   emc hits: 0
> >   smc hits: 0
> >   megaflow hits: 0
> >   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> > look right ....)
> >   miss with success upcall: 78
> >   miss with failed upcall: 19838418
> >   avg. packets per output batch: 2.00
> >   idle cycles: 0 (0.00%)
> >   processing cycles: 103431787838 (100.00%)
> >   avg cycles per packet: 5213.68 (103431787838/19838528)
> >   avg processing cycles per packet: 5213.68 (103431787838/19838528)
>
> Would you try the pmd-stats-show command before setting the AVX512 lookup?
> If the issue is still present it would indicate its not related to the
> exact lookup
> implementation.
>
> Running the same test on master, patchset using scalar, and patchset using
> AVX512
> for lookup all provides a valid megaflow hits count, exactly equal to
> (packets_rx - 1)
> as the first one goes to the upcall to install the rule (and hence doesn't
> hit the rule :)
>
> $ ./utilities/ovs-appctl dpif-netdev/pmd-stats-show
>
> ### Master
> pmd thread numa_id 0 core_id 15:
>   packets received: 503095
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 503094
>   avg. subtable lookups per megaflow hit: 1.00
>
> ### Scalar Lookup
> $ ./utilities/ovs-appctl dpif-netdev/pmd-stats-show
> pmd thread numa_id 0 core_id 15:
>   packets received: 508759
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 508758
>   avg. subtable lookups per megaflow hit: 1.00
>
> ### AVX512 Lookup
> ./utilities/ovs-appctl dpif-netdev/pmd-stats-show
> pmd thread numa_id 0 core_id 15:
>   packets received: 540311
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 540310
>   avg. subtable lookups per megaflow hit: 1.00
>
>
> > 6) gdb also looks not right..., I didn't see any avx512 instructions
> > (gdb) b avx512_lookup_impl
> > Breakpoint 2 at 0x55e92342a8df: avx512_lookup_impl. (4 locations)
> > Dump of assembler code for function dpcls_avx512_gather_skx_mf_5_1:
> > 96     const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> >    0x000055e92342a8df <+31>: mov    0x30(%rdi),%r8
> > 97     const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> >    0x000055e92342a8e3 <+35>: mov    0x38(%rdi),%r9
> > 98     ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> <snip some ASM>
>
> (gdb) disas dpcls_avx512_gather_skx_mf_5_1
> <snip preamble>
>   0x0000555556103f34 <+724>:   vmovdqu64 0x28(%rdi),%zmm2{%k1}{z}
>    0x0000555556103f3e <+734>:   vmovdqu64 0x18(%rcx),%zmm0{%k1}{z}
>    0x0000555556103f48 <+744>:   vpandd %zmm0,%zmm1,%zmm0
>    0x0000555556103f4e <+750>:   vpcmpeqq %zmm2,%zmm0,%k7{%k1}
>
> Disassembly here shows AVX512 register usage here, as expected.
>
> Note the "avx512_lookup_impl" is a static function in a .c file, so it is
> not visible
> outside the compilation unit. Further, it is also marked "ALWAYS_INLINE",
> so even
> inside the compilation unit, there isn't a symbol with that name. I'm
> surprised GDB
> let me set a breakpoint on it. Disassembling it doesn't work:
> (gdb) b avx512_lookup_impl
> Breakpoint 2 at 0x5555561035af: avx512_lookup_impl. (4 locations)
> (gdb) disas avx512_lookup_impl
> No symbol "avx512_lookup_impl" in current context.
>
> The functions it is inlined into are available for disassembly, as their
> symbols
> do exist in the binary. (Sidenote: Going to add dpcls_ to the _any
> function for
> consistency in naming with the others);
> dpcls_avx512_gather_skx_mf_4_0
> dpcls_avx512_gather_skx_mf_4_1
> dpcls_avx512_gather_skx_mf_5_1
> avx512_gather_any
>
> Disassembling the _any version of the avx512 lookup function here
> shows the AVX512 instructions, using ZMM registers and {k} masks.
> (gdb) disas avx512_gather_mf_any
> Dump of assembler code for function avx512_gather_mf_any:
>    0x0000555556103fb0 <+0>:     lea    0x8(%rsp),%r10
>    0x0000555556103fb5 <+5>:     and    $0xffffffffffffffc0,%rsp
>    0x0000555556103fb9 <+9>:     pushq  -0x8(%r10)
> <skipping preamble/pushes etc, to the fun AVX512 part>
>    0x00005555561040dd <+301>:   vpandd %zmm0,%zmm5,%zmm0
>    0x00005555561040e3 <+307>:   or     %rdi,%rax
>    0x00005555561040e6 <+310>:   test   %r8,%r8
>    0x00005555561040e9 <+313>:   kmovb  %eax,%k4
>    0x00005555561040ed <+317>:   vpsrlq $0x4,%zmm0,%zmm2
>    0x00005555561040f4 <+324>:   vpandd %zmm3,%zmm0,%zmm0
>    0x00005555561040fa <+330>:   vpandd %zmm2,%zmm3,%zmm2
>    0x0000555556104100 <+336>:   vpshufb %zmm0,%zmm4,%zmm0
>    0x0000555556104106 <+342>:   vpshufb %zmm2,%zmm4,%zmm2
>    0x000055555610410c <+348>:   vpaddb %zmm2,%zmm0,%zmm0
>    0x0000555556104112 <+354>:   vpsadbw %zmm7,%zmm0,%zmm0
>    0x0000555556104118 <+360>:   vpaddq %zmm1,%zmm0,%zmm0
>    0x000055555610411e <+366>:   vmovdqa64 %zmm8,%zmm1
>    0x0000555556104124 <+372>:   vpgatherqq 0x18(%r9,%zmm0,8),%zmm1{%k3}
>    0x000055555610412c <+380>:   vpandq %zmm6,%zmm1,%zmm0{%k4}{z}
>
> Would you try some of the above and see can it be reproduced?
>
> Regards, -Harry
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
William Tu May 20, 2020, 1:32 p.m. UTC | #6
On Wed, May 20, 2020 at 3:35 AM Federico Iezzi <fiezzi@redhat.com> wrote:
>
>
>
>
>
> On Wed, 20 May 2020 at 12:20, Van Haaren, Harry <harry.van.haaren@intel.com> wrote:
>>
>> > -----Original Message-----
>> > From: William Tu <u9012063@gmail.com>
>> > Sent: Wednesday, May 20, 2020 1:12 AM
>> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
>> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
>> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
>> > implementation
>> >
>> > On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
>> > <harry.van.haaren@intel.com> wrote:
>> > >
>> > > > -----Original Message-----
>> > > > From: William Tu <u9012063@gmail.com>
>> > > > Sent: Monday, May 18, 2020 3:58 PM
>> > > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
>> > > > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
>> > > > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
>> > > > implementation
>> > > >
>> > > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
>> > > > > This commit adds an AVX-512 dpcls lookup implementation.
>> > > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
>> > > > > operations in parallel.
>>
>> <snip lots of code/patch contents for readability>
>>
>> > Hi Harry,
>> >
>> > I managed to find a machine with avx512 in google cloud and did some
>> > performance testing. I saw lower performance when enabling avx512,
>
>
> AVX512 instruction path lowers the clock speed well below the base frequency [1].
> Aren't you killing the PMD performance while improving the lookup ones?
>
> [1] https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/2nd-gen-xeon-scalable-spec-update.pdf (see page 20)
>

Hi Federico,

Thanks for sharing the link.
Does that mean if OVS PMD uses avx512 on one core, then all the other cores's
frequency will be lower?

There are some discussion here:
https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/
My take is that overall down clocking will happen, but application
will get better performance.
William
William Tu May 20, 2020, 2:20 p.m. UTC | #7
Hi Harry,

Thanks for your feedback!

> > > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> > > > > This commit adds an AVX-512 dpcls lookup implementation.
> > > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > > > > operations in parallel.
>
> <snip lots of code/patch contents for readability>
>
> > Hi Harry,
> >
> > I managed to find a machine with avx512 in google cloud and did some
> > performance testing. I saw lower performance when enabling avx512,
> > I believe I did something wrong. Do you mind having a look:
> >
<snip>
> >
> > 3) start ovs and set avx and traffic gen
> >  ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> >  ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk
> > options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
>
> The output of the first command (enabling the AVX512 lookup) posts some output to Log INFO, please ensure its there?
>
> 2020-05-20T09:39:09Z|00262|dpif_netdev_lookup|INFO|Subtable function 'avx512_gather' set priority to 4
> 2020-05-20T09:39:09Z|00006|dpif_netdev(pmd-c15/id:99)|INFO|reprobing sub func, 5 1
>
Yes, got these info log.
ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3
ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true
ovs-ofctl add-flow br0 'actions=drop'
ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
  options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1

LOG:
2020-05-20T13:49:26.542Z|00047|dpdk|INFO|DPDK Enabled - initialized
2020-05-20T13:49:26.544Z|00048|connmgr|INFO|br0<->unix#2: 1 flow_mods
in the last 0 s (1 adds)
2020-05-20T13:49:26.547Z|00049|dpif_netdev_lookup|INFO|Subtable
function 'avx512_gather' set priority to 5
2020-05-20T13:49:26.553Z|00050|netdev_dpdk|INFO|Device
'vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1' attached to
DPDK
2020-05-20T13:49:26.553Z|00051|dpif_netdev|INFO|PMD thread on numa_id:
0, core id:  0 created.
2020-05-20T13:49:26.554Z|00052|dpif_netdev|INFO|PMD thread on numa_id:
0, core id:  1 created.
2020-05-20T13:49:26.554Z|00053|dpif_netdev|INFO|There are 2 pmd
threads on numa node 0
2020-05-20T13:49:26.554Z|00054|dpdk|INFO|Device with port_id=0 already stopped
2020-05-20T13:49:26.648Z|00055|netdev_dpdk|WARN|Rx checksum offload is
not supported on port 0
2020-05-20T13:49:26.648Z|00056|netdev_dpdk|WARN|Interface tg0 does not
support MTU configuration, max packet size supported is 1500.
2020-05-20T13:49:26.648Z|00057|netdev_dpdk|INFO|Port 0: 02:70:63:61:70:00
2020-05-20T13:49:26.648Z|00058|dpif_netdev|INFO|Core 0 on numa node 0
assigned port 'tg0' rx queue 0 (measured processing cycles 0).
2020-05-20T13:49:26.648Z|00059|bridge|INFO|bridge br0: added interface
tg0 on port 1
2020-05-20T13:49:26.648Z|00001|ofproto_dpif_upcall(pmd-c00/id:9)|WARN|upcall_cb
failure: ukey installation fails
2020-05-20T13:49:27.562Z|00002|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1

>
> > 4) dp flows with miniflow info
<snip>
> It seems the "packets:0, bytes:0,used:never" tags indicate that there is no traffic hitting these rules at all?
> Output here (with traffic running for a while) shows:
> packets:621588543, bytes:37295312580, used:0.000s, dp:ovs, actions:dpdk1, dp-extra-info:miniflow_bits(4,1)
>
Thanks, this is the hit rules:
root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m | grep -v never
flow-dump from pmd on cpu core: 0
ufid:f06998a0-9ff8-4ee5-b12f-5d7e2fcc7f0f,
skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51642/0),tcp_flags(0/0),
packets:3942096, bytes:2511115152, used:0.001s, flags:P., dp:ovs,
actions:drop, dp-extra-info:miniflow_bits(4,1)
ufid:cb3a6eac-3a7d-4e0d-a145-414dd482b4b9,
skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51650/0),tcp_flags(0/0),
packets:2779552, bytes:172332224, used:0.000s, flags:S., dp:ovs,
actions:drop, dp-extra-info:miniflow_bits(4,1)
ufid:781f3f48-ffd7-424f-ae99-62158ba05cbd,
skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:0a:b6:00:02/00:00:00:00:00:00,dst=42:01:0a:b6:00:01/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=10.182.0.2/0.0.0.0,dst=169.254.169.254/0.0.0.0,proto=6/0,tos=0/0,ttl=64/0,frag=no),tcp(src=51650/0,dst=80/0),tcp_flags(0/0),
packets:637373, bytes:216706820, used:0.000s, flags:P., dp:ovs,
actions:drop, dp-extra-info:miniflow_bits(4,1)

>
> > 5) pmd-stat-show
> > root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> > pmd thread numa_id 0 core_id 0:
> >   packets received: 19838528
> >   packet recirculations: 0
> >   avg. datapath passes per packet: 1.00
> >   emc hits: 0
> >   smc hits: 0
> >   megaflow hits: 0
> >   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> > look right ....)
> >   miss with success upcall: 78
> >   miss with failed upcall: 19838418
> >   avg. packets per output batch: 2.00
> >   idle cycles: 0 (0.00%)
> >   processing cycles: 103431787838 (100.00%)
> >   avg cycles per packet: 5213.68 (103431787838/19838528)
> >   avg processing cycles per packet: 5213.68 (103431787838/19838528)
>
> Would you try the pmd-stats-show command before setting the AVX512 lookup?
> If the issue is still present it would indicate its not related to the exact lookup
> implementation.

Before setting AVX512
### Scalar Lookup
pmd thread numa_id 0 core_id 0:
  packets received: 77470176
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 70423947
  smc hits: 0
  megaflow hits: 7045897
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 331
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 19596627706 (100.00%)
  avg cycles per packet: 252.96 (19596627706/77470176)
  avg processing cycles per packet: 252.96 (19596627706/77470176)

### AVX512 Lookup (restart ovs-vswitchd with additional command
"dpif-netdev/subtable-lookup-set avx512_gather 5"
pmd thread numa_id 0 core_id 0:
  packets received: 1178784
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 0
  avg. subtable lookups per megaflow hit: 0.00
  miss with success upcall: 13
  miss with failed upcall: 1178739     ---> this looks not right
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 5408870500 (100.00%)
  avg cycles per packet: 4588.52 (5408870500/1178784)
  avg processing cycles per packet: 4588.52 (5408870500/1178784)

>
>
> > 6) gdb also looks not right..., I didn't see any avx512 instructions
> > (gdb) b avx512_lookup_impl
> > Breakpoint 2 at 0x55e92342a8df: avx512_lookup_impl. (4 locations)
> > Dump of assembler code for function dpcls_avx512_gather_skx_mf_5_1:
> > 96     const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> >    0x000055e92342a8df <+31>: mov    0x30(%rdi),%r8
> > 97     const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> >    0x000055e92342a8e3 <+35>: mov    0x38(%rdi),%r9
> > 98     ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> <snip some ASM>
>
> (gdb) disas dpcls_avx512_gather_skx_mf_5_1
> <snip preamble>
>   0x0000555556103f34 <+724>:   vmovdqu64 0x28(%rdi),%zmm2{%k1}{z}
>    0x0000555556103f3e <+734>:   vmovdqu64 0x18(%rcx),%zmm0{%k1}{z}
>    0x0000555556103f48 <+744>:   vpandd %zmm0,%zmm1,%zmm0
>    0x0000555556103f4e <+750>:   vpcmpeqq %zmm2,%zmm0,%k7{%k1}
>
> Disassembly here shows AVX512 register usage here, as expected.

OK, tried
(gdb) disas dpcls_avx512_gather_skx_mf_5_1
and works for me. I can see avx512 instructions.

setting breakpoint at run time also work
(gdb) b dpcls_avx512_gather_skx_mf_4_1
Thread 13 "pmd-c00/id:9" hit Breakpoint 1,
dpcls_avx512_gather_skx_mf_4_1 (subtable=0x7f732c008210,
    keys_map=1, keys=0x7f733af2a798, rules=0x7f733af2a7a0) at
lib/dpif-netdev-lookup-avx512-gather.c:212
212 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)

Using perf record also show
   3.11%  pmd-c00/id:9  ovs-vswitchd        [.] dpcls_avx512_gather_skx_mf_4_1

dpcls_avx512_gather_skx_mf_4_1  /usr/local/sbin/ovs-vswitchd [Percent:
local period]
  0.48 │       lea          -0x1(%rdi),%rax
       │     _mm512_srli_epi64():
       │     return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
       │       vpsrlq       $0x4,%zmm0,%zmm1
       │     _mm512_shuffle_epi8():
  0.80 │       vpandd       %zmm3,%zmm0,%zmm0
       │     avx512_lookup_impl():
       │     ULLONG_FOR_EACH_1 (i, keys_map) {
  0.32 │       and          %rax,%rdi
       │     _mm512_shuffle_epi8():
  0.16 │       vpandd       %zmm1,%zmm3,%zmm1
  0.48 │       vpshufb      %zmm0,%zmm4,%zmm0
  0.80 │       vpshufb      %zmm1,%zmm4,%zmm1


>
> Note the "avx512_lookup_impl" is a static function in a .c file, so it is not visible
> outside the compilation unit. Further, it is also marked "ALWAYS_INLINE", so even
> inside the compilation unit, there isn't a symbol with that name. I'm surprised GDB
> let me set a breakpoint on it. Disassembling it doesn't work:
> (gdb) b avx512_lookup_impl
> Breakpoint 2 at 0x5555561035af: avx512_lookup_impl. (4 locations)
> (gdb) disas avx512_lookup_impl
> No symbol "avx512_lookup_impl" in current context.
>
> The functions it is inlined into are available for disassembly, as their symbols
> do exist in the binary. (Sidenote: Going to add dpcls_ to the _any function for
> consistency in naming with the others);
> dpcls_avx512_gather_skx_mf_4_0
> dpcls_avx512_gather_skx_mf_4_1
> dpcls_avx512_gather_skx_mf_5_1
> avx512_gather_any
>
> Disassembling the _any version of the avx512 lookup function here
> shows the AVX512 instructions, using ZMM registers and {k} masks.
> (gdb) disas avx512_gather_mf_any
> Dump of assembler code for function avx512_gather_mf_any:
>    0x0000555556103fb0 <+0>:     lea    0x8(%rsp),%r10
>    0x0000555556103fb5 <+5>:     and    $0xffffffffffffffc0,%rsp
>    0x0000555556103fb9 <+9>:     pushq  -0x8(%r10)
> <skipping preamble/pushes etc, to the fun AVX512 part>
>    0x00005555561040dd <+301>:   vpandd %zmm0,%zmm5,%zmm0
>    0x00005555561040e3 <+307>:   or     %rdi,%rax
>    0x00005555561040e6 <+310>:   test   %r8,%r8
>    0x00005555561040e9 <+313>:   kmovb  %eax,%k4
>    0x00005555561040ed <+317>:   vpsrlq $0x4,%zmm0,%zmm2
>    0x00005555561040f4 <+324>:   vpandd %zmm3,%zmm0,%zmm0
>    0x00005555561040fa <+330>:   vpandd %zmm2,%zmm3,%zmm2
>    0x0000555556104100 <+336>:   vpshufb %zmm0,%zmm4,%zmm0
>    0x0000555556104106 <+342>:   vpshufb %zmm2,%zmm4,%zmm2
>    0x000055555610410c <+348>:   vpaddb %zmm2,%zmm0,%zmm0
>    0x0000555556104112 <+354>:   vpsadbw %zmm7,%zmm0,%zmm0
>    0x0000555556104118 <+360>:   vpaddq %zmm1,%zmm0,%zmm0
>    0x000055555610411e <+366>:   vmovdqa64 %zmm8,%zmm1
>    0x0000555556104124 <+372>:   vpgatherqq 0x18(%r9,%zmm0,8),%zmm1{%k3}
>    0x000055555610412c <+380>:   vpandq %zmm6,%zmm1,%zmm0{%k4}{z}
>
> Would you try some of the above and see can it be reproduced?

btw, I saw every second ovs is doing reprobing
2020-05-20T14:15:15.113Z|00373|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:16.129Z|00374|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:17.138Z|00375|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:18.150Z|00376|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:19.170Z|00377|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:20.184Z|00378|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1
2020-05-20T14:15:21.219Z|00379|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
sub func, 4 1

Regards,
William
William Tu May 20, 2020, 3:14 p.m. UTC | #8
<snip>

> >    0x00005555561040e9 <+313>:   kmovb  %eax,%k4
> >    0x00005555561040ed <+317>:   vpsrlq $0x4,%zmm0,%zmm2
> >    0x00005555561040f4 <+324>:   vpandd %zmm3,%zmm0,%zmm0
> >    0x00005555561040fa <+330>:   vpandd %zmm2,%zmm3,%zmm2
> >    0x0000555556104100 <+336>:   vpshufb %zmm0,%zmm4,%zmm0
> >    0x0000555556104106 <+342>:   vpshufb %zmm2,%zmm4,%zmm2
> >    0x000055555610410c <+348>:   vpaddb %zmm2,%zmm0,%zmm0
> >    0x0000555556104112 <+354>:   vpsadbw %zmm7,%zmm0,%zmm0
> >    0x0000555556104118 <+360>:   vpaddq %zmm1,%zmm0,%zmm0
> >    0x000055555610411e <+366>:   vmovdqa64 %zmm8,%zmm1
> >    0x0000555556104124 <+372>:   vpgatherqq 0x18(%r9,%zmm0,8),%zmm1{%k3}
> >    0x000055555610412c <+380>:   vpandq %zmm6,%zmm1,%zmm0{%k4}{z}
> >
> > Would you try some of the above and see can it be reproduced?
>
> btw, I saw every second ovs is doing reprobing
> 2020-05-20T14:15:15.113Z|00373|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:16.129Z|00374|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:17.138Z|00375|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:18.150Z|00376|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:19.170Z|00377|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:20.184Z|00378|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:21.219Z|00379|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
>
btw, looking at
ovs-appctl coverage/show, this counter is very high when enabling the avx512
  handler_duplicate_upcall 459645.4/sec 434475.500/sec
17300.5372/sec   total: 64120526

other counters look OK.
William
Federico Iezzi May 20, 2020, 4:13 p.m. UTC | #9
On Wed, 20 May 2020 at 15:32, William Tu <u9012063@gmail.com> wrote:

> On Wed, May 20, 2020 at 3:35 AM Federico Iezzi <fiezzi@redhat.com> wrote:
> >
> >
> >
> >
> >
> > On Wed, 20 May 2020 at 12:20, Van Haaren, Harry <
> harry.van.haaren@intel.com> wrote:
> >>
> >> > -----Original Message-----
> >> > From: William Tu <u9012063@gmail.com>
> >> > Sent: Wednesday, May 20, 2020 1:12 AM
> >> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> >> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> >> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> >> > implementation
> >> >
> >> > On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
> >> > <harry.van.haaren@intel.com> wrote:
> >> > >
> >> > > > -----Original Message-----
> >> > > > From: William Tu <u9012063@gmail.com>
> >> > > > Sent: Monday, May 18, 2020 3:58 PM
> >> > > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> >> > > > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> >> > > > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512
> gather
> >> > > > implementation
> >> > > >
> >> > > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> >> > > > > This commit adds an AVX-512 dpcls lookup implementation.
> >> > > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> >> > > > > operations in parallel.
> >>
> >> <snip lots of code/patch contents for readability>
> >>
> >> > Hi Harry,
> >> >
> >> > I managed to find a machine with avx512 in google cloud and did some
> >> > performance testing. I saw lower performance when enabling avx512,
> >
> >
> > AVX512 instruction path lowers the clock speed well below the base
> frequency [1].
> > Aren't you killing the PMD performance while improving the lookup ones?
> >
> > [1]
> https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/2nd-gen-xeon-scalable-spec-update.pdf
> (see page 20)
> >
>
> Hi Federico,
>
> Thanks for sharing the link.
> Does that mean if OVS PMD uses avx512 on one core, then all the other
> cores's
> frequency will be lower?
>

Only where avx512 instructions are executed the clock is reduced to cope
with the thermals
I'm not sure if there is a situation where avx512 code is executed only on
specific PMDs, if that happens is bad as some may PMD be faster/slower (see
below)
Kinda like when dynamic turbo boost is enabled and some pmd go faster
because of the higher clock


>
> There are some discussion here:
>
> https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/


Wow, quite interesting. Thanks!


>
> My take is that overall down clocking will happen, but application
> will get better performance.
>

Indeed the part of the code wrote for avx512 goes much faster, the rest,
stay on the normal path and will go slow due to the reduced clock.
Those are different use-cases and programs but see Cannon Lake Anandtech
review regarding what AVX512 can deliver

###
When we crank on the AVX2 and AVX512, there is no stopping the Cannon Lake
chip here. At a score of 4519, it beats a full 18-core Core i9-7980XE
processor running in non-AVX.
https://www.anandtech.com/show/13405/intel-10nm-cannon-lake-and-core-i3-8121u-deep-dive-review/9
###

Indeed you have to expect much-improved performance from it, the question
is how much non-avx512 code will slow down
See also this one ->
https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html

HTH,
Federico


> William
>
>
Van Haaren, Harry May 21, 2020, 1:03 p.m. UTC | #10
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Wednesday, May 20, 2020 4:15 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation

<snip for required context only>

> > 2020-05-20T14:15:20.184Z|00378|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> > sub func, 4 1
> > 2020-05-20T14:15:21.219Z|00379|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> > sub func, 4 1
> >
> btw, looking at
> ovs-appctl coverage/show, this counter is very high when enabling the avx512
>   handler_duplicate_upcall 459645.4/sec 434475.500/sec
> 17300.5372/sec   total: 64120526

This counter seems to post some garbage to me if I run it before any traffic?
Tested using OVS Master @ 48b1c7642 (not including any AVX512 patches):

./utilities/ovs-appctl coverage/show | grep duplicate_upcall
21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 10272710751479363764

# re-runs show different numbers - indicates a garbage-initialized counter perhaps?
21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 1049338714623956653
21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 18343161283719775679

Would you test master branch and see if you can repro that garbage number before traffic?


Your setup shows 400k upcalls/sec, while here there are zero. Let's resolve that
discussion in the other email reply, I think there's a root cause visible in the log.
Van Haaren, Harry May 21, 2020, 1:12 p.m. UTC | #11
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Wednesday, May 20, 2020 3:20 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation

<snip snip>

> > >
> > > 3) start ovs and set avx and traffic gen
> > >  ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> > >  ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk
> > > options:dpdk-
> devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
> >
> > The output of the first command (enabling the AVX512 lookup) posts some
> output to Log INFO, please ensure its there?
> >
> > 2020-05-20T09:39:09Z|00262|dpif_netdev_lookup|INFO|Subtable function
> 'avx512_gather' set priority to 4
> > 2020-05-20T09:39:09Z|00006|dpif_netdev(pmd-c15/id:99)|INFO|reprobing sub
> func, 5 1
> >
> Yes, got these info log.

OK - verified the AVX512 is plugging in correct, moving on.

> ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
> ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3
> ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true
> ovs-ofctl add-flow br0 'actions=drop'
> ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
>   options:dpdk-
> devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1

I use Ether/VLAN/IPv4 to achieve a subtable with (4,1), is that the same as you?
Just trying to remove variables between our setups.

> LOG:
> 2020-05-20T13:49:26.542Z|00047|dpdk|INFO|DPDK Enabled - initialized
> 2020-05-20T13:49:26.544Z|00048|connmgr|INFO|br0<->unix#2: 1 flow_mods
> in the last 0 s (1 adds)
> 2020-05-20T13:49:26.547Z|00049|dpif_netdev_lookup|INFO|Subtable
> function 'avx512_gather' set priority to 5
> 2020-05-20T13:49:26.553Z|00050|netdev_dpdk|INFO|Device
> 'vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1' attached to
> DPDK
> 2020-05-20T13:49:26.553Z|00051|dpif_netdev|INFO|PMD thread on numa_id:
> 0, core id:  0 created.
> 2020-05-20T13:49:26.554Z|00052|dpif_netdev|INFO|PMD thread on numa_id:
> 0, core id:  1 created.
> 2020-05-20T13:49:26.554Z|00053|dpif_netdev|INFO|There are 2 pmd
> threads on numa node 0
> 2020-05-20T13:49:26.554Z|00054|dpdk|INFO|Device with port_id=0 already
> stopped
> 2020-05-20T13:49:26.648Z|00055|netdev_dpdk|WARN|Rx checksum offload is
> not supported on port 0
> 2020-05-20T13:49:26.648Z|00056|netdev_dpdk|WARN|Interface tg0 does not
> support MTU configuration, max packet size supported is 1500.
> 2020-05-20T13:49:26.648Z|00057|netdev_dpdk|INFO|Port 0: 02:70:63:61:70:00
> 2020-05-20T13:49:26.648Z|00058|dpif_netdev|INFO|Core 0 on numa node 0
> assigned port 'tg0' rx queue 0 (measured processing cycles 0).
> 2020-05-20T13:49:26.648Z|00059|bridge|INFO|bridge br0: added interface
> tg0 on port 1
> 2020-05-20T13:49:26.648Z|00001|ofproto_dpif_upcall(pmd-
> c00/id:9)|WARN|upcall_cb
> failure: ukey installation fails
> 2020-05-20T13:49:27.562Z|00002|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1

Aha! This shows somethings going wrong - there should not be any ukey-install fails!

This also explains why your logs (as per follow-up email in thread) have a high upcall count/sec,
the installed flow isn't being hit when matched. I'm not sure what the root cause of these
ukey-installation fails are - but this is what we need to investigate :)

Understanding the traffic, and attempting to reproduce here would a good step forward.

Would you describe the traffic contained in the pcap?
Is it a single packet, or something that should hit a single DPCLS wildcarded flow?


> > > 4) dp flows with miniflow info
> <snip>
> > It seems the "packets:0, bytes:0,used:never" tags indicate that there is no
> traffic hitting these rules at all?
> > Output here (with traffic running for a while) shows:
> > packets:621588543, bytes:37295312580, used:0.000s, dp:ovs, actions:dpdk1,
> dp-extra-info:miniflow_bits(4,1)
> >
> Thanks, this is the hit rules:
> root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m | grep -v never
> flow-dump from pmd on cpu core: 0
> ufid:f06998a0-9ff8-4ee5-b12f-5d7e2fcc7f0f,
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> 0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type
> (0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=
> 0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51642/0),tcp_flags(0/0),
> packets:3942096, bytes:2511115152, used:0.001s, flags:P., dp:ovs,
> actions:drop, dp-extra-info:miniflow_bits(4,1)
> ufid:cb3a6eac-3a7d-4e0d-a145-414dd482b4b9,
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> 0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type
> (0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=
> 0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51650/0),tcp_flags(0/0),
> packets:2779552, bytes:172332224, used:0.000s, flags:S., dp:ovs,
> actions:drop, dp-extra-info:miniflow_bits(4,1)
> ufid:781f3f48-ffd7-424f-ae99-62158ba05cbd,
> skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> 0a:b6:00:02/00:00:00:00:00:00,dst=42:01:0a:b6:00:01/00:00:00:00:00:00),eth_type
> (0x0800),ipv4(src=10.182.0.2/0.0.0.0,dst=169.254.169.254/0.0.0.0,proto=6/0,tos=
> 0/0,ttl=64/0,frag=no),tcp(src=51650/0,dst=80/0),tcp_flags(0/0),
> packets:637373, bytes:216706820, used:0.000s, flags:P., dp:ovs,
> actions:drop, dp-extra-info:miniflow_bits(4,1)

If single DPCLS rule expected, the below dumps of 3 rules active is explained too.


> > > 5) pmd-stat-show
> > > root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> > > pmd thread numa_id 0 core_id 0:
> > >   packets received: 19838528
> > >   packet recirculations: 0
> > >   avg. datapath passes per packet: 1.00
> > >   emc hits: 0
> > >   smc hits: 0
> > >   megaflow hits: 0
> > >   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> > > look right ....)
> > >   miss with success upcall: 78
> > >   miss with failed upcall: 19838418
> > >   avg. packets per output batch: 2.00
> > >   idle cycles: 0 (0.00%)
> > >   processing cycles: 103431787838 (100.00%)
> > >   avg cycles per packet: 5213.68 (103431787838/19838528)
> > >   avg processing cycles per packet: 5213.68 (103431787838/19838528)
> >
> > Would you try the pmd-stats-show command before setting the AVX512
> lookup?
> > If the issue is still present it would indicate its not related to the exact lookup
> > implementation.
> 
> Before setting AVX512
> ### Scalar Lookup
> pmd thread numa_id 0 core_id 0:
>   packets received: 77470176
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 70423947
>   smc hits: 0
>   megaflow hits: 7045897
>   avg. subtable lookups per megaflow hit: 1.00
>   miss with success upcall: 1
>   miss with failed upcall: 331
>   avg. packets per output batch: 0.00
>   idle cycles: 0 (0.00%)
>   processing cycles: 19596627706 (100.00%)
>   avg cycles per packet: 252.96 (19596627706/77470176)
>   avg processing cycles per packet: 252.96 (19596627706/77470176)
> 
> ### AVX512 Lookup (restart ovs-vswitchd with additional command
> "dpif-netdev/subtable-lookup-set avx512_gather 5"
> pmd thread numa_id 0 core_id 0:
>   packets received: 1178784
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 0
>   avg. subtable lookups per megaflow hit: 0.00
>   miss with success upcall: 13
>   miss with failed upcall: 1178739     ---> this looks not right
>   avg. packets per output batch: 0.00
>   idle cycles: 0 (0.00%)
>   processing cycles: 5408870500 (100.00%)
>   avg cycles per packet: 4588.52 (5408870500/1178784)
>   avg processing cycles per packet: 4588.52 (5408870500/1178784)

The statistics seem accurate (but indeed the upcall count is unexpected and too high).
This aligns with a ukey-install fail as noted in the logs above.

This seems to indicate that with the AVX512 lookup the ukey install fails.
I'd like to reproduce to investigate - above questions about traffic/rules
is hopefully enough to identify.

There is an alternative - set the "autovalidator" DPCLS implementation to
the highest priority, and it should ovs_assert() if the scalar/AVX512 implementations
mismatch. Then a dump of the OVS miniflow will give what's needed to verify root cause.


> > > 6) gdb also looks not right..., I didn't see any avx512 instructions
<snip>
> > Disassembly here shows AVX512 register usage here, as expected.
> 
> OK, tried
> (gdb) disas dpcls_avx512_gather_skx_mf_5_1
> and works for me. I can see avx512 instructions.

OK Great, good progress again.


> > Would you try some of the above and see can it be reproduced?
> 
> btw, I saw every second ovs is doing reprobing
> 2020-05-20T14:15:15.113Z|00373|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1
> 2020-05-20T14:15:16.129Z|00374|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> sub func, 4 1

Yes this is expected in the v2 of the patchset, and something that will be revised/updated/refactored
away in a future version of the patchset - so that the reprobe only occurs when prioritizes are changed.

Regards, -Harry
Van Haaren, Harry May 21, 2020, 5:09 p.m. UTC | #12
Hey All,

[OT: Apologies for a missing indent, some HTML mixup occurred somewhere, now plain-text email again.]

>From: Federico Iezzi <fiezzi@redhat.com>
>Sent: Wednesday, May 20, 2020 5:13 PM
>To: William Tu <u9012063@gmail.com>
>Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; ovs-dev@openvswitch.org; i.maximets@ovn.org
>Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather implementation
>
>On Wed, 20 May 2020 at 15:32, William Tu <u9012063@gmail.com> wrote:
>On Wed, May 20, 2020 at 3:35 AM Federico Iezzi <fiezzi@redhat.com> wrote:
>> On Wed, 20 May 2020 at 12:20, Van Haaren, Harry <harry.van.haaren@intel.com> wrote:
>>>
>>> > -----Original Message-----
>>> > From: William Tu <u9012063@gmail.com>
>>> > Sent: Wednesday, May 20, 2020 1:12 AM
>>> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
>>> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
>>> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
>>> > implementation
>>> >
>>> > On Mon, May 18, 2020 at 9:12 AM Van Haaren, Harry
>>> > <harry.van.haaren@intel.com> wrote:
>>> > >
>>> > > > -----Original Message-----
>>> > > > From: William Tu <u9012063@gmail.com>
>>> > > > Sent: Monday, May 18, 2020 3:58 PM
>>> > > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
>>> > > > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
>>> > > > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
>>> > > > implementation
>>> > > >
>>> > > > On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
>>> > > > > This commit adds an AVX-512 dpcls lookup implementation.
>>> > > > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
>>> > > > > operations in parallel.
>>>
>>> <snip lots of code/patch contents for readability>
>>>
>>> > Hi Harry,
>>> >
>>> > I managed to find a machine with avx512 in google cloud and did some
>>> > performance testing. I saw lower performance when enabling avx512,
>>
>>
>> AVX512 instruction path lowers the clock speed well below the base frequency [1].
>> Aren't you killing the PMD performance while improving the lookup ones?
>>
>> [1] https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/2nd-gen-xeon-scalable-spec-update.pdf (see page 20)

Thanks for raising your question – likely there are others with similar questions. It will be good to
discuss here and to be able to present the logic and design taken these OVS patches for enabling AVX512.

From a frequency perspective, there is a mis-conception that AVX512 will always cause the worst-case degradation.
For example, there are differences in frequency based on what instructions are executing. This does makes it more
complicated, however there are rules here – and those rules provide us SW developers with best practices. I've added
my colleague Edwin on CC, who is much more familiar with AVX512 frequency topic, and can provide more detail.


From an OVS Software Developer perspective, these were the design decisions that made AVX512 enabling work:
AVX512 provides very powerful compute ISA, so to optimize with it we must efficiently achieve compute. This patchset
achieves "flattening" of a packet miniflow data-structure, based on the miniflow of the subtable to match on. In short,
it implements the tuple-space-search as required for DPCLS wildcarded lookup in SIMD. The instruction count reduction
is large – and that's what ultimately leads to the performance improvements.

Given a DPCLS implementation with AVX512, we must consider the other work done on that thread – you correctly
point out that other work (e.g. DPDK PMDs) also execute on that core. My experience has been that performance goes
up – including DPDK PMD rx and tx – overall rate of work done increases. Given OVS can spend significant amounts of
time in DPCLS itself, any potential slowdown of the PMD code is very likely still giving performance improvements.

Finally – the design itself here is very flexible – this allows each deployment of OVS to test if/how-much the AVX512
code-path improves real-world performance, and enable it based on that.


>Thanks for sharing the link.
>Does that mean if OVS PMD uses avx512 on one core, then all the other cores's
>frequency will be lower?
>
>Only where avx512 instructions are executed the clock is reduced to cope with the thermals
>I'm not sure if there is a situation where avx512 code is executed only on specific PMDs, if that happens is bad as some may PMD be faster/slower (see below)
>Kinda like when dynamic turbo boost is enabled and some pmd go faster because of the higher clock
>
>
>There are some discussion here:
>https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/
>
>Wow, quite interesting. Thanks!
>
>
>My take is that overall down clocking will happen, but application
>will get better performance.
>
>Indeed the part of the code wrote for avx512 goes much faster, the rest, stay on the normal path and will go slow due to the reduced clock.
>Those are different use-cases and programs but see Cannon Lake Anandtech review regarding what AVX512 can deliver
>
>###
>When we crank on the AVX2 and AVX512, there is no stopping the Cannon Lake chip here. At a score of 4519, it beats a full 18-core Core i9-7980XE processor running in non-AVX.
>https://www.anandtech.com/show/13405/intel-10nm-cannon-lake-and-core-i3-8121u-deep-dive-review/9
>###
>
>Indeed you have to expect much-improved performance from it, the question is how much non-avx512 code will slow down
>See also this one -> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html

There's a lot of (and some very detailed) information out there,  and it's useful to read the available information.
Ultimately it is very unlikely somebody has tested your exact configuration or deployment, particularly since this
OVS patchset is fresh on the mailing-list in the past weeks. I welcome $ perf top  output like William's email,
showing CPU %'s spent in DPCLS, more real-world data the better for showing the value of AVX512 in DPCLS.

Regards, -Harry
William Tu May 21, 2020, 10:30 p.m. UTC | #13
> > ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
> > ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3
> > ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true
> > ovs-ofctl add-flow br0 'actions=drop'
> > ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> > ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
> >   options:dpdk-
> > devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
>
> I use Ether/VLAN/IPv4 to achieve a subtable with (4,1), is that the same as you?
> Just trying to remove variables between our setups.
>
btw I have only one OpenFlow rule, 'actions=drop'
The pcap file as input is a random one I just capture in my machine's interface
root@instance-3:~/ovs# tcpdump -n -r p0.pcap  | wc -l
reading from file p0.pcap, link-type EN10MB (Ethernet)
22
root@instance-3:~/ovs# tcpdump -n -r p0.pcap
reading from file p0.pcap, link-type EN10MB (Ethernet)
22:30:10.471943 IP 10.182.0.2.22 > 76.21.95.192.62190: Flags [P.], seq
3532581039:3532581163, ack 2971134033, win 501, options [nop,nop,TS
val 521819346 ecr 304440082], length 124
22:30:10.499759 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [.], ack
124, win 4092, options [nop,nop,TS val 304440141 ecr 521819346],
length 0
22:30:13.242821 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [P.], seq
1:37, ack 124, win 4096, options [nop,nop,TS val 304442869 ecr
521819346], length 36
22:30:13.243113 IP 10.182.0.2.22 > 76.21.95.192.62190: Flags [P.], seq
124:160, ack 37, win 501, options [nop,nop,TS val 521822117 ecr
304442869], length 36
22:30:13.271718 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [.], ack
160, win 4094, options [nop,nop,TS val 304442900 ecr 521822117],
length 0
22:30:13.415212 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [P.], seq
37:73, ack 160, win 4096, options [nop,nop,TS val 304443043 ecr
521822117], length 36
22:30:13.415479 IP 10.182.0.2.22 > 76.21.95.192.62190: Flags [P.], seq
160:196, ack 73, win 501, options [nop,nop,TS val 521822289 ecr
304443043], length 36
22:30:13.442371 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [.], ack
196, win 4094, options [nop,nop,TS val 304443069 ecr 521822289],
length 0
22:30:13.577866 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [P.], seq
73:109, ack 196, win 4096, options [nop,nop,TS val 304443208 ecr
521822289], length 36
22:30:13.578123 IP 10.182.0.2.22 > 76.21.95.192.62190: Flags [P.], seq
196:232, ack 109, win 501, options [nop,nop,TS val 521822452 ecr
304443208], length 36
22:30:13.608249 IP 76.21.95.192.62190 > 10.182.0.2.22: Flags [.], ack
232, win 4094, options [nop,nop,TS val 304443230 ecr 521822452],
length 0
22:30:16.932478 IP 169.254.169.254.80 > 10.182.0.2.51642: Flags [P.],
seq 1150154089:1150154672, ack 1477571123, win 65535, length 583:
HTTP: HTTP/1.1 200 OK
22:30:16.932540 IP 10.182.0.2.51642 > 169.254.169.254.80: Flags [.],
ack 583, win 64737, length 0
22:30:16.932547 IP 169.254.169.254.80 > 10.182.0.2.51642: Flags [F.],
seq 583, ack 1, win 65535, length 0
22:30:16.933193 IP 10.182.0.2.51642 > 169.254.169.254.80: Flags [F.],
seq 1, ack 584, win 64736, length 0
22:30:16.933280 IP 169.254.169.254.80 > 10.182.0.2.51642: Flags [.],
ack 2, win 65535, length 0
22:30:16.936976 IP 10.182.0.2.51650 > 169.254.169.254.80: Flags [S],
seq 1944213115, win 65320, options [mss 1420,sackOK,TS val 2204263930
ecr 0,nop,wscale 7], length 0
22:30:16.937201 IP 169.254.169.254.80 > 10.182.0.2.51650: Flags [S.],
seq 4118061879, ack 1944213116, win 65535, options [mss 1420,eol],
length 0
22:30:16.937234 IP 10.182.0.2.51650 > 169.254.169.254.80: Flags [.],
ack 1, win 65320, length 0
22:30:16.937297 IP 10.182.0.2.51650 > 169.254.169.254.80: Flags [P.],
seq 1:287, ack 1, win 65320, length 286: HTTP: GET
/computeMetadata/v1/instance/network-interfaces/?alt=json&last_etag=7c556bc02e6331f4&recursive=True&timeout_sec=72&wait_for_change=True
HTTP/1.1
22:30:16.937374 IP 169.254.169.254.80 > 10.182.0.2.51650: Flags [.],
ack 287, win 65249, length 0
22:30:16.937428 IP 169.254.169.254.80 > 10.182.0.2.51650: Flags [.],
ack 287, win 65535, length 0

I also attached the pcap file.
https://drive.google.com/file/d/1CR5pMebrtjzShF9bpXJcr9GAQY_6Og44/view?usp=sharing

> > LOG:
> > 2020-05-20T13:49:26.542Z|00047|dpdk|INFO|DPDK Enabled - initialized
> > 2020-05-20T13:49:26.544Z|00048|connmgr|INFO|br0<->unix#2: 1 flow_mods
> > in the last 0 s (1 adds)
> > 2020-05-20T13:49:26.547Z|00049|dpif_netdev_lookup|INFO|Subtable
> > function 'avx512_gather' set priority to 5
> > 2020-05-20T13:49:26.553Z|00050|netdev_dpdk|INFO|Device
> > 'vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1' attached to
> > DPDK
> > 2020-05-20T13:49:26.553Z|00051|dpif_netdev|INFO|PMD thread on numa_id:
> > 0, core id:  0 created.
> > 2020-05-20T13:49:26.554Z|00052|dpif_netdev|INFO|PMD thread on numa_id:
> > 0, core id:  1 created.
> > 2020-05-20T13:49:26.554Z|00053|dpif_netdev|INFO|There are 2 pmd
> > threads on numa node 0
> > 2020-05-20T13:49:26.554Z|00054|dpdk|INFO|Device with port_id=0 already
> > stopped
> > 2020-05-20T13:49:26.648Z|00055|netdev_dpdk|WARN|Rx checksum offload is
> > not supported on port 0
> > 2020-05-20T13:49:26.648Z|00056|netdev_dpdk|WARN|Interface tg0 does not
> > support MTU configuration, max packet size supported is 1500.
> > 2020-05-20T13:49:26.648Z|00057|netdev_dpdk|INFO|Port 0: 02:70:63:61:70:00
> > 2020-05-20T13:49:26.648Z|00058|dpif_netdev|INFO|Core 0 on numa node 0
> > assigned port 'tg0' rx queue 0 (measured processing cycles 0).
> > 2020-05-20T13:49:26.648Z|00059|bridge|INFO|bridge br0: added interface
> > tg0 on port 1
> > 2020-05-20T13:49:26.648Z|00001|ofproto_dpif_upcall(pmd-
> > c00/id:9)|WARN|upcall_cb
> > failure: ukey installation fails
> > 2020-05-20T13:49:27.562Z|00002|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> > sub func, 4 1
>
> Aha! This shows somethings going wrong - there should not be any ukey-install fails!
>
> This also explains why your logs (as per follow-up email in thread) have a high upcall count/sec,
> the installed flow isn't being hit when matched. I'm not sure what the root cause of these
> ukey-installation fails are - but this is what we need to investigate :)
>
> Understanding the traffic, and attempting to reproduce here would a good step forward.
>
> Would you describe the traffic contained in the pcap?
> Is it a single packet, or something that should hit a single DPCLS wildcarded flow?
>
describe in comment above.

>
> > > > 4) dp flows with miniflow info
> > <snip>
> > > It seems the "packets:0, bytes:0,used:never" tags indicate that there is no
> > traffic hitting these rules at all?
> > > Output here (with traffic running for a while) shows:
> > > packets:621588543, bytes:37295312580, used:0.000s, dp:ovs, actions:dpdk1,
> > dp-extra-info:miniflow_bits(4,1)
> > >
> > Thanks, this is the hit rules:
> > root@instance-3:~/ovs# ovs-appctl dpctl/dump-flows -m | grep -v never
> > flow-dump from pmd on cpu core: 0
> > ufid:f06998a0-9ff8-4ee5-b12f-5d7e2fcc7f0f,
> > skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> > 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> > 0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type
> > (0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=
> > 0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51642/0),tcp_flags(0/0),
> > packets:3942096, bytes:2511115152, used:0.001s, flags:P., dp:ovs,
> > actions:drop, dp-extra-info:miniflow_bits(4,1)
> > ufid:cb3a6eac-3a7d-4e0d-a145-414dd482b4b9,
> > skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> > 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> > 0a:b6:00:01/00:00:00:00:00:00,dst=42:01:0a:b6:00:02/00:00:00:00:00:00),eth_type
> > (0x0800),ipv4(src=169.254.169.254/0.0.0.0,dst=10.182.0.2/0.0.0.0,proto=6/0,tos=
> > 0/0,ttl=64/0,frag=no),tcp(src=80/0,dst=51650/0),tcp_flags(0/0),
> > packets:2779552, bytes:172332224, used:0.000s, flags:S., dp:ovs,
> > actions:drop, dp-extra-info:miniflow_bits(4,1)
> > ufid:781f3f48-ffd7-424f-ae99-62158ba05cbd,
> > skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(
> > 0/0),recirc_id(0),dp_hash(0/0),in_port(tg0),packet_type(ns=0,id=0),eth(src=42:01:
> > 0a:b6:00:02/00:00:00:00:00:00,dst=42:01:0a:b6:00:01/00:00:00:00:00:00),eth_type
> > (0x0800),ipv4(src=10.182.0.2/0.0.0.0,dst=169.254.169.254/0.0.0.0,proto=6/0,tos=
> > 0/0,ttl=64/0,frag=no),tcp(src=51650/0,dst=80/0),tcp_flags(0/0),
> > packets:637373, bytes:216706820, used:0.000s, flags:P., dp:ovs,
> > actions:drop, dp-extra-info:miniflow_bits(4,1)
>
> If single DPCLS rule expected, the below dumps of 3 rules active is explained too.
>
>
> > > > 5) pmd-stat-show
> > > > root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> > > > pmd thread numa_id 0 core_id 0:
> > > >   packets received: 19838528
> > > >   packet recirculations: 0
> > > >   avg. datapath passes per packet: 1.00
> > > >   emc hits: 0
> > > >   smc hits: 0
> > > >   megaflow hits: 0
> > > >   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> > > > look right ....)
> > > >   miss with success upcall: 78
> > > >   miss with failed upcall: 19838418
> > > >   avg. packets per output batch: 2.00
> > > >   idle cycles: 0 (0.00%)
> > > >   processing cycles: 103431787838 (100.00%)
> > > >   avg cycles per packet: 5213.68 (103431787838/19838528)
> > > >   avg processing cycles per packet: 5213.68 (103431787838/19838528)
> > >
> > > Would you try the pmd-stats-show command before setting the AVX512
> > lookup?
Yes.
before setting avx512:
root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
pmd thread numa_id 0 core_id 0:
  packets received: 70630720
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 64206054
  smc hits: 0
  megaflow hits: 6424309
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 324
  avg. packets per output batch: 0.00
  idle cycles: 1668002 (0.01%)
  processing cycles: 17710219822 (99.99%)
  avg cycles per packet: 250.77 (17711887824/70630720)
  avg processing cycles per packet: 250.74 (17710219822/70630720)


> > > If the issue is still present it would indicate its not related to the exact lookup
> > > implementation.
> >
> > Before setting AVX512
> > ### Scalar Lookup
> > pmd thread numa_id 0 core_id 0:
> >   packets received: 77470176
> >   packet recirculations: 0
> >   avg. datapath passes per packet: 1.00
> >   emc hits: 70423947
> >   smc hits: 0
> >   megaflow hits: 7045897
> >   avg. subtable lookups per megaflow hit: 1.00
> >   miss with success upcall: 1
> >   miss with failed upcall: 331
> >   avg. packets per output batch: 0.00
> >   idle cycles: 0 (0.00%)
> >   processing cycles: 19596627706 (100.00%)
> >   avg cycles per packet: 252.96 (19596627706/77470176)
> >   avg processing cycles per packet: 252.96 (19596627706/77470176)
> >
> > ### AVX512 Lookup (restart ovs-vswitchd with additional command
> > "dpif-netdev/subtable-lookup-set avx512_gather 5"
> > pmd thread numa_id 0 core_id 0:
> >   packets received: 1178784
> >   packet recirculations: 0
> >   avg. datapath passes per packet: 1.00
> >   emc hits: 0
> >   smc hits: 0
> >   megaflow hits: 0
> >   avg. subtable lookups per megaflow hit: 0.00
> >   miss with success upcall: 13
> >   miss with failed upcall: 1178739     ---> this looks not right
> >   avg. packets per output batch: 0.00
> >   idle cycles: 0 (0.00%)
> >   processing cycles: 5408870500 (100.00%)
> >   avg cycles per packet: 4588.52 (5408870500/1178784)
> >   avg processing cycles per packet: 4588.52 (5408870500/1178784)
>
> The statistics seem accurate (but indeed the upcall count is unexpected and too high).
> This aligns with a ukey-install fail as noted in the logs above.
>
> This seems to indicate that with the AVX512 lookup the ukey install fails.
> I'd like to reproduce to investigate - above questions about traffic/rules
> is hopefully enough to identify.

Why ukey is related here? Does you avx512 patch make any change to ukey?

>
> There is an alternative - set the "autovalidator" DPCLS implementation to
> the highest priority, and it should ovs_assert() if the scalar/AVX512 implementations
> mismatch. Then a dump of the OVS miniflow will give what's needed to verify root cause.
>
that's a cool feature.
When setting
ovs-appctl dpif-netdev/subtable-lookup-set autovalidator 100
log shows
2020-05-21T22:28:55.964Z|77007|dpif_lookup_autovalidator(pmd-c00/id:9)|ERR|matches_good
7 != matches_test 0 for func avx512_gather
2020-05-21T22:28:55.964Z|77008|dpif_lookup_autovalidator(pmd-c00/id:9)|ERR|matches_good
7 != matches_test 0 for func avx512_gather
2020-05-21T22:28:55.965Z|77009|dpif_lookup_autovalidator(pmd-c00/id:9)|ERR|matches_good
3 != matches_test 0 for func avx512_gather
2020-05-21T22:28:55.965Z|77010|dpif_lookup_autovalidator(pmd-c00/id:9)|ERR|matches_good
15 != matches_test 0 for func avx512_gather

Thanks
William
William Tu May 21, 2020, 10:36 p.m. UTC | #14
On Thu, May 21, 2020 at 6:04 AM Van Haaren, Harry
<harry.van.haaren@intel.com> wrote:
>
> > -----Original Message-----
> > From: William Tu <u9012063@gmail.com>
> > Sent: Wednesday, May 20, 2020 4:15 PM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > implementation
>
> <snip for required context only>
>
> > > 2020-05-20T14:15:20.184Z|00378|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> > > sub func, 4 1
> > > 2020-05-20T14:15:21.219Z|00379|dpif_netdev(pmd-c00/id:9)|INFO|reprobing
> > > sub func, 4 1
> > >
> > btw, looking at
> > ovs-appctl coverage/show, this counter is very high when enabling the avx512
> >   handler_duplicate_upcall 459645.4/sec 434475.500/sec
> > 17300.5372/sec   total: 64120526
>
> This counter seems to post some garbage to me if I run it before any traffic?
> Tested using OVS Master @ 48b1c7642 (not including any AVX512 patches):
>
> ./utilities/ovs-appctl coverage/show | grep duplicate_upcall
> 21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 10272710751479363764
>
> # re-runs show different numbers - indicates a garbage-initialized counter perhaps?
> 21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 1049338714623956653
> 21:handler_duplicate_upcall   0.0/sec     0.000/sec        0.0000/sec   total: 18343161283719775679
>

using the same pcap traffic (p0.pcap) on current master, I did not see
the above issue:
datapath_drop_upcall_error  57.4/sec     4.783/sec        0.0797/sec
total: 287
drop_action_of_pipeline  5909696.2/sec 492474.683/sec
8207.9114/sec   total: 52399553

William
Van Haaren, Harry May 26, 2020, 2:52 p.m. UTC | #15
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Thursday, May 21, 2020 11:30 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation
> 
> > > ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
> > > ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3
> > > ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true
> > > ovs-ofctl add-flow br0 'actions=drop'
> > > ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> > > ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
> > >   options:dpdk-
> > > devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
> >
> > I use Ether/VLAN/IPv4 to achieve a subtable with (4,1), is that the same as
> you?
> > Just trying to remove variables between our setups.
> >
> btw I have only one OpenFlow rule, 'actions=drop'
> The pcap file as input is a random one I just capture in my machine's interface
> root@instance-3:~/ovs# tcpdump -n -r p0.pcap  | wc -l
> reading from file p0.pcap, link-type EN10MB (Ethernet)
> 22
> root@instance-3:~/ovs# tcpdump -n -r p0.pcap

Hi William,

Thanks for the info - I was away on a longer weekend, back now!

<snip lots of pcap & details>

> > > 2020-05-20T13:49:26.648Z|00057|netdev_dpdk|INFO|Port 0:
> 02:70:63:61:70:00
> > > 2020-05-20T13:49:26.648Z|00058|dpif_netdev|INFO|Core 0 on numa node 0
> > > assigned port 'tg0' rx queue 0 (measured processing cycles 0).
> > > 2020-05-20T13:49:26.648Z|00059|bridge|INFO|bridge br0: added interface
> > > tg0 on port 1
> > > 2020-05-20T13:49:26.648Z|00001|ofproto_dpif_upcall(pmd-
> > > c00/id:9)|WARN|upcall_cb
> > > failure: ukey installation fails
> > > 2020-05-20T13:49:27.562Z|00002|dpif_netdev(pmd-
> c00/id:9)|INFO|reprobing
> > > sub func, 4 1
> >
> > Aha! This shows somethings going wrong - there should not be any ukey-
> install fails!
> >
> > This also explains why your logs (as per follow-up email in thread) have a high
> upcall count/sec,
> > the installed flow isn't being hit when matched. I'm not sure what the root
> cause of these
> > ukey-installation fails are - but this is what we need to investigate :)
> >
> > Understanding the traffic, and attempting to reproduce here would a good
> step forward.
> >
> > Would you describe the traffic contained in the pcap?
> > Is it a single packet, or something that should hit a single DPCLS wildcarded
> flow?
> >
> describe in comment above.

Thanks - the details of the pcap should be enough for me to debug from here.

<snip>

> > > > > 5) pmd-stat-show
> > > > > root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> > > > > pmd thread numa_id 0 core_id 0:
> > > > >   packets received: 19838528
> > > > >   packet recirculations: 0
> > > > >   avg. datapath passes per packet: 1.00
> > > > >   emc hits: 0
> > > > >   smc hits: 0
> > > > >   megaflow hits: 0
> > > > >   avg. subtable lookups per megaflow hit: 0.00  (---> this doesn't
> > > > > look right ....)
> > > > >   miss with success upcall: 78
> > > > >   miss with failed upcall: 19838418
> > > > >   avg. packets per output batch: 2.00
> > > > >   idle cycles: 0 (0.00%)
> > > > >   processing cycles: 103431787838 (100.00%)
> > > > >   avg cycles per packet: 5213.68 (103431787838/19838528)
> > > > >   avg processing cycles per packet: 5213.68 (103431787838/19838528)
> > > >
> > > > Would you try the pmd-stats-show command before setting the AVX512
> > > lookup?
> Yes.
> before setting avx512:
> root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
> pmd thread numa_id 0 core_id 0:
>   packets received: 70630720
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 64206054
>   smc hits: 0
>   megaflow hits: 6424309
>   avg. subtable lookups per megaflow hit: 1.00
>   miss with success upcall: 1
>   miss with failed upcall: 324
>   avg. packets per output batch: 0.00
>   idle cycles: 1668002 (0.01%)
>   processing cycles: 17710219822 (99.99%)
>   avg cycles per packet: 250.77 (17711887824/70630720)
>   avg processing cycles per packet: 250.74 (17710219822/70630720)

Thanks - this looks good - as expected.


> > > > If the issue is still present it would indicate its not related to the exact
> lookup
> > > > implementation.
> > >
> > > Before setting AVX512
> > > ### Scalar Lookup
> > > pmd thread numa_id 0 core_id 0:
> > >   packets received: 77470176
> > >   packet recirculations: 0
> > >   avg. datapath passes per packet: 1.00
> > >   emc hits: 70423947
> > >   smc hits: 0
> > >   megaflow hits: 7045897
> > >   avg. subtable lookups per megaflow hit: 1.00
> > >   miss with success upcall: 1
> > >   miss with failed upcall: 331
> > >   avg. packets per output batch: 0.00
> > >   idle cycles: 0 (0.00%)
> > >   processing cycles: 19596627706 (100.00%)
> > >   avg cycles per packet: 252.96 (19596627706/77470176)
> > >   avg processing cycles per packet: 252.96 (19596627706/77470176)
> > >
> > > ### AVX512 Lookup (restart ovs-vswitchd with additional command
> > > "dpif-netdev/subtable-lookup-set avx512_gather 5"
> > > pmd thread numa_id 0 core_id 0:
> > >   packets received: 1178784
> > >   packet recirculations: 0
> > >   avg. datapath passes per packet: 1.00
> > >   emc hits: 0
> > >   smc hits: 0
> > >   megaflow hits: 0
> > >   avg. subtable lookups per megaflow hit: 0.00
> > >   miss with success upcall: 13
> > >   miss with failed upcall: 1178739     ---> this looks not right
> > >   avg. packets per output batch: 0.00
> > >   idle cycles: 0 (0.00%)
> > >   processing cycles: 5408870500 (100.00%)
> > >   avg cycles per packet: 4588.52 (5408870500/1178784)
> > >   avg processing cycles per packet: 4588.52 (5408870500/1178784)
> >
> > The statistics seem accurate (but indeed the upcall count is unexpected and
> too high).
> > This aligns with a ukey-install fail as noted in the logs above.
> >
> > This seems to indicate that with the AVX512 lookup the ukey install fails.
> > I'd like to reproduce to investigate - above questions about traffic/rules
> > is hopefully enough to identify.
> 
> Why ukey is related here? Does you avx512 patch make any change to ukey?

No AVX512 doesn't make any ukey changes - but issues in the hashing of the
miniflow data blocks cause ukeys to be installed in different locations than where
they are looked up - hence "ukey install fail" == "issue in miniflow iteration" in this
context.


> > There is an alternative - set the "autovalidator" DPCLS implementation to
> > the highest priority, and it should ovs_assert() if the scalar/AVX512
> implementations
> > mismatch. Then a dump of the OVS miniflow will give what's needed to verify
> root cause.
> >
> that's a cool feature.
> When setting
> ovs-appctl dpif-netdev/subtable-lookup-set autovalidator 100
> log shows
> 2020-05-21T22:28:55.964Z|77007|dpif_lookup_autovalidator(pmd-
> c00/id:9)|ERR|matches_good
> 7 != matches_test 0 for func avx512_gather

Brilliant - this is exactly why the autovalidator is there. It has correctly flagged
an issue here - I've reproduced using a pcap and your commands above. I will
investigate a fix and include in the v3.

Thanks for the details - will keep you all posted on progress. -Harry
Van Haaren, Harry May 27, 2020, 12:21 p.m. UTC | #16
> -----Original Message-----
> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Van Haaren, Harry
> Sent: Tuesday, May 26, 2020 3:52 PM
> To: William Tu <u9012063@gmail.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation

<snip>

> > Why ukey is related here? Does you avx512 patch make any change to ukey?
> 
> No AVX512 doesn't make any ukey changes - but issues in the hashing of the
> miniflow data blocks cause ukeys to be installed in different locations than
> where they are looked up - hence "ukey install fail" == "issue in miniflow iteration" in
> this context.

The ukey install fails are due to a mismatch in compile flags (with/without SSE 4.2),
combined with the fact that the hashing in OVS changes its implementation depending
on the availability of the SSE 4.2  ISA (and other defines for other architectures).

The mismatch comes from upcall code being compiled without SSE4.2 (so using mhash hash code)
while the AVX512 lookup hash routines have SSE4.2 enabled (so using CRC32 hash code).
As a result, hashing identical data in different .c files produces a different hash values.

From OVS docs (http://docs.openvswitch.org/en/latest/intro/install/general/) the following
enables native ISA for your build, or else just enable SSE4.2 and popcount:
./configure CFLAGS="-g -O2 -march=native"
./configure CFLAGS="-g -O2 -march=nehalem"

To continue your testing William, I suggest using the above workaround - compile OVS and explicitly
enable SSE4.2, aligning all hashing code in OVS to use the more performant CRC32 hashing.

I will work on a proper solution to avoid this issue in the v3 patchset.

Thanks for reporting, -Harry

> > > There is an alternative - set the "autovalidator" DPCLS implementation to
> > > the highest priority, and it should ovs_assert() if the scalar/AVX512
> > implementations
> > > mismatch. Then a dump of the OVS miniflow will give what's needed to verify
> > root cause.
> > >
> > that's a cool feature.
> > When setting
> > ovs-appctl dpif-netdev/subtable-lookup-set autovalidator 100
> > log shows
> > 2020-05-21T22:28:55.964Z|77007|dpif_lookup_autovalidator(pmd-
> > c00/id:9)|ERR|matches_good
> > 7 != matches_test 0 for func avx512_gather
> 
> Brilliant - this is exactly why the autovalidator is there. It has correctly flagged
> an issue here - I've reproduced using a pcap and your commands above. I will
> investigate a fix and include in the v3.
> 
> Thanks for the details - will keep you all posted on progress. -Harry
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
William Tu May 29, 2020, 1:19 a.m. UTC | #17
On Wed, May 27, 2020 at 12:21:43PM +0000, Van Haaren, Harry wrote:
> > -----Original Message-----
> > From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Van Haaren, Harry
> > Sent: Tuesday, May 26, 2020 3:52 PM
> > To: William Tu <u9012063@gmail.com>
> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > implementation
> 
> <snip>
> 
> > > Why ukey is related here? Does you avx512 patch make any change to ukey?
> > 
> > No AVX512 doesn't make any ukey changes - but issues in the hashing of the
> > miniflow data blocks cause ukeys to be installed in different locations than
> > where they are looked up - hence "ukey install fail" == "issue in miniflow iteration" in
> > this context.
> 
> The ukey install fails are due to a mismatch in compile flags (with/without SSE 4.2),
> combined with the fact that the hashing in OVS changes its implementation depending
> on the availability of the SSE 4.2  ISA (and other defines for other architectures).
> 
> The mismatch comes from upcall code being compiled without SSE4.2 (so using mhash hash code)
> while the AVX512 lookup hash routines have SSE4.2 enabled (so using CRC32 hash code).
> As a result, hashing identical data in different .c files produces a different hash values.
> 
> From OVS docs (http://docs.openvswitch.org/en/latest/intro/install/general/) the following
> enables native ISA for your build, or else just enable SSE4.2 and popcount:
> ./configure CFLAGS="-g -O2 -march=native"
> ./configure CFLAGS="-g -O2 -march=nehalem"

Hi Harry,
Thanks for the info!
I can make it work now, with 
./configure CFLAGS="-g -O2 -msse4.2 -march=native"

using similar setup
ovs-ofctl add-flow br0 'actions=drop'
ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
  options:dpdk-devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1

The performance seems a little worse (9.7Mpps -> 8.7Mpps).
I wonder whether it's due to running it in VM (however I don't
have physical machine).

=== Enable AVX512 ===
Drop rate: 8.7Mpps
2020-05-29T01:03:15.740Z|00049|dpif_netdev_lookup|INFO|Subtable function 'avx512_gather' set priority to 5
  21.93%  pmd-c00/id:10  ovs-vswitchd        [.] dp_netdev_input__
  19.38%  pmd-c00/id:10  ovs-vswitchd        [.] miniflow_extract
  19.08%  pmd-c00/id:10  ovs-vswitchd        [.] eth_pcap_rx_infinite
  10.24%  pmd-c00/id:10  ovs-vswitchd        [.] miniflow_hash_5tuple
   9.63%  pmd-c00/id:10  libc-2.27.so        [.] __memcmp_avx2_movbe
   8.46%  pmd-c00/id:10  ovs-vswitchd        [.] free_dpdk_buf
   1.83%  pmd-c00/id:10  ovs-vswitchd        [.] dpcls_avx512_gather_skx_mf_4_1
   1.65%  pmd-c00/id:10  ovs-vswitchd        [.] odp_execute_actions
   1.17%  pmd-c00/id:10  ovs-vswitchd        [.] fast_path_processing
   1.12%  pmd-c00/id:10  ovs-vswitchd        [.] netdev_dpdk_rxq_recv
   0.99%  pmd-c00/id:10  ovs-vswitchd        [.] pmd_perf_end_iteration
   0.87%  pmd-c00/id:10  ovs-vswitchd        [.] dp_netdev_process_rxq_port
   0.51%  pmd-c00/id:10  ovs-vswitchd        [.] cmap_find_batch

root@instance-3:~/ovs# ovs-appctl dpif-netdev/pmd-stats-show
pmd thread numa_id 0 core_id 0:
  packets received: 167704800
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 152452853
  smc hits: 0
  megaflow hits: 15251600
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 346 
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 38399744430 (100.00%)
  avg cycles per packet: 228.97 (38399744430/167704800)
  avg processing cycles per packet: 228.97 (38399744430/167704800)

=== Generic lookup ===
Drop rate: 9.7Mpps
2020-05-29T01:07:05.781Z|00049|dpif_netdev_lookup|INFO|Subtable function 'generic' set priority to 5

pmd thread numa_id 0 core_id 1:
  packets received: 332413344
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 302178098
  smc hits: 0
  megaflow hits: 30234893
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 320 
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 68605925782 (100.00%)
  avg cycles per packet: 206.39 (68605925782/332413344)
  avg processing cycles per packet: 206.39 (68605925782/332413344)

  22.04%  pmd-c01/id:10  ovs-vswitchd        [.] dp_netdev_input__
  19.87%  pmd-c01/id:10  ovs-vswitchd        [.] miniflow_extract
  18.24%  pmd-c01/id:10  ovs-vswitchd        [.] eth_pcap_rx_infinite
   9.84%  pmd-c01/id:10  libc-2.27.so        [.] __memcmp_avx2_movbe
   9.76%  pmd-c01/id:10  ovs-vswitchd        [.] miniflow_hash_5tuple
   8.16%  pmd-c01/id:10  ovs-vswitchd        [.] free_dpdk_buf
   2.27%  pmd-c01/id:10  ovs-vswitchd        [.] dpcls_subtable_lookup_mf_u0w4_u1w1
   1.71%  pmd-c01/id:10  ovs-vswitchd        [.] odp_execute_actions
   1.39%  pmd-c01/id:10  ovs-vswitchd        [.] fast_path_processing
   1.10%  pmd-c01/id:10  ovs-vswitchd        [.] netdev_dpdk_rxq_recv
   0.99%  pmd-c01/id:10  ovs-vswitchd        [.] dp_netdev_process_rxq_port
   0.87%  pmd-c01/id:10  ovs-vswitchd        [.] pmd_perf_end_iteration
   0.55%  pmd-c01/id:10  ovs-vswitchd        [.] cmap_find_batch

Is there any thing I should double check?
Regards,
William

> 
> To continue your testing William, I suggest using the above workaround - compile OVS and explicitly
> enable SSE4.2, aligning all hashing code in OVS to use the more performant CRC32 hashing.
> 
> I will work on a proper solution to avoid this issue in the v3 patchset.
> 
> Thanks for reporting, -Harry
> 
> > > > There is an alternative - set the "autovalidator" DPCLS implementation to
> > > > the highest priority, and it should ovs_assert() if the scalar/AVX512
> > > implementations
> > > > mismatch. Then a dump of the OVS miniflow will give what's needed to verify
> > > root cause.
> > > >
> > > that's a cool feature.
> > > When setting
> > > ovs-appctl dpif-netdev/subtable-lookup-set autovalidator 100
> > > log shows
> > > 2020-05-21T22:28:55.964Z|77007|dpif_lookup_autovalidator(pmd-
> > > c00/id:9)|ERR|matches_good
> > > 7 != matches_test 0 for func avx512_gather
> > 
> > Brilliant - this is exactly why the autovalidator is there. It has correctly flagged
> > an issue here - I've reproduced using a pcap and your commands above. I will
> > investigate a fix and include in the v3.
> > 
> > Thanks for the details - will keep you all posted on progress. -Harry
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Van Haaren, Harry May 29, 2020, 11:47 a.m. UTC | #18
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Friday, May 29, 2020 2:19 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation
> 
> On Wed, May 27, 2020 at 12:21:43PM +0000, Van Haaren, Harry wrote:
<snip hashing details>
> > As a result, hashing identical data in different .c files produces a different hash
> values.
> >
> > From OVS docs (http://docs.openvswitch.org/en/latest/intro/install/general/)
> the following
> > enables native ISA for your build, or else just enable SSE4.2 and popcount:
> > ./configure CFLAGS="-g -O2 -march=native"
> > ./configure CFLAGS="-g -O2 -march=nehalem"
> 
> Hi Harry,
> Thanks for the info!
> I can make it work now, with
> ./configure CFLAGS="-g -O2 -msse4.2 -march=native"

OK - that's good - the root cause of the bug/hash-mismatch is confirmed!


> using similar setup
> ovs-ofctl add-flow br0 'actions=drop'
> ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
>   options:dpdk-
> devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
> 
> The performance seems a little worse (9.7Mpps -> 8.7Mpps).
> I wonder whether it's due to running it in VM (however I don't
> have physical machine).

Performance degradations are not expected, let me try understand
the below performance data posted, and work through it.

Agree that isolating the hardware and being able to verify
environment would help in removing potential noise.. but
let us work with the setup you have. Do you know what CPU
it is you're running on?

It seems you have EMC enabled (as per OVS defaults). The stats posted show
an approx 10:1 ratio on hits in EMC and DPCLS. This likely adds noise to the
measurements - as only 10% of the packets hit the changes in DPCLS.

Also in the perf top profile dp_netdev_input__ takes more cycles than
miniflow_extract, and the memcmp() is present, indicating EMC is consuming
CPU cycles to perform its duties.

I guess our simple test case is failing to show what we're trying to measure,
as you know a EMC likes low flow counts, all explaining why DPCLS is
only ~2% of CPU time.

<snip>
Removed details of CPU profiles & PMD stats for AVX512 and Generic DPCLS
removed to trim conversation. Very helpful to see into your system, and I'm
a big fan of perf top and friends - so this was useful to see, thanks!
(Future readers, check the mailing list "thread" view for previous post's details).


> Is there any thing I should double check?

Would you mind re-testing with EMC disabled? Likely DPCLS will show up as a
much larger % in the CPU profile, and this might provide some new insights.

Regards, -Harry

<snip context/backlog of hashing debug and resolution>
William Tu May 29, 2020, 6:49 p.m. UTC | #19
On Fri, May 29, 2020 at 4:47 AM Van Haaren, Harry
<harry.van.haaren@intel.com> wrote:
>
> > -----Original Message-----
> > From: William Tu <u9012063@gmail.com>
> > Sent: Friday, May 29, 2020 2:19 AM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> > implementation
> >
> > On Wed, May 27, 2020 at 12:21:43PM +0000, Van Haaren, Harry wrote:
> <snip hashing details>
> > > As a result, hashing identical data in different .c files produces a different hash
> > values.
> > >
> > > From OVS docs (http://docs.openvswitch.org/en/latest/intro/install/general/)
> > the following
> > > enables native ISA for your build, or else just enable SSE4.2 and popcount:
> > > ./configure CFLAGS="-g -O2 -march=native"
> > > ./configure CFLAGS="-g -O2 -march=nehalem"
> >
> > Hi Harry,
> > Thanks for the info!
> > I can make it work now, with
> > ./configure CFLAGS="-g -O2 -msse4.2 -march=native"
>
> OK - that's good - the root cause of the bug/hash-mismatch is confirmed!
>
>
> > using similar setup
> > ovs-ofctl add-flow br0 'actions=drop'
> > ovs-appctl dpif-netdev/subtable-lookup-set avx512_gather 5
> > ovs-vsctl add-port br0 tg0 -- set int tg0 type=dpdk \
> >   options:dpdk-
> > devargs=vdev:net_pcap0,rx_pcap=/root/ovs/p0.pcap,infinite_rx=1
> >
> > The performance seems a little worse (9.7Mpps -> 8.7Mpps).
> > I wonder whether it's due to running it in VM (however I don't
> > have physical machine).
>
> Performance degradations are not expected, let me try understand
> the below performance data posted, and work through it.
>
> Agree that isolating the hardware and being able to verify
> environment would help in removing potential noise.. but
> let us work with the setup you have. Do you know what CPU
> it is you're running on?

Thanks! I think it's skylake
root@instance-3:~/ovs# lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  2
Core(s) per socket:  2
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.176
BogoMIPS:            4000.35
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx
pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc
cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2
x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm
3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase
tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f
avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl
xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities

lspci
00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
00:01.0 ISA bridge: Intel Corporation 82371AB/EB/MB PIIX4 ISA (rev 03)
00:01.3 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 03)

>
> It seems you have EMC enabled (as per OVS defaults). The stats posted show
> an approx 10:1 ratio on hits in EMC and DPCLS. This likely adds noise to the
> measurements - as only 10% of the packets hit the changes in DPCLS.
>
> Also in the perf top profile dp_netdev_input__ takes more cycles than
> miniflow_extract, and the memcmp() is present, indicating EMC is consuming
> CPU cycles to perform its duties.
>
> I guess our simple test case is failing to show what we're trying to measure,
> as you know a EMC likes low flow counts, all explaining why DPCLS is
> only ~2% of CPU time.
>
> <snip>
> Removed details of CPU profiles & PMD stats for AVX512 and Generic DPCLS
> removed to trim conversation. Very helpful to see into your system, and I'm
> a big fan of perf top and friends - so this was useful to see, thanks!
> (Future readers, check the mailing list "thread" view for previous post's details).
>
>
> > Is there any thing I should double check?
>
> Would you mind re-testing with EMC disabled? Likely DPCLS will show up as a
> much larger % in the CPU profile, and this might provide some new insights.
>
OK, with EMC disabled, the performance gap is a little better.
Now we don't see memcmp.

=== generic ===
drop rate: 8.65Mpps
pmd thread numa_id 0 core_id 1:
  packets received: 223168512
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 223167820
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 659
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 51969566520 (100.00%)
  avg cycles per packet: 232.87 (51969566520/223168512)
  avg processing cycles per packet: 232.87 (51969566520/223168512)

  19.17%  pmd-c01/id:9  ovs-vswitchd        [.]
dpcls_subtable_lookup_mf_u0w4_u1w1
  18.93%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_extract
  16.15%  pmd-c01/id:9  ovs-vswitchd        [.] eth_pcap_rx_infinite
  11.34%  pmd-c01/id:9  ovs-vswitchd        [.] dp_netdev_input__
  10.51%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_hash_5tuple
   6.88%  pmd-c01/id:9  ovs-vswitchd        [.] free_dpdk_buf
   5.63%  pmd-c01/id:9  ovs-vswitchd        [.] fast_path_processing
   4.95%  pmd-c01/id:9  ovs-vswitchd        [.] cmap_find_batch

=== AVX512 ===
drop rate: 8.28Mpps
pmd thread numa_id 0 core_id 1:
  packets received: 138495296
  packet recirculations: 0
  avg. datapath passes per packet: 1.00
  emc hits: 0
  smc hits: 0
  megaflow hits: 138494847
  avg. subtable lookups per megaflow hit: 1.00
  miss with success upcall: 1
  miss with failed upcall: 416
  avg. packets per output batch: 0.00
  idle cycles: 0 (0.00%)
  processing cycles: 33452482260 (100.00%)
  avg cycles per packet: 241.54 (33452482260/138495296)
  avg processing cycles per packet: 241.54 (33452482260/138495296)

  19.78%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_extract
  17.73%  pmd-c01/id:9  ovs-vswitchd        [.] eth_pcap_rx_infinite
  13.53%  pmd-c01/id:9  ovs-vswitchd        [.] dpcls_avx512_gather_skx_mf_4_1
  12.00%  pmd-c01/id:9  ovs-vswitchd        [.] dp_netdev_input__
  10.94%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_hash_5tuple
   7.80%  pmd-c01/id:9  ovs-vswitchd        [.] free_dpdk_buf
   5.97%  pmd-c01/id:9  ovs-vswitchd        [.] fast_path_processing
   5.23%  pmd-c01/id:9  ovs-vswitchd        [.] cmap_find_batch

I'm not able to get current cpu frequency, probably due to running in VM?
root@instance-3:~/ovs# modprobe acpi-cpufreq
root@instance-3:~/ovs# cpufreq-info
cpufrequtils 008: cpufreq-info (C) Dominik Brodowski 2004-2009
Report errors and bugs to cpufreq@vger.kernel.org, please.
analyzing CPU 0:
  no or unknown cpufreq driver is active on this CPU
  maximum transition latency: 4294.55 ms.

Regards,
William
Van Haaren, Harry June 3, 2020, 5:36 p.m. UTC | #20
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Friday, May 29, 2020 7:49 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather
> implementation
> 
> On Fri, May 29, 2020 at 4:47 AM Van Haaren, Harry
> <harry.van.haaren@intel.com> wrote:
<snip old discussion>
> > Agree that isolating the hardware and being able to verify
> > environment would help in removing potential noise.. but
> > let us work with the setup you have. Do you know what CPU
> > it is you're running on?
> 
> Thanks! I think it's skylake
> root@instance-3:~/ovs# lscpu
> Architecture:        x86_64
<snip>

Yep looks like Skylake, and has AVX512, so requirements met.

<snip>
> > Would you mind re-testing with EMC disabled? Likely DPCLS will show up as a
> > much larger % in the CPU profile, and this might provide some new insights.
> >
> OK, with EMC disabled, the performance gap is a little better.
> Now we don't see memcmp.
> 
> === generic ===
> drop rate: 8.65Mpps
> pmd thread numa_id 0 core_id 1:
>   packets received: 223168512
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 223167820
>   avg. subtable lookups per megaflow hit: 1.00
>   miss with success upcall: 1
>   miss with failed upcall: 659
>   avg. packets per output batch: 0.00
>   idle cycles: 0 (0.00%)
>   processing cycles: 51969566520 (100.00%)
>   avg cycles per packet: 232.87 (51969566520/223168512)
>   avg processing cycles per packet: 232.87 (51969566520/223168512)
> 
>   19.17%  pmd-c01/id:9  ovs-vswitchd        [.] dpcls_subtable_lookup_mf_u0w4_u1w1
>   18.93%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_extract
>   16.15%  pmd-c01/id:9  ovs-vswitchd        [.] eth_pcap_rx_infinite
>   11.34%  pmd-c01/id:9  ovs-vswitchd        [.] dp_netdev_input__
>   10.51%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_hash_5tuple
>    6.88%  pmd-c01/id:9  ovs-vswitchd        [.] free_dpdk_buf
>    5.63%  pmd-c01/id:9  ovs-vswitchd        [.] fast_path_processing
>    4.95%  pmd-c01/id:9  ovs-vswitchd        [.] cmap_find_batch
> 
> === AVX512 ===
> drop rate: 8.28Mpps
> pmd thread numa_id 0 core_id 1:
>   packets received: 138495296
>   packet recirculations: 0
>   avg. datapath passes per packet: 1.00
>   emc hits: 0
>   smc hits: 0
>   megaflow hits: 138494847
>   avg. subtable lookups per megaflow hit: 1.00
>   miss with success upcall: 1
>   miss with failed upcall: 416
>   avg. packets per output batch: 0.00
>   idle cycles: 0 (0.00%)
>   processing cycles: 33452482260 (100.00%)
>   avg cycles per packet: 241.54 (33452482260/138495296)
>   avg processing cycles per packet: 241.54 (33452482260/138495296)
> 
>   19.78%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_extract
>   17.73%  pmd-c01/id:9  ovs-vswitchd        [.] eth_pcap_rx_infinite
>   13.53%  pmd-c01/id:9  ovs-vswitchd        [.] dpcls_avx512_gather_skx_mf_4_1
>   12.00%  pmd-c01/id:9  ovs-vswitchd        [.] dp_netdev_input__
>   10.94%  pmd-c01/id:9  ovs-vswitchd        [.] miniflow_hash_5tuple
>    7.80%  pmd-c01/id:9  ovs-vswitchd        [.] free_dpdk_buf
>    5.97%  pmd-c01/id:9  ovs-vswitchd        [.] fast_path_processing
>    5.23%  pmd-c01/id:9  ovs-vswitchd        [.] cmap_find_batch

Discussing details posted above, we do see cycle reduction in DPCLS:
Scalar (232 cyc, ~19% dpcls) ~= 46 cyc/pkt
AVX512 (241 cyc, ~13% dpcls) ~= 31 cyc/pkt

Re-stating the obvious strangeness above, the overall performance decreases
This seems to show that somehow despite DPCLS running faster, the overall
rate of work is reduced. This has not been my experience, testing the AVX512
DPCLS code running in a baremetal (not VM) environment with HW NICs has
shown good performance uplift here.


> I'm not able to get current cpu frequency, probably due to running in VM?
> root@instance-3:~/ovs# modprobe acpi-cpufreq
> root@instance-3:~/ovs# cpufreq-info
> cpufrequtils 008: cpufreq-info (C) Dominik Brodowski 2004-2009
> Report errors and bugs to cpufreq@vger.kernel.org, please.
> analyzing CPU 0:
>   no or unknown cpufreq driver is active on this CPU
>   maximum transition latency: 4294.55 ms.

Yes, a likely cause for not getting frequencies etc is due to running in a VM.
Logical next steps would be to remove noise or environmental issues to identify
exactly what the root cause is of the slowdown - unfortunately it seems that
might not be possible due to the environment.

I'm preparing a v3 of the patchset, including a number of usability and general
improvements - fixing issues present in the v2 like the "subtable reprobe" at
one-second intervals, as well as adding a command to print the available
lookup functions and their current priorities. Hoping to get the v3 up early
next week.

Regards, -Harry
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 19e454c4b..d8a05b384 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -8,13 +8,16 @@ 
 # libopenvswitch.la is the library to link against for binaries like vswitchd.
 # The code itself is built as two seperate static libraries;
 # - core: Core files, always compiled with distro provided CFLAGS
+# - lookupavx512: ISA optimized routines that require CPUID checks at runtime
 lib_LTLIBRARIES += lib/libopenvswitch.la
 lib_LTLIBRARIES += lib/libopenvswitchcore.la
+lib_LTLIBRARIES += lib/libopenvswitchlookupavx512.la
 
 # Dummy library to link against doesn't have any sources, but does
 # depend on libopenvswitchcore static library
 lib_libopenvswitch_la_SOURCES =
 lib_libopenvswitch_la_LIBADD = lib/libopenvswitchcore.la
+lib_libopenvswitch_la_LIBADD += lib/libopenvswitchlookupavx512.la
 
 # Dummy library continues to depend on external libraries as before
 lib_libopenvswitch_la_LIBADD += $(SSL_LIBS)
@@ -31,6 +34,19 @@  lib_libopenvswitch_la_LDFLAGS = \
         $(lib_libopenvswitchcore_la_LIBS) \
         $(AM_LDFLAGS)
 
+
+# Build lookupavx512 library with extra CFLAGS enabled. This allows the
+# compiler to use the ISA features required for the ISA optimized code-paths.
+lib_libopenvswitchlookupavx512_la_CFLAGS = \
+	-mavx512f \
+	-mavx512bw \
+	-mavx512dq \
+	-mbmi2 \
+	$(AM_CFLAGS)
+lib_libopenvswitchlookupavx512_la_SOURCES = \
+	lib/dpif-netdev-lookup-avx512-gather.c
+
+
 # Build core vswitch libraries as before
 lib_libopenvswitchcore_la_SOURCES = \
 	lib/aes128.c \
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
new file mode 100644
index 000000000..52348041b
--- /dev/null
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -0,0 +1,255 @@ 
+/*
+ * Copyright (c) 2020, Intel Corperation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+
+#include <config.h>
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-lookup.h"
+#include "dpif-netdev-private.h"
+#include "cmap.h"
+#include "flow.h"
+#include "pvector.h"
+#include "openvswitch/vlog.h"
+
+#include <immintrin.h>
+
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
+
+static inline __m512i
+_mm512_popcnt_epi64_manual(__m512i v_in)
+{
+    static const uint8_t pop_lut[64] = {
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    };
+    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
+
+    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
+    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
+    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
+    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
+
+    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
+    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
+    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
+
+    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
+}
+
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+                        const uint32_t mf_bits_total,
+                        const uint64_t * block_cache)
+{
+    ovs_assert(mf_bits_total <= 8);
+    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+    const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+
+    /* Always load a full cache line from blocks_cache. Other loads must be
+     * trimmed to the amount of data required for mf_bits_total blocks.
+     */
+    __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
+    __m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
+    __m512i v_key    = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
+
+    __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+    uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+
+    /* returns 1 assuming result of SIMD compare is all blocks */
+    return res_mask == lane_mask;
+}
+
+static inline uint32_t ALWAYS_INLINE
+avx512_lookup_impl(struct dpcls_subtable *subtable,
+                   uint32_t keys_map,
+                   const struct netdev_flow_key *keys[],
+                   struct dpcls_rule **rules,
+                   const uint32_t bit_count_u0,
+                   const uint32_t bit_count_u1)
+{
+    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+    int i;
+    uint32_t hashes[NETDEV_MAX_BURST];
+    const uint32_t n_pkts = __builtin_popcountll(keys_map);
+    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
+
+    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 8];
+
+    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
+    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
+    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
+    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
+
+    /* Load subtable blocks for masking later */
+    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
+    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+
+    /* Load pre-created subtable masks for each block in subtable */
+    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
+    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+                                                        subtable->mf_masks);
+
+    ULLONG_FOR_EACH_1 (i, keys_map) {
+        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
+        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+
+        /* Pre-create register with *PER PACKET* u0 offset */
+        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+        const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
+                                                                pkt_mf_u0_pop);
+
+        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
+        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
+        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
+                                         keys[i]->mf.map.bits[1]);
+
+        /* Bitmask by pre-created masks */
+        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+
+        /* Manual AVX512 popcount for u64 lanes */
+        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+
+        /* Offset popcounts for u1 with pre-created offset register */
+        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+
+        /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
+        const __m512i v_zeros = _mm512_setzero_si512();
+        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
+        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
+                                 bit_count_total_mask, v_indexes, pkt_data, 8);
+
+        /* Zero out bits that pkt doesn't have:
+         * - 2x pext() to extract bits from packet miniflow as needed by TBL
+         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
+         */
+         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
+         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+
+        /* Mask blocks using AND with subtable blocks, use k-mask to zero
+         * where lanes as required for this packet.
+         */
+        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
+                                                v_all_blocks, v_tbl_blocks);
+
+        /* Store to blocks cache, full cache line aligned */
+        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
+    }
+
+    /* Hash the now linearized blocks of packet metadata. */
+    ULLONG_FOR_EACH_1 (i, keys_map) {
+        uint64_t *block_ptr = &block_cache[i * 8];
+        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
+        hashes[i] = hash_finish(hash, bit_count_total * 8);
+    }
+
+    /* Lookup: this returns a bitmask of packets where the hash table had
+     * an entry for the given hash key. Presence of a hash key does not
+     * guarantee matching the key, as there can be hash collisions.
+     */
+    uint32_t found_map;
+    const struct cmap_node *nodes[NETDEV_MAX_BURST];
+    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
+
+    /* Verify that packet actually matched rule. If not found, a hash
+     * collision has taken place, so continue searching with the next node.
+     */
+    ULLONG_FOR_EACH_1 (i, found_map) {
+        struct dpcls_rule *rule;
+
+        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+            const uint32_t cidx = i * 8;
+            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+                                                     &block_cache[cidx]);
+            if (OVS_LIKELY(match)) {
+                rules[i] = rule;
+                subtable->hit_cnt++;
+                goto next;
+            }
+        }
+
+        /* None of the found rules was a match.  Clear the i-th bit to
+         * search for this key in the next subtable. */
+        ULLONG_SET0(found_map, i);
+    next:
+        ;                     /* Keep Sparse happy. */
+    }
+
+    return found_map;
+}
+
+/* Expand out specialized functions with U0 and U1 bit attributes. */
+#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                             \
+    static uint32_t                                                           \
+    dpcls_avx512_gather_skx_mf_##U0##_##U1(                                   \
+                                         struct dpcls_subtable *subtable,     \
+                                         uint32_t keys_map,                   \
+                                         const struct netdev_flow_key *keys[],\
+                                         struct dpcls_rule **rules)           \
+    {                                                                         \
+        return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
+    }                                                                         \
+
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
+
+/* Check if a specialized function is valid for the required subtable. */
+#define CHECK_LOOKUP_FUNCTION(U0, U1)                                         \
+    if (!f && u0_bits == U0 && u1_bits == U1) {                               \
+        f = dpcls_avx512_gather_skx_mf_##U0##_##U1;                           \
+    }
+
+static uint32_t
+avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
+                     const struct netdev_flow_key *keys[],
+                     struct dpcls_rule **rules)
+{
+    return avx512_lookup_impl(subtable, keys_map, keys, rules,
+                              subtable->mf_bits_set_unit0,
+                              subtable->mf_bits_set_unit1);
+}
+
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
+{
+    dpcls_subtable_lookup_func f = NULL;
+
+    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+    if (!avx512f_available || !bmi2_available) {
+        return NULL;
+    }
+
+    CHECK_LOOKUP_FUNCTION(5, 1);
+    CHECK_LOOKUP_FUNCTION(4, 1);
+    CHECK_LOOKUP_FUNCTION(4, 0);
+
+    if (!f && (u0_bits + u1_bits) < 8) {
+        f = avx512_gather_mf_any;
+        VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
+                  u0_bits, u1_bits);
+    }
+
+    return f;
+}
+
+#endif /* __x86_64__ */
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index 2e9fb0abd..b22a26b8c 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -18,6 +18,13 @@  static struct dpcls_subtable_lookup_info_t subtable_lookups[] = {
     { .prio = 1,
       .probe = dpcls_subtable_generic_probe,
       .name = "generic", },
+
+#ifdef __x86_64__
+    /* Only available on x86 64 bit */
+    { .prio = 0,
+      .probe = dpcls_subtable_avx512_gather_probe,
+      .name = "avx512_gather", },
+#endif
 };
 
 int32_t
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index 61f44b9e8..07a9bf694 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -21,6 +21,9 @@ 
 #include "dpif-netdev.h"
 #include "dpif-netdev-private.h"
 
+/* Extreme debugging for developers only */
+#define DPIF_NETDEV_LOOKUP_DATAPATH_DEBUG 1
+
 /* Function to perform a probe for the subtable bit fingerprint.
  * Returns NULL if not valid, or a valid function pointer to call for this
  * subtable on success.
@@ -42,6 +45,10 @@  dpcls_subtable_autovalidator_probe(uint32_t u0_bit_count,
 dpcls_subtable_lookup_func
 dpcls_subtable_generic_probe(uint32_t u0_bit_count, uint32_t u1_bit_count);
 
+/* Probe function for AVX-512 gather implementation */
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bit_cnt, uint32_t u1_bit_cnt);
+
 
 /* Subtable registration and iteration helpers */
 struct dpcls_subtable_lookup_info_t {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5d22e3aaa..6fae584a1 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1264,6 +1264,10 @@  static void
 dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
                                 const char *argv[], void *aux OVS_UNUSED)
 {
+    /* TODO: If less than 2 parameters are provided return a list of
+     * known dpcls implementations compiled in?
+     */
+
     /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
      *   argv[1] is subtable name
      *   argv[2] is priority