@@ -32,6 +32,7 @@ lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
lib_libopenvswitchavx512_la_CFLAGS = \
-mavx512f \
-mavx512bw \
+ -mavx512vl \
-mavx512dq \
-mbmi \
-mbmi2 \
@@ -39,6 +40,7 @@ lib_libopenvswitchavx512_la_CFLAGS = \
$(AM_CFLAGS)
lib_libopenvswitchavx512_la_SOURCES = \
lib/dpif-netdev-lookup-avx512-gather.c \
+ lib/dpif-netdev-avx512-extract.c \
lib/dpif-netdev-avx512.c
lib_libopenvswitchavx512_la_LDFLAGS = \
-static
@@ -107,6 +109,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dp-packet.h \
lib/dp-packet.c \
lib/dpdk.h \
+ lib/dpif-netdev-avx512-extract.h \
lib/dpif-netdev-lookup.h \
lib/dpif-netdev-lookup.c \
lib/dpif-netdev-lookup-autovalidator.c \
@@ -117,6 +120,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-private-dpcls.h \
lib/dpif-netdev-private-dpif.c \
lib/dpif-netdev-private-dpif.h \
+ lib/dpif-netdev-private-extract.c \
+ lib/dpif-netdev-private-extract.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
new file mode 100644
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions */
+#if !defined(__CHECKER__)
+
+#include <config.h>
+#include <errno.h>
+
+#include "dpif-netdev-avx512-extract.h"
+#include "dpif-netdev-private-thread.h"
+
+#include "immintrin.h"
+
+/* This file contains optimized implementations of miniflow_extract()
+ * for specific common traffic patterns. The optimizations allow for
+ * quick probing of a specific packet type, and if a match with a specific
+ * type is found, a shuffle like proceedure builds up the required miniflow
+ */
+
+#define MAX_PATTERN_COUNT (8)
+#define MAX_PATTERN_SIZE (128)
+#define MAX_SHUFFLE_COUNT (MAX_PATTERN_SIZE / 64)
+
+/* A structure to represent each matched on packet pattern */
+struct __attribute__((aligned(MAX_PATTERN_SIZE))) packet_pattern {
+ /* A bitmask to apply to the packet before comparing it to the pattern.
+ * This results in only bits that matter to packet layout remaining.
+ */
+ uint8_t mask[MAX_PATTERN_SIZE];
+
+ /* The data values to compare the masked packet against. This is the known
+ * fields of the packet which are required for a specific layout. E.g. an
+ * Ether/IPv4 packet has a 0x0800 ethertype, and the 0x0800 is stored here.
+ */
+ uint8_t data[MAX_PATTERN_SIZE];
+
+};
+
+/* Improvement: create this sttruct in dp-packet.h, and reuse-here. That would
+ * avoid the requirement of the packed attribute.
+ */
+struct __attribute__((packed)) packet_offsets {
+ uint8_t l2_pad_size;
+ uint16_t l2_5_ofs;
+ uint16_t l3_ofs;
+ uint16_t l4_ofs;
+};
+
+/* Structure to represent the data-movement from pattern to miniflow. */
+struct packet_pattern_shuffle {
+ uint64_t kmasks[MAX_SHUFFLE_COUNT];
+ struct packet_offsets offsets;
+
+ /* The input data to the data-movement shuffle. This shuffle changes the
+ * layout of the packet data into the miniflow blocks shape.
+ */
+ uint8_t shuffle[MAX_PATTERN_SIZE];
+
+ /* Data to be merged into the resulting miniflow blocks. This is required
+ * for e.g. VLAN TCI, which generates a bit in the block even if the packet
+ * didn't originally have it.
+ */
+ uint8_t insert[MAX_PATTERN_SIZE];
+};
+
+/* structure that represents all per-thread pattern data. */
+struct packet_pattern_cache {
+ /* Minimum packet len for this pattern index to be a valid candidate. */
+ uint8_t min_len[MAX_PATTERN_COUNT];
+
+ /* Number of active patterns to match against. */
+ uint8_t active_pattern_count;
+
+ /* The mask and compare data itself. */
+ struct packet_pattern patterns[MAX_PATTERN_COUNT];
+
+ /* Miniflow bits that need to be set for each pattern. */
+ struct miniflow miniflow_bits[MAX_PATTERN_COUNT];
+
+ /* Structure to represent the data-movement from pattern to miniflow. */
+ struct packet_pattern_shuffle shuffles[MAX_PATTERN_COUNT];
+
+};
+
+/* Single copy of control-path owned patterns. The contents of this struct will
+ * be updated when the user runs a miniflow-pattern-add command. The contents
+ * of this struct are only read in the datapath during the "study" phase, and
+ * copied into a thread-local memory for the PMD threads for datapath usage.
+ */
+static struct packet_pattern_cache patterns_control_path;
+
+/* Generator for EtherType masks and values. */
+#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \
+ 0, 0, 0, 0, 0, 0, /* Ether MAC DST */ \
+ 0, 0, 0, 0, 0, 0, /* Ether MAC SRC */ \
+ type_b0, type_b1, /* EtherType */
+
+#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
+#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
+
+#define PATTERN_VLAN_GEN(tpid0, tpid1, tci0, tci1) \
+ tpid0, tpid1, pcp_dei_vid0, vid1, /* Whole VLAN header */
+
+#define PATTERN_VLAN_MASK PATTERN_VLAN_GEN(0xFF, 0xFF, (~0x4), 0xFF)
+#define PATTERN_VLAN_DATA PATTERN_VLAN_GEN(0xFF, 0xFF, 0, 0)
+
+/* Generator for checking IPv4 ver, ihl, and proto */
+#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
+ VER_IHL, /* Version and IHL */ \
+ 0, 0, 0, /* DSCP, ECN, Total Lenght */ \
+ 0, 0, /* Identification */ \
+ /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */ \
+ FLAG_OFF_B0, FLAG_OFF_B1, \
+ 0, /* TTL */ \
+ PROTO, /* Protocol */ \
+ 0, 0, /* Header checksum */ \
+ 0, 0, 0, 0, /* Src IP */ \
+ 0, 0, 0, 0, /* Dst IP */
+
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
+
+
+#define ETHER_IPV4_UDP_LEN (42)
+
+#define NU 0
+#define PATTERN_IPV4_UDP_SHUFFLE \
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, NU, NU, /* Ether */ \
+ 26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ \
+ 34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */
+
+static int avx512vbmi_available;
+
+/* Enable Icelake AVX-512 VBMI ISA for only this function. That allows the
+ * compile to emit the instruction here, but not use AVX-512 VBMI outside
+ * of this function.
+ */
+static inline __m512i __attribute__((__target__("avx512vbmi")))
+packet_shuffle_avx512_icx(__mmask64 k_mask, __m512i v_pkt_data_0,
+ __m512i v_shuf_mask, __m512i v_pkt_data_1)
+{
+ return _mm512_maskz_permutex2var_epi8(k_mask, v_pkt_data_0,
+ v_shuf_mask, v_pkt_data_1);
+}
+
+/* This function provides a Skylake and higher fallback for the byte-shuffle
+ * that is required to implement miniflow extract correctly.
+ */
+static inline __m512i
+packet_shuffle_avx512(__mmask64 k_mask, __m512i v_data_0, __m512i v_shuf_idxs,
+ __m512i v_data_1, uint32_t use_vbmi)
+{
+ if (use_vbmi) {
+ return packet_shuffle_avx512_icx(k_mask, v_data_0,
+ v_shuf_idxs, v_data_1);
+ }
+
+ /* Clear away ODD lane bytes, shift down by 1 to get u8 to u16 idxs. */
+ const __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA;
+ __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+ v_shuf_idxs, _mm512_setzero_si512());
+ v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
+
+ /* Clear away EVEN lane bytes by shifting out. Shift EVEN lane indexes down
+ * by one bit too to achieve u8 to u16 conversion.
+ */
+ __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
+
+ /* Shuffle each of odd/even at 16-bit width. */
+ __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
+ v_data_1);
+ __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
+ v_data_1);
+
+ /* Find if the shuffle index was odd, via mask and compare. */
+ uint16_t index_odd_mask = 0x1;
+ const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
+
+ /* EVEN lanes, find if u8 index was odd, result as u16 bitmask. */
+ __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
+ v_index_mask_u16);
+ __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
+ v_index_mask_u16);
+
+ /* ODD lanes, find if u8 index was odd, result as u16 bitmask. */
+ __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
+ __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8,
+ v_index_mask_u16);
+ __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked,
+ v_index_mask_u16);
+ odd_rotate_mask = ~odd_rotate_mask;
+
+ /* Rotate based on low-bit-set bitmask, and blend results. */
+ __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1,
+ evn_rotate_mask, v_shuf1, 8);
+ __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2,
+ odd_rotate_mask, v_shuf2, 8);
+
+ /* Blend results of two halves back together. */
+ __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+ v_shuf_res_evn, v_shuf_res_odd);
+
+ /* k-mask the final result as requested. This is not easy to do before
+ * here, as the instructions operate at u16 size, meaning the k-mask would
+ * be interpreted as the wrong size.
+ */
+ __m512i v_zeros = _mm512_setzero_si512();
+ __m512i v_shuf_res_masked = _mm512_mask_blend_epi8(k_mask, v_zeros,
+ v_shuf_result);
+ return v_shuf_res_masked;
+}
+
+
+/* Matches all patterns provided, building the appropriate miniflow for a hit.
+ *
+ * Note that this function is compile-time specialized into to variants, one
+ * for CPUs that support AVX-512 Vector Bit Manipulation Instructions (VBMI),
+ * and another for those that support AVX-512 but not AVX-512 VBMI.
+ */
+static inline __attribute__((always_inline)) uint32_t
+packet_pattern_avx512(struct dp_packet *dp_pkt, struct miniflow *mf,
+ struct packet_pattern_cache *cache,
+ const uint32_t num_patterns,
+ const uint32_t use_vbmi)
+{
+ uint8_t *pkt = dp_packet_data(dp_pkt);
+ const uint32_t pkt_len = dp_packet_size(dp_pkt);
+ uint32_t in_port = odp_to_u32(dp_pkt->md.in_port.odp_port);
+
+ /* Masked load to only load the valid packet data. */
+ uint64_t mask1 = (1ULL << pkt_len) - 1;
+ mask1 |= (pkt_len < 64) - 1;
+ __mmask64 pkt_len_mask_0 = mask1;
+
+ uint64_t mask2 = (1ULL << (pkt_len - 64)) - 1;
+ mask2 |= (pkt_len < 128) - 1;
+ mask2 &= (pkt_len < 64) - 1;
+ __mmask64 pkt_len_mask_1 = mask2;
+
+ __m512i v_pkt_data_0 = _mm512_maskz_loadu_epi8(pkt_len_mask_0, &pkt[0]);
+ __m512i v_pkt_data_1 = _mm512_maskz_loadu_epi8(pkt_len_mask_1, &pkt[64]);
+
+ /* Loop over the patterns provided. Note that this loop can be compile-time
+ * unrolled for specialized versions with set numbers of patterns.
+ */
+ uint32_t hitmask = 0;
+
+ for (uint32_t i = 0; i < num_patterns; i++) {
+ struct packet_pattern *patterns = cache->patterns;
+
+ /* Mask and match the packet data and pattern, results in hit bit. */
+ __m512i v_mask_0 = _mm512_loadu_si512(&patterns[i].mask[0]);
+ __m512i v_data_0 = _mm512_loadu_si512(&patterns[i].data[0]);
+ __m512i v_pkt_masked = _mm512_and_si512(v_pkt_data_0, v_mask_0);
+ __mmask64 cmp_mask = _mm512_cmpeq_epi8_mask(v_pkt_masked, v_data_0);
+
+ uint32_t hit = (cmp_mask == UINT64_MAX);
+ hitmask |= (hit << i);
+ }
+
+ /* Check packet len to ensure the packet data filled the whole pattern. */
+ __mmask16 min_len_mask = (1 << num_patterns) - 1;
+ __m128i v_pattern_min_lens = _mm_maskz_loadu_epi8(min_len_mask,
+ (void *)cache->min_len);
+ __m128i v_pkt_len = _mm_maskz_set1_epi8(min_len_mask, pkt_len);
+ uint32_t pkt_len_valid_mask = _mm_mask_cmpge_epu8_mask(min_len_mask,
+ v_pkt_len,
+ v_pattern_min_lens);
+
+ /* Strip away hit if packet was too short for the pattern */
+ hitmask &= pkt_len_valid_mask;
+
+ /* If a pattern was hit, build the miniflow using the pattern shuffle. */
+ if (OVS_LIKELY(hitmask)) {
+ uint32_t idx = __builtin_ctzll(hitmask);
+
+ /* Copy the pattern miniflow bits to the destination miniflow. */
+ struct miniflow *pattern_mf_bits = &cache->miniflow_bits[idx];
+ __m128i v_pattern_mf_bits = _mm_load_si128((void *)pattern_mf_bits);
+ _mm_storeu_si128((void *)mf, v_pattern_mf_bits);
+
+ /* Load miniflow building metadata */
+ struct packet_pattern_shuffle *shuffle = &cache->shuffles[idx];
+ __mmask64 k_shuf_0 = shuffle->kmasks[0];
+ __m512i v_shuf_mask_0 = _mm512_loadu_si512(&shuffle->shuffle[0]);
+ __m512i v_ins_0 = _mm512_loadu_si512(&shuffle->insert[0]);
+
+ /* Compute bytes 0-63 and merge in pattern-required bits. */
+ __m512i v_mf_blocks_0 = packet_shuffle_avx512(k_shuf_0, v_pkt_data_0,
+ v_shuf_mask_0, v_pkt_data_1, use_vbmi);
+ __m512i v_mf_blocks_ins_0 = _mm512_or_si512(v_mf_blocks_0, v_ins_0);
+
+ /* If required, compute bytes 0-63 and merge in pattern bits. */
+ __m512i v_mf_blocks_ins_1 = _mm512_setzero_si512();
+ __mmask64 k_shuf_1 = shuffle->kmasks[1];
+ if (k_shuf_1) {
+ __m512i v_shuf_mask_1 = _mm512_loadu_si512(&shuffle->shuffle[64]);
+ __m512i v_mf_blocks_1 = packet_shuffle_avx512(k_shuf_1,
+ v_pkt_data_0, v_shuf_mask_1,
+ v_pkt_data_1, use_vbmi);
+ __m512i v_ins_1 = _mm512_loadu_si512(&shuffle->insert[64]);
+ v_mf_blocks_ins_1 = _mm512_or_si512(v_mf_blocks_1, v_ins_1);
+ }
+
+ /* Miniflow Blocks contains first 2 blocks of non-packet-parsed data,
+ * such as the dp hash, in port, ct_mark, and packet_type. On outer
+ * packets, they are always zero except for in_port.
+ */
+ uint64_t *mf_blocks = miniflow_values(mf);
+ __m128i v_blocks_01 = _mm_setzero_si128();
+ v_blocks_01 = _mm_insert_epi32(v_blocks_01, in_port, 1);
+ _mm_storeu_si128((void *)&mf_blocks[0], v_blocks_01);
+
+ /* Store the computed miniflow blocks. */
+ _mm512_storeu_si512(&mf_blocks[2], v_mf_blocks_ins_0);
+ _mm512_storeu_si512(&mf_blocks[2 + 8], v_mf_blocks_ins_1);
+
+ /* Set dp packet offsets from the pattern metadata. */
+ memcpy(&dp_pkt->l2_pad_size, &shuffle->offsets,
+ sizeof(struct packet_offsets));
+ }
+
+ return hitmask;
+}
+
+/* TODO: This function accepts a string, which represents the pattern and
+ * shuffles required for the users traffic type. Today this function has a
+ * hard-coded pattern for Ether()/IP()/UDP() packets.
+ *
+ * A future revision of this patchset will include the parsing of the input
+ * string to create the patterns, providing runtime flexibility in parsing
+ * packets into miniflows.
+ */
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string)
+{
+ /* Check that the runtime CPU has the required ISA avialable. Also check for
+ * AVX-512 Vector Bit Manipulation Instructions (VBMI), which allow a faster
+ * code-path to be used due to a native byte permute instruction.
+ */
+ int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+ int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+ avx512vbmi_available = dpdk_get_cpu_has_isa("x86_64", "avx512vbmi");
+
+ uint32_t min_isa_ok = avx512f_available && bmi2_available;
+ printf("%s : minimum ISA avialable: %s, AVX-512 VBMI available: %s\n",
+ __func__, min_isa_ok ? "yes" : "no",
+ avx512vbmi_available ? "yes" : "no");
+ if (!min_isa_ok) {
+ return -ENOTSUP;
+ }
+
+ (void)patterns_control_path;
+ (void)pattern_string;
+
+ /* Add hard-coded Ether/IPv4/UDP implementation for demonstration. */
+ patterns_control_path.active_pattern_count = 1;
+
+ /* Ether/IPv4/UDP pattern metadata */
+ patterns_control_path.patterns[0] = (struct packet_pattern) {
+ .mask = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK },
+ .data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP },
+ };
+
+ printf("%s: pattern 0 mask:\n", __func__);
+ ovs_hex_dump(stdout, &patterns_control_path.patterns[0].mask,
+ MAX_PATTERN_SIZE, 0, false);
+ printf("%s: pattern 0 data:\n", __func__);
+ ovs_hex_dump(stdout, &patterns_control_path.patterns[0].data,
+ MAX_PATTERN_SIZE, 0, false);
+
+ patterns_control_path.miniflow_bits[0] = (struct miniflow) {
+ .map = { .bits = {0x18a0000000000000, 0x0000000000040401}, }
+ };
+ printf("pattern[0] mf bits %08llx %08llx\n",
+ patterns_control_path.miniflow_bits[0].map.bits[0],
+ patterns_control_path.miniflow_bits[0].map.bits[1]);
+
+ patterns_control_path.min_len[0] = ETHER_IPV4_UDP_LEN;
+
+ /* Kmask and Shuffle for Ether/IPv4/UDP. Created by inspecting miniflow
+ * built from packet data, and reproduced using AVX-512 instructions with
+ * k-masks to zero parts of the miniflow as required.
+ */
+ patterns_control_path.shuffles[0] = (struct packet_pattern_shuffle) {
+ .kmasks = { 0b0000111111110000111111110011111111111111, 0 },
+ .offsets = {
+ .l2_pad_size = 0,
+ .l2_5_ofs = UINT16_MAX,
+ .l3_ofs = 14,
+ .l4_ofs = 34,
+ },
+ .shuffle = {PATTERN_IPV4_UDP_SHUFFLE},
+ };
+ printf("pattern[0] kmask[0] %08lx, kmask[1] %08lx, shuffle hexdump:\n",
+ patterns_control_path.shuffles[0].kmasks[0],
+ patterns_control_path.shuffles[0].kmasks[1]);
+ ovs_hex_dump(stdout, &patterns_control_path.shuffles[0], MAX_PATTERN_SIZE,
+ 0, false);
+
+ return 0;
+};
+
+static uint32_t
+miniflow_extract_avx512(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *mf)
+{
+ /* TODO: alloc pattern cache per PMD thread. */
+ (void)pmd;
+
+ /* Execute the pattern matching using the PMD pattern cache. */
+ uint32_t num_patterns = 1;
+ uint32_t use_vbmi = 0;
+ uint32_t match_hit = packet_pattern_avx512(packet, mf,
+ &patterns_control_path,
+ num_patterns,
+ use_vbmi);
+ return match_hit;
+}
+
+/* This function will only be used if AVX-512 VBMI instructions are available
+ * on the CPU. As such, we use the __target__ attribute to enable VBMI ISA.
+ */
+static uint32_t __attribute__((__target__("avx512vbmi")))
+miniflow_extract_avx512_vbmi(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *mf)
+{
+ /* TODO: alloc pattern cache per PMD thread. */
+ (void)pmd;
+
+ /* Execute the pattern matching using the PMD pattern cache. */
+ uint32_t num_patterns = 1;
+ uint32_t use_vbmi = 1;
+ uint32_t match_hit = packet_pattern_avx512(packet, mf,
+ &patterns_control_path,
+ num_patterns,
+ use_vbmi);
+ return match_hit;
+}
+
+/* The study function runs the patterns from the control-path, and based on
+ * some hit statistics can copy the pattern to the per-PMD pattern cache. Part
+ * of the study() functionality is also to validate that hits on a pattern
+ * result in an identical miniflow as the scalar miniflow_extract() function.
+ * This is validated by calling the scalar version, and comparing output.
+ */
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *dst)
+{
+ static volatile int debug = 0;
+
+ /* Run using the user supplied patterns. */
+ uint32_t match = miniflow_extract_avx512(pmd, packet, dst);
+
+ if (debug || match) {
+ /* Save off AVX512 created dp_packet offsets for verification. */
+ struct packet_offsets vec_offsets;
+ memcpy(&vec_offsets, &packet->l2_pad_size,
+ sizeof(struct packet_offsets));
+
+ /* Check the result vs the scalar miniflow-extract for correctness. */
+ struct netdev_flow_key scalar_mf_key = {0};
+ struct miniflow *scalar_mf = &scalar_mf_key.mf;
+ miniflow_extract(packet, scalar_mf);
+
+ /* Validate miniflow data is identical. */
+ uint32_t mf_bit_count = count_1bits(scalar_mf->map.bits[0]) +
+ count_1bits(scalar_mf->map.bits[1]);
+ size_t compare_size = sizeof(uint64_t) * (2 + mf_bit_count);
+ if (memcmp(scalar_mf, dst, compare_size)) {
+ printf("%s: Scalar miniflow output:\n", __func__);
+ ovs_hex_dump(stdout, scalar_mf, compare_size, 0, false);
+ printf("%s: AVX512 miniflow output:\n", __func__);
+ ovs_hex_dump(stdout, dst, compare_size, 0, false);
+ printf("error in miniflow compare, see hexdumps() above\n");
+ }
+
+ /* Validate that dp_packet offsets are identical. */
+ if (memcmp(&vec_offsets, &packet->l2_pad_size,
+ sizeof(struct packet_offsets))) {
+ printf("VECTOR code DP packet properties: %d, %d, %d, %d\n",
+ vec_offsets.l2_pad_size, vec_offsets.l2_5_ofs,
+ vec_offsets.l3_ofs, vec_offsets.l4_ofs);
+ printf("Scalar code DP packet properties: %d, %d, %d, %d\n",
+ packet->l2_pad_size, packet->l2_5_ofs, packet->l3_ofs,
+ packet->l4_ofs);
+ ovs_assert("error in packet offsets, see printf()s above\n");
+ }
+
+ }
+
+ /* Check if the study function should study more packets, or if it is
+ * done. When done, we change the per-PMD function pointer to the datapath
+ * implementation without study for better performance.
+ */
+ int64_t study_more = --pmd->miniflow_study_pkts;
+ if (!study_more) {
+ printf("%s : setting func ptr to remove study(), study_pkts = %ld\n",
+ __func__, study_more);
+ pmd->miniflow_extract_opt = miniflow_extract_avx512;
+ if (avx512vbmi_available)
+ pmd->miniflow_extract_opt = miniflow_extract_avx512_vbmi;
+ }
+
+ return match;
+}
+
+#endif /* SPARSE */
+#endif /* __x86_64__ */
new file mode 100644
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow.h"
+#include "dpif-netdev-private-thread.h"
+
+/* TODO: This function accepts a string, which represents the pattern and
+ * shuffles required for the users traffic type. Today this function has a
+ * hard-coded pattern for Ether()/IP()/UDP() packets.
+ *
+ * A future revision of this patchset will include the parsing of the input
+ * string to create the patterns, providing runtime flexibility in parsing
+ * packets into miniflows.
+ */
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string);
+
+/* The study function runs the patterns from the control-path, and based on
+ * some hit statistics can copy the pattern to the per-PMD pattern cache. Part
+ * of the study() functionality is also to validate that hits on a pattern
+ * result in an identical miniflow as the scalar miniflow_extract() function.
+ * This is validated by calling the scalar version, and comparing output.
+ */
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *dst);
@@ -34,6 +34,7 @@
#include "immintrin.h"
+#include "dpif-netdev-avx512-extract.h"
/* Structure to contain per-packet metadata that must be attributed to the
* dp netdev flow. This is unfortunate to have to track per packet, however
@@ -116,7 +117,16 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
struct dp_packet *packet = packets->packets[i];
pkt_metadata_init(&packet->md, in_port);
struct netdev_flow_key *key = &keys[i];
- miniflow_extract(packet, &key->mf);
+
+ if (pmd->miniflow_extract_opt) {
+ uint32_t matched = pmd->miniflow_extract_opt(pmd, packet,
+ &key->mf);
+ if (!matched) {
+ miniflow_extract(packet, &key->mf);
+ }
+ } else {
+ miniflow_extract(packet, &key->mf);
+ }
/* Cache TCP and byte values for all packets */
pkt_meta[i].bytes = dp_packet_size(packet);
new file mode 100644
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "dpif-netdev-private-extract.h"
+#include "openvswitch/vlog.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract);
+
+int32_t
+miniflow_extract_avx512_probe(void);
+
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string);
+
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *dst);
+
+/* Implementations of available extract opts. */
+static struct dpif_miniflow_extract_opt mfex_impl[] = {
+ {
+ .extract_func = NULL,
+ .insert_func = NULL,
+ .name = "disable",
+ },
+
+/* Only enable AVX512 if compile time criteria are met. */
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD)
+ {
+ .extract_func = miniflow_extract_avx512_study,
+ .insert_func = miniflow_extract_avx512_insert,
+ .name = "avx512",
+ },
+#endif
+};
+
+
+int32_t
+dpif_miniflow_extract_opt_get(const char *name,
+ struct dpif_miniflow_extract_opt **opt)
+{
+ ovs_assert(opt);
+
+ uint32_t i;
+ for (i = 0; i < ARRAY_SIZE(mfex_impl); i++) {
+ if (strcmp(name, mfex_impl[i].name) == 0) {
+ *opt = &mfex_impl[i];
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
new file mode 100644
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_AVX512_EXTRACT
+#define DPIF_NETDEV_AVX512_EXTRACT 1
+
+/* Forward declarations */
+struct dp_packet;
+struct miniflow;
+struct dp_netdev_pmd_thread;
+
+/* Function pointer prototype to be implemented in the optimized miniflow
+ * extract code.
+ */
+typedef uint32_t (*miniflow_extract_func)(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet *packet,
+ struct miniflow *mf);
+
+/* Function pointer prototype to be implemented by optimized miniflow extract
+ * code, to implement handling a new traffic pattern.
+ * Returns 0 on success
+ * Returns -ENOTSUP if the CPU does not support the required ISA
+ */
+typedef int32_t (*template_insert_func)(const char *pattern_string);
+
+/* Structure representing the attributes of an optimized implementation. */
+struct dpif_miniflow_extract_opt {
+ /* Function to call to extract miniflows from a packet */
+ miniflow_extract_func extract_func;
+
+ /* Function called to insert a new traffic pattern. */
+ template_insert_func insert_func;
+
+ /* Name of the optimized implementation. */
+ char *name;
+};
+
+/* Returns the opt structure for the requested implementation by name.
+ * Returns zero on success, and opt points to a valid struct, or
+ * returns a negative failure status.
+ * -EINVAL : invalid name requested
+ */
+int32_t
+dpif_miniflow_extract_opt_get(const char *name,
+ struct dpif_miniflow_extract_opt **opt);
+
+#endif /* DPIF_NETDEV_AVX512_EXTRACT */
@@ -147,6 +147,7 @@ struct dp_netdev_actions {
struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
};
+
#ifdef __cplusplus
}
#endif
@@ -29,6 +29,7 @@
#include "openvswitch/thread.h"
#include "dpif-netdev-private-dpif.h"
+#include "dpif-netdev-private-extract.h"
#ifdef __cplusplus
extern "C" {
@@ -108,6 +109,14 @@ struct dp_netdev_pmd_thread {
/* Function pointer to call for dp_netdev_input() functionality */
dp_netdev_input_func netdev_input_func;
+ /* Function pointer to call for miniflow_extract() functionality */
+ miniflow_extract_func miniflow_extract_opt;
+ /* Number of miniflow packets to study before selecting miniflow
+ * implementation. Depending on variability in traffic, a higher number
+ * allows longer inspection of traffic to ensure all are covered.
+ */
+ uint32_t miniflow_study_pkts;
+
struct seq *reload_seq;
uint64_t last_reload_seq;
@@ -46,6 +46,7 @@
#include "dpif-netdev-lookup.h"
#include "dpif-netdev-perf.h"
#include "dpif-netdev-private-dfc.h"
+#include "dpif-netdev-private-extract.h"
#include "dpif-provider.h"
#include "dummy.h"
#include "fat-rwlock.h"
@@ -990,6 +991,109 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
ds_destroy(&reply);
}
+static void
+dpif_miniflow_extract_template_add(struct unixctl_conn *conn, int argc,
+ const char *argv[], void *aux OVS_UNUSED)
+{
+ /* This function requires just one parameter, the template name.
+ * A second optional parameter can identify the datapath instance.
+ */
+ const char *mfex_impl_name = argv[1];
+
+ struct dpif_miniflow_extract_opt *mf_opt = NULL;
+ int err = dpif_miniflow_extract_opt_get(mfex_impl_name, &mf_opt);
+ if (err) {
+ struct ds reply = DS_EMPTY_INITIALIZER;
+ ds_put_format(&reply, "Miniflow Extract %s not found.",
+ mfex_impl_name);
+ const char *reply_str = ds_cstr(&reply);
+ unixctl_command_reply(conn, reply_str);
+ VLOG_INFO("%s", reply_str);
+ ds_destroy(&reply);
+ return;
+ }
+
+ /* Providing "disable" as implementation name has no insert func. */
+ if (mf_opt->insert_func) {
+ /* Insert the new pattern. There is ongoing work on designing the
+ * interaction between the string here, and the patterns in the
+ * miniflow extract optimized code.
+ */
+ const char *pattern_string = argv[2];
+ int32_t insert_err = mf_opt->insert_func(pattern_string);
+ if (OVS_UNLIKELY(insert_err)) {
+ struct ds reply = DS_EMPTY_INITIALIZER;
+
+ if (insert_err == -ENOTSUP) {
+ ds_put_format(&reply, "Miniflow Extract %s not available."
+ "This CPU does not support the required ISA.\n",
+ mfex_impl_name);
+ } else {
+ ds_put_format(&reply, "Miniflow Extract %s insert failed."
+ "Check the pattern data and command arguments.\n",
+ mfex_impl_name);
+ }
+
+ const char *reply_str = ds_cstr(&reply);
+ unixctl_command_reply(conn, reply_str);
+ VLOG_INFO("%s", reply_str);
+ ds_destroy(&reply);
+ return;
+ }
+ }
+
+ ovs_mutex_lock(&dp_netdev_mutex);
+ struct dp_netdev *dp = NULL;
+
+ /* Optional argument, if passed, study this number of packets. Defaults
+ * to 10k.
+ */
+ uint32_t study_pkts = 10000;
+ if (argc >= 4) {
+ study_pkts = atoi(argv[3]);
+ }
+
+ if (argc == 5) {
+ dp = shash_find_data(&dp_netdevs, argv[4]);
+ } else if (shash_count(&dp_netdevs) == 1) {
+ dp = shash_first(&dp_netdevs)->data;
+ }
+
+ if (!dp) {
+ ovs_mutex_unlock(&dp_netdev_mutex);
+ unixctl_command_reply_error(conn,
+ "please specify an existing datapath");
+ return;
+ }
+
+ /* Get PMD threads list */
+ size_t n;
+ struct dp_netdev_pmd_thread **pmd_list;
+ sorted_poll_thread_list(dp, &pmd_list, &n);
+
+ for (size_t i = 0; i < n; i++) {
+ struct dp_netdev_pmd_thread *pmd = pmd_list[i];
+ if (pmd->core_id == NON_PMD_CORE_ID) {
+ continue;
+ }
+
+ /* set PMD context to study N packets */
+ pmd->miniflow_study_pkts = study_pkts;
+
+ /* set PMD threads DPIF implementation to requested one */
+ pmd->miniflow_extract_opt = mf_opt->extract_func;
+ };
+ ovs_mutex_unlock(&dp_netdev_mutex);
+
+ /* Reply with success to command */
+ struct ds reply = DS_EMPTY_INITIALIZER;
+ ds_put_format(&reply, "miniflow extract opt impl %s.\n", mfex_impl_name);
+ const char *reply_str = ds_cstr(&reply);
+ unixctl_command_reply(conn, reply_str);
+ VLOG_INFO("%s", reply_str);
+ ds_destroy(&reply);
+}
+
static void
dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED)
@@ -1288,6 +1392,10 @@ dpif_netdev_init(void)
"[dpif implementation name] [dp]",
1, 2, dpif_netdev_impl_set,
NULL);
+ unixctl_command_register("dpif-netdev/miniflow-template-add",
+ "[impl name] [template] [study pkt count] [dp]",
+ 1, 4, dpif_miniflow_extract_template_add,
+ NULL);
return 0;
}
@@ -6127,6 +6235,9 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
/* Initialize the DPIF function pointer to the default scalar version */
pmd->netdev_input_func = dp_netdev_impl_get_default();
+ /* Initialize the miniflow extract function pointer not set */
+ pmd->miniflow_extract_opt = NULL;
+
/* init the 'flow_cache' since there is no
* actual thread created for NON_PMD_CORE_ID. */
if (core_id == NON_PMD_CORE_ID) {
This commit refactors the way in which the DPIF component can call the miniflow-extract function. It creates flexibility in the DPIF component by adding a function pointer at the pmd level. A new miniflow extract implementation is created which allows the AVX-512 SIMD instructions to perform the packet matching and building of the miniflow data-structure. All AVX-512 capable CPUs will be able to run the miniflow extract, however CPUs that support the AVX-512 Vector Bit Manipulation Instructions (VBMI) will benefit more as the native byte permute instruction gives extra performance. Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com> --- lib/automake.mk | 5 + lib/dpif-netdev-avx512-extract.c | 528 ++++++++++++++++++++++++++++++ lib/dpif-netdev-avx512-extract.h | 40 +++ lib/dpif-netdev-avx512.c | 12 +- lib/dpif-netdev-private-extract.c | 72 ++++ lib/dpif-netdev-private-extract.h | 60 ++++ lib/dpif-netdev-private-flow.h | 1 + lib/dpif-netdev-private-thread.h | 9 + lib/dpif-netdev.c | 111 +++++++ 9 files changed, 837 insertions(+), 1 deletion(-) create mode 100644 lib/dpif-netdev-avx512-extract.c create mode 100644 lib/dpif-netdev-avx512-extract.h create mode 100644 lib/dpif-netdev-private-extract.c create mode 100644 lib/dpif-netdev-private-extract.h