[ovs-dev,v4,10/11] dpif-netdev: enable ISA optimized miniflow extract

Message ID	20201125184342.2715681-11-harry.van.haaren@intel.com
State	Superseded
Headers	show Return-Path: <ovs-dev-bounces@openvswitch.org> IronPort-SDR: h7XYJl6PauuaAgwAVJ5UVb+ZNst+AdqWZ54STrdQMwUQlgJr+1r+fmpbHa/McJLZg7LYz9Artq 77Nf+kMJx/FA== IronPort-SDR: bblKfOb6cQ/WlgRIp3WDfA4mxm/X+Op6IAhGZZ3J96ppul8ORXrRCb96hxkDDPU87XV0ewx3iP txrDwOb8NPVw== From: Harry van Haaren <harry.van.haaren@intel.com> To: ovs-dev@openvswitch.org Date: Wed, 25 Nov 2020 18:43:41 +0000 Message-Id: <20201125184342.2715681-11-harry.van.haaren@intel.com> In-Reply-To: <20201125184342.2715681-1-harry.van.haaren@intel.com> References: <20201118161501.1710801-1-harry.van.haaren@intel.com> <20201125184342.2715681-1-harry.van.haaren@intel.com> MIME-Version: 1.0 Cc: i.maximets@ovn.org Subject: [ovs-dev] [PATCH v4 10/11] dpif-netdev: enable ISA optimized miniflow extract Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ovs-dev-bounces@openvswitch.org Sender: "dev" <ovs-dev-bounces@openvswitch.org>
Series	DPIF & MFEX Refactor and SIMD optimization \| expand [ovs-dev,v4,00/11] DPIF & MFEX Refactor and SIMD optimization [ovs-dev,v4,01/11] dpdk: Cache result of CPU ISA checks [ovs-dev,v4,02/11] dpif-netdev: Move pmd_try_optimize function in file [ovs-dev,v4,03/11] dpif-netdev: Refactor to multiple header files [ovs-dev,v4,04/11] dpif-netdev: Split hwol out to own header file [ovs-dev,v4,05/11] dpif-netdev: Add function pointer for netdev input [ovs-dev,v4,06/11] dpif-avx512: Add ISA implementation of dpif [ovs-dev,v4,07/11] dpif-avx512: Add SMC support to AVX512 DPIF [ovs-dev,v4,08/11] dpif-netdev: Add command to switch dpif implementation [ovs-dev,v4,09/11] dpif-netdev/dpcls: Refactor function names to dpcls [ovs-dev,v4,10/11] dpif-netdev: enable ISA optimized miniflow extract [ovs-dev,v4,11/11] dpif-netdev: enable scalar datapath with optimized miniflow extract

diff --git a/lib/automake.mk b/lib/automake.mk index 719477aa5..1d2d0804b 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -32,6 +32,7 @@ lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la lib_libopenvswitchavx512_la_CFLAGS = \ -mavx512f \ -mavx512bw \ + -mavx512vl \ -mavx512dq \ -mbmi \ -mbmi2 \ @@ -39,6 +40,7 @@ lib_libopenvswitchavx512_la_CFLAGS = \ $(AM_CFLAGS) lib_libopenvswitchavx512_la_SOURCES = \ lib/dpif-netdev-lookup-avx512-gather.c \ + lib/dpif-netdev-avx512-extract.c \ lib/dpif-netdev-avx512.c lib_libopenvswitchavx512_la_LDFLAGS = \ -static @@ -107,6 +109,7 @@ lib_libopenvswitch_la_SOURCES = \ lib/dp-packet.h \ lib/dp-packet.c \ lib/dpdk.h \ + lib/dpif-netdev-avx512-extract.h \ lib/dpif-netdev-lookup.h \ lib/dpif-netdev-lookup.c \ lib/dpif-netdev-lookup-autovalidator.c \ @@ -117,6 +120,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpif-netdev-private-dpcls.h \ lib/dpif-netdev-private-dpif.c \ lib/dpif-netdev-private-dpif.h \ + lib/dpif-netdev-private-extract.c \ + lib/dpif-netdev-private-extract.h \ lib/dpif-netdev-private-flow.h \ lib/dpif-netdev-private-hwol.h \ lib/dpif-netdev-private-thread.h \ diff --git a/lib/dpif-netdev-avx512-extract.c b/lib/dpif-netdev-avx512-extract.c new file mode 100644 index 000000000..592a82bd4 --- /dev/null +++ b/lib/dpif-netdev-avx512-extract.c @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2020 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __x86_64__ +/* Sparse cannot handle the AVX512 instructions */ +#if !defined(__CHECKER__) + +#include <config.h> +#include <errno.h> + +#include "dpif-netdev-avx512-extract.h" +#include "dpif-netdev-private-thread.h" + +#include "immintrin.h" + +/* This file contains optimized implementations of miniflow_extract() + * for specific common traffic patterns. The optimizations allow for + * quick probing of a specific packet type, and if a match with a specific + * type is found, a shuffle like proceedure builds up the required miniflow + */ + +#define MAX_PATTERN_COUNT (8) +#define MAX_PATTERN_SIZE (128) +#define MAX_SHUFFLE_COUNT (MAX_PATTERN_SIZE / 64) + +/* A structure to represent each matched on packet pattern */ +struct __attribute__((aligned(MAX_PATTERN_SIZE))) packet_pattern { + /* A bitmask to apply to the packet before comparing it to the pattern. + * This results in only bits that matter to packet layout remaining. + */ + uint8_t mask[MAX_PATTERN_SIZE]; + + /* The data values to compare the masked packet against. This is the known + * fields of the packet which are required for a specific layout. E.g. an + * Ether/IPv4 packet has a 0x0800 ethertype, and the 0x0800 is stored here. + */ + uint8_t data[MAX_PATTERN_SIZE]; + +}; + +/* Improvement: create this sttruct in dp-packet.h, and reuse-here. That would + * avoid the requirement of the packed attribute. + */ +struct __attribute__((packed)) packet_offsets { + uint8_t l2_pad_size; + uint16_t l2_5_ofs; + uint16_t l3_ofs; + uint16_t l4_ofs; +}; + +/* Structure to represent the data-movement from pattern to miniflow. */ +struct packet_pattern_shuffle { + uint64_t kmasks[MAX_SHUFFLE_COUNT]; + struct packet_offsets offsets; + + /* The input data to the data-movement shuffle. This shuffle changes the + * layout of the packet data into the miniflow blocks shape. + */ + uint8_t shuffle[MAX_PATTERN_SIZE]; + + /* Data to be merged into the resulting miniflow blocks. This is required + * for e.g. VLAN TCI, which generates a bit in the block even if the packet + * didn't originally have it. + */ + uint8_t insert[MAX_PATTERN_SIZE]; +}; + +/* structure that represents all per-thread pattern data. */ +struct packet_pattern_cache { + /* Minimum packet len for this pattern index to be a valid candidate. */ + uint8_t min_len[MAX_PATTERN_COUNT]; + + /* Number of active patterns to match against. */ + uint8_t active_pattern_count; + + /* The mask and compare data itself. */ + struct packet_pattern patterns[MAX_PATTERN_COUNT]; + + /* Miniflow bits that need to be set for each pattern. */ + struct miniflow miniflow_bits[MAX_PATTERN_COUNT]; + + /* Structure to represent the data-movement from pattern to miniflow. */ + struct packet_pattern_shuffle shuffles[MAX_PATTERN_COUNT]; + +}; + +/* Single copy of control-path owned patterns. The contents of this struct will + * be updated when the user runs a miniflow-pattern-add command. The contents + * of this struct are only read in the datapath during the "study" phase, and + * copied into a thread-local memory for the PMD threads for datapath usage. + */ +static struct packet_pattern_cache patterns_control_path; + +/* Generator for EtherType masks and values. */ +#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \ + 0, 0, 0, 0, 0, 0, /* Ether MAC DST */ \ + 0, 0, 0, 0, 0, 0, /* Ether MAC SRC */ \ + type_b0, type_b1, /* EtherType */ + +#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF) +#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00) + +#define PATTERN_VLAN_GEN(tpid0, tpid1, tci0, tci1) \ + tpid0, tpid1, pcp_dei_vid0, vid1, /* Whole VLAN header */ + +#define PATTERN_VLAN_MASK PATTERN_VLAN_GEN(0xFF, 0xFF, (~0x4), 0xFF) +#define PATTERN_VLAN_DATA PATTERN_VLAN_GEN(0xFF, 0xFF, 0, 0) + +/* Generator for checking IPv4 ver, ihl, and proto */ +#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \ + VER_IHL, /* Version and IHL */ \ + 0, 0, 0, /* DSCP, ECN, Total Lenght */ \ + 0, 0, /* Identification */ \ + /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */ \ + FLAG_OFF_B0, FLAG_OFF_B1, \ + 0, /* TTL */ \ + PROTO, /* Protocol */ \ + 0, 0, /* Header checksum */ \ + 0, 0, 0, 0, /* Src IP */ \ + 0, 0, 0, 0, /* Dst IP */ + +#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF) +#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11) + + +#define ETHER_IPV4_UDP_LEN (42) + +#define NU 0 +#define PATTERN_IPV4_UDP_SHUFFLE \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, NU, NU, /* Ether */ \ + 26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ \ + 34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */ + +static int avx512vbmi_available; + +/* Enable Icelake AVX-512 VBMI ISA for only this function. That allows the + * compile to emit the instruction here, but not use AVX-512 VBMI outside + * of this function. + */ +static inline __m512i __attribute__((__target__("avx512vbmi"))) +packet_shuffle_avx512_icx(__mmask64 k_mask, __m512i v_pkt_data_0, + __m512i v_shuf_mask, __m512i v_pkt_data_1) +{ + return _mm512_maskz_permutex2var_epi8(k_mask, v_pkt_data_0, + v_shuf_mask, v_pkt_data_1); +} + +/* This function provides a Skylake and higher fallback for the byte-shuffle + * that is required to implement miniflow extract correctly. + */ +static inline __m512i +packet_shuffle_avx512(__mmask64 k_mask, __m512i v_data_0, __m512i v_shuf_idxs, + __m512i v_data_1, uint32_t use_vbmi) +{ + if (use_vbmi) { + return packet_shuffle_avx512_icx(k_mask, v_data_0, + v_shuf_idxs, v_data_1); + } + + /* Clear away ODD lane bytes, shift down by 1 to get u8 to u16 idxs. */ + const __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA; + __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes, + v_shuf_idxs, _mm512_setzero_si512()); + v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1); + + /* Clear away EVEN lane bytes by shifting out. Shift EVEN lane indexes down + * by one bit too to achieve u8 to u16 conversion. + */ + __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9); + + /* Shuffle each of odd/even at 16-bit width. */ + __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn, + v_data_1); + __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd, + v_data_1); + + /* Find if the shuffle index was odd, via mask and compare. */ + uint16_t index_odd_mask = 0x1; + const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask); + + /* EVEN lanes, find if u8 index was odd, result as u16 bitmask. */ + __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs, + v_index_mask_u16); + __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked, + v_index_mask_u16); + + /* ODD lanes, find if u8 index was odd, result as u16 bitmask. */ + __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8); + __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8, + v_index_mask_u16); + __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked, + v_index_mask_u16); + odd_rotate_mask = ~odd_rotate_mask; + + /* Rotate based on low-bit-set bitmask, and blend results. */ + __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1, + evn_rotate_mask, v_shuf1, 8); + __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2, + odd_rotate_mask, v_shuf2, 8); + + /* Blend results of two halves back together. */ + __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes, + v_shuf_res_evn, v_shuf_res_odd); + + /* k-mask the final result as requested. This is not easy to do before + * here, as the instructions operate at u16 size, meaning the k-mask would + * be interpreted as the wrong size. + */ + __m512i v_zeros = _mm512_setzero_si512(); + __m512i v_shuf_res_masked = _mm512_mask_blend_epi8(k_mask, v_zeros, + v_shuf_result); + return v_shuf_res_masked; +} + + +/* Matches all patterns provided, building the appropriate miniflow for a hit. + * + * Note that this function is compile-time specialized into to variants, one + * for CPUs that support AVX-512 Vector Bit Manipulation Instructions (VBMI), + * and another for those that support AVX-512 but not AVX-512 VBMI. + */ +static inline __attribute__((always_inline)) uint32_t +packet_pattern_avx512(struct dp_packet *dp_pkt, struct miniflow *mf, + struct packet_pattern_cache *cache, + const uint32_t num_patterns, + const uint32_t use_vbmi) +{ + uint8_t *pkt = dp_packet_data(dp_pkt); + const uint32_t pkt_len = dp_packet_size(dp_pkt); + uint32_t in_port = odp_to_u32(dp_pkt->md.in_port.odp_port); + + /* Masked load to only load the valid packet data. */ + uint64_t mask1 = (1ULL << pkt_len) - 1; + mask1 |= (pkt_len < 64) - 1; + __mmask64 pkt_len_mask_0 = mask1; + + uint64_t mask2 = (1ULL << (pkt_len - 64)) - 1; + mask2 |= (pkt_len < 128) - 1; + mask2 &= (pkt_len < 64) - 1; + __mmask64 pkt_len_mask_1 = mask2; + + __m512i v_pkt_data_0 = _mm512_maskz_loadu_epi8(pkt_len_mask_0, &pkt[0]); + __m512i v_pkt_data_1 = _mm512_maskz_loadu_epi8(pkt_len_mask_1, &pkt[64]); + + /* Loop over the patterns provided. Note that this loop can be compile-time + * unrolled for specialized versions with set numbers of patterns. + */ + uint32_t hitmask = 0; + + for (uint32_t i = 0; i < num_patterns; i++) { + struct packet_pattern *patterns = cache->patterns; + + /* Mask and match the packet data and pattern, results in hit bit. */ + __m512i v_mask_0 = _mm512_loadu_si512(&patterns[i].mask[0]); + __m512i v_data_0 = _mm512_loadu_si512(&patterns[i].data[0]); + __m512i v_pkt_masked = _mm512_and_si512(v_pkt_data_0, v_mask_0); + __mmask64 cmp_mask = _mm512_cmpeq_epi8_mask(v_pkt_masked, v_data_0); + + uint32_t hit = (cmp_mask == UINT64_MAX); + hitmask |= (hit << i); + } + + /* Check packet len to ensure the packet data filled the whole pattern. */ + __mmask16 min_len_mask = (1 << num_patterns) - 1; + __m128i v_pattern_min_lens = _mm_maskz_loadu_epi8(min_len_mask, + (void *)cache->min_len); + __m128i v_pkt_len = _mm_maskz_set1_epi8(min_len_mask, pkt_len); + uint32_t pkt_len_valid_mask = _mm_mask_cmpge_epu8_mask(min_len_mask, + v_pkt_len, + v_pattern_min_lens); + + /* Strip away hit if packet was too short for the pattern */ + hitmask &= pkt_len_valid_mask; + + /* If a pattern was hit, build the miniflow using the pattern shuffle. */ + if (OVS_LIKELY(hitmask)) { + uint32_t idx = __builtin_ctzll(hitmask); + + /* Copy the pattern miniflow bits to the destination miniflow. */ + struct miniflow *pattern_mf_bits = &cache->miniflow_bits[idx]; + __m128i v_pattern_mf_bits = _mm_load_si128((void *)pattern_mf_bits); + _mm_storeu_si128((void *)mf, v_pattern_mf_bits); + + /* Load miniflow building metadata */ + struct packet_pattern_shuffle *shuffle = &cache->shuffles[idx]; + __mmask64 k_shuf_0 = shuffle->kmasks[0]; + __m512i v_shuf_mask_0 = _mm512_loadu_si512(&shuffle->shuffle[0]); + __m512i v_ins_0 = _mm512_loadu_si512(&shuffle->insert[0]); + + /* Compute bytes 0-63 and merge in pattern-required bits. */ + __m512i v_mf_blocks_0 = packet_shuffle_avx512(k_shuf_0, v_pkt_data_0, + v_shuf_mask_0, v_pkt_data_1, use_vbmi); + __m512i v_mf_blocks_ins_0 = _mm512_or_si512(v_mf_blocks_0, v_ins_0); + + /* If required, compute bytes 0-63 and merge in pattern bits. */ + __m512i v_mf_blocks_ins_1 = _mm512_setzero_si512(); + __mmask64 k_shuf_1 = shuffle->kmasks[1]; + if (k_shuf_1) { + __m512i v_shuf_mask_1 = _mm512_loadu_si512(&shuffle->shuffle[64]); + __m512i v_mf_blocks_1 = packet_shuffle_avx512(k_shuf_1, + v_pkt_data_0, v_shuf_mask_1, + v_pkt_data_1, use_vbmi); + __m512i v_ins_1 = _mm512_loadu_si512(&shuffle->insert[64]); + v_mf_blocks_ins_1 = _mm512_or_si512(v_mf_blocks_1, v_ins_1); + } + + /* Miniflow Blocks contains first 2 blocks of non-packet-parsed data, + * such as the dp hash, in port, ct_mark, and packet_type. On outer + * packets, they are always zero except for in_port. + */ + uint64_t *mf_blocks = miniflow_values(mf); + __m128i v_blocks_01 = _mm_setzero_si128(); + v_blocks_01 = _mm_insert_epi32(v_blocks_01, in_port, 1); + _mm_storeu_si128((void *)&mf_blocks[0], v_blocks_01); + + /* Store the computed miniflow blocks. */ + _mm512_storeu_si512(&mf_blocks[2], v_mf_blocks_ins_0); + _mm512_storeu_si512(&mf_blocks[2 + 8], v_mf_blocks_ins_1); + + /* Set dp packet offsets from the pattern metadata. */ + memcpy(&dp_pkt->l2_pad_size, &shuffle->offsets, + sizeof(struct packet_offsets)); + } + + return hitmask; +} + +/* TODO: This function accepts a string, which represents the pattern and + * shuffles required for the users traffic type. Today this function has a + * hard-coded pattern for Ether()/IP()/UDP() packets. + * + * A future revision of this patchset will include the parsing of the input + * string to create the patterns, providing runtime flexibility in parsing + * packets into miniflows. + */ +int32_t +miniflow_extract_avx512_insert(const char *pattern_string) +{ + /* Check that the runtime CPU has the required ISA avialable. Also check for + * AVX-512 Vector Bit Manipulation Instructions (VBMI), which allow a faster + * code-path to be used due to a native byte permute instruction. + */ + int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f"); + int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2"); + avx512vbmi_available = dpdk_get_cpu_has_isa("x86_64", "avx512vbmi"); + + uint32_t min_isa_ok = avx512f_available && bmi2_available; + printf("%s : minimum ISA avialable: %s, AVX-512 VBMI available: %s\n", + __func__, min_isa_ok ? "yes" : "no", + avx512vbmi_available ? "yes" : "no"); + if (!min_isa_ok) { + return -ENOTSUP; + } + + (void)patterns_control_path; + (void)pattern_string; + + /* Add hard-coded Ether/IPv4/UDP implementation for demonstration. */ + patterns_control_path.active_pattern_count = 1; + + /* Ether/IPv4/UDP pattern metadata */ + patterns_control_path.patterns[0] = (struct packet_pattern) { + .mask = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK }, + .data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP }, + }; + + printf("%s: pattern 0 mask:\n", __func__); + ovs_hex_dump(stdout, &patterns_control_path.patterns[0].mask, + MAX_PATTERN_SIZE, 0, false); + printf("%s: pattern 0 data:\n", __func__); + ovs_hex_dump(stdout, &patterns_control_path.patterns[0].data, + MAX_PATTERN_SIZE, 0, false); + + patterns_control_path.miniflow_bits[0] = (struct miniflow) { + .map = { .bits = {0x18a0000000000000, 0x0000000000040401}, } + }; + printf("pattern[0] mf bits %08llx %08llx\n", + patterns_control_path.miniflow_bits[0].map.bits[0], + patterns_control_path.miniflow_bits[0].map.bits[1]); + + patterns_control_path.min_len[0] = ETHER_IPV4_UDP_LEN; + + /* Kmask and Shuffle for Ether/IPv4/UDP. Created by inspecting miniflow + * built from packet data, and reproduced using AVX-512 instructions with + * k-masks to zero parts of the miniflow as required. + */ + patterns_control_path.shuffles[0] = (struct packet_pattern_shuffle) { + .kmasks = { 0b0000111111110000111111110011111111111111, 0 }, + .offsets = { + .l2_pad_size = 0, + .l2_5_ofs = UINT16_MAX, + .l3_ofs = 14, + .l4_ofs = 34, + }, + .shuffle = {PATTERN_IPV4_UDP_SHUFFLE}, + }; + printf("pattern[0] kmask[0] %08lx, kmask[1] %08lx, shuffle hexdump:\n", + patterns_control_path.shuffles[0].kmasks[0], + patterns_control_path.shuffles[0].kmasks[1]); + ovs_hex_dump(stdout, &patterns_control_path.shuffles[0], MAX_PATTERN_SIZE, + 0, false); + + return 0; +}; + +static uint32_t +miniflow_extract_avx512(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *mf) +{ + /* TODO: alloc pattern cache per PMD thread. */ + (void)pmd; + + /* Execute the pattern matching using the PMD pattern cache. */ + uint32_t num_patterns = 1; + uint32_t use_vbmi = 0; + uint32_t match_hit = packet_pattern_avx512(packet, mf, + &patterns_control_path, + num_patterns, + use_vbmi); + return match_hit; +} + +/* This function will only be used if AVX-512 VBMI instructions are available + * on the CPU. As such, we use the __target__ attribute to enable VBMI ISA. + */ +static uint32_t __attribute__((__target__("avx512vbmi"))) +miniflow_extract_avx512_vbmi(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *mf) +{ + /* TODO: alloc pattern cache per PMD thread. */ + (void)pmd; + + /* Execute the pattern matching using the PMD pattern cache. */ + uint32_t num_patterns = 1; + uint32_t use_vbmi = 1; + uint32_t match_hit = packet_pattern_avx512(packet, mf, + &patterns_control_path, + num_patterns, + use_vbmi); + return match_hit; +} + +/* The study function runs the patterns from the control-path, and based on + * some hit statistics can copy the pattern to the per-PMD pattern cache. Part + * of the study() functionality is also to validate that hits on a pattern + * result in an identical miniflow as the scalar miniflow_extract() function. + * This is validated by calling the scalar version, and comparing output. + */ +uint32_t +miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *dst) +{ + static volatile int debug = 0; + + /* Run using the user supplied patterns. */ + uint32_t match = miniflow_extract_avx512(pmd, packet, dst); + + if (debug || match) { + /* Save off AVX512 created dp_packet offsets for verification. */ + struct packet_offsets vec_offsets; + memcpy(&vec_offsets, &packet->l2_pad_size, + sizeof(struct packet_offsets)); + + /* Check the result vs the scalar miniflow-extract for correctness. */ + struct netdev_flow_key scalar_mf_key = {0}; + struct miniflow *scalar_mf = &scalar_mf_key.mf; + miniflow_extract(packet, scalar_mf); + + /* Validate miniflow data is identical. */ + uint32_t mf_bit_count = count_1bits(scalar_mf->map.bits[0]) + + count_1bits(scalar_mf->map.bits[1]); + size_t compare_size = sizeof(uint64_t) * (2 + mf_bit_count); + if (memcmp(scalar_mf, dst, compare_size)) { + printf("%s: Scalar miniflow output:\n", __func__); + ovs_hex_dump(stdout, scalar_mf, compare_size, 0, false); + printf("%s: AVX512 miniflow output:\n", __func__); + ovs_hex_dump(stdout, dst, compare_size, 0, false); + printf("error in miniflow compare, see hexdumps() above\n"); + } + + /* Validate that dp_packet offsets are identical. */ + if (memcmp(&vec_offsets, &packet->l2_pad_size, + sizeof(struct packet_offsets))) { + printf("VECTOR code DP packet properties: %d, %d, %d, %d\n", + vec_offsets.l2_pad_size, vec_offsets.l2_5_ofs, + vec_offsets.l3_ofs, vec_offsets.l4_ofs); + printf("Scalar code DP packet properties: %d, %d, %d, %d\n", + packet->l2_pad_size, packet->l2_5_ofs, packet->l3_ofs, + packet->l4_ofs); + ovs_assert("error in packet offsets, see printf()s above\n"); + } + + } + + /* Check if the study function should study more packets, or if it is + * done. When done, we change the per-PMD function pointer to the datapath + * implementation without study for better performance. + */ + int64_t study_more = --pmd->miniflow_study_pkts; + if (!study_more) { + printf("%s : setting func ptr to remove study(), study_pkts = %ld\n", + __func__, study_more); + pmd->miniflow_extract_opt = miniflow_extract_avx512; + if (avx512vbmi_available) + pmd->miniflow_extract_opt = miniflow_extract_avx512_vbmi; + } + + return match; +} + +#endif /* SPARSE */ +#endif /* __x86_64__ */ diff --git a/lib/dpif-netdev-avx512-extract.h b/lib/dpif-netdev-avx512-extract.h new file mode 100644 index 000000000..39964c31d --- /dev/null +++ b/lib/dpif-netdev-avx512-extract.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow.h" +#include "dpif-netdev-private-thread.h" + +/* TODO: This function accepts a string, which represents the pattern and + * shuffles required for the users traffic type. Today this function has a + * hard-coded pattern for Ether()/IP()/UDP() packets. + * + * A future revision of this patchset will include the parsing of the input + * string to create the patterns, providing runtime flexibility in parsing + * packets into miniflows. + */ +int32_t +miniflow_extract_avx512_insert(const char *pattern_string); + +/* The study function runs the patterns from the control-path, and based on + * some hit statistics can copy the pattern to the per-PMD pattern cache. Part + * of the study() functionality is also to validate that hits on a pattern + * result in an identical miniflow as the scalar miniflow_extract() function. + * This is validated by calling the scalar version, and comparing output. + */ +uint32_t +miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *dst); diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index 07f064a18..08ad48e7d 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -34,6 +34,7 @@ #include "immintrin.h" +#include "dpif-netdev-avx512-extract.h" /* Structure to contain per-packet metadata that must be attributed to the * dp netdev flow. This is unfortunate to have to track per packet, however @@ -116,7 +117,16 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet = packets->packets[i]; pkt_metadata_init(&packet->md, in_port); struct netdev_flow_key *key = &keys[i]; - miniflow_extract(packet, &key->mf); + + if (pmd->miniflow_extract_opt) { + uint32_t matched = pmd->miniflow_extract_opt(pmd, packet, + &key->mf); + if (!matched) { + miniflow_extract(packet, &key->mf); + } + } else { + miniflow_extract(packet, &key->mf); + } /* Cache TCP and byte values for all packets */ pkt_meta[i].bytes = dp_packet_size(packet); diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c new file mode 100644 index 000000000..c97658e41 --- /dev/null +++ b/lib/dpif-netdev-private-extract.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2020 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> +#include <errno.h> +#include <string.h> +#include <stdint.h> + +#include "dpif-netdev-private-extract.h" +#include "openvswitch/vlog.h" +#include "util.h" + +VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract); + +int32_t +miniflow_extract_avx512_probe(void); + +int32_t +miniflow_extract_avx512_insert(const char *pattern_string); + +uint32_t +miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *dst); + +/* Implementations of available extract opts. */ +static struct dpif_miniflow_extract_opt mfex_impl[] = { + { + .extract_func = NULL, + .insert_func = NULL, + .name = "disable", + }, + +/* Only enable AVX512 if compile time criteria are met. */ +#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD) + { + .extract_func = miniflow_extract_avx512_study, + .insert_func = miniflow_extract_avx512_insert, + .name = "avx512", + }, +#endif +}; + + +int32_t +dpif_miniflow_extract_opt_get(const char *name, + struct dpif_miniflow_extract_opt **opt) +{ + ovs_assert(opt); + + uint32_t i; + for (i = 0; i < ARRAY_SIZE(mfex_impl); i++) { + if (strcmp(name, mfex_impl[i].name) == 0) { + *opt = &mfex_impl[i]; + return 0; + } + } + return -EINVAL; +} diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h new file mode 100644 index 000000000..3c5868ebe --- /dev/null +++ b/lib/dpif-netdev-private-extract.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DPIF_NETDEV_AVX512_EXTRACT +#define DPIF_NETDEV_AVX512_EXTRACT 1 + +/* Forward declarations */ +struct dp_packet; +struct miniflow; +struct dp_netdev_pmd_thread; + +/* Function pointer prototype to be implemented in the optimized miniflow + * extract code. + */ +typedef uint32_t (*miniflow_extract_func)(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct miniflow *mf); + +/* Function pointer prototype to be implemented by optimized miniflow extract + * code, to implement handling a new traffic pattern. + * Returns 0 on success + * Returns -ENOTSUP if the CPU does not support the required ISA + */ +typedef int32_t (*template_insert_func)(const char *pattern_string); + +/* Structure representing the attributes of an optimized implementation. */ +struct dpif_miniflow_extract_opt { + /* Function to call to extract miniflows from a packet */ + miniflow_extract_func extract_func; + + /* Function called to insert a new traffic pattern. */ + template_insert_func insert_func; + + /* Name of the optimized implementation. */ + char *name; +}; + +/* Returns the opt structure for the requested implementation by name. + * Returns zero on success, and opt points to a valid struct, or + * returns a negative failure status. + * -EINVAL : invalid name requested + */ +int32_t +dpif_miniflow_extract_opt_get(const char *name, + struct dpif_miniflow_extract_opt **opt); + +#endif /* DPIF_NETDEV_AVX512_EXTRACT */ diff --git a/lib/dpif-netdev-private-flow.h b/lib/dpif-netdev-private-flow.h index 6b91a5d4e..20d22bad3 100644 --- a/lib/dpif-netdev-private-flow.h +++ b/lib/dpif-netdev-private-flow.h @@ -147,6 +147,7 @@ struct dp_netdev_actions { struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */ }; + #ifdef __cplusplus } #endif diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h index b465e6ea3..c5013cf4e 100644 --- a/lib/dpif-netdev-private-thread.h +++ b/lib/dpif-netdev-private-thread.h @@ -29,6 +29,7 @@ #include "openvswitch/thread.h" #include "dpif-netdev-private-dpif.h" +#include "dpif-netdev-private-extract.h" #ifdef __cplusplus extern "C" { @@ -108,6 +109,14 @@ struct dp_netdev_pmd_thread { /* Function pointer to call for dp_netdev_input() functionality */ dp_netdev_input_func netdev_input_func; + /* Function pointer to call for miniflow_extract() functionality */ + miniflow_extract_func miniflow_extract_opt; + /* Number of miniflow packets to study before selecting miniflow + * implementation. Depending on variability in traffic, a higher number + * allows longer inspection of traffic to ensure all are covered. + */ + uint32_t miniflow_study_pkts; + struct seq *reload_seq; uint64_t last_reload_seq; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 588981ca8..5627277c4 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -46,6 +46,7 @@ #include "dpif-netdev-lookup.h" #include "dpif-netdev-perf.h" #include "dpif-netdev-private-dfc.h" +#include "dpif-netdev-private-extract.h" #include "dpif-provider.h" #include "dummy.h" #include "fat-rwlock.h" @@ -990,6 +991,109 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc, ds_destroy(&reply); } +static void +dpif_miniflow_extract_template_add(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + /* This function requires just one parameter, the template name. + * A second optional parameter can identify the datapath instance. + */ + const char *mfex_impl_name = argv[1]; + + struct dpif_miniflow_extract_opt *mf_opt = NULL; + int err = dpif_miniflow_extract_opt_get(mfex_impl_name, &mf_opt); + if (err) { + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, "Miniflow Extract %s not found.", + mfex_impl_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); + return; + } + + /* Providing "disable" as implementation name has no insert func. */ + if (mf_opt->insert_func) { + /* Insert the new pattern. There is ongoing work on designing the + * interaction between the string here, and the patterns in the + * miniflow extract optimized code. + */ + const char *pattern_string = argv[2]; + int32_t insert_err = mf_opt->insert_func(pattern_string); + if (OVS_UNLIKELY(insert_err)) { + struct ds reply = DS_EMPTY_INITIALIZER; + + if (insert_err == -ENOTSUP) { + ds_put_format(&reply, "Miniflow Extract %s not available." + "This CPU does not support the required ISA.\n", + mfex_impl_name); + } else { + ds_put_format(&reply, "Miniflow Extract %s insert failed." + "Check the pattern data and command arguments.\n", + mfex_impl_name); + } + + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); + return; + } + } + + ovs_mutex_lock(&dp_netdev_mutex); + struct dp_netdev *dp = NULL; + + /* Optional argument, if passed, study this number of packets. Defaults + * to 10k. + */ + uint32_t study_pkts = 10000; + if (argc >= 4) { + study_pkts = atoi(argv[3]); + } + + if (argc == 5) { + dp = shash_find_data(&dp_netdevs, argv[4]); + } else if (shash_count(&dp_netdevs) == 1) { + dp = shash_first(&dp_netdevs)->data; + } + + if (!dp) { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } + + /* Get PMD threads list */ + size_t n; + struct dp_netdev_pmd_thread **pmd_list; + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* set PMD context to study N packets */ + pmd->miniflow_study_pkts = study_pkts; + + /* set PMD threads DPIF implementation to requested one */ + pmd->miniflow_extract_opt = mf_opt->extract_func; + }; + ovs_mutex_unlock(&dp_netdev_mutex); + + /* Reply with success to command */ + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, "miniflow extract opt impl %s.\n", mfex_impl_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); +} + static void dpif_netdev_impl_set(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) @@ -1288,6 +1392,10 @@ dpif_netdev_init(void) "[dpif implementation name] [dp]", 1, 2, dpif_netdev_impl_set, NULL); + unixctl_command_register("dpif-netdev/miniflow-template-add", + "[impl name] [template] [study pkt count] [dp]", + 1, 4, dpif_miniflow_extract_template_add, + NULL); return 0; } @@ -6127,6 +6235,9 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, /* Initialize the DPIF function pointer to the default scalar version */ pmd->netdev_input_func = dp_netdev_impl_get_default(); + /* Initialize the miniflow extract function pointer not set */ + pmd->miniflow_extract_opt = NULL; + /* init the 'flow_cache' since there is no * actual thread created for NON_PMD_CORE_ID. */ if (core_id == NON_PMD_CORE_ID) {

[ovs-dev,v4,10/11] dpif-netdev: enable ISA optimized miniflow extract

Commit Message

Comments

Patch