Message ID | 20220614115743.1143341-9-emma.finn@intel.com |
---|---|
State | Changes Requested |
Headers | show |
Series | [ovs-dev,v7,01/11] ofproto-dpif: Fix incorrect checksums in input packets | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/intel-ovs-compilation | success | test: success |
On 14 Jun 2022, at 13:57, Emma Finn wrote: > This commit adds the AVX512 implementation of the > pop_vlan action. > > Signed-off-by: Emma Finn <emma.finn@intel.com> > --- > lib/odp-execute-avx512.c | 91 ++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 91 insertions(+) > > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c > index 1fb334689..f9e2b1727 100644 > --- a/lib/odp-execute-avx512.c > +++ b/lib/odp-execute-avx512.c > @@ -14,6 +14,11 @@ > * limitations under the License. > */ > > +#ifdef __x86_64__ > +/* Sparse cannot handle the AVX512 instructions. */ > +#if !defined(__CHECKER__) > + > + > #include <config.h> > #include <errno.h> > > @@ -24,6 +29,86 @@ > #include "odp-netlink.h" > #include "openvswitch/vlog.h" > > +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); I'd asked to add comment so that anyone without avx512 knowledge can read and understand the code. Guess this is not fully accomplished. I've attached a diff at the end, with what I think would help people to understand what's going on. Also you do not catch the issue if some one adds a member between l2_pad_size and l2_5_ofs. I'll add some code for this to my patch also. > +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) + > + MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) == > + offsetof(struct dp_packet, l3_ofs)); > + > +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) + > + MEMBER_SIZEOF(struct dp_packet, l3_ofs) == > + offsetof(struct dp_packet, l4_ofs)); > + We also need a build assert to make sure we can safely read 128 bytes from l2_pad_size. I will add it together with my comment changes below. > +/* Adjust the size of the l2 portion of the dp_packet, updating the l2 > + * pointer and the layer offsets. The function will broadcast resize_by_bytes > + * across a register and uses a kmask to identify which lanes should be > + * incremented/decremented. Either an add or subtract will be performed > + * and the result is stored back to the original packet. */ > +static inline void ALWAYS_INLINE > +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) > +{ > + /* Update packet size/data pointers */ > + if (resize_by_bytes >= 0) { > + dp_packet_prealloc_headroom(b, resize_by_bytes); > + } else { > + ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= > + -resize_by_bytes); > + } > + > + dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes); > + dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes); > + Please use the available macro's for this as suggested earlier to make the code more common: @@ -48,15 +48,11 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) { /* Update packet size/data pointers */ if (resize_by_bytes >= 0) { - dp_packet_prealloc_headroom(b, resize_by_bytes); + dp_packet_push_uninit(b, resize_by_bytes); } else { - ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= - -resize_by_bytes); + dp_packet_pull(b, -resize_by_bytes); } - dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes); - dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes); - > + const __m128i v_zeros = _mm_setzero_si128(); > + const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros); > + > + const uint8_t k_lanes = 0b1110; > + __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes)); > + > + /* Load 128 bits from the dp_packet structure starting at the l2_pad_size > + * offset. */ > + void *adjust_ptr = &b->l2_pad_size; > + __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr); > + > + __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src, > + v_u16_max); > + > + __m128i v_adjust_wip; > + > + if (resize_by_bytes >= 0) { > + v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp, > + v_adjust_src, v_offset); > + } else { > + v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp, > + v_adjust_src, v_offset); > + } > + > + _mm_storeu_si128(adjust_ptr, v_adjust_wip); > +} > + > +/* This function will load the entire vlan_eth_header into a 128-bit wide > + * register. Then use an 8-byte realign to shift the header right by 12 bytes > + * to remove the vlan header and store the results back to the orginal header. > + */ > +static void > +action_avx512_pop_vlan(struct dp_packet_batch *batch, > + const struct nlattr *a OVS_UNUSED) > +{ > + struct dp_packet *packet; > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + struct vlan_eth_header *veh = dp_packet_eth(packet); > + > + if (veh && dp_packet_size(packet) >= sizeof *veh && > + eth_type_vlan(veh->veth_type)) { > + > + __m128i v_ether = _mm_loadu_si128((void *) veh); > + __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(), As I'm not a AVX expert, but would it be more effective to load a register with _mm_setzero_si128(), and reuse it in every iteration? > + 16 - VLAN_HEADER_LEN); I think we should replace the 16 with sizeof(__m128i) as it give an idea what it represents, i.e. number of bits from b. > + _mm_storeu_si128((void *) veh, v_realign); > + avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN); > + } > + } > +} > + > /* Probe functions to check ISA requirements. */ > static bool > avx512_isa_probe(void) > @@ -52,5 +137,11 @@ action_avx512_init(struct odp_execute_action_impl *self) > return -ENOTSUP; > } > > + /* Set function pointers for actions that can be applied directly, these > + * are identified by OVS_ACTION_ATTR_*. */ > + self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan; > return 0; > } > + > +#endif > +#endif > -- > 2.32.0 Here is my full diff with the enhanced comments: diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 473d2c666..9b68ee6b3 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -30,6 +30,15 @@ #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); + +/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs + * fields remain in the same order and offset to l2_padd_size. This is needed + * as the avx512_dp_packet_resize_l2() function will manipulate those fields at + * a fixed memory index based on the l2_padd_size offset. */ +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) + + MEMBER_SIZEOF(struct dp_packet, l2_pad_size) == + offsetof(struct dp_packet, l2_5_ofs)); + BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) + MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) == offsetof(struct dp_packet, l3_ofs)); @@ -38,39 +47,61 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) + MEMBER_SIZEOF(struct dp_packet, l3_ofs) == offsetof(struct dp_packet, l4_ofs)); -/* Adjust the size of the l2 portion of the dp_packet, updating the l2 - * pointer and the layer offsets. The function will broadcast resize_by_bytes - * across a register and uses a kmask to identify which lanes should be - * incremented/decremented. Either an add or subtract will be performed - * and the result is stored back to the original packet. */ +/* The below build assert makes sure it's safe to read/write 128-bits starting + * at the l2_pad_size location. */ +BUILD_ASSERT_DECL(sizeof(struct dp_packet) - + offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i)); + + static inline void ALWAYS_INLINE avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) { - /* Update packet size/data pointers */ + /* Update packet size/data pointers, same as the scalar implementation. */ if (resize_by_bytes >= 0) { - dp_packet_prealloc_headroom(b, resize_by_bytes); + dp_packet_push_uninit(b, resize_by_bytes); } else { - ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= - -resize_by_bytes); + dp_packet_pull(b, -resize_by_bytes); } - dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes); - dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes); + /* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which + * the scalar implementation does with the dp_packet_adjust_layer_offset() + * function. */ + /* Set the v_zero register to all zero's. */ const __m128i v_zeros = _mm_setzero_si128(); + + /* Set the v_u16_max register to all one's. */ const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros); + /* Each lane represents 16 bits in a 12- bit register. In this case the + * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and + * l4_ofs fields. */ const uint8_t k_lanes = 0b1110; + + /* Set all 16-bit words in the 128-bits v_offset register to the value we + * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */ __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes)); - /* Load 128 bits from the dp_packet structure starting at the l2_pad_size + /* Load 128-bits from the dp_packet structure starting at the l2_pad_size * offset. */ void *adjust_ptr = &b->l2_pad_size; __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr); + /* Here is the tricky part, we only need to update the value of the three + * fields if they are not UINT16_MAX. The following function will return + * a mask of lanes (read fields) that are not UINT16_MAX. It will do this + * by comparing only the lanes we requested, k_lanes, and if they match + * v_u16_max, the bit will be set. */ __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src, v_u16_max); + /* Based on the bytes adjust (positive, or negative) it will do the actual + * add or subtraction. These functions will only operate on the lanes + * (fields) requested based on k_cmp, i.e: + * k_cmp = [l2_5_ofs, l3_ofs, l4_ofs] + * for field in kcmp + * v_adjust_src[field] = v_adjust_src[field] + v_offset + */ __m128i v_adjust_wip; if (resize_by_bytes >= 0) { @@ -81,13 +112,12 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) v_adjust_src, v_offset); } + /* Here we write back the full 128-bits. */ _mm_storeu_si128(adjust_ptr, v_adjust_wip); } -/* This function will load the entire vlan_eth_header into a 128-bit wide - * register. Then use an 8-byte realign to shift the header right by 12 bytes - * to remove the vlan header and store the results back to the orginal header. - */ +/* This function performance the same operation on each packet in the batch as + * the scalar eth_pop_vlan() function. */ static void action_avx512_pop_vlan(struct dp_packet_batch *batch, const struct nlattr *a OVS_UNUSED) @@ -100,10 +130,25 @@ action_avx512_pop_vlan(struct dp_packet_batch *batch, if (veh && dp_packet_size(packet) >= sizeof *veh && eth_type_vlan(veh->veth_type)) { + /* Load the first 128-bits of l2 header into the v_ether register. + * This result in the veth_dst/src and veth_type/tci of the + * vlan_eth_header structure to be loaded. */ __m128i v_ether = _mm_loadu_si128((void *) veh); + + /* This creates a 256-bit value containing the first four fields + * of the vlan_eth_header plus 128 zero-bit. The result will be the + * lowest 128-bits after the right shift, hence we shift the data + * 128(zero)-bits minus the VLAN_HEADER_LEN, so we are left with + * only the veth_dst and veth_src fields. */ __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(), - 16 - VLAN_HEADER_LEN); + sizeof(__m128i) - + VLAN_HEADER_LEN); + + /* Write back the modified ethernet header. */ _mm_storeu_si128((void *) veh, v_realign); + + /* As we removed the VLAN_HEADER we now need to adjust all the + * offsets. */ avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN); } }
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 1fb334689..f9e2b1727 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -14,6 +14,11 @@ * limitations under the License. */ +#ifdef __x86_64__ +/* Sparse cannot handle the AVX512 instructions. */ +#if !defined(__CHECKER__) + + #include <config.h> #include <errno.h> @@ -24,6 +29,86 @@ #include "odp-netlink.h" #include "openvswitch/vlog.h" +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) + + MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) == + offsetof(struct dp_packet, l3_ofs)); + +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) + + MEMBER_SIZEOF(struct dp_packet, l3_ofs) == + offsetof(struct dp_packet, l4_ofs)); + +/* Adjust the size of the l2 portion of the dp_packet, updating the l2 + * pointer and the layer offsets. The function will broadcast resize_by_bytes + * across a register and uses a kmask to identify which lanes should be + * incremented/decremented. Either an add or subtract will be performed + * and the result is stored back to the original packet. */ +static inline void ALWAYS_INLINE +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) +{ + /* Update packet size/data pointers */ + if (resize_by_bytes >= 0) { + dp_packet_prealloc_headroom(b, resize_by_bytes); + } else { + ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >= + -resize_by_bytes); + } + + dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes); + dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes); + + const __m128i v_zeros = _mm_setzero_si128(); + const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros); + + const uint8_t k_lanes = 0b1110; + __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes)); + + /* Load 128 bits from the dp_packet structure starting at the l2_pad_size + * offset. */ + void *adjust_ptr = &b->l2_pad_size; + __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr); + + __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src, + v_u16_max); + + __m128i v_adjust_wip; + + if (resize_by_bytes >= 0) { + v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp, + v_adjust_src, v_offset); + } else { + v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp, + v_adjust_src, v_offset); + } + + _mm_storeu_si128(adjust_ptr, v_adjust_wip); +} + +/* This function will load the entire vlan_eth_header into a 128-bit wide + * register. Then use an 8-byte realign to shift the header right by 12 bytes + * to remove the vlan header and store the results back to the orginal header. + */ +static void +action_avx512_pop_vlan(struct dp_packet_batch *batch, + const struct nlattr *a OVS_UNUSED) +{ + struct dp_packet *packet; + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + struct vlan_eth_header *veh = dp_packet_eth(packet); + + if (veh && dp_packet_size(packet) >= sizeof *veh && + eth_type_vlan(veh->veth_type)) { + + __m128i v_ether = _mm_loadu_si128((void *) veh); + __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(), + 16 - VLAN_HEADER_LEN); + _mm_storeu_si128((void *) veh, v_realign); + avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN); + } + } +} + /* Probe functions to check ISA requirements. */ static bool avx512_isa_probe(void) @@ -52,5 +137,11 @@ action_avx512_init(struct odp_execute_action_impl *self) return -ENOTSUP; } + /* Set function pointers for actions that can be applied directly, these + * are identified by OVS_ACTION_ATTR_*. */ + self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan; return 0; } + +#endif +#endif
This commit adds the AVX512 implementation of the pop_vlan action. Signed-off-by: Emma Finn <emma.finn@intel.com> --- lib/odp-execute-avx512.c | 91 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+)