[committed] aarch64: Restore vectorisation of vld1 inputs [PR109072]

Message ID	mpt4jq5w1io.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 209883858C54 To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [committed] aarch64: Restore vectorisation of vld1 inputs [PR109072] Date: Tue, 28 Mar 2023 12:36:47 +0100 Message-ID: <mpt4jq5w1io.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	[committed] aarch64: Restore vectorisation of vld1 inputs [PR109072] \| expand [committed] aarch64: Restore vectorisation of vld1 inputs [PR109072]

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eaefbf6774b..cc6b7c01fd1 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -2994,6 +2994,19 @@ get_mem_type_for_load_store (unsigned int fcode) } } +/* We've seen a vector load from address ADDR. Record it in + vector_load_decls, if appropriate. */ +static void +aarch64_record_vector_load_arg (tree addr) +{ + tree decl = aarch64_vector_load_decl (addr); + if (!decl) + return; + if (!cfun->machine->vector_load_decls) + cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31); + cfun->machine->vector_load_decls->add (decl); +} + /* Try to fold STMT, given that it's a call to the built-in function with subcode FCODE. Return the new statement on success and null on failure. */ @@ -3051,6 +3064,11 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD) + /* Punt until after inlining, so that we stand more chance of + recording something meaningful in vector_load_decls. */ + if (!cfun->after_inlining) + break; + aarch64_record_vector_load_arg (args[0]); if (!BYTES_BIG_ENDIAN) { enum aarch64_simd_type mem_type @@ -3069,6 +3087,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, fold_build2 (MEM_REF, access_type, args[0], zero)); + gimple_set_vuse (new_stmt, gimple_vuse (stmt)); + gimple_set_vdef (new_stmt, gimple_vdef (stmt)); } break; @@ -3092,6 +3112,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, = gimple_build_assign (fold_build2 (MEM_REF, access_type, args[0], zero), args[1]); + gimple_set_vuse (new_stmt, gimple_vuse (stmt)); + gimple_set_vdef (new_stmt, gimple_vdef (stmt)); } break; diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index f75eb892f3d..63339fa47df 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -761,6 +761,7 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT, bool aarch64_constant_address_p (rtx); bool aarch64_emit_approx_div (rtx, rtx, rtx); bool aarch64_emit_approx_sqrt (rtx, rtx, bool); +tree aarch64_vector_load_decl (tree); void aarch64_expand_call (rtx, rtx, rtx, bool); bool aarch64_expand_cpymem (rtx *); bool aarch64_expand_setmem (rtx *); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 661fff65cea..cc119d0acdd 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -15661,6 +15661,33 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* If a vld1 from address ADDR should be recorded in vector_load_decls, + return the decl that should be recorded. Return null otherwise. */ +tree +aarch64_vector_load_decl (tree addr) +{ + if (TREE_CODE (addr) != ADDR_EXPR) + return NULL_TREE; + tree base = get_base_address (TREE_OPERAND (addr, 0)); + if (TREE_CODE (base) != VAR_DECL) + return NULL_TREE; + return base; +} + +/* Return true if STMT_INFO accesses a decl that is known to be the + argument to a vld1 in the same function. */ +static bool +aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info) +{ + if (!cfun->machine->vector_load_decls) + return false; + auto dr = STMT_VINFO_DATA_REF (stmt_info); + if (!dr) + return false; + tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr)); + return decl && cfun->machine->vector_load_decls->contains (decl); +} + /* Information about how the CPU would issue the scalar, Advanced SIMD or SVE version of a vector loop, using the scheme defined by the aarch64_base_vec_issue_info hierarchy of structures. */ @@ -15891,6 +15918,20 @@ private: supported by Advanced SIMD and SVE2. */ bool m_has_avg = false; + /* True if the vector body contains a store to a decl and if the + function is known to have a vld1 from the same decl. + + In the Advanced SIMD ACLE, the recommended endian-agnostic way of + initializing a vector is: + + float f[4] = { elts }; + float32x4_t x = vld1q_f32(f); + + We should strongly prefer vectorization of the initialization of f, + so that the store to f and the load back can be optimized away, + leaving a vectorization of { elts }. */ + bool m_stores_to_vector_load_decl = false; + /* - If M_VEC_FLAGS is zero then we're costing the original scalar code. - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced SIMD code. @@ -16907,6 +16948,18 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, } } } + + /* If the statement stores to a decl that is known to be the argument + to a vld1 in the same function, ignore the store for costing purposes. + See the comment above m_stores_to_vector_load_decl for more details. */ + if (stmt_info + && (kind == vector_store || kind == unaligned_store) + && aarch64_accesses_vector_load_decl_p (stmt_info)) + { + stmt_cost = 0; + m_stores_to_vector_load_decl = true; + } + return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ()); } @@ -17196,12 +17249,21 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) /* Apply the heuristic described above m_stp_sequence_cost. Prefer the scalar code in the event of a tie, since there is more chance - of scalar code being optimized with surrounding operations. */ + of scalar code being optimized with surrounding operations. + + In addition, if the vector body is a simple store to a decl that + is elsewhere loaded using vld1, strongly prefer the vector form, + to the extent of giving the prologue a zero cost. See the comment + above m_stores_to_vector_load_decl for details. */ if (!loop_vinfo && scalar_costs - && m_stp_sequence_cost != ~0U - && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost) - m_costs[vect_body] = 2 * scalar_costs->total_cost (); + && m_stp_sequence_cost != ~0U) + { + if (m_stores_to_vector_load_decl) + m_costs[vect_prologue] = 0; + else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost) + m_costs[vect_body] = 2 * scalar_costs->total_cost (); + } vector_costs::finish_cost (scalar_costs); } diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 2403795e836..155cace6afe 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -860,6 +860,7 @@ struct GTY (()) aarch64_frame bool is_scs_enabled; }; +#ifdef hash_set_h typedef struct GTY (()) machine_function { struct aarch64_frame frame; @@ -868,8 +869,12 @@ typedef struct GTY (()) machine_function /* One entry for each general purpose register. */ rtx call_via[SP_REGNUM]; bool label_is_assembled; + /* A set of all decls that have been passed to a vld1 intrinsic in the + current function. This is used to help guide the vector cost model. */ + hash_set<tree> *vector_load_decls; } machine_function; #endif +#endif /* Which ABI to use. */ enum aarch64_abi_type diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c new file mode 100644 index 00000000000..6c1d2b0bdcc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c @@ -0,0 +1,281 @@ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target aarch64_little_endian } } } */ + +#include <arm_neon.h> + +/* +** s32x2_1: +** dup v0\.2s, w0 +** ret +*/ +int32x2_t +s32x2_1 (int32_t x) +{ + int32_t arr[] = { x, x }; + return vld1_s32 (arr); +} + +/* +** s32x2_2: +** fmov s0, w0 +** ret +*/ +int32x2_t +s32x2_2 (int32_t x) +{ + int32_t arr[] = { x, 0 }; + return vld1_s32 (arr); +} + +/* +** s32x2_3: +** fmov s0, w0 +** ins v0\.s\[1\], w1 +** ret +*/ +int32x2_t +s32x2_3 (int32_t x, int32_t y) +{ + int32_t arr[] = { x, y }; + return vld1_s32 (arr); +} + +/* +** f32x2_1: +** dup v0\.2s, v0.s\[0\] +** ret +*/ +float32x2_t +f32x2_1 (float32_t x) +{ + float32_t arr[] = { x, x }; + return vld1_f32 (arr); +} + +/* +** f32x2_2: +** ins v0\.s\[1\], v1.s\[0\] +** ret +*/ +float32x2_t +f32x2_2 (float32_t x, float32_t y) +{ + float32_t arr[] = { x, y }; + return vld1_f32 (arr); +} + +/* +** s16x4_1: +** dup v0\.4h, w0 +** ret +*/ +int16x4_t +s16x4_1 (int16_t x) +{ + int16_t arr[] = { x, x, x, x }; + return vld1_s16 (arr); +} + +/* +** s16x4_2: +** ... +** fmov [dsh]0, [wx][0-9]+ +** ret +*/ +int16x4_t +s16x4_2 (int16_t x) +{ + int16_t arr[] = { x, 0, 0, 0 }; + return vld1_s16 (arr); +} + +/* +** s16x4_3: +** dup v0\.4h, w1 +** ins v0.h\[0\], w0 +** ret +*/ +int16x4_t +s16x4_3 (int16_t x, int16_t y) +{ + int16_t arr[] = { x, y, y, y }; + return vld1_s16 (arr); +} + +/* +** f16x4_1: +** dup v0\.4h, v0.h\[0\] +** ret +*/ +float16x4_t +f16x4_1 (float16_t x) +{ + float16_t arr[] = { x, x, x, x }; + return vld1_f16 (arr); +} + +/* +** s64x2_1: +** dup v0\.2d, x0 +** ret +*/ +int64x2_t +s64x2_1 (int64_t x) +{ + int64_t arr[] = { x, x }; + return vld1q_s64 (arr); +} + +/* +** s64x2_2: { xfail *-*-* } +** fmov d0, x0 +** ret +*/ +int64x2_t +s64x2_2 (int64_t x) +{ + int64_t arr[] = { x, 0 }; + return vld1q_s64 (arr); +} + +/* +** s64x2_3: +** fmov d0, x0 +** ins v0\.d\[1\], x1 +** ret +*/ +int64x2_t +s64x2_3 (int64_t x, int64_t y) +{ + int64_t arr[] = { x, y }; + return vld1q_s64 (arr); +} + +/* +** f64x2_1: +** dup v0\.2d, v0.d\[0\] +** ret +*/ +float64x2_t +f64x2_1 (float64_t x) +{ + float64_t arr[] = { x, x }; + return vld1q_f64 (arr); +} + +/* +** f64x2_2: +** ins v0\.d\[1\], v1.d\[0\] +** ret +*/ +float64x2_t +f64x2_2 (float64_t x, float64_t y) +{ + float64_t arr[] = { x, y }; + return vld1q_f64 (arr); +} + +/* +** s32x4_1: +** dup v0\.4s, w0 +** ret +*/ +int32x4_t +s32x4_1 (int32_t x) +{ + int32_t arr[] = { x, x, x, x }; + return vld1q_s32 (arr); +} + +/* +** s32x4_2: { xfail *-*-* } +** fmov s0, w0 +** ret +*/ +int32x4_t +s32x4_2 (int32_t x) +{ + int32_t arr[] = { x, 0, 0, 0 }; + return vld1q_s32 (arr); +} + +/* +** s32x4_3: +** dup v0\.4s, w1 +** ins v0.s\[0\], w0 +** ret +*/ +int32x4_t +s32x4_3 (int32_t x, int32_t y) +{ + int32_t arr[] = { x, y, y, y }; + return vld1q_s32 (arr); +} + +/* +** f32x4_1: +** dup v0\.4s, v0.s\[0\] +** ret +*/ +float32x4_t +f32x4_1 (float32_t x) +{ + float32_t arr[] = { x, x, x, x }; + return vld1q_f32 (arr); +} + +void consume (float32x4_t, float32x4_t, float32x4_t, float32x4_t); + +/* +** produce_1: +** ( +** dup v0\.4s, v0\.s\[0\] +** dup v1\.4s, v1\.s\[0\] +** dup v2\.4s, v2\.s\[0\] +** dup v3\.4s, v3\.s\[0\] +** | +** dup v3\.4s, v3\.s\[0\] +** dup v2\.4s, v2\.s\[0\] +** dup v1\.4s, v1\.s\[0\] +** dup v0\.4s, v0\.s\[0\] +** ) +** b consume +*/ +void +produce_1 (float32_t a, float32_t b, float32_t c, float32_t d) +{ + float arr[4][4] = { + { a, a, a, a }, + { b, b, b, b }, + { c, c, c, c }, + { d, d, d, d } + }; + consume (vld1q_f32 (arr[0]), vld1q_f32 (arr[1]), + vld1q_f32 (arr[2]), vld1q_f32 (arr[3])); +} + +/* +** produce_2: +** ( +** dup v0\.4s, v0\.s\[0\] +** dup v1\.4s, v1\.s\[0\] +** dup v2\.4s, v2\.s\[0\] +** dup v3\.4s, v3\.s\[0\] +** | +** dup v3\.4s, v3\.s\[0\] +** dup v2\.4s, v2\.s\[0\] +** dup v1\.4s, v1\.s\[0\] +** dup v0\.4s, v0\.s\[0\] +** ) +** b consume +*/ +void +produce_2 (float32_t a, float32_t b, float32_t c, float32_t d) +{ + float arr0[] = { a, a, a, a }; + float arr1[] = { b, b, b, b }; + float arr2[] = { c, c, c, c }; + float arr3[] = { d, d, d, d }; + consume (vld1q_f32 (arr0), vld1q_f32 (arr1), + vld1q_f32 (arr2), vld1q_f32 (arr3)); +} diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_2.c b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c new file mode 100644 index 00000000000..d532f08aa0c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c @@ -0,0 +1,60 @@ +/* { dg-options "-O" } */ + +#pragma GCC target "arch=armv8.2-a+dotprod" + +#include <arm_neon.h> + +static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) { + return vaddvq_u32(a); +} + +static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + + + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }

[committed] aarch64: Restore vectorisation of vld1 inputs [PR109072]

Commit Message

Patch