diff mbox series

[committed] aarch64: Restore vectorisation of vld1 inputs [PR109072]

Message ID mpt4jq5w1io.fsf@arm.com
State New
Headers show
Series [committed] aarch64: Restore vectorisation of vld1 inputs [PR109072] | expand

Commit Message

Richard Sandiford March 28, 2023, 11:36 a.m. UTC
Before GCC 12, we would vectorize:

  int32_t arr[] = { x, x, x, x };

at -O3.  Vectorizing the store on its own is often a loss, particularly
for integers, so g:4963079769c99c4073adfd799885410ad484cbbe suppressed it.
This was necessary to fix regressions from enabling vectorisation at -O2,

However, the vectorisation is important if the code subsequently loads
from the array using vld1:

  return vld1q_s32 (arr);

This approach of initialising an array and loading from it is the
recommend endian-agnostic way of constructing an ACLE vector.

As discussed in the PR notes, the general fix would be to fold the
store and load-back to a constructor (preferably before vectorisation).
But that's clearly not stage 4 material.

This patch instead delays folding vld1 until after inlining and
records which decls a vld1 loads from.  It then treats vector
stores to those decls as free, on the optimistic assumption that
they will be removed later.  The patch also brute-forces
vectorization of plain constructor+store sequences, since some
of the CPU costs make that (dubiously) expensive even when the
store is discounted.

Delaying folding showed that we were failing to update the vops.
The patch fixes that too.

Tested on aarch64-linux-gnu & pushed.

Thanks to Tamar for discussion & help with testing.

Richard


gcc/
	PR target/109072
	* config/aarch64/aarch64-protos.h (aarch64_vector_load_decl): Declare.
	* config/aarch64/aarch64.h (machine_function::vector_load_decls): New
	variable.
	* config/aarch64/aarch64-builtins.cc (aarch64_record_vector_load_arg):
	New function.
	(aarch64_general_gimple_fold_builtin): Delay folding of vld1 until
	after inlining.  Record which decls are loaded from.  Fix handling
	of vops for loads and stores.
	* config/aarch64/aarch64.cc (aarch64_vector_load_decl): New function.
	(aarch64_accesses_vector_load_decl_p): Likewise.
	(aarch64_vector_costs::m_stores_to_vector_load_decl): New member
	variable.
	(aarch64_vector_costs::add_stmt_cost): If the function has a vld1
	that loads from a decl, treat vector stores to those decls as
	zero cost.
	(aarch64_vector_costs::finish_cost): ...and in that case,
	if the vector code does nothing more than a store, give the
	prologue a zero cost as well.

gcc/testsuite/
	PR target/109072
	* gcc.target/aarch64/pr109072_1.c: New test.
	* gcc.target/aarch64/pr109072_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-builtins.cc        |  22 ++
 gcc/config/aarch64/aarch64-protos.h           |   1 +
 gcc/config/aarch64/aarch64.cc                 |  70 ++++-
 gcc/config/aarch64/aarch64.h                  |   5 +
 gcc/testsuite/gcc.target/aarch64/pr109072_1.c | 281 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/pr109072_2.c |  60 ++++
 6 files changed, 435 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr109072_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr109072_2.c
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index eaefbf6774b..cc6b7c01fd1 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2994,6 +2994,19 @@  get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* We've seen a vector load from address ADDR.  Record it in
+   vector_load_decls, if appropriate.  */
+static void
+aarch64_record_vector_load_arg (tree addr)
+{
+  tree decl = aarch64_vector_load_decl (addr);
+  if (!decl)
+    return;
+  if (!cfun->machine->vector_load_decls)
+    cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31);
+  cfun->machine->vector_load_decls->add (decl);
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
@@ -3051,6 +3064,11 @@  aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
+	/* Punt until after inlining, so that we stand more chance of
+	   recording something meaningful in vector_load_decls.  */
+	if (!cfun->after_inlining)
+	  break;
+	aarch64_record_vector_load_arg (args[0]);
 	if (!BYTES_BIG_ENDIAN)
 	  {
 	    enum aarch64_simd_type mem_type
@@ -3069,6 +3087,8 @@  aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
 				     fold_build2 (MEM_REF,
 						  access_type,
 						  args[0], zero));
+	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
 	  }
 	break;
 
@@ -3092,6 +3112,8 @@  aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
 	      = gimple_build_assign (fold_build2 (MEM_REF, access_type,
 						  args[0], zero),
 				     args[1]);
+	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	    gimple_set_vdef (new_stmt, gimple_vdef (stmt));
 	  }
 	break;
 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f75eb892f3d..63339fa47df 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -761,6 +761,7 @@  bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem (rtx *);
 bool aarch64_expand_setmem (rtx *);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 661fff65cea..cc119d0acdd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15661,6 +15661,33 @@  aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
 
 /* Vectorizer cost model target hooks.  */
 
+/* If a vld1 from address ADDR should be recorded in vector_load_decls,
+   return the decl that should be recorded.  Return null otherwise.  */
+tree
+aarch64_vector_load_decl (tree addr)
+{
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return NULL_TREE;
+  tree base = get_base_address (TREE_OPERAND (addr, 0));
+  if (TREE_CODE (base) != VAR_DECL)
+    return NULL_TREE;
+  return base;
+}
+
+/* Return true if STMT_INFO accesses a decl that is known to be the
+   argument to a vld1 in the same function.  */
+static bool
+aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
+{
+  if (!cfun->machine->vector_load_decls)
+    return false;
+  auto dr = STMT_VINFO_DATA_REF (stmt_info);
+  if (!dr)
+    return false;
+  tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
+  return decl && cfun->machine->vector_load_decls->contains (decl);
+}
+
 /* Information about how the CPU would issue the scalar, Advanced SIMD
    or SVE version of a vector loop, using the scheme defined by the
    aarch64_base_vec_issue_info hierarchy of structures.  */
@@ -15891,6 +15918,20 @@  private:
      supported by Advanced SIMD and SVE2.  */
   bool m_has_avg = false;
 
+  /* True if the vector body contains a store to a decl and if the
+     function is known to have a vld1 from the same decl.
+
+     In the Advanced SIMD ACLE, the recommended endian-agnostic way of
+     initializing a vector is:
+
+       float f[4] = { elts };
+       float32x4_t x = vld1q_f32(f);
+
+     We should strongly prefer vectorization of the initialization of f,
+     so that the store to f and the load back can be optimized away,
+     leaving a vectorization of { elts }.  */
+  bool m_stores_to_vector_load_decl = false;
+
   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -16907,6 +16948,18 @@  aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	    }
 	}
     }
+
+  /* If the statement stores to a decl that is known to be the argument
+     to a vld1 in the same function, ignore the store for costing purposes.
+     See the comment above m_stores_to_vector_load_decl for more details.  */
+  if (stmt_info
+      && (kind == vector_store || kind == unaligned_store)
+      && aarch64_accesses_vector_load_decl_p (stmt_info))
+    {
+      stmt_cost = 0;
+      m_stores_to_vector_load_decl = true;
+    }
+
   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
 }
 
@@ -17196,12 +17249,21 @@  aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
      the scalar code in the event of a tie, since there is more chance
-     of scalar code being optimized with surrounding operations.  */
+     of scalar code being optimized with surrounding operations.
+
+     In addition, if the vector body is a simple store to a decl that
+     is elsewhere loaded using vld1, strongly prefer the vector form,
+     to the extent of giving the prologue a zero cost.  See the comment
+     above m_stores_to_vector_load_decl for details.  */
   if (!loop_vinfo
       && scalar_costs
-      && m_stp_sequence_cost != ~0U
-      && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
-    m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+      && m_stp_sequence_cost != ~0U)
+    {
+      if (m_stores_to_vector_load_decl)
+	m_costs[vect_prologue] = 0;
+      else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+	m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+    }
 
   vector_costs::finish_cost (scalar_costs);
 }
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2403795e836..155cace6afe 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -860,6 +860,7 @@  struct GTY (()) aarch64_frame
   bool is_scs_enabled;
 };
 
+#ifdef hash_set_h
 typedef struct GTY (()) machine_function
 {
   struct aarch64_frame frame;
@@ -868,8 +869,12 @@  typedef struct GTY (()) machine_function
   /* One entry for each general purpose register.  */
   rtx call_via[SP_REGNUM];
   bool label_is_assembled;
+  /* A set of all decls that have been passed to a vld1 intrinsic in the
+     current function.  This is used to help guide the vector cost model.  */
+  hash_set<tree> *vector_load_decls;
 } machine_function;
 #endif
+#endif
 
 /* Which ABI to use.  */
 enum aarch64_abi_type
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
new file mode 100644
index 00000000000..6c1d2b0bdcc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -0,0 +1,281 @@ 
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target aarch64_little_endian } } } */
+
+#include <arm_neon.h>
+
+/*
+** s32x2_1:
+**	dup	v0\.2s, w0
+**	ret
+*/
+int32x2_t
+s32x2_1 (int32_t x)
+{
+  int32_t arr[] = { x, x };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_2:
+**	fmov	s0, w0
+**	ret
+*/
+int32x2_t
+s32x2_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0 };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_3:
+**	fmov	s0, w0
+**	ins	v0\.s\[1\], w1
+**	ret
+*/
+int32x2_t
+s32x2_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y };
+  return vld1_s32 (arr);
+}
+
+/*
+** f32x2_1:
+**	dup	v0\.2s, v0.s\[0\]
+**	ret
+*/
+float32x2_t
+f32x2_1 (float32_t x)
+{
+  float32_t arr[] = { x, x };
+  return vld1_f32 (arr);
+}
+
+/*
+** f32x2_2:
+**	ins	v0\.s\[1\], v1.s\[0\]
+**	ret
+*/
+float32x2_t
+f32x2_2 (float32_t x, float32_t y)
+{
+  float32_t arr[] = { x, y };
+  return vld1_f32 (arr);
+}
+
+/*
+** s16x4_1:
+**	dup	v0\.4h, w0
+**	ret
+*/
+int16x4_t
+s16x4_1 (int16_t x)
+{
+  int16_t arr[] = { x, x, x, x };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_2:
+**	...
+**	fmov	[dsh]0, [wx][0-9]+
+**	ret
+*/
+int16x4_t
+s16x4_2 (int16_t x)
+{
+  int16_t arr[] = { x, 0, 0, 0 };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_3:
+**	dup	v0\.4h, w1
+**	ins	v0.h\[0\], w0
+**	ret
+*/
+int16x4_t
+s16x4_3 (int16_t x, int16_t y)
+{
+  int16_t arr[] = { x, y, y, y };
+  return vld1_s16 (arr);
+}
+
+/*
+** f16x4_1:
+**	dup	v0\.4h, v0.h\[0\]
+**	ret
+*/
+float16x4_t
+f16x4_1 (float16_t x)
+{
+  float16_t arr[] = { x, x, x, x };
+  return vld1_f16 (arr);
+}
+
+/*
+** s64x2_1:
+**	dup	v0\.2d, x0
+**	ret
+*/
+int64x2_t
+s64x2_1 (int64_t x)
+{
+  int64_t arr[] = { x, x };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_2: { xfail *-*-* }
+**	fmov	d0, x0
+**	ret
+*/
+int64x2_t
+s64x2_2 (int64_t x)
+{
+  int64_t arr[] = { x, 0 };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_3:
+**	fmov	d0, x0
+**	ins	v0\.d\[1\], x1
+**	ret
+*/
+int64x2_t
+s64x2_3 (int64_t x, int64_t y)
+{
+  int64_t arr[] = { x, y };
+  return vld1q_s64 (arr);
+}
+
+/*
+** f64x2_1:
+**	dup	v0\.2d, v0.d\[0\]
+**	ret
+*/
+float64x2_t
+f64x2_1 (float64_t x)
+{
+  float64_t arr[] = { x, x };
+  return vld1q_f64 (arr);
+}
+
+/*
+** f64x2_2:
+**	ins	v0\.d\[1\], v1.d\[0\]
+**	ret
+*/
+float64x2_t
+f64x2_2 (float64_t x, float64_t y)
+{
+  float64_t arr[] = { x, y };
+  return vld1q_f64 (arr);
+}
+
+/*
+** s32x4_1:
+**	dup	v0\.4s, w0
+**	ret
+*/
+int32x4_t
+s32x4_1 (int32_t x)
+{
+  int32_t arr[] = { x, x, x, x };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_2: { xfail *-*-* }
+**	fmov	s0, w0
+**	ret
+*/
+int32x4_t
+s32x4_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0, 0, 0 };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_3:
+**	dup	v0\.4s, w1
+**	ins	v0.s\[0\], w0
+**	ret
+*/
+int32x4_t
+s32x4_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y, y, y };
+  return vld1q_s32 (arr);
+}
+
+/*
+** f32x4_1:
+**	dup	v0\.4s, v0.s\[0\]
+**	ret
+*/
+float32x4_t
+f32x4_1 (float32_t x)
+{
+  float32_t arr[] = { x, x, x, x };
+  return vld1q_f32 (arr);
+}
+
+void consume (float32x4_t, float32x4_t, float32x4_t, float32x4_t);
+
+/*
+** produce_1:
+** (
+**	dup	v0\.4s, v0\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v3\.4s, v3\.s\[0\]
+** |
+**	dup	v3\.4s, v3\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v0\.4s, v0\.s\[0\]
+** )
+**	b	consume
+*/
+void
+produce_1 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr[4][4] = {
+    { a, a, a, a },
+    { b, b, b, b },
+    { c, c, c, c },
+    { d, d, d, d }
+  };
+  consume (vld1q_f32 (arr[0]), vld1q_f32 (arr[1]),
+	   vld1q_f32 (arr[2]), vld1q_f32 (arr[3]));
+}
+
+/*
+** produce_2:
+** (
+**	dup	v0\.4s, v0\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v3\.4s, v3\.s\[0\]
+** |
+**	dup	v3\.4s, v3\.s\[0\]
+**	dup	v2\.4s, v2\.s\[0\]
+**	dup	v1\.4s, v1\.s\[0\]
+**	dup	v0\.4s, v0\.s\[0\]
+** )
+**	b	consume
+*/
+void
+produce_2 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr0[] = { a, a, a, a };
+  float arr1[] = { b, b, b, b };
+  float arr2[] = { c, c, c, c };
+  float arr3[] = { d, d, d, d };
+  consume (vld1q_f32 (arr0), vld1q_f32 (arr1),
+	   vld1q_f32 (arr2), vld1q_f32 (arr3));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_2.c b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c
new file mode 100644
index 00000000000..d532f08aa0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c
@@ -0,0 +1,60 @@ 
+/* { dg-options "-O" } */
+
+#pragma GCC target "arch=armv8.2-a+dotprod"
+
+#include <arm_neon.h>
+
+static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+  return vaddvq_u32(a);
+}
+
+static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           const uint8_t *second_pred) {
+
+
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                         second_pred);
+}
+
+uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }