@@ -2994,6 +2994,19 @@ get_mem_type_for_load_store (unsigned int fcode)
}
}
+/* We've seen a vector load from address ADDR. Record it in
+ vector_load_decls, if appropriate. */
+static void
+aarch64_record_vector_load_arg (tree addr)
+{
+ tree decl = aarch64_vector_load_decl (addr);
+ if (!decl)
+ return;
+ if (!cfun->machine->vector_load_decls)
+ cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31);
+ cfun->machine->vector_load_decls->add (decl);
+}
+
/* Try to fold STMT, given that it's a call to the built-in function with
subcode FCODE. Return the new statement on success and null on
failure. */
@@ -3051,6 +3064,11 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
+ /* Punt until after inlining, so that we stand more chance of
+ recording something meaningful in vector_load_decls. */
+ if (!cfun->after_inlining)
+ break;
+ aarch64_record_vector_load_arg (args[0]);
if (!BYTES_BIG_ENDIAN)
{
enum aarch64_simd_type mem_type
@@ -3069,6 +3087,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
fold_build2 (MEM_REF,
access_type,
args[0], zero));
+ gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+ gimple_set_vdef (new_stmt, gimple_vdef (stmt));
}
break;
@@ -3092,6 +3112,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
= gimple_build_assign (fold_build2 (MEM_REF, access_type,
args[0], zero),
args[1]);
+ gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+ gimple_set_vdef (new_stmt, gimple_vdef (stmt));
}
break;
@@ -761,6 +761,7 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
bool aarch64_constant_address_p (rtx);
bool aarch64_emit_approx_div (rtx, rtx, rtx);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+tree aarch64_vector_load_decl (tree);
void aarch64_expand_call (rtx, rtx, rtx, bool);
bool aarch64_expand_cpymem (rtx *);
bool aarch64_expand_setmem (rtx *);
@@ -15661,6 +15661,33 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
/* Vectorizer cost model target hooks. */
+/* If a vld1 from address ADDR should be recorded in vector_load_decls,
+ return the decl that should be recorded. Return null otherwise. */
+tree
+aarch64_vector_load_decl (tree addr)
+{
+ if (TREE_CODE (addr) != ADDR_EXPR)
+ return NULL_TREE;
+ tree base = get_base_address (TREE_OPERAND (addr, 0));
+ if (TREE_CODE (base) != VAR_DECL)
+ return NULL_TREE;
+ return base;
+}
+
+/* Return true if STMT_INFO accesses a decl that is known to be the
+ argument to a vld1 in the same function. */
+static bool
+aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
+{
+ if (!cfun->machine->vector_load_decls)
+ return false;
+ auto dr = STMT_VINFO_DATA_REF (stmt_info);
+ if (!dr)
+ return false;
+ tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
+ return decl && cfun->machine->vector_load_decls->contains (decl);
+}
+
/* Information about how the CPU would issue the scalar, Advanced SIMD
or SVE version of a vector loop, using the scheme defined by the
aarch64_base_vec_issue_info hierarchy of structures. */
@@ -15891,6 +15918,20 @@ private:
supported by Advanced SIMD and SVE2. */
bool m_has_avg = false;
+ /* True if the vector body contains a store to a decl and if the
+ function is known to have a vld1 from the same decl.
+
+ In the Advanced SIMD ACLE, the recommended endian-agnostic way of
+ initializing a vector is:
+
+ float f[4] = { elts };
+ float32x4_t x = vld1q_f32(f);
+
+ We should strongly prefer vectorization of the initialization of f,
+ so that the store to f and the load back can be optimized away,
+ leaving a vectorization of { elts }. */
+ bool m_stores_to_vector_load_decl = false;
+
/* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
- If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
SIMD code.
@@ -16907,6 +16948,18 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
}
}
}
+
+ /* If the statement stores to a decl that is known to be the argument
+ to a vld1 in the same function, ignore the store for costing purposes.
+ See the comment above m_stores_to_vector_load_decl for more details. */
+ if (stmt_info
+ && (kind == vector_store || kind == unaligned_store)
+ && aarch64_accesses_vector_load_decl_p (stmt_info))
+ {
+ stmt_cost = 0;
+ m_stores_to_vector_load_decl = true;
+ }
+
return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
}
@@ -17196,12 +17249,21 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
/* Apply the heuristic described above m_stp_sequence_cost. Prefer
the scalar code in the event of a tie, since there is more chance
- of scalar code being optimized with surrounding operations. */
+ of scalar code being optimized with surrounding operations.
+
+ In addition, if the vector body is a simple store to a decl that
+ is elsewhere loaded using vld1, strongly prefer the vector form,
+ to the extent of giving the prologue a zero cost. See the comment
+ above m_stores_to_vector_load_decl for details. */
if (!loop_vinfo
&& scalar_costs
- && m_stp_sequence_cost != ~0U
- && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
- m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+ && m_stp_sequence_cost != ~0U)
+ {
+ if (m_stores_to_vector_load_decl)
+ m_costs[vect_prologue] = 0;
+ else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+ m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+ }
vector_costs::finish_cost (scalar_costs);
}
@@ -860,6 +860,7 @@ struct GTY (()) aarch64_frame
bool is_scs_enabled;
};
+#ifdef hash_set_h
typedef struct GTY (()) machine_function
{
struct aarch64_frame frame;
@@ -868,8 +869,12 @@ typedef struct GTY (()) machine_function
/* One entry for each general purpose register. */
rtx call_via[SP_REGNUM];
bool label_is_assembled;
+ /* A set of all decls that have been passed to a vld1 intrinsic in the
+ current function. This is used to help guide the vector cost model. */
+ hash_set<tree> *vector_load_decls;
} machine_function;
#endif
+#endif
/* Which ABI to use. */
enum aarch64_abi_type
new file mode 100644
@@ -0,0 +1,281 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target aarch64_little_endian } } } */
+
+#include <arm_neon.h>
+
+/*
+** s32x2_1:
+** dup v0\.2s, w0
+** ret
+*/
+int32x2_t
+s32x2_1 (int32_t x)
+{
+ int32_t arr[] = { x, x };
+ return vld1_s32 (arr);
+}
+
+/*
+** s32x2_2:
+** fmov s0, w0
+** ret
+*/
+int32x2_t
+s32x2_2 (int32_t x)
+{
+ int32_t arr[] = { x, 0 };
+ return vld1_s32 (arr);
+}
+
+/*
+** s32x2_3:
+** fmov s0, w0
+** ins v0\.s\[1\], w1
+** ret
+*/
+int32x2_t
+s32x2_3 (int32_t x, int32_t y)
+{
+ int32_t arr[] = { x, y };
+ return vld1_s32 (arr);
+}
+
+/*
+** f32x2_1:
+** dup v0\.2s, v0.s\[0\]
+** ret
+*/
+float32x2_t
+f32x2_1 (float32_t x)
+{
+ float32_t arr[] = { x, x };
+ return vld1_f32 (arr);
+}
+
+/*
+** f32x2_2:
+** ins v0\.s\[1\], v1.s\[0\]
+** ret
+*/
+float32x2_t
+f32x2_2 (float32_t x, float32_t y)
+{
+ float32_t arr[] = { x, y };
+ return vld1_f32 (arr);
+}
+
+/*
+** s16x4_1:
+** dup v0\.4h, w0
+** ret
+*/
+int16x4_t
+s16x4_1 (int16_t x)
+{
+ int16_t arr[] = { x, x, x, x };
+ return vld1_s16 (arr);
+}
+
+/*
+** s16x4_2:
+** ...
+** fmov [dsh]0, [wx][0-9]+
+** ret
+*/
+int16x4_t
+s16x4_2 (int16_t x)
+{
+ int16_t arr[] = { x, 0, 0, 0 };
+ return vld1_s16 (arr);
+}
+
+/*
+** s16x4_3:
+** dup v0\.4h, w1
+** ins v0.h\[0\], w0
+** ret
+*/
+int16x4_t
+s16x4_3 (int16_t x, int16_t y)
+{
+ int16_t arr[] = { x, y, y, y };
+ return vld1_s16 (arr);
+}
+
+/*
+** f16x4_1:
+** dup v0\.4h, v0.h\[0\]
+** ret
+*/
+float16x4_t
+f16x4_1 (float16_t x)
+{
+ float16_t arr[] = { x, x, x, x };
+ return vld1_f16 (arr);
+}
+
+/*
+** s64x2_1:
+** dup v0\.2d, x0
+** ret
+*/
+int64x2_t
+s64x2_1 (int64_t x)
+{
+ int64_t arr[] = { x, x };
+ return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_2: { xfail *-*-* }
+** fmov d0, x0
+** ret
+*/
+int64x2_t
+s64x2_2 (int64_t x)
+{
+ int64_t arr[] = { x, 0 };
+ return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_3:
+** fmov d0, x0
+** ins v0\.d\[1\], x1
+** ret
+*/
+int64x2_t
+s64x2_3 (int64_t x, int64_t y)
+{
+ int64_t arr[] = { x, y };
+ return vld1q_s64 (arr);
+}
+
+/*
+** f64x2_1:
+** dup v0\.2d, v0.d\[0\]
+** ret
+*/
+float64x2_t
+f64x2_1 (float64_t x)
+{
+ float64_t arr[] = { x, x };
+ return vld1q_f64 (arr);
+}
+
+/*
+** f64x2_2:
+** ins v0\.d\[1\], v1.d\[0\]
+** ret
+*/
+float64x2_t
+f64x2_2 (float64_t x, float64_t y)
+{
+ float64_t arr[] = { x, y };
+ return vld1q_f64 (arr);
+}
+
+/*
+** s32x4_1:
+** dup v0\.4s, w0
+** ret
+*/
+int32x4_t
+s32x4_1 (int32_t x)
+{
+ int32_t arr[] = { x, x, x, x };
+ return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_2: { xfail *-*-* }
+** fmov s0, w0
+** ret
+*/
+int32x4_t
+s32x4_2 (int32_t x)
+{
+ int32_t arr[] = { x, 0, 0, 0 };
+ return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_3:
+** dup v0\.4s, w1
+** ins v0.s\[0\], w0
+** ret
+*/
+int32x4_t
+s32x4_3 (int32_t x, int32_t y)
+{
+ int32_t arr[] = { x, y, y, y };
+ return vld1q_s32 (arr);
+}
+
+/*
+** f32x4_1:
+** dup v0\.4s, v0.s\[0\]
+** ret
+*/
+float32x4_t
+f32x4_1 (float32_t x)
+{
+ float32_t arr[] = { x, x, x, x };
+ return vld1q_f32 (arr);
+}
+
+void consume (float32x4_t, float32x4_t, float32x4_t, float32x4_t);
+
+/*
+** produce_1:
+** (
+** dup v0\.4s, v0\.s\[0\]
+** dup v1\.4s, v1\.s\[0\]
+** dup v2\.4s, v2\.s\[0\]
+** dup v3\.4s, v3\.s\[0\]
+** |
+** dup v3\.4s, v3\.s\[0\]
+** dup v2\.4s, v2\.s\[0\]
+** dup v1\.4s, v1\.s\[0\]
+** dup v0\.4s, v0\.s\[0\]
+** )
+** b consume
+*/
+void
+produce_1 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+ float arr[4][4] = {
+ { a, a, a, a },
+ { b, b, b, b },
+ { c, c, c, c },
+ { d, d, d, d }
+ };
+ consume (vld1q_f32 (arr[0]), vld1q_f32 (arr[1]),
+ vld1q_f32 (arr[2]), vld1q_f32 (arr[3]));
+}
+
+/*
+** produce_2:
+** (
+** dup v0\.4s, v0\.s\[0\]
+** dup v1\.4s, v1\.s\[0\]
+** dup v2\.4s, v2\.s\[0\]
+** dup v3\.4s, v3\.s\[0\]
+** |
+** dup v3\.4s, v3\.s\[0\]
+** dup v2\.4s, v2\.s\[0\]
+** dup v1\.4s, v1\.s\[0\]
+** dup v0\.4s, v0\.s\[0\]
+** )
+** b consume
+*/
+void
+produce_2 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+ float arr0[] = { a, a, a, a };
+ float arr1[] = { b, b, b, b };
+ float arr2[] = { c, c, c, c };
+ float arr3[] = { d, d, d, d };
+ consume (vld1q_f32 (arr0), vld1q_f32 (arr1),
+ vld1q_f32 (arr2), vld1q_f32 (arr3));
+}
new file mode 100644
@@ -0,0 +1,60 @@
+/* { dg-options "-O" } */
+
+#pragma GCC target "arch=armv8.2-a+dotprod"
+
+#include <arm_neon.h>
+
+static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+ return vaddvq_u32(a);
+}
+
+static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+
+
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }