@@ -6501,6 +6501,8 @@ gcn_dwarf_register_span (rtx rtl)
gcn_vector_alignment_reachable
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
+#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER true
struct gcc_target targetm = TARGET_INITIALIZER;
@@ -6122,6 +6122,11 @@ The default is @code{NULL_TREE} which means to not vectorize scatter
stores.
@end deftypefn
+@deftypevr {Target Hook} bool TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+This hook is set to TRUE if gather loads or scatter stores are cheaper on
+this target than a sequence of elementwise loads or stores.
+@end deftypevr
+
@deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
@@ -4195,6 +4195,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_VECTORIZE_BUILTIN_SCATTER
+@hook TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+
@hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
@hook TARGET_SIMD_CLONE_ADJUST
@@ -2027,6 +2027,14 @@ all zeros. GCC can then try to branch around the instruction instead.",
(unsigned ifn),
default_empty_mask_is_expensive)
+/* Prefer gather/scatter loads/stores to e.g. elementwise accesses if\n\
+we cannot use a contiguous access. */
+DEFHOOKPOD
+(prefer_gather_scatter,
+ "This hook is set to TRUE if gather loads or scatter stores are cheaper on\n\
+this target than a sequence of elementwise loads or stores.",
+ bool, false)
+
/* Target builtin that implements vector gather operation. */
DEFHOOK
(builtin_gather,
@@ -2444,9 +2444,14 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
it probably isn't a win to use separate strided accesses based
on nearby locations. Or, even if it's a win over scalar code,
it might not be a win over vectorizing at a lower VF, if that
- allows us to use contiguous accesses. */
+ allows us to use contiguous accesses.
+
+ On some targets (e.g. AMD GCN), always use gather/scatter accesses
+ here since those are the only types of vector loads/stores available,
+ and the fallback case of using elementwise accesses is very
+ inefficient. */
if (*memory_access_type == VMAT_ELEMENTWISE
- && single_element_p
+ && (targetm.vectorize.prefer_gather_scatter || single_element_p)
&& loop_vinfo
&& vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
masked_p, gs_info))