@@ -1039,6 +1039,8 @@ namespace aarch64_sve {
#ifdef GCC_TARGET_H
bool verify_type_context (location_t, type_context_kind, const_tree, bool);
#endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
}
extern void aarch64_split_combinev16qi (rtx operands[3]);
@@ -951,14 +951,16 @@ static GTY(()) hash_map<tree, registered_function *> *overload_names[2];
/* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
and NUM_PR SVE predicates. MANGLED_NAME, if nonnull, is the ABI-defined
mangling of the type. ACLE_NAME is the <arm_sve.h> name of the type. */
-static void
+void
add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
{
tree mangled_name_tree
= (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+ tree acle_name_tree
+ = (acle_name ? get_identifier (acle_name) : NULL_TREE);
- tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+ tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
value = tree_cons (NULL_TREE, mangled_name_tree, value);
value = tree_cons (NULL_TREE, size_int (num_pr), value);
value = tree_cons (NULL_TREE, size_int (num_zr), value);
@@ -29022,7 +29022,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
int num, bool explicit_p)
{
tree t, ret_type;
- unsigned int nds_elt_bits;
+ unsigned int nds_elt_bits, wds_elt_bits;
unsigned HOST_WIDE_INT const_simdlen;
if (!TARGET_SIMD)
@@ -29067,10 +29067,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
if (TREE_CODE (ret_type) != VOID_TYPE)
{
nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+ wds_elt_bits = nds_elt_bits;
vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
}
else
- nds_elt_bits = POINTER_SIZE;
+ {
+ nds_elt_bits = POINTER_SIZE;
+ wds_elt_bits = 0;
+ }
int i;
tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -29078,44 +29082,65 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
t && t != void_list_node; t = TREE_CHAIN (t), i++)
{
- tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
+ tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
- && !supported_simd_type (arg_type))
+ && !supported_simd_type (type))
{
if (!explicit_p)
;
- else if (COMPLEX_FLOAT_TYPE_P (ret_type))
+ else if (COMPLEX_FLOAT_TYPE_P (type))
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
"GCC does not currently support argument type %qT "
- "for simd", arg_type);
+ "for simd", type);
else
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
"unsupported argument type %qT for simd",
- arg_type);
+ type);
return 0;
}
- unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
+ unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
- vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
+ vec_elts.safe_push (std::make_pair (type, lane_bits));
if (nds_elt_bits > lane_bits)
nds_elt_bits = lane_bits;
+ if (wds_elt_bits < lane_bits)
+ wds_elt_bits = lane_bits;
}
- clonei->vecsize_mangle = 'n';
+ /* If we could not determine the WDS type from available parameters/return,
+ then fallback to using uintptr_t. */
+ if (wds_elt_bits == 0)
+ wds_elt_bits = POINTER_SIZE;
+
clonei->mask_mode = VOIDmode;
poly_uint64 simdlen;
- auto_vec<poly_uint64> simdlens (2);
+ typedef struct
+ {
+ poly_uint64 len;
+ char mangle;
+ } aarch64_clone_info;
+ auto_vec<aarch64_clone_info> clones (3);
+
/* Keep track of the possible simdlens the clones of this function can have,
and check them later to see if we support them. */
if (known_eq (clonei->simdlen, 0U))
{
simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
if (maybe_ne (simdlen, 1U))
- simdlens.safe_push (simdlen);
- simdlens.safe_push (simdlen * 2);
+ clones.safe_push ({simdlen, 'n'});
+ clones.safe_push ({simdlen * 2, 'n'});
+ /* Only create a SVE simd clone if we aren't dealing with an unprototyped
+ function.
+ We have also disabled support for creating SVE simdclones for functions
+ with function bodies and any simdclones when -msve-vector-bits is used.
+ TODO: add support for these. */
+ if (prototype_p (TREE_TYPE (node->decl))
+ && !node->definition
+ && !aarch64_sve_vg.is_constant ())
+ clones.safe_push ({exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits), 's'});
}
else
- simdlens.safe_push (clonei->simdlen);
+ clones.safe_push ({clonei->simdlen, 'n'});
clonei->vecsize_int = 0;
clonei->vecsize_float = 0;
@@ -29129,11 +29154,12 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
simdclone would cause a vector type to be larger than 128-bits, and reject
such a clone. */
unsigned j = 0;
- while (j < simdlens.length ())
+ while (j < clones.length ())
{
bool remove_simdlen = false;
for (auto elt : vec_elts)
- if (known_gt (simdlens[j] * elt.second, 128U))
+ if (clones[j].mangle == 'n'
+ && known_gt (clones[j].len * elt.second, 128U))
{
/* Don't issue a warning for every simdclone when there is no
specific simdlen clause. */
@@ -29141,18 +29167,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
"GCC does not currently support simdlen %wd for "
"type %qT",
- constant_lower_bound (simdlens[j]), elt.first);
+ constant_lower_bound (clones[j].len), elt.first);
remove_simdlen = true;
break;
}
if (remove_simdlen)
- simdlens.ordered_remove (j);
+ clones.ordered_remove (j);
else
j++;
}
-
- int count = simdlens.length ();
+ int count = clones.length ();
if (count == 0)
{
if (explicit_p && known_eq (clonei->simdlen, 0U))
@@ -29169,21 +29194,118 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
}
gcc_assert (num < count);
- clonei->simdlen = simdlens[num];
+ clonei->simdlen = clones[num].len;
+ clonei->vecsize_mangle = clones[num].mangle;
+ /* SVE simdclones always have a Mask, so set inbranch to 1. */
+ if (clonei->vecsize_mangle == 's')
+ clonei->inbranch = 1;
return count;
}
-/* Implement TARGET_SIMD_CLONE_ADJUST. */
+/* Helper function to adjust a SVE vector type of a SVE simd clone. Returns a
+ SVE vector type based on the element type of the vector TYPE, with SIMDLEN
+ number of elements. If IS_MASK, returns a SVE mask type appropriate for use
+ with the SVE type it would otherwise return. */
+
+static tree
+simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
+{
+ unsigned int num_zr = 0;
+ unsigned int num_pr = 0;
+ machine_mode vector_mode;
+ type = TREE_TYPE (type);
+ scalar_mode scalar_m = SCALAR_TYPE_MODE (type);
+ vector_mode = aarch64_sve_data_mode (scalar_m, simdlen).require ();
+ type = build_vector_type_for_mode (type, vector_mode);
+ if (is_mask)
+ {
+ type = truth_type_for (type);
+ num_pr = 1;
+ }
+ else
+ num_zr = 1;
+ /* We create new types here with the SVE type attribute instead of using ACLE
+ types as we need to support unpacked vectors which aren't available as
+ ACLE SVE types. */
+ type = build_distinct_type_copy (type);
+ aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
+ return type;
+}
+
+/* Implement TARGET_SIMD_CLONE_ADJUST. */
static void
aarch64_simd_clone_adjust (struct cgraph_node *node)
{
- /* Add aarch64_vector_pcs target attribute to SIMD clones so they
- use the correct ABI. */
-
tree t = TREE_TYPE (node->decl);
- TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
- TYPE_ATTRIBUTES (t));
+ cl_target_option cur_target;
+ bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+
+ if (node->simdclone->vecsize_mangle == 's')
+ {
+ /* This is additive and has no effect if SVE, or a superset thereof, is
+ already enabled. */
+ tree target = build_string (strlen ("+sve") + 1, "+sve");
+ if (!aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0))
+ gcc_unreachable ();
+ cl_target_option_save (&cur_target, &global_options, &global_options_set);
+ tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+ cl_target_option_restore (&global_options, &global_options_set,
+ TREE_TARGET_OPTION (new_target));
+ aarch64_override_options_internal (&global_options);
+ memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+ sizeof (have_regs_of_mode));
+ for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+ if (aarch64_sve_mode_p ((machine_mode) i))
+ have_regs_of_mode[i] = true;
+ }
+ else
+ {
+ /* Add aarch64_vector_pcs target attribute to SIMD clones so they
+ use the correct ABI. */
+ TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
+ TYPE_ATTRIBUTES (t));
+ }
+ cgraph_simd_clone *sc = node->simdclone;
+
+ for (unsigned i = 0; i < sc->nargs; ++i)
+ {
+ bool is_mask = false;
+ tree type;
+ switch (sc->args[i].arg_type)
+ {
+ case SIMD_CLONE_ARG_TYPE_MASK:
+ is_mask = true;
+ gcc_fallthrough ();
+ case SIMD_CLONE_ARG_TYPE_VECTOR:
+ case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
+ case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
+ type = sc->args[i].vector_type;
+ gcc_assert (VECTOR_TYPE_P (type));
+ if (node->simdclone->vecsize_mangle == 's')
+ type = simd_clone_adjust_sve_vector_type (type, is_mask,
+ sc->simdlen);
+ else if (is_mask)
+ type = truth_type_for (type);
+ sc->args[i].vector_type = type;
+ break;
+ default:
+ continue;
+ }
+ }
+ if (node->simdclone->vecsize_mangle == 's')
+ {
+ tree ret_type = TREE_TYPE (t);
+ if (VECTOR_TYPE_P (ret_type))
+ TREE_TYPE (t)
+ = simd_clone_adjust_sve_vector_type (ret_type, false,
+ node->simdclone->simdlen);
+ /* Restore current options. */
+ cl_target_option_restore (&global_options, &global_options_set, &cur_target);
+ aarch64_override_options_internal (&global_options);
+ memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+ sizeof (have_regs_of_mode));
+ }
}
/* Implement TARGET_SIMD_CLONE_USABLE. */
@@ -29197,6 +29319,11 @@ aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
if (!TARGET_SIMD || aarch64_sve_mode_p (vector_mode))
return -1;
return 0;
+ case 's':
+ if (!TARGET_SVE
+ || !aarch64_sve_mode_p (vector_mode))
+ return -1;
+ return 0;
default:
gcc_unreachable ();
}
@@ -542,9 +542,12 @@ simd_clone_mangle (struct cgraph_node *node,
pp_string (&pp, "_ZGV");
pp_character (&pp, vecsize_mangle);
pp_character (&pp, mask);
- /* For now, simdlen is always constant, while variable simdlen pp 'n'. */
- unsigned int len = simdlen.to_constant ();
- pp_decimal_int (&pp, (len));
+
+ unsigned HOST_WIDE_INT len;
+ if (simdlen.is_constant (&len))
+ pp_decimal_int (&pp, (int) (len));
+ else
+ pp_character (&pp, 'x');
for (n = 0; n < clone_info->nargs; ++n)
{
@@ -1534,8 +1537,8 @@ simd_clone_adjust (struct cgraph_node *node)
below). */
loop = alloc_loop ();
cfun->has_force_vectorize_loops = true;
- /* For now, simlen is always constant. */
- loop->safelen = node->simdclone->simdlen.to_constant ();
+ /* We can assert that safelen is the 'minimum' simdlen. */
+ loop->safelen = constant_lower_bound (node->simdclone->simdlen);
loop->force_vectorize = true;
loop->header = body_bb;
}
@@ -43,6 +43,7 @@ float f04 (double a)
}
/* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
/* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
+/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
#pragma omp declare simd uniform(a) linear (b)
void f05 (short a, short *b, short c)
new file mode 100644
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Ensure correct creation of SVE Vector-length agnostic (VLA SVE) vector
+ function calls from scalar versions in accordance with the Vector Function
+ Application Binary Interface Specification for AArch64 (AAVPCS).
+
+ We check for correctness in:
+ - Vector function name mangling, with the grammar:
+
+ vector name := prefix "_" name
+ prefix := "_ZGV" isa mask <len> <parameters>
+
+ Whereby:
+ - <isa> := "s" for SVE
+ - <mask> := "M" for Mask
+ - <len> := "x" for VLA SVE
+
+ resulting in:
+ <prefix> := "_ZGVsMx" <parameters>
+
+ with each vector parameter contributing a "v" to the prefix.
+
+ - Parameter and return value mapping:
+ - Unless marked with uniform or linear OpenMP clauses, parameters and
+ return values are expected to map to vectors.
+ - Where the lane-size of a parameter is less than the widest data size
+ for a given function, the resulting vector should be unpacked and
+ populated via use extending loads.
+
+ - Finally, we also make sure we can correctly generate calls to the same
+ function, differing only in the target architecture (i.e. SVE vs SIMD),
+ ensuring that each call points to the correctly-mangled vector function
+ and employs the correct ABI. For example, for `fn' we may expect:
+
+ for #pragma GCC target("+sve"): _ZGVsMxvv_fn
+ for #pragma GCC target("+simd): _ZGVnN4vv_fn */
+
+#pragma GCC target ("+sve")
+/* scan-assembler {_ZGVsMxv_fn0} } } */
+extern int __attribute__ ((simd, const)) fn0 (int);
+void test_fn0 (int *a, int *b, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] += fn0 (b[i]);
+}
+
+/*
+** test_fn1:
+** ...
+** ld1w z1\.s, p7/z, \[x22, x19, lsl 2\]
+** ld1h z0\.s, p7/z, \[x23, x19, lsl 1\]
+** mov p0\.b, p7\.b
+** bl _ZGVsMxvv_fn1
+** st1w z0\.s, p7, \[x21, x19, lsl 2\]
+** ...
+*/
+extern int __attribute__ ((simd, const)) fn1 (short, int);
+void test_fn1 (int *a, int *b, short *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn1 (c[i], b[i]);
+}
+
+/*
+** test_fn2:
+** ...
+** ld1w z1\.s, p7/z, \[x23, x19, lsl 2\]
+** ld1h z0\.s, p7/z, \[x22, x19, lsl 1\]
+** mov p0\.b, p7.b
+** bl _ZGVsMxvv_fn2
+** st1h z0\.s, p7, \[x21, x19, lsl 1\]
+** ...
+*/
+extern short __attribute__ ((simd, const)) fn2 (short, int);
+void test_fn2 (short *a, int *b, short *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn2 (c[i], b[i]);
+}
+
+/*
+** test_fn3:
+** ...
+** ld1b z23\.s, p7/z, \[x22, x19\]
+** ld1w z0\.s, p7/z, \[x23, x19, lsl 2\]
+** ptrue p6\.b, all
+** mov p0\.b, p7\.b
+** mov z1\.d, z23\.d
+** uxtb z23\.h, p6/m, z23\.h
+** bl _ZGVsMxvv_fn3
+** uxtb z0\.h, p6/m, z0\.h
+** add z0\.h, z0\.h, z23\.h
+** uxth z0\.s, p6/m, z0\.s
+** st1w z0\.s, p7, \[x20, x19, lsl 2\]
+** ...
+*/
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/*
+** test_fn4:
+** ...
+** ld1h z23\.s, p7/z, \[x23, x19, lsl 1\]
+** ld1w z0\.s, p7/z, \[x22, x19, lsl 2\]
+** mov p0\.b, p7\.b
+** mov z1\.d, z23\.d
+** ptrue p6\.b, all
+** bl _ZGVsMxvv_fn4
+** sxth z23\.s, p6/m, z23\.s
+** sxth z0\.s, p6/m, z0\.s
+** add z0\.s, z0\.s, z23\.s
+** st1w z0\.s, p7, \[x21, x19, lsl 2\]
+** ...
+*/
+extern short __attribute__ ((simd, const)) fn4 (int, short);
+void test_fn4 (int *a, int *b, short *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
+}
+
+#pragma GCC reset_options
+#pragma GCC target ("+simd")
+/* scan-assembler {_ZGVnN4vv_fn4} } } */
+extern short __attribute__ ((simd, const)) fn4 (int, short);
+void test_fn5 (int *a, int *b, short *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
+}