Message ID | 20240130143132.9575-4-andre.simoesdiasvieira@arm.com |
---|---|
State | New |
Headers | show |
Series | vect, aarch64: Add SVE support for simdclones | expand |
Andre Vieira <andre.simoesdiasvieira@arm.com> writes: > This patch finalizes adding support for the generation of SVE simd clones when > no simdlen is provided, following the ABI rules where the widest data type > determines the minimum amount of elements in a length agnostic vector. > > gcc/ChangeLog: > > * config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare. > * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make > visibility global and support use for non_acle types. > * config/aarch64/aarch64.cc > (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone > when no simdlen is provided, according to ABI rules. > (simd_clone_adjust_sve_vector_type): New helper function. > (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones > and modify types to use SVE types. > * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen. > (simd_clone_adjust): Adapt safelen check to be compatible with VLA > simdlen. > > gcc/testsuite/ChangeLog: > > * c-c++-common/gomp/declare-variant-14.c: Make i?86 and x86_64 target > only test. > * gfortran.dg/gomp/declare-variant-14.f90: Likewise. > * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan. > * gcc.target/aarch64/vect-simd-clone-1.c: New test. > > diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h > index a0b142e0b94..207396de0ff 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -1031,6 +1031,8 @@ namespace aarch64_sve { > #ifdef GCC_TARGET_H > bool verify_type_context (location_t, type_context_kind, const_tree, bool); > #endif > + void add_sve_type_attribute (tree, unsigned int, unsigned int, > + const char *, const char *); > } > > extern void aarch64_split_combinev16qi (rtx operands[3]); > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc > index 11f5c5c500c..747131e684e 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc > @@ -953,14 +953,16 @@ static bool reported_missing_registers_p; > /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors > and NUM_PR SVE predicates. MANGLED_NAME, if nonnull, is the ABI-defined > mangling of the type. ACLE_NAME is the <arm_sve.h> name of the type. */ > -static void > +void > add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr, > const char *mangled_name, const char *acle_name) > { > tree mangled_name_tree > = (mangled_name ? get_identifier (mangled_name) : NULL_TREE); > + tree acle_name_tree > + = (acle_name ? get_identifier (acle_name) : NULL_TREE); > > - tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE); > + tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE); > value = tree_cons (NULL_TREE, mangled_name_tree, value); > value = tree_cons (NULL_TREE, size_int (num_pr), value); > value = tree_cons (NULL_TREE, size_int (num_zr), value); > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 31617510160..cba8879ab33 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -28527,7 +28527,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > int num, bool explicit_p) > { > tree t, ret_type; > - unsigned int nds_elt_bits; > + unsigned int nds_elt_bits, wds_elt_bits; > unsigned HOST_WIDE_INT const_simdlen; > > if (!TARGET_SIMD) > @@ -28572,10 +28572,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > if (TREE_CODE (ret_type) != VOID_TYPE) > { > nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type); > + wds_elt_bits = nds_elt_bits; > vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits)); > } > else > - nds_elt_bits = POINTER_SIZE; > + { > + nds_elt_bits = POINTER_SIZE; > + wds_elt_bits = 0; > + } > > int i; > tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); > @@ -28583,44 +28587,72 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0; > t && t != void_list_node; t = TREE_CHAIN (t), i++) > { > - tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); > + tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); > if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM > - && !supported_simd_type (arg_type)) > + && !supported_simd_type (type)) > { > if (!explicit_p) > ; > - else if (COMPLEX_FLOAT_TYPE_P (ret_type)) > + else if (COMPLEX_FLOAT_TYPE_P (type)) > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > "GCC does not currently support argument type %qT " > - "for simd", arg_type); > + "for simd", type); > else > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > "unsupported argument type %qT for simd", > - arg_type); > + type); > return 0; > } > - unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type); > + unsigned lane_bits = lane_size (clonei->args[i].arg_type, type); > if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) > - vec_elts.safe_push (std::make_pair (arg_type, lane_bits)); > + vec_elts.safe_push (std::make_pair (type, lane_bits)); > if (nds_elt_bits > lane_bits) > nds_elt_bits = lane_bits; > + if (wds_elt_bits < lane_bits) > + wds_elt_bits = lane_bits; > } > > - clonei->vecsize_mangle = 'n'; > + /* If we could not determine the WDS type from available parameters/return, > + then fallback to using uintptr_t. */ > + if (wds_elt_bits == 0) > + wds_elt_bits = POINTER_SIZE; > + > clonei->mask_mode = VOIDmode; > poly_uint64 simdlen; > - auto_vec<poly_uint64> simdlens (2); > + auto_vec<poly_uint64> simdlens (3); > + auto_vec<char> simdmangle (3); Minor, but I think it'd be neater to use an ad-hoc structure that contains the mangling prefix and simdlen together, so that only one vector is needed. Brace initialization should make it a bit shorter too. > /* Keep track of the possible simdlens the clones of this function can have, > and check them later to see if we support them. */ > if (known_eq (clonei->simdlen, 0U)) > { > simdlen = exact_div (poly_uint64 (64), nds_elt_bits); > if (maybe_ne (simdlen, 1U)) > - simdlens.safe_push (simdlen); > + { > + simdlens.safe_push (simdlen); > + simdmangle.safe_push ('n'); > + } > simdlens.safe_push (simdlen * 2); > + simdmangle.safe_push ('n'); > + /* Only create a SVE simd clone if we aren't dealing with an unprototyped > + function. > + We have also disabled support for creating SVE simdclones for functions > + with function bodies and any simdclones when -msve-vector-bits is used. > + TODO: add support for these. */ > + if ((DECL_ARGUMENTS (node->decl) != 0 > + || type_arg_types != 0) I think my comment from the previous review still stands: This check feels a bit indirect. Does it work to use: if (prototype_p (TREE_TYPE (node->decl))) instead? Or does that not work? > + && !node->definition > + && !aarch64_sve_vg.is_constant ()) > + { > + poly_uint64 sve_simdlen = aarch64_sve_vg * 64; > + simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits)); Simpler as: simdlens.safe_push (exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits)); > + simdmangle.safe_push ('s'); > + } > } > else > - simdlens.safe_push (clonei->simdlen); > + { > + simdlens.safe_push (clonei->simdlen); > + simdmangle.safe_push ('n'); > + } > > clonei->vecsize_int = 0; > clonei->vecsize_float = 0; > @@ -28638,7 +28670,8 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > { > bool remove_simdlen = false; > for (auto elt : vec_elts) > - if (known_gt (simdlens[j] * elt.second, 128U)) > + if (simdmangle[j] == 'n' > + && known_gt (simdlens[j] * elt.second, 128U)) > { > /* Don't issue a warning for every simdclone when there is no > specific simdlen clause. */ > @@ -28651,12 +28684,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > break; > } > if (remove_simdlen) > - simdlens.ordered_remove (j); > + { > + simdlens.ordered_remove (j); > + simdmangle.ordered_remove (j); > + } > else > j++; > } > > - > int count = simdlens.length (); > if (count == 0) > { > @@ -28675,20 +28710,107 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > > gcc_assert (num < count); > clonei->simdlen = simdlens[num]; > + clonei->vecsize_mangle = simdmangle[num]; > + /* SVE simdclones always have a Mask, so set inbranch to 1. */ > + if (clonei->vecsize_mangle == 's') > + clonei->inbranch = 1; > return count; > } > > +static tree > +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen) > +{ > + unsigned int num_zr = 0; From the previous review: Nits: missing function comment. The body is indented by too many columns. > + unsigned int num_pr = 0; > + machine_mode vector_mode; > + type = TREE_TYPE (type); > + scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type)); SCALAR_TYPE_MODE > + gcc_assert (aarch64_sve_data_mode (scalar_m, > + simdlen).exists (&vector_mode)); Better to use require () instead, since gcc_asserts can be compiled out. > + type = build_vector_type_for_mode (type, vector_mode); > + if (is_mask) > + { > + type = truth_type_for (type); > + num_pr = 1; > + } > + else > + num_zr = 1; > + > + aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, > + NULL); The comment from my previous review still stands: Before adding the atttribute, I think we should call: type = build_distinct_type_copy (type); so that we don't change a type that is already in use, or associate any new types with this one. I think it'd also be worth adding a comment to say why we take this approach instead of reusing ACLE types. (The reason being that we need to handle unpacked vectors as well, which the ACLE doesn't provide.) > + return type; > +} > + > /* Implement TARGET_SIMD_CLONE_ADJUST. */ > > static void > aarch64_simd_clone_adjust (struct cgraph_node *node) > { > - /* Add aarch64_vector_pcs target attribute to SIMD clones so they > - use the correct ABI. */ > - > tree t = TREE_TYPE (node->decl); > - TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", > - TYPE_ATTRIBUTES (t)); > + cl_target_option cur_target; > + bool m_old_have_regs_of_mode[MAX_MACHINE_MODE]; > + > + if (node->simdclone->vecsize_mangle == 's') > + { > + tree target = build_string (strlen ("+sve"), "+sve"); Probably worth adding a comment here to say (as you noted in the reply to the last review) that this is additive and has no effect if SVE (or higher) is already enabled. > + aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0); I still think it'd be better to assert that this succeeds (via a gcc_unreachable). It looks weird to call a _p function and not test the result. > + cl_target_option_save (&cur_target, &global_options, &global_options_set); > + tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl); > + cl_target_option_restore (&global_options, &global_options_set, > + TREE_TARGET_OPTION (new_target)); > + aarch64_override_options_internal (&global_options); > + memcpy (m_old_have_regs_of_mode, have_regs_of_mode, > + sizeof (have_regs_of_mode)); > + for (int i = 0; i < NUM_MACHINE_MODES; ++i) > + if (aarch64_sve_mode_p ((machine_mode) i)) > + have_regs_of_mode[i] = true; Sorry, just realised I never replied to your question about the push_cfun/pop_cfun suggestion. I think the function we'd push is node->decl, i.e. the one that received the +sve target attribute. I.e. could we do: push_cfun (node->decl); after aarch64_option_valid_attribute_p and skip the rest? Then do pop_cfun as the restoration step. Does the above work with the: /* If what we're processing is the current pragma string then the target option node is already stored in target_option_current_node by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid having to re-parse the string. This is especially useful to keep arm_neon.h compile times down since that header contains a lot of intrinsics enclosed in pragmas. */ if (!existing_target && args == current_target_pragma) shortcut in aarch64_override_options_internal? I have no particular reason to believe that it wouldn't, just wanted to check... > + } > + else > + { > + /* Add aarch64_vector_pcs target attribute to SIMD clones so they > + use the correct ABI. */ > + TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", > + TYPE_ATTRIBUTES (t)); > + } > + cgraph_simd_clone *sc = node->simdclone; > + > + for (unsigned i = 0; i < sc->nargs; ++i) > + { > + bool is_mask = false; > + tree type; > + switch (sc->args[i].arg_type) > + { > + case SIMD_CLONE_ARG_TYPE_MASK: > + is_mask = true; > + gcc_fallthrough (); > + case SIMD_CLONE_ARG_TYPE_VECTOR: > + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP: > + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP: > + type = sc->args[i].vector_type; > + gcc_assert (VECTOR_TYPE_P (type)); > + if (node->simdclone->vecsize_mangle == 's') > + type = simd_clone_adjust_sve_vector_type (type, is_mask, > + sc->simdlen); > + else if (is_mask) > + type = truth_type_for (type); > + sc->args[i].vector_type = type; Probably best to add a break here (or a fall-through if you prefer). > + default: > + continue; Nit: over-indented continue. But it might as well be a break. > + } > + } > + if (node->simdclone->vecsize_mangle == 's') > + { > + tree ret_type = TREE_TYPE (t); > + if (VECTOR_TYPE_P (ret_type)) > + TREE_TYPE (t) > + = simd_clone_adjust_sve_vector_type (ret_type, false, > + node->simdclone->simdlen); > + /* Restore current options. */ > + cl_target_option_restore (&global_options, &global_options_set, &cur_target); > + aarch64_override_options_internal (&global_options); > + memcpy (have_regs_of_mode, m_old_have_regs_of_mode, > + sizeof (have_regs_of_mode)); > + } > } > > /* Implement TARGET_SIMD_CLONE_USABLE. */ > @@ -28705,6 +28827,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo) > && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))) > return -1; > return 0; > + case 's': > + if (!TARGET_SVE) > + return -1; > + return 0; > default: > gcc_unreachable (); > } > diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc > index 864586207ee..066b6217253 100644 > --- a/gcc/omp-simd-clone.cc > +++ b/gcc/omp-simd-clone.cc > @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node, > pp_string (&pp, "_ZGV"); > pp_character (&pp, vecsize_mangle); > pp_character (&pp, mask); > - /* For now, simdlen is always constant, while variable simdlen pp 'n'. */ > - unsigned int len = simdlen.to_constant (); > - pp_decimal_int (&pp, (len)); > + > + unsigned long long len = 0; unsigned HOST_WIDE_INT > + if (simdlen.is_constant (&len)) > + pp_decimal_int (&pp, (int) (len)); > + else > + pp_character (&pp, 'x'); > > for (n = 0; n < clone_info->nargs; ++n) > { > @@ -1533,8 +1536,8 @@ simd_clone_adjust (struct cgraph_node *node) > below). */ > loop = alloc_loop (); > cfun->has_force_vectorize_loops = true; > - /* For now, simlen is always constant. */ > - loop->safelen = node->simdclone->simdlen.to_constant (); > + /* We can assert that safelen is the 'minimum' simdlen. */ > + loop->safelen = constant_lower_bound (node->simdclone->simdlen); > loop->force_vectorize = true; > loop->header = body_bb; > } > diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c > index e3668893afe..2b71869787e 100644 > --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c > +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c > @@ -1,6 +1,6 @@ > -/* { dg-do compile { target vect_simd_clones } } */ > +/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ > /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */ > -/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */ > +/* { dg-additional-options "-mno-sse3" } */ Please get Jakub's OK for this part. Similarly for the Fortran test. > > int f01 (int); > int f02 (int); > @@ -15,15 +15,13 @@ int > test1 (int x) > { > /* At gimplification time, we can't decide yet which function to call. */ > - /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */ > + /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */ > /* After simd clones are created, the original non-clone test1 shall > call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones > shall call f01 with score 8. */ > /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */ > - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */ > - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */ > - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */ > - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */ > + /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */ > + /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */ > int a = f04 (x); > int b = f04 (x); > return a + b; This part I feel safer with :) > diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > index e2e80f0c663..2f4d3a866e5 100644 > --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > @@ -43,6 +43,7 @@ float f04 (double a) > } > /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */ > /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */ > +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */ > > #pragma omp declare simd uniform(a) linear (b) > void f05 (short a, short *b, short c) > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c > new file mode 100644 > index 00000000000..71fd361acec > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c > @@ -0,0 +1,52 @@ > +/* { dg-do compile } */ > +/* { dg-options "-std=c99" } */ > +/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */ > +extern int __attribute__ ((simd, const)) fn0 (int); > + > +void test_fn0 (int *a, int *b, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] += fn0 (b[i]); > +} > + > +/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */ > + > +extern int __attribute__ ((simd, const)) fn1 (short, int); > + > +void test_fn1 (int *a, int *b, short *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = fn1 (c[i], b[i]); > +} > + > +/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */ > + > +extern short __attribute__ ((simd, const)) fn2 (short, int); > + > +void test_fn2 (short *a, int *b, short *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = fn2 (c[i], b[i]); > +} > + > +/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */ > + > +extern char __attribute__ ((simd, const)) fn3 (int, char); > + > +void test_fn3 (int *a, int *b, char *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = (int) (fn3 (b[i], c[i]) + c[i]); > +} > + > +/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */ > + > +extern short __attribute__ ((simd, const)) fn4 (int, short); > + > +void test_fn4 (int *a, int *b, short *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = (int) (fn4 (b[i], c[i]) + c[i]); > +} > + > +/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */ It'd be nice to have some more specific testing here. Although there are 5 different signatures, the last 4 are interchangeable as far as the test goes. E.g. maybe it would be possible to have some partial check-function-bodies tests that match the inner loop. Do we use extending loads for the unpacked vectors? (Hope so.) Thanks, Richard > diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 > index 6319df0558f..3c7d093c5c6 100644 > --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 > +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 > @@ -1,6 +1,6 @@ > -! { dg-do compile { target vect_simd_clones } } > +! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ > ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" } > -! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } > +! { dg-additional-options "-mno-sse3" } > > module main > implicit none > @@ -41,7 +41,7 @@ contains > ! shall call f01 with score 8. > ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } > ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } > - ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } } > + ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } } > ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } > ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } > a = f04 (x)
On 2/1/24 21:59, Richard Sandiford wrote: > Andre Vieira <andre.simoesdiasvieira@arm.com> writes: >> This patch finalizes adding support for the generation of SVE simd clones when >> no simdlen is provided, following the ABI rules where the widest data type >> determines the minimum amount of elements in a length agnostic vector. >> >> gcc/ChangeLog: >> >> * config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare. >> * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make >> visibility global and support use for non_acle types. >> * config/aarch64/aarch64.cc >> (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone >> when no simdlen is provided, according to ABI rules. >> (simd_clone_adjust_sve_vector_type): New helper function. >> (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones >> and modify types to use SVE types. >> * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen. >> (simd_clone_adjust): Adapt safelen check to be compatible with VLA >> simdlen. >> >> gcc/testsuite/ChangeLog: >> >> * c-c++-common/gomp/declare-variant-14.c: Make i?86 and x86_64 target >> only test. >> * gfortran.dg/gomp/declare-variant-14.f90: Likewise. >> * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan. >> * gcc.target/aarch64/vect-simd-clone-1.c: New test. >> >> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h >> index a0b142e0b94..207396de0ff 100644 >> --- a/gcc/config/aarch64/aarch64-protos.h >> +++ b/gcc/config/aarch64/aarch64-protos.h >> @@ -1031,6 +1031,8 @@ namespace aarch64_sve { >> #ifdef GCC_TARGET_H >> bool verify_type_context (location_t, type_context_kind, const_tree, bool); >> #endif >> + void add_sve_type_attribute (tree, unsigned int, unsigned int, >> + const char *, const char *); >> } >> >> extern void aarch64_split_combinev16qi (rtx operands[3]); >> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc >> index 11f5c5c500c..747131e684e 100644 >> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc >> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc >> @@ -953,14 +953,16 @@ static bool reported_missing_registers_p; >> /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors >> and NUM_PR SVE predicates. MANGLED_NAME, if nonnull, is the ABI-defined >> mangling of the type. ACLE_NAME is the <arm_sve.h> name of the type. */ >> -static void >> +void >> add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr, >> const char *mangled_name, const char *acle_name) >> { >> tree mangled_name_tree >> = (mangled_name ? get_identifier (mangled_name) : NULL_TREE); >> + tree acle_name_tree >> + = (acle_name ? get_identifier (acle_name) : NULL_TREE); >> >> - tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE); >> + tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE); >> value = tree_cons (NULL_TREE, mangled_name_tree, value); >> value = tree_cons (NULL_TREE, size_int (num_pr), value); >> value = tree_cons (NULL_TREE, size_int (num_zr), value); >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index 31617510160..cba8879ab33 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -28527,7 +28527,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> int num, bool explicit_p) >> { >> tree t, ret_type; >> - unsigned int nds_elt_bits; >> + unsigned int nds_elt_bits, wds_elt_bits; >> unsigned HOST_WIDE_INT const_simdlen; >> >> if (!TARGET_SIMD) >> @@ -28572,10 +28572,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> if (TREE_CODE (ret_type) != VOID_TYPE) >> { >> nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type); >> + wds_elt_bits = nds_elt_bits; >> vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits)); >> } >> else >> - nds_elt_bits = POINTER_SIZE; >> + { >> + nds_elt_bits = POINTER_SIZE; >> + wds_elt_bits = 0; >> + } >> >> int i; >> tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); >> @@ -28583,44 +28587,72 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0; >> t && t != void_list_node; t = TREE_CHAIN (t), i++) >> { >> - tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); >> + tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); >> if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM >> - && !supported_simd_type (arg_type)) >> + && !supported_simd_type (type)) >> { >> if (!explicit_p) >> ; >> - else if (COMPLEX_FLOAT_TYPE_P (ret_type)) >> + else if (COMPLEX_FLOAT_TYPE_P (type)) >> warning_at (DECL_SOURCE_LOCATION (node->decl), 0, >> "GCC does not currently support argument type %qT " >> - "for simd", arg_type); >> + "for simd", type); >> else >> warning_at (DECL_SOURCE_LOCATION (node->decl), 0, >> "unsupported argument type %qT for simd", >> - arg_type); >> + type); >> return 0; >> } >> - unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type); >> + unsigned lane_bits = lane_size (clonei->args[i].arg_type, type); >> if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) >> - vec_elts.safe_push (std::make_pair (arg_type, lane_bits)); >> + vec_elts.safe_push (std::make_pair (type, lane_bits)); >> if (nds_elt_bits > lane_bits) >> nds_elt_bits = lane_bits; >> + if (wds_elt_bits < lane_bits) >> + wds_elt_bits = lane_bits; >> } >> >> - clonei->vecsize_mangle = 'n'; >> + /* If we could not determine the WDS type from available parameters/return, >> + then fallback to using uintptr_t. */ >> + if (wds_elt_bits == 0) >> + wds_elt_bits = POINTER_SIZE; >> + >> clonei->mask_mode = VOIDmode; >> poly_uint64 simdlen; >> - auto_vec<poly_uint64> simdlens (2); >> + auto_vec<poly_uint64> simdlens (3); >> + auto_vec<char> simdmangle (3); > > Minor, but I think it'd be neater to use an ad-hoc structure that > contains the mangling prefix and simdlen together, so that only one > vector is needed. Brace initialization should make it a bit shorter too. > >> /* Keep track of the possible simdlens the clones of this function can have, >> and check them later to see if we support them. */ >> if (known_eq (clonei->simdlen, 0U)) >> { >> simdlen = exact_div (poly_uint64 (64), nds_elt_bits); >> if (maybe_ne (simdlen, 1U)) >> - simdlens.safe_push (simdlen); >> + { >> + simdlens.safe_push (simdlen); >> + simdmangle.safe_push ('n'); >> + } >> simdlens.safe_push (simdlen * 2); >> + simdmangle.safe_push ('n'); >> + /* Only create a SVE simd clone if we aren't dealing with an unprototyped >> + function. >> + We have also disabled support for creating SVE simdclones for functions >> + with function bodies and any simdclones when -msve-vector-bits is used. >> + TODO: add support for these. */ >> + if ((DECL_ARGUMENTS (node->decl) != 0 >> + || type_arg_types != 0) > > I think my comment from the previous review still stands: > > This check feels a bit indirect. Does it work to use: > > if (prototype_p (TREE_TYPE (node->decl))) > > instead? > > Or does that not work? > >> + && !node->definition >> + && !aarch64_sve_vg.is_constant ()) >> + { >> + poly_uint64 sve_simdlen = aarch64_sve_vg * 64; >> + simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits)); > > Simpler as: > > simdlens.safe_push (exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits)); > >> + simdmangle.safe_push ('s'); >> + } >> } >> else >> - simdlens.safe_push (clonei->simdlen); >> + { >> + simdlens.safe_push (clonei->simdlen); >> + simdmangle.safe_push ('n'); >> + } >> >> clonei->vecsize_int = 0; >> clonei->vecsize_float = 0; >> @@ -28638,7 +28670,8 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> { >> bool remove_simdlen = false; >> for (auto elt : vec_elts) >> - if (known_gt (simdlens[j] * elt.second, 128U)) >> + if (simdmangle[j] == 'n' >> + && known_gt (simdlens[j] * elt.second, 128U)) >> { >> /* Don't issue a warning for every simdclone when there is no >> specific simdlen clause. */ >> @@ -28651,12 +28684,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> break; >> } >> if (remove_simdlen) >> - simdlens.ordered_remove (j); >> + { >> + simdlens.ordered_remove (j); >> + simdmangle.ordered_remove (j); >> + } >> else >> j++; >> } >> >> - >> int count = simdlens.length (); >> if (count == 0) >> { >> @@ -28675,20 +28710,107 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, >> >> gcc_assert (num < count); >> clonei->simdlen = simdlens[num]; >> + clonei->vecsize_mangle = simdmangle[num]; >> + /* SVE simdclones always have a Mask, so set inbranch to 1. */ >> + if (clonei->vecsize_mangle == 's') >> + clonei->inbranch = 1; >> return count; >> } >> >> +static tree >> +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen) >> +{ >> + unsigned int num_zr = 0; > > From the previous review: > > Nits: missing function comment. The body is indented by too many columns. > >> + unsigned int num_pr = 0; >> + machine_mode vector_mode; >> + type = TREE_TYPE (type); >> + scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type)); > > SCALAR_TYPE_MODE > >> + gcc_assert (aarch64_sve_data_mode (scalar_m, >> + simdlen).exists (&vector_mode)); > > Better to use require () instead, since gcc_asserts can be compiled out. > >> + type = build_vector_type_for_mode (type, vector_mode); >> + if (is_mask) >> + { >> + type = truth_type_for (type); >> + num_pr = 1; >> + } >> + else >> + num_zr = 1; >> + >> + aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, >> + NULL); > > The comment from my previous review still stands: > > Before adding the atttribute, I think we should call: > > type = build_distinct_type_copy (type); > > so that we don't change a type that is already in use, or associate > any new types with this one. > > I think it'd also be worth adding a comment to say why we take this > approach instead of reusing ACLE types. (The reason being that we need > to handle unpacked vectors as well, which the ACLE doesn't provide.) > >> + return type; >> +} >> + >> /* Implement TARGET_SIMD_CLONE_ADJUST. */ >> >> static void >> aarch64_simd_clone_adjust (struct cgraph_node *node) >> { >> - /* Add aarch64_vector_pcs target attribute to SIMD clones so they >> - use the correct ABI. */ >> - >> tree t = TREE_TYPE (node->decl); >> - TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", >> - TYPE_ATTRIBUTES (t)); >> + cl_target_option cur_target; >> + bool m_old_have_regs_of_mode[MAX_MACHINE_MODE]; >> + >> + if (node->simdclone->vecsize_mangle == 's') >> + { >> + tree target = build_string (strlen ("+sve"), "+sve"); > > Probably worth adding a comment here to say (as you noted in the reply > to the last review) that this is additive and has no effect if SVE (or > higher) is already enabled. > >> + aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0); > > I still think it'd be better to assert that this succeeds (via > a gcc_unreachable). It looks weird to call a _p function and not test > the result. > >> + cl_target_option_save (&cur_target, &global_options, &global_options_set); >> + tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl); >> + cl_target_option_restore (&global_options, &global_options_set, >> + TREE_TARGET_OPTION (new_target)); >> + aarch64_override_options_internal (&global_options); >> + memcpy (m_old_have_regs_of_mode, have_regs_of_mode, >> + sizeof (have_regs_of_mode)); >> + for (int i = 0; i < NUM_MACHINE_MODES; ++i) >> + if (aarch64_sve_mode_p ((machine_mode) i)) >> + have_regs_of_mode[i] = true; > > Sorry, just realised I never replied to your question about the > push_cfun/pop_cfun suggestion. I think the function we'd push is > node->decl, i.e. the one that received the +sve target attribute. > > I.e. could we do: > > push_cfun (node->decl); > > after aarch64_option_valid_attribute_p and skip the rest? Then do > pop_cfun as the restoration step. On this matter, I will just quote Andre verbatim, as he's correct: push_cfun expects a function structure and DECL_STRUCT_FUNCTION (node->decl) returns 0 for functions without function bodies, so I can't use it here. > > Does the above work with the: > > /* If what we're processing is the current pragma string then the > target option node is already stored in target_option_current_node > by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid > having to re-parse the string. This is especially useful to keep > arm_neon.h compile times down since that header contains a lot > of intrinsics enclosed in pragmas. */ > if (!existing_target && args == current_target_pragma) > > shortcut in aarch64_override_options_internal? I have no particular > reason to believe that it wouldn't, just wanted to check... I'm afraid this won't work for a few reasons. The first is that args is a STRING_CST and current_target_pragma a TREE_LIST. We could, in principle, fix the issue as follows: else if (TREE_CODE (args) == STRING_CST && list_length (current_target_pragma) == 1) { if (TREE_STRING_LENGTH (TREE_VALUE (current_target_pragma)) == TREE_STRING_LENGTH (args)) if (!strcmp (TREE_STRING_POINTER (args), TREE_STRING_POINTER(TREE_VALUE(current_target_pragma)))) return true; } But this exposes another issue. `current_target_pragma' gets updated as we parse the source code from top to bottom. Its state coming out of the frontend therefore reflects the last `#pragma GCC target' seen and doesn't appear to change from that point onward in the compilation process. Now suppose for a sec that the last pragma target in the file is "+simd". `pass_omp_simd_clone' then processes our functions in the reverse order (effectively bottom of source code upwards). When it runs the above check, it will always see "+simd", which after we pass the `#pragma GCC target ("+simd")' statement in bottom-up fashion, will be wrong anyway. Therefore, it's a good thing that the ` if (!existing_target && args == current_target_pragma)' test always fails. Regards, Victor >> + } >> + else >> + { >> + /* Add aarch64_vector_pcs target attribute to SIMD clones so they >> + use the correct ABI. */ >> + TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", >> + TYPE_ATTRIBUTES (t)); >> + } >> + cgraph_simd_clone *sc = node->simdclone; >> + >> + for (unsigned i = 0; i < sc->nargs; ++i) >> + { >> + bool is_mask = false; >> + tree type; >> + switch (sc->args[i].arg_type) >> + { >> + case SIMD_CLONE_ARG_TYPE_MASK: >> + is_mask = true; >> + gcc_fallthrough (); >> + case SIMD_CLONE_ARG_TYPE_VECTOR: >> + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP: >> + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP: >> + type = sc->args[i].vector_type; >> + gcc_assert (VECTOR_TYPE_P (type)); >> + if (node->simdclone->vecsize_mangle == 's') >> + type = simd_clone_adjust_sve_vector_type (type, is_mask, >> + sc->simdlen); >> + else if (is_mask) >> + type = truth_type_for (type); >> + sc->args[i].vector_type = type; > > Probably best to add a break here (or a fall-through if you prefer). > >> + default: >> + continue; > > Nit: over-indented continue. But it might as well be a break. > >> + } >> + } >> + if (node->simdclone->vecsize_mangle == 's') >> + { >> + tree ret_type = TREE_TYPE (t); >> + if (VECTOR_TYPE_P (ret_type)) >> + TREE_TYPE (t) >> + = simd_clone_adjust_sve_vector_type (ret_type, false, >> + node->simdclone->simdlen); >> + /* Restore current options. */ >> + cl_target_option_restore (&global_options, &global_options_set, &cur_target); >> + aarch64_override_options_internal (&global_options); >> + memcpy (have_regs_of_mode, m_old_have_regs_of_mode, >> + sizeof (have_regs_of_mode)); >> + } >> } >> >> /* Implement TARGET_SIMD_CLONE_USABLE. */ >> @@ -28705,6 +28827,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo) >> && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))) >> return -1; >> return 0; >> + case 's': >> + if (!TARGET_SVE) >> + return -1; >> + return 0; >> default: >> gcc_unreachable (); >> } >> diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc >> index 864586207ee..066b6217253 100644 >> --- a/gcc/omp-simd-clone.cc >> +++ b/gcc/omp-simd-clone.cc >> @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node, >> pp_string (&pp, "_ZGV"); >> pp_character (&pp, vecsize_mangle); >> pp_character (&pp, mask); >> - /* For now, simdlen is always constant, while variable simdlen pp 'n'. */ >> - unsigned int len = simdlen.to_constant (); >> - pp_decimal_int (&pp, (len)); >> + >> + unsigned long long len = 0; > > unsigned HOST_WIDE_INT > >> + if (simdlen.is_constant (&len)) >> + pp_decimal_int (&pp, (int) (len)); >> + else >> + pp_character (&pp, 'x'); >> >> for (n = 0; n < clone_info->nargs; ++n) >> { >> @@ -1533,8 +1536,8 @@ simd_clone_adjust (struct cgraph_node *node) >> below). */ >> loop = alloc_loop (); >> cfun->has_force_vectorize_loops = true; >> - /* For now, simlen is always constant. */ >> - loop->safelen = node->simdclone->simdlen.to_constant (); >> + /* We can assert that safelen is the 'minimum' simdlen. */ >> + loop->safelen = constant_lower_bound (node->simdclone->simdlen); >> loop->force_vectorize = true; >> loop->header = body_bb; >> } >> diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c >> index e3668893afe..2b71869787e 100644 >> --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c >> +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c >> @@ -1,6 +1,6 @@ >> -/* { dg-do compile { target vect_simd_clones } } */ >> +/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ >> /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */ >> -/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */ >> +/* { dg-additional-options "-mno-sse3" } */ > > Please get Jakub's OK for this part. Similarly for the Fortran test. > >> >> int f01 (int); >> int f02 (int); >> @@ -15,15 +15,13 @@ int >> test1 (int x) >> { >> /* At gimplification time, we can't decide yet which function to call. */ >> - /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */ >> + /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */ >> /* After simd clones are created, the original non-clone test1 shall >> call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones >> shall call f01 with score 8. */ >> /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */ >> - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */ >> - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */ >> - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */ >> - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */ >> + /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */ >> + /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */ >> int a = f04 (x); >> int b = f04 (x); >> return a + b; > > This part I feel safer with :) > >> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c >> index e2e80f0c663..2f4d3a866e5 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c >> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c >> @@ -43,6 +43,7 @@ float f04 (double a) >> } >> /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */ >> /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */ >> +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */ >> >> #pragma omp declare simd uniform(a) linear (b) >> void f05 (short a, short *b, short c) >> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c >> new file mode 100644 >> index 00000000000..71fd361acec >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c >> @@ -0,0 +1,52 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-std=c99" } */ >> +/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */ >> +extern int __attribute__ ((simd, const)) fn0 (int); >> + >> +void test_fn0 (int *a, int *b, int n) >> +{ >> + for (int i = 0; i < n; ++i) >> + a[i] += fn0 (b[i]); >> +} >> + >> +/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */ >> + >> +extern int __attribute__ ((simd, const)) fn1 (short, int); >> + >> +void test_fn1 (int *a, int *b, short *c, int n) >> +{ >> + for (int i = 0; i < n; ++i) >> + a[i] = fn1 (c[i], b[i]); >> +} >> + >> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */ >> + >> +extern short __attribute__ ((simd, const)) fn2 (short, int); >> + >> +void test_fn2 (short *a, int *b, short *c, int n) >> +{ >> + for (int i = 0; i < n; ++i) >> + a[i] = fn2 (c[i], b[i]); >> +} >> + >> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */ >> + >> +extern char __attribute__ ((simd, const)) fn3 (int, char); >> + >> +void test_fn3 (int *a, int *b, char *c, int n) >> +{ >> + for (int i = 0; i < n; ++i) >> + a[i] = (int) (fn3 (b[i], c[i]) + c[i]); >> +} >> + >> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */ >> + >> +extern short __attribute__ ((simd, const)) fn4 (int, short); >> + >> +void test_fn4 (int *a, int *b, short *c, int n) >> +{ >> + for (int i = 0; i < n; ++i) >> + a[i] = (int) (fn4 (b[i], c[i]) + c[i]); >> +} >> + >> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */ > > It'd be nice to have some more specific testing here. Although there > are 5 different signatures, the last 4 are interchangeable as far as > the test goes. E.g. maybe it would be possible to have some partial > check-function-bodies tests that match the inner loop. Do we > use extending loads for the unpacked vectors? (Hope so.) > > Thanks, > Richard > >> diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 >> index 6319df0558f..3c7d093c5c6 100644 >> --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 >> +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 >> @@ -1,6 +1,6 @@ >> -! { dg-do compile { target vect_simd_clones } } >> +! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ >> ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" } >> -! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } >> +! { dg-additional-options "-mno-sse3" } >> >> module main >> implicit none >> @@ -41,7 +41,7 @@ contains >> ! shall call f01 with score 8. >> ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } >> ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } >> - ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } } >> + ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } } >> ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } >> ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } >> a = f04 (x)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index a0b142e0b94..207396de0ff 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1031,6 +1031,8 @@ namespace aarch64_sve { #ifdef GCC_TARGET_H bool verify_type_context (location_t, type_context_kind, const_tree, bool); #endif + void add_sve_type_attribute (tree, unsigned int, unsigned int, + const char *, const char *); } extern void aarch64_split_combinev16qi (rtx operands[3]); diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 11f5c5c500c..747131e684e 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -953,14 +953,16 @@ static bool reported_missing_registers_p; /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors and NUM_PR SVE predicates. MANGLED_NAME, if nonnull, is the ABI-defined mangling of the type. ACLE_NAME is the <arm_sve.h> name of the type. */ -static void +void add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr, const char *mangled_name, const char *acle_name) { tree mangled_name_tree = (mangled_name ? get_identifier (mangled_name) : NULL_TREE); + tree acle_name_tree + = (acle_name ? get_identifier (acle_name) : NULL_TREE); - tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE); + tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE); value = tree_cons (NULL_TREE, mangled_name_tree, value); value = tree_cons (NULL_TREE, size_int (num_pr), value); value = tree_cons (NULL_TREE, size_int (num_zr), value); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 31617510160..cba8879ab33 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -28527,7 +28527,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, int num, bool explicit_p) { tree t, ret_type; - unsigned int nds_elt_bits; + unsigned int nds_elt_bits, wds_elt_bits; unsigned HOST_WIDE_INT const_simdlen; if (!TARGET_SIMD) @@ -28572,10 +28572,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, if (TREE_CODE (ret_type) != VOID_TYPE) { nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type); + wds_elt_bits = nds_elt_bits; vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits)); } else - nds_elt_bits = POINTER_SIZE; + { + nds_elt_bits = POINTER_SIZE; + wds_elt_bits = 0; + } int i; tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); @@ -28583,44 +28587,72 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0; t && t != void_list_node; t = TREE_CHAIN (t), i++) { - tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); + tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM - && !supported_simd_type (arg_type)) + && !supported_simd_type (type)) { if (!explicit_p) ; - else if (COMPLEX_FLOAT_TYPE_P (ret_type)) + else if (COMPLEX_FLOAT_TYPE_P (type)) warning_at (DECL_SOURCE_LOCATION (node->decl), 0, "GCC does not currently support argument type %qT " - "for simd", arg_type); + "for simd", type); else warning_at (DECL_SOURCE_LOCATION (node->decl), 0, "unsupported argument type %qT for simd", - arg_type); + type); return 0; } - unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type); + unsigned lane_bits = lane_size (clonei->args[i].arg_type, type); if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) - vec_elts.safe_push (std::make_pair (arg_type, lane_bits)); + vec_elts.safe_push (std::make_pair (type, lane_bits)); if (nds_elt_bits > lane_bits) nds_elt_bits = lane_bits; + if (wds_elt_bits < lane_bits) + wds_elt_bits = lane_bits; } - clonei->vecsize_mangle = 'n'; + /* If we could not determine the WDS type from available parameters/return, + then fallback to using uintptr_t. */ + if (wds_elt_bits == 0) + wds_elt_bits = POINTER_SIZE; + clonei->mask_mode = VOIDmode; poly_uint64 simdlen; - auto_vec<poly_uint64> simdlens (2); + auto_vec<poly_uint64> simdlens (3); + auto_vec<char> simdmangle (3); /* Keep track of the possible simdlens the clones of this function can have, and check them later to see if we support them. */ if (known_eq (clonei->simdlen, 0U)) { simdlen = exact_div (poly_uint64 (64), nds_elt_bits); if (maybe_ne (simdlen, 1U)) - simdlens.safe_push (simdlen); + { + simdlens.safe_push (simdlen); + simdmangle.safe_push ('n'); + } simdlens.safe_push (simdlen * 2); + simdmangle.safe_push ('n'); + /* Only create a SVE simd clone if we aren't dealing with an unprototyped + function. + We have also disabled support for creating SVE simdclones for functions + with function bodies and any simdclones when -msve-vector-bits is used. + TODO: add support for these. */ + if ((DECL_ARGUMENTS (node->decl) != 0 + || type_arg_types != 0) + && !node->definition + && !aarch64_sve_vg.is_constant ()) + { + poly_uint64 sve_simdlen = aarch64_sve_vg * 64; + simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits)); + simdmangle.safe_push ('s'); + } } else - simdlens.safe_push (clonei->simdlen); + { + simdlens.safe_push (clonei->simdlen); + simdmangle.safe_push ('n'); + } clonei->vecsize_int = 0; clonei->vecsize_float = 0; @@ -28638,7 +28670,8 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, { bool remove_simdlen = false; for (auto elt : vec_elts) - if (known_gt (simdlens[j] * elt.second, 128U)) + if (simdmangle[j] == 'n' + && known_gt (simdlens[j] * elt.second, 128U)) { /* Don't issue a warning for every simdclone when there is no specific simdlen clause. */ @@ -28651,12 +28684,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, break; } if (remove_simdlen) - simdlens.ordered_remove (j); + { + simdlens.ordered_remove (j); + simdmangle.ordered_remove (j); + } else j++; } - int count = simdlens.length (); if (count == 0) { @@ -28675,20 +28710,107 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, gcc_assert (num < count); clonei->simdlen = simdlens[num]; + clonei->vecsize_mangle = simdmangle[num]; + /* SVE simdclones always have a Mask, so set inbranch to 1. */ + if (clonei->vecsize_mangle == 's') + clonei->inbranch = 1; return count; } +static tree +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen) +{ + unsigned int num_zr = 0; + unsigned int num_pr = 0; + machine_mode vector_mode; + type = TREE_TYPE (type); + scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type)); + gcc_assert (aarch64_sve_data_mode (scalar_m, + simdlen).exists (&vector_mode)); + type = build_vector_type_for_mode (type, vector_mode); + if (is_mask) + { + type = truth_type_for (type); + num_pr = 1; + } + else + num_zr = 1; + + aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, + NULL); + return type; +} + /* Implement TARGET_SIMD_CLONE_ADJUST. */ static void aarch64_simd_clone_adjust (struct cgraph_node *node) { - /* Add aarch64_vector_pcs target attribute to SIMD clones so they - use the correct ABI. */ - tree t = TREE_TYPE (node->decl); - TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", - TYPE_ATTRIBUTES (t)); + cl_target_option cur_target; + bool m_old_have_regs_of_mode[MAX_MACHINE_MODE]; + + if (node->simdclone->vecsize_mangle == 's') + { + tree target = build_string (strlen ("+sve"), "+sve"); + aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0); + cl_target_option_save (&cur_target, &global_options, &global_options_set); + tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl); + cl_target_option_restore (&global_options, &global_options_set, + TREE_TARGET_OPTION (new_target)); + aarch64_override_options_internal (&global_options); + memcpy (m_old_have_regs_of_mode, have_regs_of_mode, + sizeof (have_regs_of_mode)); + for (int i = 0; i < NUM_MACHINE_MODES; ++i) + if (aarch64_sve_mode_p ((machine_mode) i)) + have_regs_of_mode[i] = true; + } + else + { + /* Add aarch64_vector_pcs target attribute to SIMD clones so they + use the correct ABI. */ + TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", + TYPE_ATTRIBUTES (t)); + } + cgraph_simd_clone *sc = node->simdclone; + + for (unsigned i = 0; i < sc->nargs; ++i) + { + bool is_mask = false; + tree type; + switch (sc->args[i].arg_type) + { + case SIMD_CLONE_ARG_TYPE_MASK: + is_mask = true; + gcc_fallthrough (); + case SIMD_CLONE_ARG_TYPE_VECTOR: + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP: + case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP: + type = sc->args[i].vector_type; + gcc_assert (VECTOR_TYPE_P (type)); + if (node->simdclone->vecsize_mangle == 's') + type = simd_clone_adjust_sve_vector_type (type, is_mask, + sc->simdlen); + else if (is_mask) + type = truth_type_for (type); + sc->args[i].vector_type = type; + default: + continue; + } + } + if (node->simdclone->vecsize_mangle == 's') + { + tree ret_type = TREE_TYPE (t); + if (VECTOR_TYPE_P (ret_type)) + TREE_TYPE (t) + = simd_clone_adjust_sve_vector_type (ret_type, false, + node->simdclone->simdlen); + /* Restore current options. */ + cl_target_option_restore (&global_options, &global_options_set, &cur_target); + aarch64_override_options_internal (&global_options); + memcpy (have_regs_of_mode, m_old_have_regs_of_mode, + sizeof (have_regs_of_mode)); + } } /* Implement TARGET_SIMD_CLONE_USABLE. */ @@ -28705,6 +28827,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo) && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))) return -1; return 0; + case 's': + if (!TARGET_SVE) + return -1; + return 0; default: gcc_unreachable (); } diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc index 864586207ee..066b6217253 100644 --- a/gcc/omp-simd-clone.cc +++ b/gcc/omp-simd-clone.cc @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node, pp_string (&pp, "_ZGV"); pp_character (&pp, vecsize_mangle); pp_character (&pp, mask); - /* For now, simdlen is always constant, while variable simdlen pp 'n'. */ - unsigned int len = simdlen.to_constant (); - pp_decimal_int (&pp, (len)); + + unsigned long long len = 0; + if (simdlen.is_constant (&len)) + pp_decimal_int (&pp, (int) (len)); + else + pp_character (&pp, 'x'); for (n = 0; n < clone_info->nargs; ++n) { @@ -1533,8 +1536,8 @@ simd_clone_adjust (struct cgraph_node *node) below). */ loop = alloc_loop (); cfun->has_force_vectorize_loops = true; - /* For now, simlen is always constant. */ - loop->safelen = node->simdclone->simdlen.to_constant (); + /* We can assert that safelen is the 'minimum' simdlen. */ + loop->safelen = constant_lower_bound (node->simdclone->simdlen); loop->force_vectorize = true; loop->header = body_bb; } diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c index e3668893afe..2b71869787e 100644 --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c @@ -1,6 +1,6 @@ -/* { dg-do compile { target vect_simd_clones } } */ +/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */ -/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */ +/* { dg-additional-options "-mno-sse3" } */ int f01 (int); int f02 (int); @@ -15,15 +15,13 @@ int test1 (int x) { /* At gimplification time, we can't decide yet which function to call. */ - /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */ + /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */ /* After simd clones are created, the original non-clone test1 shall call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones shall call f01 with score 8. */ /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */ - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */ - /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */ - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */ - /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */ + /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */ + /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */ int a = f04 (x); int b = f04 (x); return a + b; diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c index e2e80f0c663..2f4d3a866e5 100644 --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c @@ -43,6 +43,7 @@ float f04 (double a) } /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */ /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */ +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */ #pragma omp declare simd uniform(a) linear (b) void f05 (short a, short *b, short c) diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c new file mode 100644 index 00000000000..71fd361acec --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c99" } */ +/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */ +extern int __attribute__ ((simd, const)) fn0 (int); + +void test_fn0 (int *a, int *b, int n) +{ + for (int i = 0; i < n; ++i) + a[i] += fn0 (b[i]); +} + +/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */ + +extern int __attribute__ ((simd, const)) fn1 (short, int); + +void test_fn1 (int *a, int *b, short *c, int n) +{ + for (int i = 0; i < n; ++i) + a[i] = fn1 (c[i], b[i]); +} + +/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */ + +extern short __attribute__ ((simd, const)) fn2 (short, int); + +void test_fn2 (short *a, int *b, short *c, int n) +{ + for (int i = 0; i < n; ++i) + a[i] = fn2 (c[i], b[i]); +} + +/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */ + +extern char __attribute__ ((simd, const)) fn3 (int, char); + +void test_fn3 (int *a, int *b, char *c, int n) +{ + for (int i = 0; i < n; ++i) + a[i] = (int) (fn3 (b[i], c[i]) + c[i]); +} + +/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */ + +extern short __attribute__ ((simd, const)) fn4 (int, short); + +void test_fn4 (int *a, int *b, short *c, int n) +{ + for (int i = 0; i < n; ++i) + a[i] = (int) (fn4 (b[i], c[i]) + c[i]); +} + +/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */ diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 index 6319df0558f..3c7d093c5c6 100644 --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 @@ -1,6 +1,6 @@ -! { dg-do compile { target vect_simd_clones } } +! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */ ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" } -! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } +! { dg-additional-options "-mno-sse3" } module main implicit none @@ -41,7 +41,7 @@ contains ! shall call f01 with score 8. ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } - ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } } + ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } } ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } a = f04 (x)