diff mbox series

[8/8] aarch64: Add SVE support for simd clones [PR 96342]

Message ID 25cccf6c-1b3c-a032-7930-aba25a311dca@arm.com
State New
Headers show
Series [1/8] parloops: Copy target and optimizations when creating a function clone | expand

Commit Message

Andre Vieira (lists) Aug. 30, 2023, 9:19 a.m. UTC
This patch finalizes adding support for the generation of SVE simd 
clones when no simdlen is provided, following the ABI rules where the 
widest data type determines the minimum amount of elements in a length 
agnostic vector.

gcc/ChangeLog:

         * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
Declare.
	* config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
	visibility global.
	* config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
	chosen over SIMD ABI if a SVE type is used in return or arguments.
	(aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
	when no simdlen is provided, according to ABI rules.
	(aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
	(aarch64_simd_clone_adjust_ret_or_param): New.
	(TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
	* omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
	(simd_clone_adjust): Adapt safelen check to be compatible with VLA
	simdlen.

gcc/testsuite/ChangeLog:

	* c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
	* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
	* gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
	longer necessary.
	* gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.

Comments

Andre Vieira (lists) Oct. 18, 2023, 2:41 p.m. UTC | #1
Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
> This patch finalizes adding support for the generation of SVE simd 
> clones when no simdlen is provided, following the ABI rules where the 
> widest data type determines the minimum amount of elements in a length 
> agnostic vector.
> 
> gcc/ChangeLog:
> 
>          * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
> Declare.
>      * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): 
> Make
>      visibility global.
>      * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
>      chosen over SIMD ABI if a SVE type is used in return or arguments.
>      (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd 
> clone
>      when no simdlen is provided, according to ABI rules.
>      (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
>      (aarch64_simd_clone_adjust_ret_or_param): New.
>      (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
>      * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
>      (simd_clone_adjust): Adapt safelen check to be compatible with VLA
>      simdlen.
> 
> gcc/testsuite/ChangeLog:
> 
>      * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
>      * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
>      * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
>      longer necessary.
>      * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+			      const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
    mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
 			const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-    return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
       || aarch64_takes_arguments_in_sve_regs_p (fntype))
     return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+    return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 					int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
 
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
     {
       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+      wds_elt_bits = nds_elt_bits;
       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
     }
   else
-    nds_elt_bits = POINTER_SIZE;
+    {
+      nds_elt_bits = POINTER_SIZE;
+      wds_elt_bits = 0;
+    }
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -27524,30 +27528,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
        t && t != void_list_node; t = TREE_CHAIN (t), i++)
     {
-      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
+      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
-	  && !supported_simd_type (arg_type))
+	  && !supported_simd_type (type))
 	{
 	  if (!explicit_p)
 	    ;
-	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
+	  else if (COMPLEX_FLOAT_TYPE_P (type))
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"GCC does not currently support argument type %qT "
-			"for simd", arg_type);
+			"for simd", type);
 	  else
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"unsupported argument type %qT for simd",
-			arg_type);
+			type);
 	  return 0;
 	}
-      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
+      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
-	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
+	vec_elts.safe_push (std::make_pair (type, lane_bits));
       if (nds_elt_bits > lane_bits)
 	nds_elt_bits = lane_bits;
+      else if (wds_elt_bits < lane_bits)
+	wds_elt_bits = lane_bits;
     }
 
-  clonei->vecsize_mangle = 'n';
+  /* If we could not determine the WDS type from available parameters/return,
+     then fallback to using uintptr_t.  */
+  if (wds_elt_bits == 0)
+    wds_elt_bits = POINTER_SIZE;
+
   clonei->mask_mode = VOIDmode;
   poly_uint64 simdlen;
   auto_vec<poly_uint64> simdlens (2);
@@ -27558,6 +27568,11 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
       simdlens.safe_push (simdlen);
       simdlens.safe_push (simdlen * 2);
+      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
+	 function.  */
+      if (DECL_ARGUMENTS (node->decl) != 0
+	  || type_arg_types != 0)
+	simdlens.safe_push (exact_div (poly_uint64 (128, 128), wds_elt_bits));
     }
   else
     simdlens.safe_push (clonei->simdlen);
@@ -27578,19 +27593,20 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   while (j < count && !simdlens.is_empty ())
     {
       bool remove_simdlen = false;
-      for (auto elt : vec_elts)
-	if (known_gt (simdlens[j] * elt.second, 128U))
-	  {
-	    /* Don't issue a warning for every simdclone when there is no
-	       specific simdlen clause.  */
-	    if (explicit_p && known_ne (clonei->simdlen, 0U))
-	      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-			  "GCC does not currently support simdlen %wd for "
-			  "type %qT",
-			  constant_lower_bound (simdlens[j]), elt.first);
-	    remove_simdlen = true;
-	    break;
-	  }
+      if (simdlens[j].is_constant ())
+	for (auto elt : vec_elts)
+	  if (known_gt (simdlens[j] * elt.second, 128U))
+	    {
+	      /* Don't issue a warning for every simdclone when there is no
+		 specific simdlen clause.  */
+	      if (explicit_p && known_ne (clonei->simdlen, 0U))
+		warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+			    "GCC does not currently support simdlen %wd for "
+			    "type %qT",
+			    constant_lower_bound (simdlens[j]), elt.first);
+	      remove_simdlen = true;
+	      break;
+	    }
       if (remove_simdlen)
 	{
 	  count--;
@@ -27618,9 +27634,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 
   gcc_assert (num < count);
   clonei->simdlen = simdlens[num];
+  if (clonei->simdlen.is_constant ())
+    clonei->vecsize_mangle = 'n';
+  else
+    {
+      clonei->vecsize_mangle = 's';
+      clonei->inbranch = 1;
+    }
   return count;
 }
 
+static tree
+simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
+{
+    unsigned int num_zr = 0;
+    unsigned int num_pr = 0;
+    type = TREE_TYPE (type);
+    type = build_vector_type (type, simdlen);
+    if (is_mask)
+      {
+	type = truth_type_for (type);
+	num_pr = 1;
+      }
+    else
+      num_zr = 1;
+
+    aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL,
+					 NULL);
+    return type;
+}
+
 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
 
 static void
@@ -27632,6 +27675,69 @@ aarch64_simd_clone_adjust (struct cgraph_node *node)
   tree t = TREE_TYPE (node->decl);
   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
 					TYPE_ATTRIBUTES (t));
+
+  cl_target_option cur_target;
+  poly_uint16 old_sve_vg;
+  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree target = build_string (strlen ("+sve"), "+sve");
+      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
+      cl_target_option_save (&cur_target, &global_options, &global_options_set);
+      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+      cl_target_option_restore (&global_options, &global_options_set,
+				TREE_TARGET_OPTION (new_target));
+      aarch64_override_options_internal (&global_options);
+      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+	if (aarch64_sve_mode_p ((machine_mode) i))
+	  have_regs_of_mode[i] = true;
+      old_sve_vg = aarch64_sve_vg;
+      if (!node->simdclone->simdlen.is_constant ())
+	aarch64_sve_vg = poly_uint16 (2, 2);
+    }
+  cgraph_simd_clone *sc = node->simdclone;
+
+  for (unsigned i = 0; i < sc->nargs; ++i)
+    {
+      bool is_mask = false;
+      tree type;
+      switch (sc->args[i].arg_type)
+	{
+	case SIMD_CLONE_ARG_TYPE_MASK:
+	  is_mask = true;
+	  gcc_fallthrough ();
+	case SIMD_CLONE_ARG_TYPE_VECTOR:
+	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
+	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
+	  type = sc->args[i].vector_type;
+	  gcc_assert (VECTOR_TYPE_P (type));
+	  if (node->simdclone->vecsize_mangle == 's')
+	    type = simd_clone_adjust_sve_vector_type (type, is_mask,
+						      sc->simdlen);
+	  else if (is_mask)
+	    type = truth_type_for (type);
+	  sc->args[i].vector_type = type;
+	default:
+	    continue;
+	}
+    }
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree ret_type = TREE_TYPE (t);
+      if (VECTOR_TYPE_P (ret_type))
+	TREE_TYPE (t)
+	  = simd_clone_adjust_sve_vector_type (ret_type, false,
+					       node->simdclone->simdlen);
+      /* Restore current options and AARCH64_SVE_VG.  */
+      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
+      aarch64_override_options_internal (&global_options);
+      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      aarch64_sve_vg = old_sve_vg;
+    }
 }
 
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
@@ -27645,6 +27751,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node)
       if (!TARGET_SIMD)
 	return -1;
       return 0;
+    case 's':
+      if (!TARGET_SVE)
+	return -1;
+      return 0;
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index fb80888190c88e29895ecfbbe1b17d390c9a9dfe..150af24c5bc52b6737f3ca46ba73d1c890b143e9 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node,
   pp_string (&pp, "_ZGV");
   pp_character (&pp, vecsize_mangle);
   pp_character (&pp, mask);
-  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
-  unsigned int len = simdlen.to_constant ();
-  pp_decimal_int (&pp, (len));
+
+  unsigned long long len = 0;
+  if (simdlen.is_constant (&len))
+    pp_decimal_int (&pp, (int) (len));
+  else
+    pp_character (&pp, 'x');
 
   for (n = 0; n < clone_info->nargs; ++n)
     {
@@ -1541,8 +1544,8 @@ simd_clone_adjust (struct cgraph_node *node)
 	 below).  */
       loop = alloc_loop ();
       cfun->has_force_vectorize_loops = true;
-      /* For now, simlen is always constant.  */
-      loop->safelen = node->simdclone->simdlen.to_constant ();
+      /* We can assert that safelen is the 'minimum' simdlen.  */
+      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
       loop->force_vectorize = true;
       loop->header = body_bb;
     }
diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
index e3668893afe33a58c029cddd433d9bf43cce2bfa..12f8b3b839b7f3ff9e4f99768e59c0e1c5339062 100644
--- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
+++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
@@ -21,7 +21,7 @@ test1 (int x)
      shall call f01 with score 8.  */
   /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
   /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
-  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 12 "optimized" { target { aarch64*-*-* } } } } */
   /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
   /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
   int a = f04 (x);
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
index aab8c17f0c442a7cda4dce23cc18162a0b7f676e..add6e7c93019834fbd5bed5ead18b52d4cdd0a37 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
@@ -4,28 +4,39 @@
 extern "C" {
 #endif
 #pragma omp declare simd
-int __attribute__ ((const)) f00 (int a , char b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
+int __attribute__ ((const)) f00 (int a , char b)
 {
   return a + b;
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f00} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f00} } } */
+
 #pragma omp declare simd
-long long __attribute__ ((const)) f01 (int a , short b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
+long long __attribute__ ((const)) f01 (int a , short b)
 {
   return a + b;
 }
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f01} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f01} } } */
 
 #pragma omp declare simd linear(b)
-long long __attribute__ ((const)) f02 (short *b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
+long long __attribute__ ((const)) f02 (short *b, int a)
 {
   return a + *b;
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f02} } } */
+/* { dg-final { scan-assembler {_ZGVsMxl2v_f02} } } */
+
 #pragma omp declare simd uniform(b)
-void f03 (char b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
+void f03 (char b, int a)
 {
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f03} } } */
+/* { dg-final { scan-assembler {_ZGVsMxuv_f03} } } */
+
 #pragma omp declare simd simdlen(4)
 double f04 (void) /* { dg-warning {GCC does not currently support simdlen 4 for type 'double'} } */
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
index abb128ffc9cd2c1353b99eb38aae72377746e6d6..604869a30456e4db988bba86e059a27f19dda589 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
@@ -10,6 +10,7 @@ short __attribute__ ((const)) f00 (short a , char b)
 }
 /* { dg-final { scan-assembler {_ZGVnN8vv_f00:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8vv_f00:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f00:} } } */
 
 #pragma omp declare simd notinbranch
 short __attribute__ ((const)) f01 (int a , short b)
@@ -17,6 +18,7 @@ short __attribute__ ((const)) f01 (int a , short b)
   return a + b;
 }
 /* { dg-final { scan-assembler {_ZGVnN4vv_f01:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f01:} } } */
 
 #pragma omp declare simd linear(b) inbranch
 int __attribute__ ((const)) f02 (int a, short *b)
@@ -24,6 +26,7 @@ int __attribute__ ((const)) f02 (int a, short *b)
   return a + *b;
 }
 /* { dg-final { scan-assembler {_ZGVnM4vl2_f02:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvl2_f02:} } } */
 
 #pragma omp declare simd uniform(a) notinbranch
 void f03 (char b, int a)
@@ -31,6 +34,7 @@ void f03 (char b, int a)
 }
 /* { dg-final { scan-assembler {_ZGVnN8vu_f03:} } } */
 /* { dg-final { scan-assembler {_ZGVnN16vu_f03:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvu_f03:} } } */
 
 #pragma omp declare simd simdlen(2)
 float f04 (double a)
@@ -39,6 +43,7 @@ float f04 (double a)
 }
 /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
 /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
+/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
 
 #pragma omp declare simd uniform(a) linear (b)
 void f05 (short a, short *b, short c)
@@ -50,6 +55,7 @@ void f05 (short a, short *b, short c)
 /* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxul2v_f05:} } } */
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
index 6319df0558f37b95f1b2eb17374bdb4ecbc33295..38677b8f7a76b960ce9363b1c0cabf6fc5086ab6 100644
--- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
@@ -41,7 +41,7 @@ contains
     ! shall call f01 with score 8.
     ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
     ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
-    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
+    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
     a = f04 (x)
diff --git a/libgomp/testsuite/libgomp.c/declare-variant-1.c b/libgomp/testsuite/libgomp.c/declare-variant-1.c
index 6129f23a0f80585246957022d63608dc3a68f1ff..591867bdc97f28bcb64fbe8bb9db39ccee5fe643 100644
--- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
+++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
@@ -48,7 +48,7 @@ test1 (int x)
   /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
   /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } } */
   /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 12 "optimized" { target { aarch64*-*-* } } } } } */
   /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { aarch64*-*-* } } } } } */
   int a = f04 (x);
   int b = f04 (x);
Richard Sandiford Nov. 29, 2023, 5:01 p.m. UTC | #2
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> Rebased, no major changes, still needs review.
>
> On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
>> This patch finalizes adding support for the generation of SVE simd 
>> clones when no simdlen is provided, following the ABI rules where the 
>> widest data type determines the minimum amount of elements in a length 
>> agnostic vector.
>> 
>> gcc/ChangeLog:
>> 
>>          * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
>> Declare.
>>      * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): 
>> Make
>>      visibility global.
>>      * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
>>      chosen over SIMD ABI if a SVE type is used in return or arguments.
>>      (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd 
>> clone
>>      when no simdlen is provided, according to ABI rules.
>>      (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
>>      (aarch64_simd_clone_adjust_ret_or_param): New.
>>      (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
>>      * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
>>      (simd_clone_adjust): Adapt safelen check to be compatible with VLA
>>      simdlen.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>      * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
>>      * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
>>      * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
>>      longer necessary.
>>      * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1005,6 +1005,8 @@ namespace aarch64_sve {
>  #ifdef GCC_TARGET_H
>    bool verify_type_context (location_t, type_context_kind, const_tree, bool);
>  #endif
> + void add_sve_type_attribute (tree, unsigned int, unsigned int,
> +			      const char *, const char *);
>  }
>  
>  extern void aarch64_split_combinev16qi (rtx operands[3]);
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index 161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
>  /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
>     and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
>     mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
> -static void
> +void
>  add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
>  			const char *mangled_name, const char *acle_name)
>  {
>    tree mangled_name_tree
>      = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
> +  tree acle_name_tree
> +    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
>  
> -  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
> +  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
>    value = tree_cons (NULL_TREE, mangled_name_tree, value);
>    value = tree_cons (NULL_TREE, size_int (num_pr), value);
>    value = tree_cons (NULL_TREE, size_int (num_zr), value);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
>  static const predefined_function_abi &
>  aarch64_fntype_abi (const_tree fntype)
>  {
> -  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
> -    return aarch64_simd_abi ();
> -
>    if (aarch64_returns_value_in_sve_regs_p (fntype)
>        || aarch64_takes_arguments_in_sve_regs_p (fntype))
>      return aarch64_sve_abi ();
>  
> +  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
> +    return aarch64_simd_abi ();
> +
>    return default_function_abi;
>  }
>  

I think we discussed this off-list later, but the change above shouldn't
be necessary.  aarch64_vector_pcs must not be attached to SVE PCS functions,
so the two cases should be mutually exclusive.

> @@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>  					int num, bool explicit_p)
>  {
>    tree t, ret_type;
> -  unsigned int nds_elt_bits;
> +  unsigned int nds_elt_bits, wds_elt_bits;
>    int count;
>    unsigned HOST_WIDE_INT const_simdlen;
>  
> @@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>    if (TREE_CODE (ret_type) != VOID_TYPE)
>      {
>        nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
> +      wds_elt_bits = nds_elt_bits;
>        vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
>      }
>    else
> -    nds_elt_bits = POINTER_SIZE;
> +    {
> +      nds_elt_bits = POINTER_SIZE;
> +      wds_elt_bits = 0;
> +    }
>  
>    int i;
>    tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
> @@ -27524,30 +27528,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>    for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
>         t && t != void_list_node; t = TREE_CHAIN (t), i++)
>      {
> -      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
> +      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);

Not sure renaming arg_type is worth it.  The original was probably
more descriptive.

>        if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
> -	  && !supported_simd_type (arg_type))
> +	  && !supported_simd_type (type))
>  	{
>  	  if (!explicit_p)
>  	    ;
> -	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
> +	  else if (COMPLEX_FLOAT_TYPE_P (type))
>  	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>  			"GCC does not currently support argument type %qT "
> -			"for simd", arg_type);
> +			"for simd", type);
>  	  else
>  	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>  			"unsupported argument type %qT for simd",
> -			arg_type);
> +			type);
>  	  return 0;
>  	}
> -      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
> +      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
>        if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
> -	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
> +	vec_elts.safe_push (std::make_pair (type, lane_bits));
>        if (nds_elt_bits > lane_bits)
>  	nds_elt_bits = lane_bits;
> +      else if (wds_elt_bits < lane_bits)
> +	wds_elt_bits = lane_bits;
>      }
>  
> -  clonei->vecsize_mangle = 'n';
> +  /* If we could not determine the WDS type from available parameters/return,
> +     then fallback to using uintptr_t.  */
> +  if (wds_elt_bits == 0)
> +    wds_elt_bits = POINTER_SIZE;
> +
>    clonei->mask_mode = VOIDmode;
>    poly_uint64 simdlen;
>    auto_vec<poly_uint64> simdlens (2);
> @@ -27558,6 +27568,11 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>        simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
>        simdlens.safe_push (simdlen);
>        simdlens.safe_push (simdlen * 2);
> +      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
> +	 function.  */
> +      if (DECL_ARGUMENTS (node->decl) != 0
> +	  || type_arg_types != 0)
> +	simdlens.safe_push (exact_div (poly_uint64 (128, 128), wds_elt_bits));

This check feels a bit indirect.  Does it work to use:

  if (prototype_p (TREE_TYPE (node->decl)))

instead?

>      }
>    else
>      simdlens.safe_push (clonei->simdlen);
> @@ -27578,19 +27593,20 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>    while (j < count && !simdlens.is_empty ())
>      {
>        bool remove_simdlen = false;
> -      for (auto elt : vec_elts)
> -	if (known_gt (simdlens[j] * elt.second, 128U))
> -	  {
> -	    /* Don't issue a warning for every simdclone when there is no
> -	       specific simdlen clause.  */
> -	    if (explicit_p && known_ne (clonei->simdlen, 0U))
> -	      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -			  "GCC does not currently support simdlen %wd for "
> -			  "type %qT",
> -			  constant_lower_bound (simdlens[j]), elt.first);
> -	    remove_simdlen = true;
> -	    break;
> -	  }
> +      if (simdlens[j].is_constant ())
> +	for (auto elt : vec_elts)
> +	  if (known_gt (simdlens[j] * elt.second, 128U))
> +	    {
> +	      /* Don't issue a warning for every simdclone when there is no
> +		 specific simdlen clause.  */
> +	      if (explicit_p && known_ne (clonei->simdlen, 0U))
> +		warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> +			    "GCC does not currently support simdlen %wd for "
> +			    "type %qT",
> +			    constant_lower_bound (simdlens[j]), elt.first);
> +	      remove_simdlen = true;
> +	      break;
> +	    }
>        if (remove_simdlen)
>  	{
>  	  count--;
> @@ -27618,9 +27634,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>  
>    gcc_assert (num < count);
>    clonei->simdlen = simdlens[num];
> +  if (clonei->simdlen.is_constant ())
> +    clonei->vecsize_mangle = 'n';
> +  else
> +    {
> +      clonei->vecsize_mangle = 's';
> +      clonei->inbranch = 1;
> +    }
>    return count;
>  }
>  
> +static tree
> +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
> +{
> +    unsigned int num_zr = 0;

Nits: missing function comment.  The body is indented by too many columns.

> +    unsigned int num_pr = 0;
> +    type = TREE_TYPE (type);
> +    type = build_vector_type (type, simdlen);

Is simdlen ever different from the original TYPE_VECTOR_SUBPARTS?
I think a comment is needed if so.

> +    if (is_mask)
> +      {
> +	type = truth_type_for (type);
> +	num_pr = 1;
> +      }
> +    else
> +      num_zr = 1;
> +
> +    aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL,
> +					 NULL);

Before adding the atttribute, I think we should call:

  type = build_distinct_type_copy (type);

so that we don't change a type that is already in use, or associate
any new types with this one.  If simdlen is always equal to the original
TYPE_VECTOR_SUBPARTS then the build_distinct_type_copy would replace the
build_vector_type.

> +    return type;
> +}
> +
>  /* Implement TARGET_SIMD_CLONE_ADJUST.  */
>  
>  static void
> @@ -27632,6 +27675,69 @@ aarch64_simd_clone_adjust (struct cgraph_node *node)
>    tree t = TREE_TYPE (node->decl);
>    TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
>  					TYPE_ATTRIBUTES (t));
> +
> +  cl_target_option cur_target;
> +  poly_uint16 old_sve_vg;
> +  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
> +
> +  if (node->simdclone->vecsize_mangle == 's')
> +    {
> +      tree target = build_string (strlen ("+sve"), "+sve");
> +      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);

It would be good to assert that this succeeds.  It's unfortunate that we
have a predicate with side-effects, but that's obviously not your fault. :)

> +      cl_target_option_save (&cur_target, &global_options, &global_options_set);
> +      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
> +      cl_target_option_restore (&global_options, &global_options_set,
> +				TREE_TARGET_OPTION (new_target));
> +      aarch64_override_options_internal (&global_options);

Does this preserve any existing target attributes too, to the extent
possible?  E.g. if the function has:

  #pragma GCC target "+sve2"

then I think we should honour that rather than dial down to "+sve".
Same if the code is compiled with -march=armv9-a+sve2: we should
compile the clone as SVE2 rather than SVE.

> +      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
> +	      sizeof (have_regs_of_mode));
> +      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
> +	if (aarch64_sve_mode_p ((machine_mode) i))
> +	  have_regs_of_mode[i] = true;

Would it be possible to use push_cfun and pop_cfun instead, so that
we do a proper target switch?

(The SVE ACLE code uses a similar technique to the above, but that's
because it's run in a non-function context.)

> +      old_sve_vg = aarch64_sve_vg;
> +      if (!node->simdclone->simdlen.is_constant ())
> +	aarch64_sve_vg = poly_uint16 (2, 2);

I'm not sure we should change VG here.

The basis for that and for allowing SVE2 above is that the ODR requires
that all comdat instances of a function are compiled in the same way.
That applies to all functions, not just simd clones.  So IMO it's user
error if a clone is compiled multiple times with different target options,
or if it's compiled with target options that the runtime target doesn't in
fact support.

We're already implicitly assuming the same thing for Advanced SIMD,
since we'll use whatever post-Armv8-A features happen to be enabled.

> +    }
> +  cgraph_simd_clone *sc = node->simdclone;
> +
> +  for (unsigned i = 0; i < sc->nargs; ++i)
> +    {
> +      bool is_mask = false;
> +      tree type;
> +      switch (sc->args[i].arg_type)
> +	{
> +	case SIMD_CLONE_ARG_TYPE_MASK:
> +	  is_mask = true;
> +	  gcc_fallthrough ();
> +	case SIMD_CLONE_ARG_TYPE_VECTOR:
> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
> +	  type = sc->args[i].vector_type;
> +	  gcc_assert (VECTOR_TYPE_P (type));
> +	  if (node->simdclone->vecsize_mangle == 's')
> +	    type = simd_clone_adjust_sve_vector_type (type, is_mask,
> +						      sc->simdlen);
> +	  else if (is_mask)
> +	    type = truth_type_for (type);
> +	  sc->args[i].vector_type = type;
> +	default:
> +	    continue;
> +	}
> +    }
> +  if (node->simdclone->vecsize_mangle == 's')
> +    {
> +      tree ret_type = TREE_TYPE (t);
> +      if (VECTOR_TYPE_P (ret_type))
> +	TREE_TYPE (t)
> +	  = simd_clone_adjust_sve_vector_type (ret_type, false,
> +					       node->simdclone->simdlen);
> +      /* Restore current options and AARCH64_SVE_VG.  */
> +      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
> +      aarch64_override_options_internal (&global_options);
> +      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
> +	      sizeof (have_regs_of_mode));
> +      aarch64_sve_vg = old_sve_vg;
> +    }
>  }
>  
>  /* Implement TARGET_SIMD_CLONE_USABLE.  */
> @@ -27645,6 +27751,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node)
>        if (!TARGET_SIMD)
>  	return -1;
>        return 0;
> +    case 's':
> +      if (!TARGET_SVE)
> +	return -1;
> +      return 0;
>      default:
>        gcc_unreachable ();
>      }
> diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
> index fb80888190c88e29895ecfbbe1b17d390c9a9dfe..150af24c5bc52b6737f3ca46ba73d1c890b143e9 100644
> --- a/gcc/omp-simd-clone.cc
> +++ b/gcc/omp-simd-clone.cc
> @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node,
>    pp_string (&pp, "_ZGV");
>    pp_character (&pp, vecsize_mangle);
>    pp_character (&pp, mask);
> -  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
> -  unsigned int len = simdlen.to_constant ();
> -  pp_decimal_int (&pp, (len));
> +
> +  unsigned long long len = 0;
> +  if (simdlen.is_constant (&len))
> +    pp_decimal_int (&pp, (int) (len));
> +  else
> +    pp_character (&pp, 'x');

FWIW, I agree this is the right approach for now.  We can put the 'x' behind
a target hook later if another VLA target uses a different convention.

Thanks,
Richard

>    for (n = 0; n < clone_info->nargs; ++n)
>      {
> @@ -1541,8 +1544,8 @@ simd_clone_adjust (struct cgraph_node *node)
>  	 below).  */
>        loop = alloc_loop ();
>        cfun->has_force_vectorize_loops = true;
> -      /* For now, simlen is always constant.  */
> -      loop->safelen = node->simdclone->simdlen.to_constant ();
> +      /* We can assert that safelen is the 'minimum' simdlen.  */
> +      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
>        loop->force_vectorize = true;
>        loop->header = body_bb;
>      }
> diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> index e3668893afe33a58c029cddd433d9bf43cce2bfa..12f8b3b839b7f3ff9e4f99768e59c0e1c5339062 100644
> --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
> @@ -21,7 +21,7 @@ test1 (int x)
>       shall call f01 with score 8.  */
>    /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
>    /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
> -  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
> +  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 12 "optimized" { target { aarch64*-*-* } } } } */
>    /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
>    /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
>    int a = f04 (x);
> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
> index aab8c17f0c442a7cda4dce23cc18162a0b7f676e..add6e7c93019834fbd5bed5ead18b52d4cdd0a37 100644
> --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
> @@ -4,28 +4,39 @@
>  extern "C" {
>  #endif
>  #pragma omp declare simd
> -int __attribute__ ((const)) f00 (int a , char b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
> +int __attribute__ ((const)) f00 (int a , char b)
>  {
>    return a + b;
>  }
>  
> +/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f00} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvv_f00} } } */
> +
>  #pragma omp declare simd
> -long long __attribute__ ((const)) f01 (int a , short b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
> +long long __attribute__ ((const)) f01 (int a , short b)
>  {
>    return a + b;
>  }
> +/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f01} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvv_f01} } } */
>  
>  #pragma omp declare simd linear(b)
> -long long __attribute__ ((const)) f02 (short *b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
> +long long __attribute__ ((const)) f02 (short *b, int a)
>  {
>    return a + *b;
>  }
>  
> +/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f02} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxl2v_f02} } } */
> +
>  #pragma omp declare simd uniform(b)
> -void f03 (char b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
> +void f03 (char b, int a)
>  {
>  }
>  
> +/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f03} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxuv_f03} } } */
> +
>  #pragma omp declare simd simdlen(4)
>  double f04 (void) /* { dg-warning {GCC does not currently support simdlen 4 for type 'double'} } */
>  {
> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> index abb128ffc9cd2c1353b99eb38aae72377746e6d6..604869a30456e4db988bba86e059a27f19dda589 100644
> --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> @@ -10,6 +10,7 @@ short __attribute__ ((const)) f00 (short a , char b)
>  }
>  /* { dg-final { scan-assembler {_ZGVnN8vv_f00:} } } */
>  /* { dg-final { scan-assembler {_ZGVnM8vv_f00:} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvv_f00:} } } */
>  
>  #pragma omp declare simd notinbranch
>  short __attribute__ ((const)) f01 (int a , short b)
> @@ -17,6 +18,7 @@ short __attribute__ ((const)) f01 (int a , short b)
>    return a + b;
>  }
>  /* { dg-final { scan-assembler {_ZGVnN4vv_f01:} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvv_f01:} } } */
>  
>  #pragma omp declare simd linear(b) inbranch
>  int __attribute__ ((const)) f02 (int a, short *b)
> @@ -24,6 +26,7 @@ int __attribute__ ((const)) f02 (int a, short *b)
>    return a + *b;
>  }
>  /* { dg-final { scan-assembler {_ZGVnM4vl2_f02:} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvl2_f02:} } } */
>  
>  #pragma omp declare simd uniform(a) notinbranch
>  void f03 (char b, int a)
> @@ -31,6 +34,7 @@ void f03 (char b, int a)
>  }
>  /* { dg-final { scan-assembler {_ZGVnN8vu_f03:} } } */
>  /* { dg-final { scan-assembler {_ZGVnN16vu_f03:} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxvu_f03:} } } */
>  
>  #pragma omp declare simd simdlen(2)
>  float f04 (double a)
> @@ -39,6 +43,7 @@ float f04 (double a)
>  }
>  /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
>  /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
> +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
>  
>  #pragma omp declare simd uniform(a) linear (b)
>  void f05 (short a, short *b, short c)
> @@ -50,6 +55,7 @@ void f05 (short a, short *b, short c)
>  /* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */
>  /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
>  /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
> +/* { dg-final { scan-assembler {_ZGVsMxul2v_f05:} } } */
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> index 6319df0558f37b95f1b2eb17374bdb4ecbc33295..38677b8f7a76b960ce9363b1c0cabf6fc5086ab6 100644
> --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
> @@ -41,7 +41,7 @@ contains
>      ! shall call f01 with score 8.
>      ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
>      ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
> -    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
> +    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
>      ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
>      ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
>      a = f04 (x)
> diff --git a/libgomp/testsuite/libgomp.c/declare-variant-1.c b/libgomp/testsuite/libgomp.c/declare-variant-1.c
> index 6129f23a0f80585246957022d63608dc3a68f1ff..591867bdc97f28bcb64fbe8bb9db39ccee5fe643 100644
> --- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
> +++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
> @@ -48,7 +48,7 @@ test1 (int x)
>    /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
>    /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } } */
>    /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } } */
> -  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } } */
> +  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 12 "optimized" { target { aarch64*-*-* } } } } } */
>    /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { aarch64*-*-* } } } } } */
>    int a = f04 (x);
>    int b = f04 (x);
Andre Vieira (lists) Dec. 1, 2023, 4:39 p.m. UTC | #3
On 29/11/2023 17:01, Richard Sandiford wrote:
> "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
>> Rebased, no major changes, still needs review.
>>
>> On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
>>> This patch finalizes adding support for the generation of SVE simd
>>> clones when no simdlen is provided, following the ABI rules where the
>>> widest data type determines the minimum amount of elements in a length
>>> agnostic vector.
>>>
>>> gcc/ChangeLog:
>>>
>>>           * config/aarch64/aarch64-protos.h (add_sve_type_attribute):
>>> Declare.
>>>       * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute):
>>> Make
>>>       visibility global.
>>>       * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
>>>       chosen over SIMD ABI if a SVE type is used in return or arguments.
>>>       (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd
>>> clone
>>>       when no simdlen is provided, according to ABI rules.
>>>       (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
>>>       (aarch64_simd_clone_adjust_ret_or_param): New.
>>>       (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
>>>       * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
>>>       (simd_clone_adjust): Adapt safelen check to be compatible with VLA
>>>       simdlen.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>>       * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
>>>       * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
>>>       * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
>>>       longer necessary.
>>>       * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
>>
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -1005,6 +1005,8 @@ namespace aarch64_sve {
>>   #ifdef GCC_TARGET_H
>>     bool verify_type_context (location_t, type_context_kind, const_tree, bool);
>>   #endif
>> + void add_sve_type_attribute (tree, unsigned int, unsigned int,
>> +			      const char *, const char *);
>>   }
>>   
>>   extern void aarch64_split_combinev16qi (rtx operands[3]);
>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
>> index 161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c 100644
>> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
>> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
>> @@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
>>   /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
>>      and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
>>      mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
>> -static void
>> +void
>>   add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
>>   			const char *mangled_name, const char *acle_name)
>>   {
>>     tree mangled_name_tree
>>       = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
>> +  tree acle_name_tree
>> +    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
>>   
>> -  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
>> +  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
>>     value = tree_cons (NULL_TREE, mangled_name_tree, value);
>>     value = tree_cons (NULL_TREE, size_int (num_pr), value);
>>     value = tree_cons (NULL_TREE, size_int (num_zr), value);
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
>>   static const predefined_function_abi &
>>   aarch64_fntype_abi (const_tree fntype)
>>   {
>> -  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
>> -    return aarch64_simd_abi ();
>> -
>>     if (aarch64_returns_value_in_sve_regs_p (fntype)
>>         || aarch64_takes_arguments_in_sve_regs_p (fntype))
>>       return aarch64_sve_abi ();
>>   
>> +  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
>> +    return aarch64_simd_abi ();
>> +
>>     return default_function_abi;
>>   }
>>   
> 
> I think we discussed this off-list later, but the change above shouldn't
> be necessary.  aarch64_vector_pcs must not be attached to SVE PCS functions,
> so the two cases should be mutually exclusive.

Yeah I had made the changes locally, but not updated the patch yet.
> 
>> @@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>   					int num, bool explicit_p)
>>   {
>>     tree t, ret_type;
>> -  unsigned int nds_elt_bits;
>> +  unsigned int nds_elt_bits, wds_elt_bits;
>>     int count;
>>     unsigned HOST_WIDE_INT const_simdlen;
>>   
>> @@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>     if (TREE_CODE (ret_type) != VOID_TYPE)
>>       {
>>         nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
>> +      wds_elt_bits = nds_elt_bits;
>>         vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
>>       }
>>     else
>> -    nds_elt_bits = POINTER_SIZE;
>> +    {
>> +      nds_elt_bits = POINTER_SIZE;
>> +      wds_elt_bits = 0;
>> +    }
>>   
>>     int i;
>>     tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
>> @@ -27524,30 +27528,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>     for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
>>          t && t != void_list_node; t = TREE_CHAIN (t), i++)
>>       {
>> -      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
>> +      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
> 
> Not sure renaming arg_type is worth it.  The original was probably
> more descriptive.
> 
>>         if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
>> -	  && !supported_simd_type (arg_type))
>> +	  && !supported_simd_type (type))
>>   	{
>>   	  if (!explicit_p)
>>   	    ;
>> -	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
>> +	  else if (COMPLEX_FLOAT_TYPE_P (type))
>>   	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>>   			"GCC does not currently support argument type %qT "
>> -			"for simd", arg_type);
>> +			"for simd", type);
>>   	  else
>>   	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>>   			"unsupported argument type %qT for simd",
>> -			arg_type);
>> +			type);
>>   	  return 0;
>>   	}
>> -      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
>> +      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
>>         if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
>> -	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
>> +	vec_elts.safe_push (std::make_pair (type, lane_bits));
>>         if (nds_elt_bits > lane_bits)
>>   	nds_elt_bits = lane_bits;
>> +      else if (wds_elt_bits < lane_bits)
>> +	wds_elt_bits = lane_bits;
>>       }
>>   
>> -  clonei->vecsize_mangle = 'n';
>> +  /* If we could not determine the WDS type from available parameters/return,
>> +     then fallback to using uintptr_t.  */
>> +  if (wds_elt_bits == 0)
>> +    wds_elt_bits = POINTER_SIZE;
>> +
>>     clonei->mask_mode = VOIDmode;
>>     poly_uint64 simdlen;
>>     auto_vec<poly_uint64> simdlens (2);
>> @@ -27558,6 +27568,11 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>         simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
>>         simdlens.safe_push (simdlen);
>>         simdlens.safe_push (simdlen * 2);
>> +      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
>> +	 function.  */
>> +      if (DECL_ARGUMENTS (node->decl) != 0
>> +	  || type_arg_types != 0)
>> +	simdlens.safe_push (exact_div (poly_uint64 (128, 128), wds_elt_bits));
> 
> This check feels a bit indirect.  Does it work to use:
> 
>    if (prototype_p (TREE_TYPE (node->decl)))
> 
> instead?
> 
>>       }
>>     else
>>       simdlens.safe_push (clonei->simdlen);
>> @@ -27578,19 +27593,20 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>     while (j < count && !simdlens.is_empty ())
>>       {
>>         bool remove_simdlen = false;
>> -      for (auto elt : vec_elts)
>> -	if (known_gt (simdlens[j] * elt.second, 128U))
>> -	  {
>> -	    /* Don't issue a warning for every simdclone when there is no
>> -	       specific simdlen clause.  */
>> -	    if (explicit_p && known_ne (clonei->simdlen, 0U))
>> -	      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>> -			  "GCC does not currently support simdlen %wd for "
>> -			  "type %qT",
>> -			  constant_lower_bound (simdlens[j]), elt.first);
>> -	    remove_simdlen = true;
>> -	    break;
>> -	  }
>> +      if (simdlens[j].is_constant ())
>> +	for (auto elt : vec_elts)
>> +	  if (known_gt (simdlens[j] * elt.second, 128U))
>> +	    {
>> +	      /* Don't issue a warning for every simdclone when there is no
>> +		 specific simdlen clause.  */
>> +	      if (explicit_p && known_ne (clonei->simdlen, 0U))
>> +		warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>> +			    "GCC does not currently support simdlen %wd for "
>> +			    "type %qT",
>> +			    constant_lower_bound (simdlens[j]), elt.first);
>> +	      remove_simdlen = true;
>> +	      break;
>> +	    }
>>         if (remove_simdlen)
>>   	{
>>   	  count--;
>> @@ -27618,9 +27634,36 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>   
>>     gcc_assert (num < count);
>>     clonei->simdlen = simdlens[num];
>> +  if (clonei->simdlen.is_constant ())
>> +    clonei->vecsize_mangle = 'n';
>> +  else
>> +    {
>> +      clonei->vecsize_mangle = 's';
>> +      clonei->inbranch = 1;
>> +    }
>>     return count;
>>   }
>>   
>> +static tree
>> +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
>> +{
>> +    unsigned int num_zr = 0;
> 
> Nits: missing function comment.  The body is indented by too many columns.
> 
>> +    unsigned int num_pr = 0;
>> +    type = TREE_TYPE (type);
>> +    type = build_vector_type (type, simdlen);
> 
> Is simdlen ever different from the original TYPE_VECTOR_SUBPARTS?
> I think a comment is needed if so.

Not right now, but I'm not sure why you are asking. So the reason why I 
say not right now is because we don't support multi-{register,argument} 
vector mappings, so as soon as simdlen means it wouldn't fit in a single 
register we reject, that's for both Advanced SIMD and SVE. If we would 
want to, then simdlen could be larger than type's TYPE_VECTOR_SUBPARTS.


>> +
>>   /* Implement TARGET_SIMD_CLONE_ADJUST.  */
>>   
>>   static void
>> @@ -27632,6 +27675,69 @@ aarch64_simd_clone_adjust (struct cgraph_node *node)
>>     tree t = TREE_TYPE (node->decl);
>>     TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
>>   					TYPE_ATTRIBUTES (t));
>> +
>> +  cl_target_option cur_target;
>> +  poly_uint16 old_sve_vg;
>> +  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
>> +
>> +  if (node->simdclone->vecsize_mangle == 's')
>> +    {
>> +      tree target = build_string (strlen ("+sve"), "+sve");
>> +      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
> 
> It would be good to assert that this succeeds.  It's unfortunate that we
> have a predicate with side-effects, but that's obviously not your fault. :)
Scared to think of how it wouldn't...
> 
>> +      cl_target_option_save (&cur_target, &global_options, &global_options_set);
>> +      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
>> +      cl_target_option_restore (&global_options, &global_options_set,
>> +				TREE_TARGET_OPTION (new_target));
>> +      aarch64_override_options_internal (&global_options);
> 
> Does this preserve any existing target attributes too, to the extent
> possible?  E.g. if the function has:
> 
>    #pragma GCC target "+sve2"
> 
> then I think we should honour that rather than dial down to "+sve".
> Same if the code is compiled with -march=armv9-a+sve2: we should
> compile the clone as SVE2 rather than SVE.

Yes it adds to it, so for sve2 this actually has no effect, because 
obviously sve is already enabled.
> 
>> +      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
>> +	      sizeof (have_regs_of_mode));
>> +      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
>> +	if (aarch64_sve_mode_p ((machine_mode) i))
>> +	  have_regs_of_mode[i] = true;
> 
> Would it be possible to use push_cfun and pop_cfun instead, so that
> we do a proper target switch?

Where would I get the new cfun from? I'll go have a bit of a look around 
see if I can make snse of what this would do.
> 
> (The SVE ACLE code uses a similar technique to the above, but that's
> because it's run in a non-function context.)
> 
>> +      old_sve_vg = aarch64_sve_vg;
>> +      if (!node->simdclone->simdlen.is_constant ())
>> +	aarch64_sve_vg = poly_uint16 (2, 2);
> 
> I'm not sure we should change VG here.
Agree. I forgot about the decision to accept that -msve-vector-bits 
would influence the VG of simdclones. I'll also make the changes to the 
code to compute simdlen.

> 
> The basis for that and for allowing SVE2 above is that the ODR requires
> that all comdat instances of a function are compiled in the same way.
> That applies to all functions, not just simd clones.  So IMO it's user
> error if a clone is compiled multiple times with different target options,
> or if it's compiled with target options that the runtime target doesn't in
> fact support.
> 
> We're already implicitly assuming the same thing for Advanced SIMD,
> since we'll use whatever post-Armv8-A features happen to be enabled.
> 
Agree.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 70303d6fd953e0c397b9138ede8858c2db2e53db..d7888c95a4999fad1a4c55d5cd2287c2040302c8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1001,6 +1001,8 @@  namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+			      const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@  static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
    mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
 			const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a13d3fba05f9f9d2989b36c681bc77d71e943e0d..492acb9ce081866162faa8dfca777e4cb943797f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4034,13 +4034,13 @@  aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-    return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
       || aarch64_takes_arguments_in_sve_regs_p (fntype))
     return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+    return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27327,7 +27327,7 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 					int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
@@ -27374,10 +27374,14 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
     {
       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+      wds_elt_bits = nds_elt_bits;
       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
     }
   else
-    nds_elt_bits = POINTER_SIZE;
+    {
+      nds_elt_bits = POINTER_SIZE;
+      wds_elt_bits = 0;
+    }
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -27385,30 +27389,36 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
        t && t != void_list_node; t = TREE_CHAIN (t), i++)
     {
-      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
+      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
-	  && !supported_simd_type (arg_type))
+	  && !supported_simd_type (type))
 	{
 	  if (!explicit_p)
 	    ;
-	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
+	  else if (COMPLEX_FLOAT_TYPE_P (type))
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"GCC does not currently support argument type %qT "
-			"for simd", arg_type);
+			"for simd", type);
 	  else
 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
 			"unsupported argument type %qT for simd",
-			arg_type);
+			type);
 	  return 0;
 	}
-      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
+      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
-	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
+	vec_elts.safe_push (std::make_pair (type, lane_bits));
       if (nds_elt_bits > lane_bits)
 	nds_elt_bits = lane_bits;
+      else if (wds_elt_bits < lane_bits)
+	wds_elt_bits = lane_bits;
     }
 
-  clonei->vecsize_mangle = 'n';
+  /* If we could not determine the WDS type from available parameters/return,
+     then fallback to using uintptr_t.  */
+  if (wds_elt_bits == 0)
+    wds_elt_bits = POINTER_SIZE;
+
   clonei->mask_mode = VOIDmode;
   poly_uint64 simdlen;
   auto_vec<poly_uint64> simdlens (2);
@@ -27419,6 +27429,11 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
       simdlens.safe_push (simdlen);
       simdlens.safe_push (simdlen * 2);
+      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
+	 function.  */
+      if (DECL_ARGUMENTS (node->decl) != 0
+	  || type_arg_types != 0)
+	simdlens.safe_push (exact_div (poly_uint64 (128, 128), wds_elt_bits));
     }
   else
     simdlens.safe_push (clonei->simdlen);
@@ -27439,19 +27454,20 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   while (j < count && !simdlens.is_empty ())
     {
       bool remove_simdlen = false;
-      for (auto elt : vec_elts)
-	if (known_gt (simdlens[j] * elt.second, 128U))
-	  {
-	    /* Don't issue a warning for every simdclone when there is no
-	       specific simdlen clause.  */
-	    if (explicit_p && known_ne (clonei->simdlen, 0U))
-	      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-			  "GCC does not currently support simdlen %wd for "
-			  "type %qT",
-			  constant_lower_bound (simdlens[j]), elt.first);
-	    remove_simdlen = true;
-	    break;
-	  }
+      if (simdlens[j].is_constant ())
+	for (auto elt : vec_elts)
+	  if (known_gt (simdlens[j] * elt.second, 128U))
+	    {
+	      /* Don't issue a warning for every simdclone when there is no
+		 specific simdlen clause.  */
+	      if (explicit_p && known_ne (clonei->simdlen, 0U))
+		warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+			    "GCC does not currently support simdlen %wd for "
+			    "type %qT",
+			    constant_lower_bound (simdlens[j]), elt.first);
+	      remove_simdlen = true;
+	      break;
+	    }
       if (remove_simdlen)
 	{
 	  count--;
@@ -27479,6 +27495,13 @@  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 
   gcc_assert (num < count);
   clonei->simdlen = simdlens[num];
+  if (clonei->simdlen.is_constant ())
+    clonei->vecsize_mangle = 'n';
+  else
+    {
+      clonei->vecsize_mangle = 's';
+      clonei->inbranch = 1;
+    }
   return count;
 }
 
@@ -27493,6 +27516,11 @@  aarch64_simd_clone_adjust (struct cgraph_node *node)
   tree t = TREE_TYPE (node->decl);
   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
 					TYPE_ATTRIBUTES (t));
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree target = build_string (strlen ("+sve"), "+sve");
+      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
+    }
 }
 
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
@@ -27517,6 +27545,57 @@  aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
     }
 }
 
+/* Implement TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM.  */
+
+static tree
+aarch64_simd_clone_adjust_ret_or_param (cgraph_node *node, tree type,
+					bool is_mask)
+{
+  if (type
+      && VECTOR_TYPE_P (type)
+      && node->simdclone->vecsize_mangle == 's')
+    {
+      cl_target_option cur_target;
+      cl_target_option_save (&cur_target, &global_options, &global_options_set);
+      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+      cl_target_option_restore (&global_options, &global_options_set,
+				TREE_TARGET_OPTION (new_target));
+      aarch64_override_options_internal (&global_options);
+      bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+	if (aarch64_sve_mode_p ((machine_mode) i))
+	  have_regs_of_mode[i] = true;
+      poly_uint16 old_sve_vg = aarch64_sve_vg;
+      if (!node->simdclone->simdlen.is_constant ())
+	aarch64_sve_vg = poly_uint16 (2, 2);
+      unsigned int num_zr = 0;
+      unsigned int num_pr = 0;
+      type = TREE_TYPE (type);
+      type = build_vector_type (type, node->simdclone->simdlen);
+      if (is_mask)
+	{
+	  type = truth_type_for (type);
+	  num_pr = 1;
+	}
+      else
+	num_zr = 1;
+
+      aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
+      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
+      aarch64_override_options_internal (&global_options);
+      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+	      sizeof (have_regs_of_mode));
+      aarch64_sve_vg = old_sve_vg;
+    }
+  else if (type
+	   && VECTOR_TYPE_P (type)
+	   && is_mask)
+    type = truth_type_for (type);
+  return type;
+}
+
 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
 
 static int
@@ -28590,6 +28669,10 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SIMD_CLONE_ADJUST
 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
 
+#undef TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
+#define TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM \
+  aarch64_simd_clone_adjust_ret_or_param
+
 #undef TARGET_SIMD_CLONE_USABLE
 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
 
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index c2fd4d3be878e56b6394e34097d2de826a0ba1ff..091f194f1829fb9f70827d8674fd4dae44282d55 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -541,9 +541,12 @@  simd_clone_mangle (struct cgraph_node *node,
   pp_string (&pp, "_ZGV");
   pp_character (&pp, vecsize_mangle);
   pp_character (&pp, mask);
-  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
-  unsigned int len = simdlen.to_constant ();
-  pp_decimal_int (&pp, (len));
+
+  unsigned long long len = 0;
+  if (simdlen.is_constant (&len))
+    pp_decimal_int (&pp, (int) (len));
+  else
+    pp_character (&pp, 'x');
 
   for (n = 0; n < clone_info->nargs; ++n)
     {
@@ -1499,8 +1502,8 @@  simd_clone_adjust (struct cgraph_node *node)
 	 below).  */
       loop = alloc_loop ();
       cfun->has_force_vectorize_loops = true;
-      /* For now, simlen is always constant.  */
-      loop->safelen = node->simdclone->simdlen.to_constant ();
+      /* We can assert that safelen is the 'minimum' simdlen.  */
+      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
       loop->force_vectorize = true;
       loop->header = body_bb;
     }
diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
index e3668893afe33a58c029cddd433d9bf43cce2bfa..12f8b3b839b7f3ff9e4f99768e59c0e1c5339062 100644
--- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
+++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
@@ -21,7 +21,7 @@  test1 (int x)
      shall call f01 with score 8.  */
   /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
   /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
-  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 12 "optimized" { target { aarch64*-*-* } } } } */
   /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
   /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
   int a = f04 (x);
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
index aab8c17f0c442a7cda4dce23cc18162a0b7f676e..add6e7c93019834fbd5bed5ead18b52d4cdd0a37 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-1.c
@@ -4,28 +4,39 @@ 
 extern "C" {
 #endif
 #pragma omp declare simd
-int __attribute__ ((const)) f00 (int a , char b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
+int __attribute__ ((const)) f00 (int a , char b)
 {
   return a + b;
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f00} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f00} } } */
+
 #pragma omp declare simd
-long long __attribute__ ((const)) f01 (int a , short b) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
+long long __attribute__ ((const)) f01 (int a , short b)
 {
   return a + b;
 }
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f01} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f01} } } */
 
 #pragma omp declare simd linear(b)
-long long __attribute__ ((const)) f02 (short *b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 4 and 8 for these types.} } */
+long long __attribute__ ((const)) f02 (short *b, int a)
 {
   return a + *b;
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f02} } } */
+/* { dg-final { scan-assembler {_ZGVsMxl2v_f02} } } */
+
 #pragma omp declare simd uniform(b)
-void f03 (char b, int a) /* { dg-warning {GCC does not currently support a simdclone with simdlens 8 and 16 for these types.} } */
+void f03 (char b, int a)
 {
 }
 
+/* { dg-final { scan-assembler-not {_ZGVn[a-z0-9]+_f03} } } */
+/* { dg-final { scan-assembler {_ZGVsMxuv_f03} } } */
+
 #pragma omp declare simd simdlen(4)
 double f04 (void) /* { dg-warning {GCC does not currently support simdlen 4 for type 'double'} } */
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
index abb128ffc9cd2c1353b99eb38aae72377746e6d6..604869a30456e4db988bba86e059a27f19dda589 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
@@ -10,6 +10,7 @@  short __attribute__ ((const)) f00 (short a , char b)
 }
 /* { dg-final { scan-assembler {_ZGVnN8vv_f00:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8vv_f00:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f00:} } } */
 
 #pragma omp declare simd notinbranch
 short __attribute__ ((const)) f01 (int a , short b)
@@ -17,6 +18,7 @@  short __attribute__ ((const)) f01 (int a , short b)
   return a + b;
 }
 /* { dg-final { scan-assembler {_ZGVnN4vv_f01:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvv_f01:} } } */
 
 #pragma omp declare simd linear(b) inbranch
 int __attribute__ ((const)) f02 (int a, short *b)
@@ -24,6 +26,7 @@  int __attribute__ ((const)) f02 (int a, short *b)
   return a + *b;
 }
 /* { dg-final { scan-assembler {_ZGVnM4vl2_f02:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvl2_f02:} } } */
 
 #pragma omp declare simd uniform(a) notinbranch
 void f03 (char b, int a)
@@ -31,6 +34,7 @@  void f03 (char b, int a)
 }
 /* { dg-final { scan-assembler {_ZGVnN8vu_f03:} } } */
 /* { dg-final { scan-assembler {_ZGVnN16vu_f03:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxvu_f03:} } } */
 
 #pragma omp declare simd simdlen(2)
 float f04 (double a)
@@ -39,6 +43,7 @@  float f04 (double a)
 }
 /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
 /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
+/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
 
 #pragma omp declare simd uniform(a) linear (b)
 void f05 (short a, short *b, short c)
@@ -50,6 +55,7 @@  void f05 (short a, short *b, short c)
 /* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
 /* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
+/* { dg-final { scan-assembler {_ZGVsMxul2v_f05:} } } */
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
index 6319df0558f37b95f1b2eb17374bdb4ecbc33295..38677b8f7a76b960ce9363b1c0cabf6fc5086ab6 100644
--- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
@@ -41,7 +41,7 @@  contains
     ! shall call f01 with score 8.
     ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
     ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
-    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
+    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
     ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
     a = f04 (x)