diff mbox series

[AArch64] Implement ACLE Data Intrinsics

Message ID c9db158d-b0d5-3f4e-97dd-0f779195ad1d@arm.com
State New
Headers show
Series [AArch64] Implement ACLE Data Intrinsics | expand

Commit Message

Andre Vieira (lists) June 10, 2022, 1:37 p.m. UTC
Hi,

This patch adds support for the ACLE Data Intrinsics to the AArch64 port.

Bootstrapped and regression tested on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

2022-06-10  Andre Vieira  <andre.simoesdiasvieira@arm.com>

         * config/aarch64/aarch64.md (rbit<mode>2): Rename this ...
         (@aarch64_rbit<mode>): ... this and change it in...
         (ffs<mode>2,ctz<mode>2): ... here.
         (@aarch64_rev16<mode>): New.
         * config/aarch64/aarch64-builtins.cc: (aarch64_builtins):
         Define the following enum AARCH64_REV16, AARCH64_REV16L, 
AARCH64_REV16LL,
         AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL.
         (aarch64_init_data_intrinsics): New.
         (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics.
         (aarch64_expand_builtin_data_intrinsic): New.
         (aarch64_general_expand_builtin): Add call to 
aarch64_expand_builtin_data_intrinsic.
         * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, 
__clsl, __clsll, __rbit,
         __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, 
__rev16ll, __ror, __rorl,
         __rorll, __revsh): New.

gcc/testsuite/ChangeLog:

2022-06-10  Andre Vieira  <andre.simoesdiasvieira@arm.com>

     * gcc.target/aarch64/acle/data-intrinsics.c: New test.

Comments

Richard Sandiford June 17, 2022, 10:54 a.m. UTC | #1
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> Hi,
>
> This patch adds support for the ACLE Data Intrinsics to the AArch64 port.
>
> Bootstrapped and regression tested on aarch64-none-linux.
>
> OK for trunk?

Sorry for the slow review.

>
> gcc/ChangeLog:
>
> 2022-06-10  Andre Vieira  <andre.simoesdiasvieira@arm.com>
>
>          * config/aarch64/aarch64.md (rbit<mode>2): Rename this ...
>          (@aarch64_rbit<mode>): ... this and change it in...
>          (ffs<mode>2,ctz<mode>2): ... here.
>          (@aarch64_rev16<mode>): New.
>          * config/aarch64/aarch64-builtins.cc: (aarch64_builtins):
>          Define the following enum AARCH64_REV16, AARCH64_REV16L, 
> AARCH64_REV16LL,
>          AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL.
>          (aarch64_init_data_intrinsics): New.
>          (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics.
>          (aarch64_expand_builtin_data_intrinsic): New.
>          (aarch64_general_expand_builtin): Add call to 
> aarch64_expand_builtin_data_intrinsic.
>          * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, 
> __clsl, __clsll, __rbit,
>          __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, 
> __rev16ll, __ror, __rorl,
>          __rorll, __revsh): New.
>
> gcc/testsuite/ChangeLog:
>
> 2022-06-10  Andre Vieira  <andre.simoesdiasvieira@arm.com>
>
>      * gcc.target/aarch64/acle/data-intrinsics.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
> index e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -613,6 +613,12 @@ enum aarch64_builtins
>    AARCH64_LS64_BUILTIN_ST64B,
>    AARCH64_LS64_BUILTIN_ST64BV,
>    AARCH64_LS64_BUILTIN_ST64BV0,
> +  AARCH64_REV16,
> +  AARCH64_REV16L,
> +  AARCH64_REV16LL,
> +  AARCH64_RBIT,
> +  AARCH64_RBITL,
> +  AARCH64_RBITLL,
>    AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>        = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
>  }
>  
> +static void
> +aarch64_init_data_intrinsics (void)
> +{
> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> +						 uint32_type_node, NULL_TREE);
> +  tree long_fntype = build_function_type_list (long_unsigned_type_node,
> +					       long_unsigned_type_node,
> +					       NULL_TREE);

Very minor, but ulong_fntype might be clearer, since the other two
variable names are explicitly unsigned.

> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> +						 uint64_type_node, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_REV16]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
> +				   AARCH64_REV16);
> +  aarch64_builtin_decls[AARCH64_REV16L]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype,
> +				   AARCH64_REV16L);
> +  aarch64_builtin_decls[AARCH64_REV16LL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
> +				   AARCH64_REV16LL);
> +  aarch64_builtin_decls[AARCH64_RBIT]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
> +				   AARCH64_RBIT);
> +  aarch64_builtin_decls[AARCH64_RBITL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype,
> +				   AARCH64_RBITL);
> +  aarch64_builtin_decls[AARCH64_RBITLL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
> +				   AARCH64_RBITLL);
> +}
> +
>  /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_init_data_intrinsics ();
>    if (TARGET_LS64)
>      aarch64_init_ls64_builtins ();
>  }
> @@ -2393,6 +2430,32 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
>    emit_insn (pat);
>    return target;
>  }
> +/* Function to expand an expression EXP which calls one of the ACLE Data
> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> +static rtx
> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
> +{
> +  rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
> +  machine_mode mode = GET_MODE (op0);
> +  rtx pat;
> +  switch (fcode)
> +    {
> +    case AARCH64_REV16:
> +    case AARCH64_REV16L:
> +    case AARCH64_REV16LL:
> +      pat = gen_aarch64_rev16 (mode, target, op0);

Does this work when op0 is a constant or comes from memory?
Same for when target is a memory.  E.g. does:

void test_rev16 (uint32_t *ptr)
{
  *ptr = __rev16 (*ptr);
}

work?

It'd be more robust to use the expand_insn interface instead;
see aarch64_expand_builtin_ls64 for an example.

> +      break;
> +    case AARCH64_RBIT:
> +    case AARCH64_RBITL:
> +    case AARCH64_RBITLL:
> +      pat = gen_aarch64_rbit (mode, target, op0);
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +  emit_insn (pat);
> +  return target;
> +}
>  
>  /* Expand an expression EXP as fpsr or fpcr setter (depending on
>     UNSPEC) using MODE.  */
> @@ -2551,6 +2614,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
>    if (fcode >= AARCH64_MEMTAG_BUILTIN_START
>        && fcode <= AARCH64_MEMTAG_BUILTIN_END)
>      return aarch64_expand_builtin_memtag (fcode, exp, target);
> +  if (fcode >= AARCH64_REV16
> +      && fcode <= AARCH64_RBITLL)
> +    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
>  
>    gcc_unreachable ();
>  }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
>      rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
>      rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
>  
> -    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>      emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>      emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
>      DONE;
> @@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
>    [(set_attr "type" "clz")]
>  )
>  
> -(define_insn "rbit<mode>2"
> +(define_insn "@aarch64_rbit<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r")
>  	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
>    ""
> @@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
>    "reload_completed"
>    [(const_int 0)]
>    "
> -  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>    DONE;
>  ")
> @@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
>    [(set_attr "type" "rev")]
>  )
>  
> +(define_insn "@aarch64_rev16<mode>"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
> +  ""
> +  "rev16\\t%<w>0, %<w>1"
> +  [(set_attr "type" "rev")])
> +
>  (define_insn "*aarch64_bfxil<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r,r")
>      (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> index 9775a48c65825b424d3eb442384f5ab87b734fd7..faddd5d0a780c5d65ba430bd3174c701e848c794 100644
> --- a/gcc/config/aarch64/arm_acle.h
> +++ b/gcc/config/aarch64/arm_acle.h
> @@ -28,6 +28,7 @@
>  #define _GCC_ARM_ACLE_H
>  
>  #include <stdint.h>
> +#include <stddef.h>
>  
>  #pragma GCC aarch64 "arm_acle.h"
>  
> @@ -35,6 +36,54 @@
>  extern "C" {
>  #endif
>  
> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)			      \
> +__extension__ extern __inline TYPE				      \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))   \
> +NAME (TYPE value, uint32_t rotate)				      \

The names of the parameters and local variables need the same __
uglification as __revl.  Same for _GCC_ARM_ACLE_DATA_FN.

> +{								      \
> +  size_t size = sizeof (TYPE) * __CHAR_BIT__;			      \
> +  rotate = rotate % size;					      \
> +  return value >> rotate | value << (size - rotate);		      \

This runs into UB for rotate == 0.

> +}
> +
> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)

Would be good to undef the macro once we're done with it, to reduce
noise in things like -dM.

> +
> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, TYPE)		    \
> +__extension__ extern __inline TYPE				    \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> +__##NAME (TYPE value)						    \
> +{								    \
> +  return __builtin_##BUILTIN (value);				    \
> +}
> +
> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t)

The ACLE spec says that these should return unsigned int, so I guess
either these functions should have their own macro or the macro above
should have a separate parameter for the return type.

Thanks,
Richard

> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t)
> +_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t)
> +
> +__extension__ extern __inline unsigned long
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +__revl (unsigned long __value)
> +{
> +  if (sizeof (unsigned long) == 8)
> +    return __revll (__value);
> +  else
> +    return __rev (__value);
> +}
> +
>  #pragma GCC push_options
>  #pragma GCC target ("arch=armv8.3-a")
>  __extension__ extern __inline int32_t
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..90813184704dfcdaf2d24d523ff744aa6cbedf1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> @@ -0,0 +1,215 @@
> +/* Test the ACLE data intrinsics.  */
> +/* { dg-do assemble } */
> +/* { dg-additional-options "--save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include "arm_acle.h"
> +
> +/*
> +** test_clz:
> +**	clz	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_clz (uint32_t a)
> +{
> +  return __clz (a);
> +}
> +
> +/*
> +** test_clzl:
> +**	clz	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_clzl (unsigned long a)
> +{
> +  return __clzl (a);
> +}
> +
> +/*
> +** test_clzll:
> +**	clz	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_clzll (uint64_t a)
> +{
> +  return __clzll (a);
> +}
> +
> +/*
> +** test_cls:
> +**	cls	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_cls (uint32_t a)
> +{
> +  return __cls (a);
> +}
> +
> +/*
> +** test_clsl:
> +**	cls	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_clsl (unsigned long a)
> +{
> +  return __clsl (a);
> +}
> +
> +/*
> +** test_clsll:
> +**	cls	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_clsll (uint64_t a)
> +{
> +  return __clsll (a);
> +}
> +
> +/*
> +** test_rbit:
> +**	rbit	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rbit (uint32_t a)
> +{
> +  return __rbit (a);
> +}
> +
> +/*
> +** test_rbitl:
> +**	rbit	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rbitl (unsigned long a)
> +{
> +  return __rbitl (a);
> +}
> +
> +/*
> +** test_rbitll:
> +**	rbit	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rbitll (uint64_t a)
> +{
> +  return __rbitll (a);
> +}
> +
> +/*
> +** test_rev:
> +**	rev	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev (uint32_t a)
> +{
> +  return __builtin_bswap32 (a);
> +}
> +
> +/*
> +** test_revl:
> +**	rev	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_revl (unsigned long a)
> +{
> +  return __revl (a);
> +}
> +
> +/*
> +** test_revll:
> +**	rev	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_revll (uint64_t a)
> +{
> +  return __revll (a);
> +}
> +
> +/*
> +** test_rev16:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev16 (uint32_t a)
> +{
> +  return __rev16 (a);
> +}
> +
> +/*
> +** test_rev16l:
> +**	rev16	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rev16l (unsigned long a)
> +{
> +  return __rev16l (a);
> +}
> +
> +/*
> +** test_rev16ll:
> +**	rev16	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rev16ll (uint64_t a)
> +{
> +  return __rev16ll (a);
> +}
> +
> +/*
> +** test_ror:
> +**	ror	w0, w0, w1
> +**	ret
> +*/
> +
> +uint32_t test_ror (uint32_t a, uint32_t r)
> +{
> +  return __ror (a, r);
> +}
> +
> +/*
> +** test_rorl:
> +**	ror	[wx]0, [wx]0, [wx]1
> +**	ret
> +*/
> +
> +unsigned long test_rorl (unsigned long a, uint32_t r)
> +{
> +  return __rorl (a, r);
> +}
> +
> +/*
> +** test_rorll:
> +**	ror	x0, x0, x1
> +**	ret
> +*/
> +
> +uint64_t test_rorll (uint64_t a, uint32_t r)
> +{
> +  return __rorll (a, r);
> +}
> +
> +/*
> +** test_revsh:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +int16_t test_revsh (int16_t a)
> +{
> +  return __revsh (a);
> +}
Andre Vieira (lists) June 28, 2022, 2:04 p.m. UTC | #2
On 17/06/2022 11:54, Richard Sandiford wrote:
> "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
>> Hi,
>>
>> This patch adds support for the ACLE Data Intrinsics to the AArch64 port.
>>
>> Bootstrapped and regression tested on aarch64-none-linux.
>>
>> OK for trunk?
> Sorry for the slow review.
No worries :)
>
>> +{								      \
>> +  size_t size = sizeof (TYPE) * __CHAR_BIT__;			      \
>> +  rotate = rotate % size;					      \
>> +  return value >> rotate | value << (size - rotate);		      \
> This runs into UB for rotate == 0.
I assume it's because of the value << size no? I added a modulo, I 
assume it's legal to shift by 0?

This OK?
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index e0a741ac663188713e21f457affa57217d074783..69f1fb3604a481fa378d105cf3ee98edec1ba619 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
       = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+						 uint32_type_node, NULL_TREE);
+  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
+						long_unsigned_type_node,
+						NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+						 uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+				   AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
+				   AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+				   AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+				   AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
+				   AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+				   AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
     aarch64_init_ls64_builtins ();
 }
@@ -2393,6 +2430,40 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
   emit_insn (pat);
   return target;
 }
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
+{
+  expand_operand ops[2];
+  machine_mode mode = GET_MODE (target);
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
+  enum insn_code icode;
+  switch (fcode)
+    {
+    case AARCH64_REV16:
+    case AARCH64_REV16L:
+    case AARCH64_REV16LL:
+      if (mode == SImode)
+	icode = CODE_FOR_aarch64_rev16si;
+      else
+	icode = CODE_FOR_aarch64_rev16di;
+      break;
+    case AARCH64_RBIT:
+    case AARCH64_RBITL:
+    case AARCH64_RBITLL:
+      if (mode == SImode)
+	icode = CODE_FOR_aarch64_rbitsi;
+      else
+	icode = CODE_FOR_aarch64_rbitdi;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  expand_insn (icode, 2, ops);
+  return target;
+}
 
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
    UNSPEC) using MODE.  */
@@ -2551,6 +2622,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
       && fcode <= AARCH64_MEMTAG_BUILTIN_END)
     return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+      && fcode <= AARCH64_RBITLL)
+    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
     rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
     rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
 
-    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
     emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
     DONE;
@@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
   [(set_attr "type" "clz")]
 )
 
-(define_insn "rbit<mode>2"
+(define_insn "@aarch64_rbit<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
   ""
@@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
   "reload_completed"
   [(const_int 0)]
   "
-  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
   emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
   DONE;
 ")
@@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
   [(set_attr "type" "rev")]
 )
 
+(define_insn "@aarch64_rev16<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
+  ""
+  "rev16\\t%<w>0, %<w>1"
+  [(set_attr "type" "rev")])
+
 (define_insn "*aarch64_bfxil<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r,r")
     (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -28,6 +28,7 @@
 #define _GCC_ARM_ACLE_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 #pragma GCC aarch64 "arm_acle.h"
 
@@ -35,6 +36,58 @@
 extern "C" {
 #endif
 
+#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
+__extension__ extern __inline TYPE					  \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
+NAME (TYPE __value, uint32_t __rotate)					  \
+{									  \
+  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
+  __rotate = __rotate % __size;						  \
+  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
+}
+
+_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
+_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
+_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
+
+#undef _GCC_ARM_ACLE_ROR_FN
+
+#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
+__extension__ extern __inline RTYPE				    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+__##NAME (ITYPE __value)					    \
+{								    \
+  return __builtin_##BUILTIN (__value);				    \
+}
+
+_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
+_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
+
+#undef _GCC_ARM_ACLE_DATA_FN
+
+__extension__ extern __inline unsigned long
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__revl (unsigned long __value)
+{
+  if (sizeof (unsigned long) == 8)
+    return __revll (__value);
+  else
+    return __rev (__value);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.3-a")
 __extension__ extern __inline int32_t
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..90813184704dfcdaf2d24d523ff744aa6cbedf1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
@@ -0,0 +1,215 @@
+/* Test the ACLE data intrinsics.  */
+/* { dg-do assemble } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include "arm_acle.h"
+
+/*
+** test_clz:
+**	clz	w0, w0
+**	ret
+*/
+
+uint32_t test_clz (uint32_t a)
+{
+  return __clz (a);
+}
+
+/*
+** test_clzl:
+**	clz	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_clzl (unsigned long a)
+{
+  return __clzl (a);
+}
+
+/*
+** test_clzll:
+**	clz	x0, x0
+**	ret
+*/
+
+uint64_t test_clzll (uint64_t a)
+{
+  return __clzll (a);
+}
+
+/*
+** test_cls:
+**	cls	w0, w0
+**	ret
+*/
+
+uint32_t test_cls (uint32_t a)
+{
+  return __cls (a);
+}
+
+/*
+** test_clsl:
+**	cls	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_clsl (unsigned long a)
+{
+  return __clsl (a);
+}
+
+/*
+** test_clsll:
+**	cls	x0, x0
+**	ret
+*/
+
+uint64_t test_clsll (uint64_t a)
+{
+  return __clsll (a);
+}
+
+/*
+** test_rbit:
+**	rbit	w0, w0
+**	ret
+*/
+
+uint32_t test_rbit (uint32_t a)
+{
+  return __rbit (a);
+}
+
+/*
+** test_rbitl:
+**	rbit	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rbitl (unsigned long a)
+{
+  return __rbitl (a);
+}
+
+/*
+** test_rbitll:
+**	rbit	x0, x0
+**	ret
+*/
+
+uint64_t test_rbitll (uint64_t a)
+{
+  return __rbitll (a);
+}
+
+/*
+** test_rev:
+**	rev	w0, w0
+**	ret
+*/
+
+uint32_t test_rev (uint32_t a)
+{
+  return __builtin_bswap32 (a);
+}
+
+/*
+** test_revl:
+**	rev	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_revl (unsigned long a)
+{
+  return __revl (a);
+}
+
+/*
+** test_revll:
+**	rev	x0, x0
+**	ret
+*/
+
+uint64_t test_revll (uint64_t a)
+{
+  return __revll (a);
+}
+
+/*
+** test_rev16:
+**	rev16	w0, w0
+**	ret
+*/
+
+uint32_t test_rev16 (uint32_t a)
+{
+  return __rev16 (a);
+}
+
+/*
+** test_rev16l:
+**	rev16	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rev16l (unsigned long a)
+{
+  return __rev16l (a);
+}
+
+/*
+** test_rev16ll:
+**	rev16	x0, x0
+**	ret
+*/
+
+uint64_t test_rev16ll (uint64_t a)
+{
+  return __rev16ll (a);
+}
+
+/*
+** test_ror:
+**	ror	w0, w0, w1
+**	ret
+*/
+
+uint32_t test_ror (uint32_t a, uint32_t r)
+{
+  return __ror (a, r);
+}
+
+/*
+** test_rorl:
+**	ror	[wx]0, [wx]0, [wx]1
+**	ret
+*/
+
+unsigned long test_rorl (unsigned long a, uint32_t r)
+{
+  return __rorl (a, r);
+}
+
+/*
+** test_rorll:
+**	ror	x0, x0, x1
+**	ret
+*/
+
+uint64_t test_rorll (uint64_t a, uint32_t r)
+{
+  return __rorll (a, r);
+}
+
+/*
+** test_revsh:
+**	rev16	w0, w0
+**	ret
+*/
+
+int16_t test_revsh (int16_t a)
+{
+  return __revsh (a);
+}
Richard Sandiford June 29, 2022, 7:18 a.m. UTC | #3
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> On 17/06/2022 11:54, Richard Sandiford wrote:
>> "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
>>> Hi,
>>>
>>> This patch adds support for the ACLE Data Intrinsics to the AArch64 port.
>>>
>>> Bootstrapped and regression tested on aarch64-none-linux.
>>>
>>> OK for trunk?
>> Sorry for the slow review.
> No worries :)
>>
>>> +{								      \
>>> +  size_t size = sizeof (TYPE) * __CHAR_BIT__;			      \
>>> +  rotate = rotate % size;					      \
>>> +  return value >> rotate | value << (size - rotate);		      \
>> This runs into UB for rotate == 0.
> I assume it's because of the value << size no?

Yeah.

> I added a modulo, I assume it's legal to shift by 0?

Thanks, and yeah, shifting by zero is fine.

> This OK?
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
> index e0a741ac663188713e21f457affa57217d074783..69f1fb3604a481fa378d105cf3ee98edec1ba619 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -613,6 +613,12 @@ enum aarch64_builtins
>    AARCH64_LS64_BUILTIN_ST64B,
>    AARCH64_LS64_BUILTIN_ST64BV,
>    AARCH64_LS64_BUILTIN_ST64BV0,
> +  AARCH64_REV16,
> +  AARCH64_REV16L,
> +  AARCH64_REV16LL,
> +  AARCH64_RBIT,
> +  AARCH64_RBITL,
> +  AARCH64_RBITLL,
>    AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>        = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
>  }
>  
> +static void
> +aarch64_init_data_intrinsics (void)
> +{
> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> +						 uint32_type_node, NULL_TREE);
> +  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
> +						long_unsigned_type_node,
> +						NULL_TREE);
> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> +						 uint64_type_node, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_REV16]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
> +				   AARCH64_REV16);
> +  aarch64_builtin_decls[AARCH64_REV16L]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
> +				   AARCH64_REV16L);
> +  aarch64_builtin_decls[AARCH64_REV16LL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
> +				   AARCH64_REV16LL);
> +  aarch64_builtin_decls[AARCH64_RBIT]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
> +				   AARCH64_RBIT);
> +  aarch64_builtin_decls[AARCH64_RBITL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
> +				   AARCH64_RBITL);
> +  aarch64_builtin_decls[AARCH64_RBITLL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
> +				   AARCH64_RBITLL);
> +}
> +
>  /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_init_data_intrinsics ();
>    if (TARGET_LS64)
>      aarch64_init_ls64_builtins ();
>  }
> @@ -2393,6 +2430,40 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
>    emit_insn (pat);
>    return target;
>  }

Nit: missing blank line here.

> +/* Function to expand an expression EXP which calls one of the ACLE Data
> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> +static rtx
> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
> +{
> +  expand_operand ops[2];
> +  machine_mode mode = GET_MODE (target);
> +  create_output_operand (&ops[0], target, mode);
> +  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
> +  enum insn_code icode;
> +  switch (fcode)
> +    {
> +    case AARCH64_REV16:
> +    case AARCH64_REV16L:
> +    case AARCH64_REV16LL:
> +      if (mode == SImode)
> +	icode = CODE_FOR_aarch64_rev16si;
> +      else
> +	icode = CODE_FOR_aarch64_rev16di;

You should be able to do:

  icode = code_for_aarch64_rev (mode);

instead.  Same for the next cases.

> +      break;
> +    case AARCH64_RBIT:
> +    case AARCH64_RBITL:
> +    case AARCH64_RBITLL:
> +      if (mode == SImode)
> +	icode = CODE_FOR_aarch64_rbitsi;
> +      else
> +	icode = CODE_FOR_aarch64_rbitdi;
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +  expand_insn (icode, 2, ops);
> +  return target;

This needs to return ops[0].value instead, since "target" just suggests
a possible location.

Could you add tests for a memory source and memory destination, e.g.:

void test_clz_mem (uint32_t *a)
{
  *a = __clz (*a);
}

Without tests like that, these comments probably just sound like a paper
exercise, but they should make a difference for memory sources (previous
review) and memory destinations (this round).

> +}
>  
>  /* Expand an expression EXP as fpsr or fpcr setter (depending on
>     UNSPEC) using MODE.  */
> @@ -2551,6 +2622,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
>    if (fcode >= AARCH64_MEMTAG_BUILTIN_START
>        && fcode <= AARCH64_MEMTAG_BUILTIN_END)
>      return aarch64_expand_builtin_memtag (fcode, exp, target);
> +  if (fcode >= AARCH64_REV16
> +      && fcode <= AARCH64_RBITLL)
> +    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
>  
>    gcc_unreachable ();
>  }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
>      rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
>      rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
>  
> -    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>      emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>      emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
>      DONE;
> @@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
>    [(set_attr "type" "clz")]
>  )
>  
> -(define_insn "rbit<mode>2"
> +(define_insn "@aarch64_rbit<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r")
>  	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
>    ""
> @@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
>    "reload_completed"
>    [(const_int 0)]
>    "
> -  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>    DONE;
>  ")
> @@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
>    [(set_attr "type" "rev")]
>  )
>  
> +(define_insn "@aarch64_rev16<mode>"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
> +  ""
> +  "rev16\\t%<w>0, %<w>1"
> +  [(set_attr "type" "rev")])
> +
>  (define_insn "*aarch64_bfxil<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r,r")
>      (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> index 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce 100644
> --- a/gcc/config/aarch64/arm_acle.h
> +++ b/gcc/config/aarch64/arm_acle.h
> @@ -28,6 +28,7 @@
>  #define _GCC_ARM_ACLE_H
>  
>  #include <stdint.h>
> +#include <stddef.h>
>  
>  #pragma GCC aarch64 "arm_acle.h"
>  
> @@ -35,6 +36,58 @@
>  extern "C" {
>  #endif
>  
> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
> +__extension__ extern __inline TYPE					  \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
> +NAME (TYPE __value, uint32_t __rotate)					  \
> +{									  \
> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
> +  __rotate = __rotate % __size;						  \
> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
> +}
> +
> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
> +
> +#undef _GCC_ARM_ACLE_ROR_FN
> +
> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
> +__extension__ extern __inline RTYPE				    \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> +__##NAME (ITYPE __value)					    \
> +{								    \
> +  return __builtin_##BUILTIN (__value);				    \
> +}
> +
> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)

The return type should be int16_t.

The clz and cls tests have the old return types (same as the argument
types), but I guess that's a good thing, since it shows that we avoid
the redundant zero-extend in clzll and clsll.

Thanks,
Richard

> +_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
> +
> +#undef _GCC_ARM_ACLE_DATA_FN
> +
> +__extension__ extern __inline unsigned long
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +__revl (unsigned long __value)
> +{
> +  if (sizeof (unsigned long) == 8)
> +    return __revll (__value);
> +  else
> +    return __rev (__value);
> +}
> +
>  #pragma GCC push_options
>  #pragma GCC target ("arch=armv8.3-a")
>  __extension__ extern __inline int32_t
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..90813184704dfcdaf2d24d523ff744aa6cbedf1a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> @@ -0,0 +1,215 @@
> +/* Test the ACLE data intrinsics.  */
> +/* { dg-do assemble } */
> +/* { dg-additional-options "--save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include "arm_acle.h"
> +
> +/*
> +** test_clz:
> +**	clz	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_clz (uint32_t a)
> +{
> +  return __clz (a);
> +}
> +
> +/*
> +** test_clzl:
> +**	clz	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_clzl (unsigned long a)
> +{
> +  return __clzl (a);
> +}
> +
> +/*
> +** test_clzll:
> +**	clz	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_clzll (uint64_t a)
> +{
> +  return __clzll (a);
> +}
> +
> +/*
> +** test_cls:
> +**	cls	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_cls (uint32_t a)
> +{
> +  return __cls (a);
> +}
> +
> +/*
> +** test_clsl:
> +**	cls	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_clsl (unsigned long a)
> +{
> +  return __clsl (a);
> +}
> +
> +/*
> +** test_clsll:
> +**	cls	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_clsll (uint64_t a)
> +{
> +  return __clsll (a);
> +}
> +
> +/*
> +** test_rbit:
> +**	rbit	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rbit (uint32_t a)
> +{
> +  return __rbit (a);
> +}
> +
> +/*
> +** test_rbitl:
> +**	rbit	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rbitl (unsigned long a)
> +{
> +  return __rbitl (a);
> +}
> +
> +/*
> +** test_rbitll:
> +**	rbit	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rbitll (uint64_t a)
> +{
> +  return __rbitll (a);
> +}
> +
> +/*
> +** test_rev:
> +**	rev	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev (uint32_t a)
> +{
> +  return __builtin_bswap32 (a);
> +}
> +
> +/*
> +** test_revl:
> +**	rev	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_revl (unsigned long a)
> +{
> +  return __revl (a);
> +}
> +
> +/*
> +** test_revll:
> +**	rev	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_revll (uint64_t a)
> +{
> +  return __revll (a);
> +}
> +
> +/*
> +** test_rev16:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev16 (uint32_t a)
> +{
> +  return __rev16 (a);
> +}
> +
> +/*
> +** test_rev16l:
> +**	rev16	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rev16l (unsigned long a)
> +{
> +  return __rev16l (a);
> +}
> +
> +/*
> +** test_rev16ll:
> +**	rev16	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rev16ll (uint64_t a)
> +{
> +  return __rev16ll (a);
> +}
> +
> +/*
> +** test_ror:
> +**	ror	w0, w0, w1
> +**	ret
> +*/
> +
> +uint32_t test_ror (uint32_t a, uint32_t r)
> +{
> +  return __ror (a, r);
> +}
> +
> +/*
> +** test_rorl:
> +**	ror	[wx]0, [wx]0, [wx]1
> +**	ret
> +*/
> +
> +unsigned long test_rorl (unsigned long a, uint32_t r)
> +{
> +  return __rorl (a, r);
> +}
> +
> +/*
> +** test_rorll:
> +**	ror	x0, x0, x1
> +**	ret
> +*/
> +
> +uint64_t test_rorll (uint64_t a, uint32_t r)
> +{
> +  return __rorll (a, r);
> +}
> +
> +/*
> +** test_revsh:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +int16_t test_revsh (int16_t a)
> +{
> +  return __revsh (a);
> +}
Andre Vieira (lists) July 1, 2022, 10:25 a.m. UTC | #4
On 29/06/2022 08:18, Richard Sandiford wrote:
>> +      break;
>> +    case AARCH64_RBIT:
>> +    case AARCH64_RBITL:
>> +    case AARCH64_RBITLL:
>> +      if (mode == SImode)
>> +	icode = CODE_FOR_aarch64_rbitsi;
>> +      else
>> +	icode = CODE_FOR_aarch64_rbitdi;
>> +      break;
>> +    default:
>> +      gcc_unreachable ();
>> +    }
>> +  expand_insn (icode, 2, ops);
>> +  return target;
> This needs to return ops[0].value instead, since "target" just suggests
> a possible location.
>
> Could you add tests for a memory source and memory destination, e.g.:
>
> void test_clz_mem (uint32_t *a)
> {
>    *a = __clz (*a);
> }
>
> Without tests like that, these comments probably just sound like a paper
> exercise, but they should make a difference for memory sources (previous
> review) and memory destinations (this round).
I had locally tested it (with rev though because clz doesn't use that 
code) and strangely it does seem to work for the memory destinations, 
but that's just a simple test.
It could very well go wrong with some more complex codegen, so I'll just 
take your word and use ops[0].value.

And yeah I didn't add the tests at the time, don't really know why, I'll 
chuck it down to laziness :P
>
>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
>> index 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce 100644
>> --- a/gcc/config/aarch64/arm_acle.h
>> +++ b/gcc/config/aarch64/arm_acle.h
>> @@ -28,6 +28,7 @@
>>   #define _GCC_ARM_ACLE_H
>>   
>>   #include <stdint.h>
>> +#include <stddef.h>
>>   
>>   #pragma GCC aarch64 "arm_acle.h"
>>   
>> @@ -35,6 +36,58 @@
>>   extern "C" {
>>   #endif
>>   
>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
>> +__extension__ extern __inline TYPE					  \
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
>> +NAME (TYPE __value, uint32_t __rotate)					  \
>> +{									  \
>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
>> +  __rotate = __rotate % __size;						  \
>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
>> +}
>> +
>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
>> +
>> +#undef _GCC_ARM_ACLE_ROR_FN
>> +
>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
>> +__extension__ extern __inline RTYPE				    \
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
>> +__##NAME (ITYPE __value)					    \
>> +{								    \
>> +  return __builtin_##BUILTIN (__value);				    \
>> +}
>> +
>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
> The return type should be int16_t.
Nice catch!
> The clz and cls tests have the old return types (same as the argument
> types), but I guess that's a good thing, since it shows that we avoid
> the redundant zero-extend in clzll and clsll.
Yeah I noticed that too when I was adding the mem tests, but I did 
change them though because at the time it just felt like an oversight, 
though I too was pleasantly surprised GCC was managing to avoid the 
zero-extending :)
I then saw your comment and made me wonder whether I should keep the 
wrong return types in... I haven't but happy to change them back if you 
think it's a nice 'test' to have.

Regards,
Andre
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index e0a741ac663188713e21f457affa57217d074783..bb5d97c8fc6402635270df851a949cabeecaa5e8 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
       = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+						 uint32_type_node, NULL_TREE);
+  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
+						long_unsigned_type_node,
+						NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+						 uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+				   AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
+				   AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+				   AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+				   AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
+				   AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+				   AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
     aarch64_init_ls64_builtins ();
 }
@@ -2394,6 +2431,37 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
+{
+  expand_operand ops[2];
+  machine_mode mode = GET_MODE (target);
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
+  enum insn_code icode;
+
+  switch (fcode)
+    {
+    case AARCH64_REV16:
+    case AARCH64_REV16L:
+    case AARCH64_REV16LL:
+      icode = code_for_aarch64_rev16 (mode);
+      break;
+    case AARCH64_RBIT:
+    case AARCH64_RBITL:
+    case AARCH64_RBITLL:
+      icode = code_for_aarch64_rbit (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  expand_insn (icode, 2, ops);
+  return ops[0].value;
+}
+
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
    UNSPEC) using MODE.  */
 static void
@@ -2551,6 +2619,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
       && fcode <= AARCH64_MEMTAG_BUILTIN_END)
     return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+      && fcode <= AARCH64_RBITLL)
+    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
     rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
     rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
 
-    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
     emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
     DONE;
@@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
   [(set_attr "type" "clz")]
 )
 
-(define_insn "rbit<mode>2"
+(define_insn "@aarch64_rbit<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
   ""
@@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
   "reload_completed"
   [(const_int 0)]
   "
-  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
   emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
   DONE;
 ")
@@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
   [(set_attr "type" "rev")]
 )
 
+(define_insn "@aarch64_rev16<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
+  ""
+  "rev16\\t%<w>0, %<w>1"
+  [(set_attr "type" "rev")])
+
 (define_insn "*aarch64_bfxil<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r,r")
     (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 9775a48c65825b424d3eb442384f5ab87b734fd7..d26e269cb843fe37ba789db09c40d06f53438cda 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -28,6 +28,7 @@
 #define _GCC_ARM_ACLE_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 #pragma GCC aarch64 "arm_acle.h"
 
@@ -35,6 +36,58 @@
 extern "C" {
 #endif
 
+#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
+__extension__ extern __inline TYPE					  \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
+NAME (TYPE __value, uint32_t __rotate)					  \
+{									  \
+  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
+  __rotate = __rotate % __size;						  \
+  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
+}
+
+_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
+_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
+_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
+
+#undef _GCC_ARM_ACLE_ROR_FN
+
+#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
+__extension__ extern __inline RTYPE				    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+__##NAME (ITYPE __value)					    \
+{								    \
+  return __builtin_##BUILTIN (__value);				    \
+}
+
+_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, int16_t)
+_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
+
+#undef _GCC_ARM_ACLE_DATA_FN
+
+__extension__ extern __inline unsigned long
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__revl (unsigned long __value)
+{
+  if (sizeof (unsigned long) == 8)
+    return __revll (__value);
+  else
+    return __rev (__value);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.3-a")
 __extension__ extern __inline int32_t
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..e067ef20bbdc8993865b541aa99dccac6b03e6a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
@@ -0,0 +1,468 @@
+/* Test the ACLE data intrinsics.  */
+/* { dg-do assemble } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include "arm_acle.h"
+
+/*
+** test_clz:
+**	clz	w0, w0
+**	ret
+*/
+
+unsigned int test_clz (uint32_t a)
+{
+  return __clz (a);
+}
+
+/*
+** test_clzl:
+**	clz	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned int test_clzl (unsigned long a)
+{
+  return __clzl (a);
+}
+
+/*
+** test_clzll:
+**	clz	x0, x0
+**	ret
+*/
+
+unsigned int test_clzll (uint64_t a)
+{
+  return __clzll (a);
+}
+
+/*
+** test_cls:
+**	cls	w0, w0
+**	ret
+*/
+
+unsigned int test_cls (uint32_t a)
+{
+  return __cls (a);
+}
+
+/*
+** test_clsl:
+**	cls	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned int test_clsl (unsigned long a)
+{
+  return __clsl (a);
+}
+
+/*
+** test_clsll:
+**	cls	x0, x0
+**	ret
+*/
+
+unsigned int test_clsll (uint64_t a)
+{
+  return __clsll (a);
+}
+
+/*
+** test_rbit:
+**	rbit	w0, w0
+**	ret
+*/
+
+uint32_t test_rbit (uint32_t a)
+{
+  return __rbit (a);
+}
+
+/*
+** test_rbitl:
+**	rbit	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rbitl (unsigned long a)
+{
+  return __rbitl (a);
+}
+
+/*
+** test_rbitll:
+**	rbit	x0, x0
+**	ret
+*/
+
+uint64_t test_rbitll (uint64_t a)
+{
+  return __rbitll (a);
+}
+
+/*
+** test_rev:
+**	rev	w0, w0
+**	ret
+*/
+
+uint32_t test_rev (uint32_t a)
+{
+  return __rev (a);
+}
+
+/*
+** test_revl:
+**	rev	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_revl (unsigned long a)
+{
+  return __revl (a);
+}
+
+/*
+** test_revll:
+**	rev	x0, x0
+**	ret
+*/
+
+uint64_t test_revll (uint64_t a)
+{
+  return __revll (a);
+}
+
+/*
+** test_rev16:
+**	rev16	w0, w0
+**	ret
+*/
+
+uint32_t test_rev16 (uint32_t a)
+{
+  return __rev16 (a);
+}
+
+/*
+** test_rev16l:
+**	rev16	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rev16l (unsigned long a)
+{
+  return __rev16l (a);
+}
+
+/*
+** test_rev16ll:
+**	rev16	x0, x0
+**	ret
+*/
+
+uint64_t test_rev16ll (uint64_t a)
+{
+  return __rev16ll (a);
+}
+
+/*
+** test_ror:
+**	ror	w0, w0, w1
+**	ret
+*/
+
+uint32_t test_ror (uint32_t a, uint32_t r)
+{
+  return __ror (a, r);
+}
+
+/*
+** test_rorl:
+**	ror	[wx]0, [wx]0, [wx]1
+**	ret
+*/
+
+unsigned long test_rorl (unsigned long a, uint32_t r)
+{
+  return __rorl (a, r);
+}
+
+/*
+** test_rorll:
+**	ror	x0, x0, x1
+**	ret
+*/
+
+uint64_t test_rorll (uint64_t a, uint32_t r)
+{
+  return __rorll (a, r);
+}
+
+/*
+** test_revsh:
+**	rev16	w0, w0
+**	ret
+*/
+
+int16_t test_revsh (int16_t a)
+{
+  return __revsh (a);
+}
+
+uint32_t *g32;
+unsigned long *gul;
+uint64_t *g64;
+unsigned int *gui;
+int16_t *g16;
+
+/*
+** test_clz_mem:
+**	...
+**	clz	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_clz_mem (uint32_t *a)
+{
+  *gui = __clz (*a);
+}
+
+/*
+** test_clzl_mem:
+**	...
+**	clz	[wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_clzl_mem (unsigned long *a)
+{
+  *gui = __clzl (*a);
+}
+
+/*
+** test_clzll_mem:
+**	...
+**	clz	x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_clzll_mem (uint64_t *a)
+{
+  *gui = __clzll (*a);
+}
+
+/*
+** test_cls_mem:
+**	...
+**	cls	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_cls_mem (uint32_t *a)
+{
+  *gui = __cls (*a);
+}
+
+/*
+** test_clsl_mem:
+**	...
+**	cls	[wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_clsl_mem (unsigned long *a)
+{
+  *gui = __clsl (*a);
+}
+
+/*
+** test_clsll_mem:
+**	...
+**	cls	x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_clsll_mem (uint64_t *a)
+{
+  *gui = __clsll (*a);
+}
+
+/*
+** test_rbit_mem:
+**	...
+**	rbit	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_rbit_mem (uint32_t *a)
+{
+  *g32 = __rbit (*a);
+}
+
+/*
+** test_rbitl_mem:
+**	...
+**	rbit	[wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_rbitl_mem (unsigned long *a)
+{
+  *gul = __rbitl (*a);
+}
+
+/*
+** test_rbitll_mem:
+**	...
+**	rbit	x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_rbitll_mem (uint64_t *a)
+{
+  *g64 = __rbitll (*a);
+}
+
+/*
+** test_rev_mem:
+**	...
+**	rev	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_rev_mem (uint32_t *a)
+{
+  *g32 = __rev (*a);
+}
+
+/*
+** test_revl_mem:
+**	...
+**	rev	[wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_revl_mem (unsigned long *a)
+{
+  *gul = __revl (*a);
+}
+
+/*
+** test_revll_mem:
+**	...
+**	rev	x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_revll_mem (uint64_t *a)
+{
+  *g64 = __revll (*a);
+}
+
+/*
+** test_rev16_mem:
+**	...
+**	rev16	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_rev16_mem (uint32_t *a)
+{
+  *g32 = __rev16 (*a);
+}
+
+/*
+** test_rev16l_mem:
+**	...
+**	rev16	[wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_rev16l_mem (unsigned long *a)
+{
+  *gul = __rev16l (*a);
+}
+
+/*
+** test_rev16ll_mem:
+**	...
+**	rev16	x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_rev16ll_mem (uint64_t *a)
+{
+  *g64 = __rev16ll (*a);
+}
+
+/*
+** test_ror_mem:
+**	...
+**	ror	w[0-9]+, w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_ror_mem (uint32_t *a, uint32_t *r)
+{
+  *g32 = __ror (*a, *r);
+}
+
+/*
+** test_rorl_mem:
+**	...
+**	ror	[wx][0-9]+, [wx][0-9]+, [wx][0-9]+
+**	...
+**	ret
+*/
+
+void test_rorl_mem (unsigned long *a, uint32_t *r)
+{
+  *gul = __rorl (*a, *r);
+}
+
+/*
+** test_rorll_mem:
+**	...
+**	ror	x[0-9]+, x[0-9]+, x[0-9]+
+**	...
+**	ret
+*/
+
+void test_rorll_mem (uint64_t *a, uint32_t *r)
+{
+  *g64 = __rorll (*a, *r);
+}
+
+/*
+** test_revsh_mem:
+**	...
+**	rev16	w[0-9]+, w[0-9]+
+**	...
+**	ret
+*/
+
+void test_revsh_mem (int16_t *a)
+{
+  *g16 = __revsh (*a);
+}
Richard Sandiford July 1, 2022, 11:26 a.m. UTC | #5
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> On 29/06/2022 08:18, Richard Sandiford wrote:
>>> +      break;
>>> +    case AARCH64_RBIT:
>>> +    case AARCH64_RBITL:
>>> +    case AARCH64_RBITLL:
>>> +      if (mode == SImode)
>>> +	icode = CODE_FOR_aarch64_rbitsi;
>>> +      else
>>> +	icode = CODE_FOR_aarch64_rbitdi;
>>> +      break;
>>> +    default:
>>> +      gcc_unreachable ();
>>> +    }
>>> +  expand_insn (icode, 2, ops);
>>> +  return target;
>> This needs to return ops[0].value instead, since "target" just suggests
>> a possible location.
>>
>> Could you add tests for a memory source and memory destination, e.g.:
>>
>> void test_clz_mem (uint32_t *a)
>> {
>>    *a = __clz (*a);
>> }
>>
>> Without tests like that, these comments probably just sound like a paper
>> exercise, but they should make a difference for memory sources (previous
>> review) and memory destinations (this round).
> I had locally tested it (with rev though because clz doesn't use that 
> code) and strangely it does seem to work for the memory destinations, 
> but that's just a simple test.
> It could very well go wrong with some more complex codegen, so I'll just 
> take your word and use ops[0].value.
>
> And yeah I didn't add the tests at the time, don't really know why, I'll 
> chuck it down to laziness :P
>>
>>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
>>> index 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce 100644
>>> --- a/gcc/config/aarch64/arm_acle.h
>>> +++ b/gcc/config/aarch64/arm_acle.h
>>> @@ -28,6 +28,7 @@
>>>   #define _GCC_ARM_ACLE_H
>>>   
>>>   #include <stdint.h>
>>> +#include <stddef.h>
>>>   
>>>   #pragma GCC aarch64 "arm_acle.h"
>>>   
>>> @@ -35,6 +36,58 @@
>>>   extern "C" {
>>>   #endif
>>>   
>>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
>>> +__extension__ extern __inline TYPE					  \
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
>>> +NAME (TYPE __value, uint32_t __rotate)					  \
>>> +{									  \
>>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
>>> +  __rotate = __rotate % __size;						  \
>>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
>>> +}
>>> +
>>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
>>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
>>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
>>> +
>>> +#undef _GCC_ARM_ACLE_ROR_FN
>>> +
>>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
>>> +__extension__ extern __inline RTYPE				    \
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
>>> +__##NAME (ITYPE __value)					    \
>>> +{								    \
>>> +  return __builtin_##BUILTIN (__value);				    \
>>> +}
>>> +
>>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
>>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
>>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
>> The return type should be int16_t.
> Nice catch!
>> The clz and cls tests have the old return types (same as the argument
>> types), but I guess that's a good thing, since it shows that we avoid
>> the redundant zero-extend in clzll and clsll.
> Yeah I noticed that too when I was adding the mem tests, but I did 
> change them though because at the time it just felt like an oversight, 
> though I too was pleasantly surprised GCC was managing to avoid the 
> zero-extending :)
> I then saw your comment and made me wonder whether I should keep the 
> wrong return types in... I haven't but happy to change them back if you 
> think it's a nice 'test' to have.

I thought it was OK/useful as it was, but I don't mind either way.

BTW, while trying it out locally, I noticed:

  aarch64_init_data_intrinsics

was called from the wrong place.  Since it's adding normal __builtin
functions, it should be called from aarch64_general_init_builtins
instead of handle_arm_acle_h.

handle_arm_acle_h is instead for cases where we want to simulate
C/C++ definitions of the ACLE intrinsics themselves (i.e. so that
the intrinsics themselves are built-in functions, rather than
inline wrappers around built-in functions).

OK with that change, thanks.

Thanks,
Richard

> Regards,
> Andre
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
> index e0a741ac663188713e21f457affa57217d074783..bb5d97c8fc6402635270df851a949cabeecaa5e8 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -613,6 +613,12 @@ enum aarch64_builtins
>    AARCH64_LS64_BUILTIN_ST64B,
>    AARCH64_LS64_BUILTIN_ST64BV,
>    AARCH64_LS64_BUILTIN_ST64BV0,
> +  AARCH64_REV16,
> +  AARCH64_REV16L,
> +  AARCH64_REV16LL,
> +  AARCH64_RBIT,
> +  AARCH64_RBITL,
> +  AARCH64_RBITLL,
>    AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>        = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
>  }
>  
> +static void
> +aarch64_init_data_intrinsics (void)
> +{
> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> +						 uint32_type_node, NULL_TREE);
> +  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
> +						long_unsigned_type_node,
> +						NULL_TREE);
> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> +						 uint64_type_node, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_REV16]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
> +				   AARCH64_REV16);
> +  aarch64_builtin_decls[AARCH64_REV16L]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
> +				   AARCH64_REV16L);
> +  aarch64_builtin_decls[AARCH64_REV16LL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
> +				   AARCH64_REV16LL);
> +  aarch64_builtin_decls[AARCH64_RBIT]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
> +				   AARCH64_RBIT);
> +  aarch64_builtin_decls[AARCH64_RBITL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
> +				   AARCH64_RBITL);
> +  aarch64_builtin_decls[AARCH64_RBITLL]
> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
> +				   AARCH64_RBITLL);
> +}
> +
>  /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_init_data_intrinsics ();
>    if (TARGET_LS64)
>      aarch64_init_ls64_builtins ();
>  }
> @@ -2394,6 +2431,37 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
>    return target;
>  }
>  
> +/* Function to expand an expression EXP which calls one of the ACLE Data
> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> +static rtx
> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
> +{
> +  expand_operand ops[2];
> +  machine_mode mode = GET_MODE (target);
> +  create_output_operand (&ops[0], target, mode);
> +  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
> +  enum insn_code icode;
> +
> +  switch (fcode)
> +    {
> +    case AARCH64_REV16:
> +    case AARCH64_REV16L:
> +    case AARCH64_REV16LL:
> +      icode = code_for_aarch64_rev16 (mode);
> +      break;
> +    case AARCH64_RBIT:
> +    case AARCH64_RBITL:
> +    case AARCH64_RBITLL:
> +      icode = code_for_aarch64_rbit (mode);
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  expand_insn (icode, 2, ops);
> +  return ops[0].value;
> +}
> +
>  /* Expand an expression EXP as fpsr or fpcr setter (depending on
>     UNSPEC) using MODE.  */
>  static void
> @@ -2551,6 +2619,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
>    if (fcode >= AARCH64_MEMTAG_BUILTIN_START
>        && fcode <= AARCH64_MEMTAG_BUILTIN_END)
>      return aarch64_expand_builtin_memtag (fcode, exp, target);
> +  if (fcode >= AARCH64_REV16
> +      && fcode <= AARCH64_RBITLL)
> +    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
>  
>    gcc_unreachable ();
>  }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
>      rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
>      rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
>  
> -    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>      emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>      emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
>      DONE;
> @@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
>    [(set_attr "type" "clz")]
>  )
>  
> -(define_insn "rbit<mode>2"
> +(define_insn "@aarch64_rbit<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r")
>  	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
>    ""
> @@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
>    "reload_completed"
>    [(const_int 0)]
>    "
> -  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> +  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>    DONE;
>  ")
> @@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
>    [(set_attr "type" "rev")]
>  )
>  
> +(define_insn "@aarch64_rev16<mode>"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
> +  ""
> +  "rev16\\t%<w>0, %<w>1"
> +  [(set_attr "type" "rev")])
> +
>  (define_insn "*aarch64_bfxil<mode>"
>    [(set (match_operand:GPI 0 "register_operand" "=r,r")
>      (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> index 9775a48c65825b424d3eb442384f5ab87b734fd7..d26e269cb843fe37ba789db09c40d06f53438cda 100644
> --- a/gcc/config/aarch64/arm_acle.h
> +++ b/gcc/config/aarch64/arm_acle.h
> @@ -28,6 +28,7 @@
>  #define _GCC_ARM_ACLE_H
>  
>  #include <stdint.h>
> +#include <stddef.h>
>  
>  #pragma GCC aarch64 "arm_acle.h"
>  
> @@ -35,6 +36,58 @@
>  extern "C" {
>  #endif
>  
> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
> +__extension__ extern __inline TYPE					  \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
> +NAME (TYPE __value, uint32_t __rotate)					  \
> +{									  \
> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
> +  __rotate = __rotate % __size;						  \
> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
> +}
> +
> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
> +
> +#undef _GCC_ARM_ACLE_ROR_FN
> +
> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
> +__extension__ extern __inline RTYPE				    \
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> +__##NAME (ITYPE __value)					    \
> +{								    \
> +  return __builtin_##BUILTIN (__value);				    \
> +}
> +
> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, int16_t)
> +_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
> +_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
> +
> +#undef _GCC_ARM_ACLE_DATA_FN
> +
> +__extension__ extern __inline unsigned long
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +__revl (unsigned long __value)
> +{
> +  if (sizeof (unsigned long) == 8)
> +    return __revll (__value);
> +  else
> +    return __rev (__value);
> +}
> +
>  #pragma GCC push_options
>  #pragma GCC target ("arch=armv8.3-a")
>  __extension__ extern __inline int32_t
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e067ef20bbdc8993865b541aa99dccac6b03e6a0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> @@ -0,0 +1,468 @@
> +/* Test the ACLE data intrinsics.  */
> +/* { dg-do assemble } */
> +/* { dg-additional-options "--save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include "arm_acle.h"
> +
> +/*
> +** test_clz:
> +**	clz	w0, w0
> +**	ret
> +*/
> +
> +unsigned int test_clz (uint32_t a)
> +{
> +  return __clz (a);
> +}
> +
> +/*
> +** test_clzl:
> +**	clz	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned int test_clzl (unsigned long a)
> +{
> +  return __clzl (a);
> +}
> +
> +/*
> +** test_clzll:
> +**	clz	x0, x0
> +**	ret
> +*/
> +
> +unsigned int test_clzll (uint64_t a)
> +{
> +  return __clzll (a);
> +}
> +
> +/*
> +** test_cls:
> +**	cls	w0, w0
> +**	ret
> +*/
> +
> +unsigned int test_cls (uint32_t a)
> +{
> +  return __cls (a);
> +}
> +
> +/*
> +** test_clsl:
> +**	cls	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned int test_clsl (unsigned long a)
> +{
> +  return __clsl (a);
> +}
> +
> +/*
> +** test_clsll:
> +**	cls	x0, x0
> +**	ret
> +*/
> +
> +unsigned int test_clsll (uint64_t a)
> +{
> +  return __clsll (a);
> +}
> +
> +/*
> +** test_rbit:
> +**	rbit	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rbit (uint32_t a)
> +{
> +  return __rbit (a);
> +}
> +
> +/*
> +** test_rbitl:
> +**	rbit	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rbitl (unsigned long a)
> +{
> +  return __rbitl (a);
> +}
> +
> +/*
> +** test_rbitll:
> +**	rbit	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rbitll (uint64_t a)
> +{
> +  return __rbitll (a);
> +}
> +
> +/*
> +** test_rev:
> +**	rev	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev (uint32_t a)
> +{
> +  return __rev (a);
> +}
> +
> +/*
> +** test_revl:
> +**	rev	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_revl (unsigned long a)
> +{
> +  return __revl (a);
> +}
> +
> +/*
> +** test_revll:
> +**	rev	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_revll (uint64_t a)
> +{
> +  return __revll (a);
> +}
> +
> +/*
> +** test_rev16:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +uint32_t test_rev16 (uint32_t a)
> +{
> +  return __rev16 (a);
> +}
> +
> +/*
> +** test_rev16l:
> +**	rev16	[wx]0, [wx]0
> +**	ret
> +*/
> +
> +unsigned long test_rev16l (unsigned long a)
> +{
> +  return __rev16l (a);
> +}
> +
> +/*
> +** test_rev16ll:
> +**	rev16	x0, x0
> +**	ret
> +*/
> +
> +uint64_t test_rev16ll (uint64_t a)
> +{
> +  return __rev16ll (a);
> +}
> +
> +/*
> +** test_ror:
> +**	ror	w0, w0, w1
> +**	ret
> +*/
> +
> +uint32_t test_ror (uint32_t a, uint32_t r)
> +{
> +  return __ror (a, r);
> +}
> +
> +/*
> +** test_rorl:
> +**	ror	[wx]0, [wx]0, [wx]1
> +**	ret
> +*/
> +
> +unsigned long test_rorl (unsigned long a, uint32_t r)
> +{
> +  return __rorl (a, r);
> +}
> +
> +/*
> +** test_rorll:
> +**	ror	x0, x0, x1
> +**	ret
> +*/
> +
> +uint64_t test_rorll (uint64_t a, uint32_t r)
> +{
> +  return __rorll (a, r);
> +}
> +
> +/*
> +** test_revsh:
> +**	rev16	w0, w0
> +**	ret
> +*/
> +
> +int16_t test_revsh (int16_t a)
> +{
> +  return __revsh (a);
> +}
> +
> +uint32_t *g32;
> +unsigned long *gul;
> +uint64_t *g64;
> +unsigned int *gui;
> +int16_t *g16;
> +
> +/*
> +** test_clz_mem:
> +**	...
> +**	clz	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_clz_mem (uint32_t *a)
> +{
> +  *gui = __clz (*a);
> +}
> +
> +/*
> +** test_clzl_mem:
> +**	...
> +**	clz	[wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_clzl_mem (unsigned long *a)
> +{
> +  *gui = __clzl (*a);
> +}
> +
> +/*
> +** test_clzll_mem:
> +**	...
> +**	clz	x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_clzll_mem (uint64_t *a)
> +{
> +  *gui = __clzll (*a);
> +}
> +
> +/*
> +** test_cls_mem:
> +**	...
> +**	cls	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_cls_mem (uint32_t *a)
> +{
> +  *gui = __cls (*a);
> +}
> +
> +/*
> +** test_clsl_mem:
> +**	...
> +**	cls	[wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_clsl_mem (unsigned long *a)
> +{
> +  *gui = __clsl (*a);
> +}
> +
> +/*
> +** test_clsll_mem:
> +**	...
> +**	cls	x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_clsll_mem (uint64_t *a)
> +{
> +  *gui = __clsll (*a);
> +}
> +
> +/*
> +** test_rbit_mem:
> +**	...
> +**	rbit	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rbit_mem (uint32_t *a)
> +{
> +  *g32 = __rbit (*a);
> +}
> +
> +/*
> +** test_rbitl_mem:
> +**	...
> +**	rbit	[wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rbitl_mem (unsigned long *a)
> +{
> +  *gul = __rbitl (*a);
> +}
> +
> +/*
> +** test_rbitll_mem:
> +**	...
> +**	rbit	x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rbitll_mem (uint64_t *a)
> +{
> +  *g64 = __rbitll (*a);
> +}
> +
> +/*
> +** test_rev_mem:
> +**	...
> +**	rev	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rev_mem (uint32_t *a)
> +{
> +  *g32 = __rev (*a);
> +}
> +
> +/*
> +** test_revl_mem:
> +**	...
> +**	rev	[wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_revl_mem (unsigned long *a)
> +{
> +  *gul = __revl (*a);
> +}
> +
> +/*
> +** test_revll_mem:
> +**	...
> +**	rev	x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_revll_mem (uint64_t *a)
> +{
> +  *g64 = __revll (*a);
> +}
> +
> +/*
> +** test_rev16_mem:
> +**	...
> +**	rev16	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rev16_mem (uint32_t *a)
> +{
> +  *g32 = __rev16 (*a);
> +}
> +
> +/*
> +** test_rev16l_mem:
> +**	...
> +**	rev16	[wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rev16l_mem (unsigned long *a)
> +{
> +  *gul = __rev16l (*a);
> +}
> +
> +/*
> +** test_rev16ll_mem:
> +**	...
> +**	rev16	x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rev16ll_mem (uint64_t *a)
> +{
> +  *g64 = __rev16ll (*a);
> +}
> +
> +/*
> +** test_ror_mem:
> +**	...
> +**	ror	w[0-9]+, w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_ror_mem (uint32_t *a, uint32_t *r)
> +{
> +  *g32 = __ror (*a, *r);
> +}
> +
> +/*
> +** test_rorl_mem:
> +**	...
> +**	ror	[wx][0-9]+, [wx][0-9]+, [wx][0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rorl_mem (unsigned long *a, uint32_t *r)
> +{
> +  *gul = __rorl (*a, *r);
> +}
> +
> +/*
> +** test_rorll_mem:
> +**	...
> +**	ror	x[0-9]+, x[0-9]+, x[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_rorll_mem (uint64_t *a, uint32_t *r)
> +{
> +  *g64 = __rorll (*a, *r);
> +}
> +
> +/*
> +** test_revsh_mem:
> +**	...
> +**	rev16	w[0-9]+, w[0-9]+
> +**	...
> +**	ret
> +*/
> +
> +void test_revsh_mem (int16_t *a)
> +{
> +  *g16 = __revsh (*a);
> +}
Andre Vieira (lists) Aug. 11, 2022, 3:11 p.m. UTC | #6
OK to backport this to gcc-12? Applies cleanly and did a bootstrat and 
regression test on aarch64-linux-gnu

Regards,
Andre

On 01/07/2022 12:26, Richard Sandiford wrote:
> "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
>> On 29/06/2022 08:18, Richard Sandiford wrote:
>>>> +      break;
>>>> +    case AARCH64_RBIT:
>>>> +    case AARCH64_RBITL:
>>>> +    case AARCH64_RBITLL:
>>>> +      if (mode == SImode)
>>>> +	icode = CODE_FOR_aarch64_rbitsi;
>>>> +      else
>>>> +	icode = CODE_FOR_aarch64_rbitdi;
>>>> +      break;
>>>> +    default:
>>>> +      gcc_unreachable ();
>>>> +    }
>>>> +  expand_insn (icode, 2, ops);
>>>> +  return target;
>>> This needs to return ops[0].value instead, since "target" just suggests
>>> a possible location.
>>>
>>> Could you add tests for a memory source and memory destination, e.g.:
>>>
>>> void test_clz_mem (uint32_t *a)
>>> {
>>>     *a = __clz (*a);
>>> }
>>>
>>> Without tests like that, these comments probably just sound like a paper
>>> exercise, but they should make a difference for memory sources (previous
>>> review) and memory destinations (this round).
>> I had locally tested it (with rev though because clz doesn't use that
>> code) and strangely it does seem to work for the memory destinations,
>> but that's just a simple test.
>> It could very well go wrong with some more complex codegen, so I'll just
>> take your word and use ops[0].value.
>>
>> And yeah I didn't add the tests at the time, don't really know why, I'll
>> chuck it down to laziness :P
>>>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
>>>> index 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce 100644
>>>> --- a/gcc/config/aarch64/arm_acle.h
>>>> +++ b/gcc/config/aarch64/arm_acle.h
>>>> @@ -28,6 +28,7 @@
>>>>    #define _GCC_ARM_ACLE_H
>>>>    
>>>>    #include <stdint.h>
>>>> +#include <stddef.h>
>>>>    
>>>>    #pragma GCC aarch64 "arm_acle.h"
>>>>    
>>>> @@ -35,6 +36,58 @@
>>>>    extern "C" {
>>>>    #endif
>>>>    
>>>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
>>>> +__extension__ extern __inline TYPE					  \
>>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
>>>> +NAME (TYPE __value, uint32_t __rotate)					  \
>>>> +{									  \
>>>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
>>>> +  __rotate = __rotate % __size;						  \
>>>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
>>>> +}
>>>> +
>>>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
>>>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
>>>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
>>>> +
>>>> +#undef _GCC_ARM_ACLE_ROR_FN
>>>> +
>>>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
>>>> +__extension__ extern __inline RTYPE				    \
>>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
>>>> +__##NAME (ITYPE __value)					    \
>>>> +{								    \
>>>> +  return __builtin_##BUILTIN (__value);				    \
>>>> +}
>>>> +
>>>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
>>>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
>>>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
>>>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
>>>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
>>>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
>>>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
>>>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
>>> The return type should be int16_t.
>> Nice catch!
>>> The clz and cls tests have the old return types (same as the argument
>>> types), but I guess that's a good thing, since it shows that we avoid
>>> the redundant zero-extend in clzll and clsll.
>> Yeah I noticed that too when I was adding the mem tests, but I did
>> change them though because at the time it just felt like an oversight,
>> though I too was pleasantly surprised GCC was managing to avoid the
>> zero-extending :)
>> I then saw your comment and made me wonder whether I should keep the
>> wrong return types in... I haven't but happy to change them back if you
>> think it's a nice 'test' to have.
> I thought it was OK/useful as it was, but I don't mind either way.
>
> BTW, while trying it out locally, I noticed:
>
>    aarch64_init_data_intrinsics
>
> was called from the wrong place.  Since it's adding normal __builtin
> functions, it should be called from aarch64_general_init_builtins
> instead of handle_arm_acle_h.
>
> handle_arm_acle_h is instead for cases where we want to simulate
> C/C++ definitions of the ACLE intrinsics themselves (i.e. so that
> the intrinsics themselves are built-in functions, rather than
> inline wrappers around built-in functions).
>
> OK with that change, thanks.
>
> Thanks,
> Richard
>
>> Regards,
>> Andre
>>
>> diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
>> index e0a741ac663188713e21f457affa57217d074783..bb5d97c8fc6402635270df851a949cabeecaa5e8 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.cc
>> +++ b/gcc/config/aarch64/aarch64-builtins.cc
>> @@ -613,6 +613,12 @@ enum aarch64_builtins
>>     AARCH64_LS64_BUILTIN_ST64B,
>>     AARCH64_LS64_BUILTIN_ST64BV,
>>     AARCH64_LS64_BUILTIN_ST64BV0,
>> +  AARCH64_REV16,
>> +  AARCH64_REV16L,
>> +  AARCH64_REV16LL,
>> +  AARCH64_RBIT,
>> +  AARCH64_RBITL,
>> +  AARCH64_RBITLL,
>>     AARCH64_BUILTIN_MAX
>>   };
>>   
>> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>>         = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
>>   }
>>   
>> +static void
>> +aarch64_init_data_intrinsics (void)
>> +{
>> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
>> +						 uint32_type_node, NULL_TREE);
>> +  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
>> +						long_unsigned_type_node,
>> +						NULL_TREE);
>> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
>> +						 uint64_type_node, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_REV16]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
>> +				   AARCH64_REV16);
>> +  aarch64_builtin_decls[AARCH64_REV16L]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
>> +				   AARCH64_REV16L);
>> +  aarch64_builtin_decls[AARCH64_REV16LL]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
>> +				   AARCH64_REV16LL);
>> +  aarch64_builtin_decls[AARCH64_RBIT]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
>> +				   AARCH64_RBIT);
>> +  aarch64_builtin_decls[AARCH64_RBITL]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
>> +				   AARCH64_RBITL);
>> +  aarch64_builtin_decls[AARCH64_RBITLL]
>> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
>> +				   AARCH64_RBITLL);
>> +}
>> +
>>   /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>>   void
>>   handle_arm_acle_h (void)
>>   {
>> +  aarch64_init_data_intrinsics ();
>>     if (TARGET_LS64)
>>       aarch64_init_ls64_builtins ();
>>   }
>> @@ -2394,6 +2431,37 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
>>     return target;
>>   }
>>   
>> +/* Function to expand an expression EXP which calls one of the ACLE Data
>> +   Intrinsic builtins FCODE with the result going to TARGET.  */
>> +static rtx
>> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
>> +{
>> +  expand_operand ops[2];
>> +  machine_mode mode = GET_MODE (target);
>> +  create_output_operand (&ops[0], target, mode);
>> +  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
>> +  enum insn_code icode;
>> +
>> +  switch (fcode)
>> +    {
>> +    case AARCH64_REV16:
>> +    case AARCH64_REV16L:
>> +    case AARCH64_REV16LL:
>> +      icode = code_for_aarch64_rev16 (mode);
>> +      break;
>> +    case AARCH64_RBIT:
>> +    case AARCH64_RBITL:
>> +    case AARCH64_RBITLL:
>> +      icode = code_for_aarch64_rbit (mode);
>> +      break;
>> +    default:
>> +      gcc_unreachable ();
>> +    }
>> +
>> +  expand_insn (icode, 2, ops);
>> +  return ops[0].value;
>> +}
>> +
>>   /* Expand an expression EXP as fpsr or fpcr setter (depending on
>>      UNSPEC) using MODE.  */
>>   static void
>> @@ -2551,6 +2619,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
>>     if (fcode >= AARCH64_MEMTAG_BUILTIN_START
>>         && fcode <= AARCH64_MEMTAG_BUILTIN_END)
>>       return aarch64_expand_builtin_memtag (fcode, exp, target);
>> +  if (fcode >= AARCH64_REV16
>> +      && fcode <= AARCH64_RBITLL)
>> +    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
>>   
>>     gcc_unreachable ();
>>   }
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
>>       rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
>>       rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
>>   
>> -    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
>> +    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>>       emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>>       emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
>>       DONE;
>> @@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
>>     [(set_attr "type" "clz")]
>>   )
>>   
>> -(define_insn "rbit<mode>2"
>> +(define_insn "@aarch64_rbit<mode>"
>>     [(set (match_operand:GPI 0 "register_operand" "=r")
>>   	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
>>     ""
>> @@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
>>     "reload_completed"
>>     [(const_int 0)]
>>     "
>> -  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
>> +  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
>>     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
>>     DONE;
>>   ")
>> @@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
>>     [(set_attr "type" "rev")]
>>   )
>>   
>> +(define_insn "@aarch64_rev16<mode>"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r")
>> +	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
>> +  ""
>> +  "rev16\\t%<w>0, %<w>1"
>> +  [(set_attr "type" "rev")])
>> +
>>   (define_insn "*aarch64_bfxil<mode>"
>>     [(set (match_operand:GPI 0 "register_operand" "=r,r")
>>       (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
>> index 9775a48c65825b424d3eb442384f5ab87b734fd7..d26e269cb843fe37ba789db09c40d06f53438cda 100644
>> --- a/gcc/config/aarch64/arm_acle.h
>> +++ b/gcc/config/aarch64/arm_acle.h
>> @@ -28,6 +28,7 @@
>>   #define _GCC_ARM_ACLE_H
>>   
>>   #include <stdint.h>
>> +#include <stddef.h>
>>   
>>   #pragma GCC aarch64 "arm_acle.h"
>>   
>> @@ -35,6 +36,58 @@
>>   extern "C" {
>>   #endif
>>   
>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)				  \
>> +__extension__ extern __inline TYPE					  \
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
>> +NAME (TYPE __value, uint32_t __rotate)					  \
>> +{									  \
>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;				  \
>> +  __rotate = __rotate % __size;						  \
>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
>> +}
>> +
>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
>> +
>> +#undef _GCC_ARM_ACLE_ROR_FN
>> +
>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)	    \
>> +__extension__ extern __inline RTYPE				    \
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
>> +__##NAME (ITYPE __value)					    \
>> +{								    \
>> +  return __builtin_##BUILTIN (__value);				    \
>> +}
>> +
>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, int16_t)
>> +_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
>> +_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
>> +
>> +#undef _GCC_ARM_ACLE_DATA_FN
>> +
>> +__extension__ extern __inline unsigned long
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +__revl (unsigned long __value)
>> +{
>> +  if (sizeof (unsigned long) == 8)
>> +    return __revll (__value);
>> +  else
>> +    return __rev (__value);
>> +}
>> +
>>   #pragma GCC push_options
>>   #pragma GCC target ("arch=armv8.3-a")
>>   __extension__ extern __inline int32_t
>> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..e067ef20bbdc8993865b541aa99dccac6b03e6a0
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
>> @@ -0,0 +1,468 @@
>> +/* Test the ACLE data intrinsics.  */
>> +/* { dg-do assemble } */
>> +/* { dg-additional-options "--save-temps -O1" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>> +
>> +#include "arm_acle.h"
>> +
>> +/*
>> +** test_clz:
>> +**	clz	w0, w0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_clz (uint32_t a)
>> +{
>> +  return __clz (a);
>> +}
>> +
>> +/*
>> +** test_clzl:
>> +**	clz	[wx]0, [wx]0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_clzl (unsigned long a)
>> +{
>> +  return __clzl (a);
>> +}
>> +
>> +/*
>> +** test_clzll:
>> +**	clz	x0, x0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_clzll (uint64_t a)
>> +{
>> +  return __clzll (a);
>> +}
>> +
>> +/*
>> +** test_cls:
>> +**	cls	w0, w0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_cls (uint32_t a)
>> +{
>> +  return __cls (a);
>> +}
>> +
>> +/*
>> +** test_clsl:
>> +**	cls	[wx]0, [wx]0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_clsl (unsigned long a)
>> +{
>> +  return __clsl (a);
>> +}
>> +
>> +/*
>> +** test_clsll:
>> +**	cls	x0, x0
>> +**	ret
>> +*/
>> +
>> +unsigned int test_clsll (uint64_t a)
>> +{
>> +  return __clsll (a);
>> +}
>> +
>> +/*
>> +** test_rbit:
>> +**	rbit	w0, w0
>> +**	ret
>> +*/
>> +
>> +uint32_t test_rbit (uint32_t a)
>> +{
>> +  return __rbit (a);
>> +}
>> +
>> +/*
>> +** test_rbitl:
>> +**	rbit	[wx]0, [wx]0
>> +**	ret
>> +*/
>> +
>> +unsigned long test_rbitl (unsigned long a)
>> +{
>> +  return __rbitl (a);
>> +}
>> +
>> +/*
>> +** test_rbitll:
>> +**	rbit	x0, x0
>> +**	ret
>> +*/
>> +
>> +uint64_t test_rbitll (uint64_t a)
>> +{
>> +  return __rbitll (a);
>> +}
>> +
>> +/*
>> +** test_rev:
>> +**	rev	w0, w0
>> +**	ret
>> +*/
>> +
>> +uint32_t test_rev (uint32_t a)
>> +{
>> +  return __rev (a);
>> +}
>> +
>> +/*
>> +** test_revl:
>> +**	rev	[wx]0, [wx]0
>> +**	ret
>> +*/
>> +
>> +unsigned long test_revl (unsigned long a)
>> +{
>> +  return __revl (a);
>> +}
>> +
>> +/*
>> +** test_revll:
>> +**	rev	x0, x0
>> +**	ret
>> +*/
>> +
>> +uint64_t test_revll (uint64_t a)
>> +{
>> +  return __revll (a);
>> +}
>> +
>> +/*
>> +** test_rev16:
>> +**	rev16	w0, w0
>> +**	ret
>> +*/
>> +
>> +uint32_t test_rev16 (uint32_t a)
>> +{
>> +  return __rev16 (a);
>> +}
>> +
>> +/*
>> +** test_rev16l:
>> +**	rev16	[wx]0, [wx]0
>> +**	ret
>> +*/
>> +
>> +unsigned long test_rev16l (unsigned long a)
>> +{
>> +  return __rev16l (a);
>> +}
>> +
>> +/*
>> +** test_rev16ll:
>> +**	rev16	x0, x0
>> +**	ret
>> +*/
>> +
>> +uint64_t test_rev16ll (uint64_t a)
>> +{
>> +  return __rev16ll (a);
>> +}
>> +
>> +/*
>> +** test_ror:
>> +**	ror	w0, w0, w1
>> +**	ret
>> +*/
>> +
>> +uint32_t test_ror (uint32_t a, uint32_t r)
>> +{
>> +  return __ror (a, r);
>> +}
>> +
>> +/*
>> +** test_rorl:
>> +**	ror	[wx]0, [wx]0, [wx]1
>> +**	ret
>> +*/
>> +
>> +unsigned long test_rorl (unsigned long a, uint32_t r)
>> +{
>> +  return __rorl (a, r);
>> +}
>> +
>> +/*
>> +** test_rorll:
>> +**	ror	x0, x0, x1
>> +**	ret
>> +*/
>> +
>> +uint64_t test_rorll (uint64_t a, uint32_t r)
>> +{
>> +  return __rorll (a, r);
>> +}
>> +
>> +/*
>> +** test_revsh:
>> +**	rev16	w0, w0
>> +**	ret
>> +*/
>> +
>> +int16_t test_revsh (int16_t a)
>> +{
>> +  return __revsh (a);
>> +}
>> +
>> +uint32_t *g32;
>> +unsigned long *gul;
>> +uint64_t *g64;
>> +unsigned int *gui;
>> +int16_t *g16;
>> +
>> +/*
>> +** test_clz_mem:
>> +**	...
>> +**	clz	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_clz_mem (uint32_t *a)
>> +{
>> +  *gui = __clz (*a);
>> +}
>> +
>> +/*
>> +** test_clzl_mem:
>> +**	...
>> +**	clz	[wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_clzl_mem (unsigned long *a)
>> +{
>> +  *gui = __clzl (*a);
>> +}
>> +
>> +/*
>> +** test_clzll_mem:
>> +**	...
>> +**	clz	x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_clzll_mem (uint64_t *a)
>> +{
>> +  *gui = __clzll (*a);
>> +}
>> +
>> +/*
>> +** test_cls_mem:
>> +**	...
>> +**	cls	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_cls_mem (uint32_t *a)
>> +{
>> +  *gui = __cls (*a);
>> +}
>> +
>> +/*
>> +** test_clsl_mem:
>> +**	...
>> +**	cls	[wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_clsl_mem (unsigned long *a)
>> +{
>> +  *gui = __clsl (*a);
>> +}
>> +
>> +/*
>> +** test_clsll_mem:
>> +**	...
>> +**	cls	x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_clsll_mem (uint64_t *a)
>> +{
>> +  *gui = __clsll (*a);
>> +}
>> +
>> +/*
>> +** test_rbit_mem:
>> +**	...
>> +**	rbit	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rbit_mem (uint32_t *a)
>> +{
>> +  *g32 = __rbit (*a);
>> +}
>> +
>> +/*
>> +** test_rbitl_mem:
>> +**	...
>> +**	rbit	[wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rbitl_mem (unsigned long *a)
>> +{
>> +  *gul = __rbitl (*a);
>> +}
>> +
>> +/*
>> +** test_rbitll_mem:
>> +**	...
>> +**	rbit	x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rbitll_mem (uint64_t *a)
>> +{
>> +  *g64 = __rbitll (*a);
>> +}
>> +
>> +/*
>> +** test_rev_mem:
>> +**	...
>> +**	rev	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rev_mem (uint32_t *a)
>> +{
>> +  *g32 = __rev (*a);
>> +}
>> +
>> +/*
>> +** test_revl_mem:
>> +**	...
>> +**	rev	[wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_revl_mem (unsigned long *a)
>> +{
>> +  *gul = __revl (*a);
>> +}
>> +
>> +/*
>> +** test_revll_mem:
>> +**	...
>> +**	rev	x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_revll_mem (uint64_t *a)
>> +{
>> +  *g64 = __revll (*a);
>> +}
>> +
>> +/*
>> +** test_rev16_mem:
>> +**	...
>> +**	rev16	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rev16_mem (uint32_t *a)
>> +{
>> +  *g32 = __rev16 (*a);
>> +}
>> +
>> +/*
>> +** test_rev16l_mem:
>> +**	...
>> +**	rev16	[wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rev16l_mem (unsigned long *a)
>> +{
>> +  *gul = __rev16l (*a);
>> +}
>> +
>> +/*
>> +** test_rev16ll_mem:
>> +**	...
>> +**	rev16	x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rev16ll_mem (uint64_t *a)
>> +{
>> +  *g64 = __rev16ll (*a);
>> +}
>> +
>> +/*
>> +** test_ror_mem:
>> +**	...
>> +**	ror	w[0-9]+, w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_ror_mem (uint32_t *a, uint32_t *r)
>> +{
>> +  *g32 = __ror (*a, *r);
>> +}
>> +
>> +/*
>> +** test_rorl_mem:
>> +**	...
>> +**	ror	[wx][0-9]+, [wx][0-9]+, [wx][0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rorl_mem (unsigned long *a, uint32_t *r)
>> +{
>> +  *gul = __rorl (*a, *r);
>> +}
>> +
>> +/*
>> +** test_rorll_mem:
>> +**	...
>> +**	ror	x[0-9]+, x[0-9]+, x[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_rorll_mem (uint64_t *a, uint32_t *r)
>> +{
>> +  *g64 = __rorll (*a, *r);
>> +}
>> +
>> +/*
>> +** test_revsh_mem:
>> +**	...
>> +**	rev16	w[0-9]+, w[0-9]+
>> +**	...
>> +**	ret
>> +*/
>> +
>> +void test_revsh_mem (int16_t *a)
>> +{
>> +  *g16 = __revsh (*a);
>> +}
Kyrylo Tkachov Aug. 11, 2022, 3:21 p.m. UTC | #7
> -----Original Message-----
> From: Andre Vieira (lists) <andre.simoesdiasvieira@arm.com>
> Sent: Thursday, August 11, 2022 4:11 PM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Richard Biener
> <rguenther@suse.de>
> Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> 
> OK to backport this to gcc-12? Applies cleanly and did a bootstrat and
> regression test on aarch64-linux-gnu

Ok as long as it's before the branch freeze.
Thanks,
Kyrill

> 
> Regards,
> Andre
> 
> On 01/07/2022 12:26, Richard Sandiford wrote:
> > "Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> >> On 29/06/2022 08:18, Richard Sandiford wrote:
> >>>> +      break;
> >>>> +    case AARCH64_RBIT:
> >>>> +    case AARCH64_RBITL:
> >>>> +    case AARCH64_RBITLL:
> >>>> +      if (mode == SImode)
> >>>> +	icode = CODE_FOR_aarch64_rbitsi;
> >>>> +      else
> >>>> +	icode = CODE_FOR_aarch64_rbitdi;
> >>>> +      break;
> >>>> +    default:
> >>>> +      gcc_unreachable ();
> >>>> +    }
> >>>> +  expand_insn (icode, 2, ops);
> >>>> +  return target;
> >>> This needs to return ops[0].value instead, since "target" just suggests
> >>> a possible location.
> >>>
> >>> Could you add tests for a memory source and memory destination, e.g.:
> >>>
> >>> void test_clz_mem (uint32_t *a)
> >>> {
> >>>     *a = __clz (*a);
> >>> }
> >>>
> >>> Without tests like that, these comments probably just sound like a paper
> >>> exercise, but they should make a difference for memory sources
> (previous
> >>> review) and memory destinations (this round).
> >> I had locally tested it (with rev though because clz doesn't use that
> >> code) and strangely it does seem to work for the memory destinations,
> >> but that's just a simple test.
> >> It could very well go wrong with some more complex codegen, so I'll just
> >> take your word and use ops[0].value.
> >>
> >> And yeah I didn't add the tests at the time, don't really know why, I'll
> >> chuck it down to laziness :P
> >>>> diff --git a/gcc/config/aarch64/arm_acle.h
> b/gcc/config/aarch64/arm_acle.h
> >>>> index
> 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290
> 083f3f072fd5a2ce 100644
> >>>> --- a/gcc/config/aarch64/arm_acle.h
> >>>> +++ b/gcc/config/aarch64/arm_acle.h
> >>>> @@ -28,6 +28,7 @@
> >>>>    #define _GCC_ARM_ACLE_H
> >>>>
> >>>>    #include <stdint.h>
> >>>> +#include <stddef.h>
> >>>>
> >>>>    #pragma GCC aarch64 "arm_acle.h"
> >>>>
> >>>> @@ -35,6 +36,58 @@
> >>>>    extern "C" {
> >>>>    #endif
> >>>>
> >>>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)
> 		  \
> >>>> +__extension__ extern __inline TYPE
> 	  \
> >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> 	  \
> >>>> +NAME (TYPE __value, uint32_t __rotate)
> 	  \
> >>>> +{									  \
> >>>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
> 	  \
> >>>> +  __rotate = __rotate % __size;
> 	  \
> >>>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size);
> \
> >>>> +}
> >>>> +
> >>>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> >>>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> >>>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
> >>>> +
> >>>> +#undef _GCC_ARM_ACLE_ROR_FN
> >>>> +
> >>>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)
> 	    \
> >>>> +__extension__ extern __inline RTYPE				    \
> >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> >>>> +__##NAME (ITYPE __value)					    \
> >>>> +{								    \
> >>>> +  return __builtin_##BUILTIN (__value);				    \
> >>>> +}
> >>>> +
> >>>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t,
> uint32_t)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long,
> unsigned long)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t,
> uint64_t)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long,
> unsigned long)
> >>>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
> >>>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
> >>> The return type should be int16_t.
> >> Nice catch!
> >>> The clz and cls tests have the old return types (same as the argument
> >>> types), but I guess that's a good thing, since it shows that we avoid
> >>> the redundant zero-extend in clzll and clsll.
> >> Yeah I noticed that too when I was adding the mem tests, but I did
> >> change them though because at the time it just felt like an oversight,
> >> though I too was pleasantly surprised GCC was managing to avoid the
> >> zero-extending :)
> >> I then saw your comment and made me wonder whether I should keep
> the
> >> wrong return types in... I haven't but happy to change them back if you
> >> think it's a nice 'test' to have.
> > I thought it was OK/useful as it was, but I don't mind either way.
> >
> > BTW, while trying it out locally, I noticed:
> >
> >    aarch64_init_data_intrinsics
> >
> > was called from the wrong place.  Since it's adding normal __builtin
> > functions, it should be called from aarch64_general_init_builtins
> > instead of handle_arm_acle_h.
> >
> > handle_arm_acle_h is instead for cases where we want to simulate
> > C/C++ definitions of the ACLE intrinsics themselves (i.e. so that
> > the intrinsics themselves are built-in functions, rather than
> > inline wrappers around built-in functions).
> >
> > OK with that change, thanks.
> >
> > Thanks,
> > Richard
> >
> >> Regards,
> >> Andre
> >>
> >> diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> b/gcc/config/aarch64/aarch64-builtins.cc
> >> index
> e0a741ac663188713e21f457affa57217d074783..bb5d97c8fc6402635270df85
> 1a949cabeecaa5e8 100644
> >> --- a/gcc/config/aarch64/aarch64-builtins.cc
> >> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> >> @@ -613,6 +613,12 @@ enum aarch64_builtins
> >>     AARCH64_LS64_BUILTIN_ST64B,
> >>     AARCH64_LS64_BUILTIN_ST64BV,
> >>     AARCH64_LS64_BUILTIN_ST64BV0,
> >> +  AARCH64_REV16,
> >> +  AARCH64_REV16L,
> >> +  AARCH64_REV16LL,
> >> +  AARCH64_RBIT,
> >> +  AARCH64_RBITL,
> >> +  AARCH64_RBITLL,
> >>     AARCH64_BUILTIN_MAX
> >>   };
> >>
> >> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
> >>         = aarch64_general_add_builtin (data[i].name, data[i].type,
> data[i].code);
> >>   }
> >>
> >> +static void
> >> +aarch64_init_data_intrinsics (void)
> >> +{
> >> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> >> +						 uint32_type_node,
> NULL_TREE);
> >> +  tree ulong_fntype = build_function_type_list
> (long_unsigned_type_node,
> >> +						long_unsigned_type_node,
> >> +						NULL_TREE);
> >> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> >> +						 uint64_type_node,
> NULL_TREE);
> >> +  aarch64_builtin_decls[AARCH64_REV16]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16",
> uint32_fntype,
> >> +				   AARCH64_REV16);
> >> +  aarch64_builtin_decls[AARCH64_REV16L]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l",
> ulong_fntype,
> >> +				   AARCH64_REV16L);
> >> +  aarch64_builtin_decls[AARCH64_REV16LL]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll",
> uint64_fntype,
> >> +				   AARCH64_REV16LL);
> >> +  aarch64_builtin_decls[AARCH64_RBIT]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbit",
> uint32_fntype,
> >> +				   AARCH64_RBIT);
> >> +  aarch64_builtin_decls[AARCH64_RBITL]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl",
> ulong_fntype,
> >> +				   AARCH64_RBITL);
> >> +  aarch64_builtin_decls[AARCH64_RBITLL]
> >> +    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll",
> uint64_fntype,
> >> +				   AARCH64_RBITLL);
> >> +}
> >> +
> >>   /* Implement #pragma GCC aarch64 "arm_acle.h".  */
> >>   void
> >>   handle_arm_acle_h (void)
> >>   {
> >> +  aarch64_init_data_intrinsics ();
> >>     if (TARGET_LS64)
> >>       aarch64_init_ls64_builtins ();
> >>   }
> >> @@ -2394,6 +2431,37 @@ aarch64_expand_builtin_memtag (int fcode,
> tree exp, rtx target)
> >>     return target;
> >>   }
> >>
> >> +/* Function to expand an expression EXP which calls one of the ACLE
> Data
> >> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> >> +static rtx
> >> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx
> target)
> >> +{
> >> +  expand_operand ops[2];
> >> +  machine_mode mode = GET_MODE (target);
> >> +  create_output_operand (&ops[0], target, mode);
> >> +  create_input_operand (&ops[1], expand_normal (CALL_EXPR_ARG
> (exp, 0)), mode);
> >> +  enum insn_code icode;
> >> +
> >> +  switch (fcode)
> >> +    {
> >> +    case AARCH64_REV16:
> >> +    case AARCH64_REV16L:
> >> +    case AARCH64_REV16LL:
> >> +      icode = code_for_aarch64_rev16 (mode);
> >> +      break;
> >> +    case AARCH64_RBIT:
> >> +    case AARCH64_RBITL:
> >> +    case AARCH64_RBITLL:
> >> +      icode = code_for_aarch64_rbit (mode);
> >> +      break;
> >> +    default:
> >> +      gcc_unreachable ();
> >> +    }
> >> +
> >> +  expand_insn (icode, 2, ops);
> >> +  return ops[0].value;
> >> +}
> >> +
> >>   /* Expand an expression EXP as fpsr or fpcr setter (depending on
> >>      UNSPEC) using MODE.  */
> >>   static void
> >> @@ -2551,6 +2619,9 @@ aarch64_general_expand_builtin (unsigned int
> fcode, tree exp, rtx target,
> >>     if (fcode >= AARCH64_MEMTAG_BUILTIN_START
> >>         && fcode <= AARCH64_MEMTAG_BUILTIN_END)
> >>       return aarch64_expand_builtin_memtag (fcode, exp, target);
> >> +  if (fcode >= AARCH64_REV16
> >> +      && fcode <= AARCH64_RBITLL)
> >> +    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
> >>
> >>     gcc_unreachable ();
> >>   }
> >> diff --git a/gcc/config/aarch64/aarch64.md
> b/gcc/config/aarch64/aarch64.md
> >> index
> acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030d
> c5921a534e3d19 100644
> >> --- a/gcc/config/aarch64/aarch64.md
> >> +++ b/gcc/config/aarch64/aarch64.md
> >> @@ -4950,7 +4950,7 @@ (define_expand "ffs<mode>2"
> >>       rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
> >>       rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
> >>
> >> -    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> >> +    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0],
> operands[1]));
> >>       emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
> >>       emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0],
> const0_rtx));
> >>       DONE;
> >> @@ -4996,7 +4996,7 @@ (define_insn "clrsb<mode>2"
> >>     [(set_attr "type" "clz")]
> >>   )
> >>
> >> -(define_insn "rbit<mode>2"
> >> +(define_insn "@aarch64_rbit<mode>"
> >>     [(set (match_operand:GPI 0 "register_operand" "=r")
> >>   	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")]
> UNSPEC_RBIT))]
> >>     ""
> >> @@ -5017,7 +5017,7 @@ (define_insn_and_split "ctz<mode>2"
> >>     "reload_completed"
> >>     [(const_int 0)]
> >>     "
> >> -  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
> >> +  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0],
> operands[1]));
> >>     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
> >>     DONE;
> >>   ")
> >> @@ -6022,6 +6022,13 @@ (define_insn "bswaphi2"
> >>     [(set_attr "type" "rev")]
> >>   )
> >>
> >> +(define_insn "@aarch64_rev16<mode>"
> >> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> >> +	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")]
> UNSPEC_REV))]
> >> +  ""
> >> +  "rev16\\t%<w>0, %<w>1"
> >> +  [(set_attr "type" "rev")])
> >> +
> >>   (define_insn "*aarch64_bfxil<mode>"
> >>     [(set (match_operand:GPI 0 "register_operand" "=r,r")
> >>       (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
> >> diff --git a/gcc/config/aarch64/arm_acle.h
> b/gcc/config/aarch64/arm_acle.h
> >> index
> 9775a48c65825b424d3eb442384f5ab87b734fd7..d26e269cb843fe37ba789db
> 09c40d06f53438cda 100644
> >> --- a/gcc/config/aarch64/arm_acle.h
> >> +++ b/gcc/config/aarch64/arm_acle.h
> >> @@ -28,6 +28,7 @@
> >>   #define _GCC_ARM_ACLE_H
> >>
> >>   #include <stdint.h>
> >> +#include <stddef.h>
> >>
> >>   #pragma GCC aarch64 "arm_acle.h"
> >>
> >> @@ -35,6 +36,58 @@
> >>   extern "C" {
> >>   #endif
> >>
> >> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)
> 	  \
> >> +__extension__ extern __inline TYPE
> 	  \
> >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))	  \
> >> +NAME (TYPE __value, uint32_t __rotate)
> 	  \
> >> +{									  \
> >> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
> 	  \
> >> +  __rotate = __rotate % __size;
> 	  \
> >> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
> >> +}
> >> +
> >> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> >> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> >> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
> >> +
> >> +#undef _GCC_ARM_ACLE_ROR_FN
> >> +
> >> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)
> 	    \
> >> +__extension__ extern __inline RTYPE				    \
> >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> >> +__##NAME (ITYPE __value)					    \
> >> +{								    \
> >> +  return __builtin_##BUILTIN (__value);				    \
> >> +}
> >> +
> >> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
> >> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
> >> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long,
> unsigned long)
> >> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
> >> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
> >> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long,
> unsigned long)
> >> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
> >> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, int16_t)
> >> +_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t, uint32_t)
> >> +_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t, uint64_t)
> >> +
> >> +#undef _GCC_ARM_ACLE_DATA_FN
> >> +
> >> +__extension__ extern __inline unsigned long
> >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >> +__revl (unsigned long __value)
> >> +{
> >> +  if (sizeof (unsigned long) == 8)
> >> +    return __revll (__value);
> >> +  else
> >> +    return __rev (__value);
> >> +}
> >> +
> >>   #pragma GCC push_options
> >>   #pragma GCC target ("arch=armv8.3-a")
> >>   __extension__ extern __inline int32_t
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> >> new file mode 100644
> >> index
> 0000000000000000000000000000000000000000..e067ef20bbdc8993865b541
> aa99dccac6b03e6a0
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
> >> @@ -0,0 +1,468 @@
> >> +/* Test the ACLE data intrinsics.  */
> >> +/* { dg-do assemble } */
> >> +/* { dg-additional-options "--save-temps -O1" } */
> >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> +
> >> +#include "arm_acle.h"
> >> +
> >> +/*
> >> +** test_clz:
> >> +**	clz	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_clz (uint32_t a)
> >> +{
> >> +  return __clz (a);
> >> +}
> >> +
> >> +/*
> >> +** test_clzl:
> >> +**	clz	[wx]0, [wx]0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_clzl (unsigned long a)
> >> +{
> >> +  return __clzl (a);
> >> +}
> >> +
> >> +/*
> >> +** test_clzll:
> >> +**	clz	x0, x0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_clzll (uint64_t a)
> >> +{
> >> +  return __clzll (a);
> >> +}
> >> +
> >> +/*
> >> +** test_cls:
> >> +**	cls	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_cls (uint32_t a)
> >> +{
> >> +  return __cls (a);
> >> +}
> >> +
> >> +/*
> >> +** test_clsl:
> >> +**	cls	[wx]0, [wx]0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_clsl (unsigned long a)
> >> +{
> >> +  return __clsl (a);
> >> +}
> >> +
> >> +/*
> >> +** test_clsll:
> >> +**	cls	x0, x0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned int test_clsll (uint64_t a)
> >> +{
> >> +  return __clsll (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbit:
> >> +**	rbit	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +uint32_t test_rbit (uint32_t a)
> >> +{
> >> +  return __rbit (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbitl:
> >> +**	rbit	[wx]0, [wx]0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned long test_rbitl (unsigned long a)
> >> +{
> >> +  return __rbitl (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbitll:
> >> +**	rbit	x0, x0
> >> +**	ret
> >> +*/
> >> +
> >> +uint64_t test_rbitll (uint64_t a)
> >> +{
> >> +  return __rbitll (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev:
> >> +**	rev	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +uint32_t test_rev (uint32_t a)
> >> +{
> >> +  return __rev (a);
> >> +}
> >> +
> >> +/*
> >> +** test_revl:
> >> +**	rev	[wx]0, [wx]0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned long test_revl (unsigned long a)
> >> +{
> >> +  return __revl (a);
> >> +}
> >> +
> >> +/*
> >> +** test_revll:
> >> +**	rev	x0, x0
> >> +**	ret
> >> +*/
> >> +
> >> +uint64_t test_revll (uint64_t a)
> >> +{
> >> +  return __revll (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16:
> >> +**	rev16	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +uint32_t test_rev16 (uint32_t a)
> >> +{
> >> +  return __rev16 (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16l:
> >> +**	rev16	[wx]0, [wx]0
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned long test_rev16l (unsigned long a)
> >> +{
> >> +  return __rev16l (a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16ll:
> >> +**	rev16	x0, x0
> >> +**	ret
> >> +*/
> >> +
> >> +uint64_t test_rev16ll (uint64_t a)
> >> +{
> >> +  return __rev16ll (a);
> >> +}
> >> +
> >> +/*
> >> +** test_ror:
> >> +**	ror	w0, w0, w1
> >> +**	ret
> >> +*/
> >> +
> >> +uint32_t test_ror (uint32_t a, uint32_t r)
> >> +{
> >> +  return __ror (a, r);
> >> +}
> >> +
> >> +/*
> >> +** test_rorl:
> >> +**	ror	[wx]0, [wx]0, [wx]1
> >> +**	ret
> >> +*/
> >> +
> >> +unsigned long test_rorl (unsigned long a, uint32_t r)
> >> +{
> >> +  return __rorl (a, r);
> >> +}
> >> +
> >> +/*
> >> +** test_rorll:
> >> +**	ror	x0, x0, x1
> >> +**	ret
> >> +*/
> >> +
> >> +uint64_t test_rorll (uint64_t a, uint32_t r)
> >> +{
> >> +  return __rorll (a, r);
> >> +}
> >> +
> >> +/*
> >> +** test_revsh:
> >> +**	rev16	w0, w0
> >> +**	ret
> >> +*/
> >> +
> >> +int16_t test_revsh (int16_t a)
> >> +{
> >> +  return __revsh (a);
> >> +}
> >> +
> >> +uint32_t *g32;
> >> +unsigned long *gul;
> >> +uint64_t *g64;
> >> +unsigned int *gui;
> >> +int16_t *g16;
> >> +
> >> +/*
> >> +** test_clz_mem:
> >> +**	...
> >> +**	clz	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_clz_mem (uint32_t *a)
> >> +{
> >> +  *gui = __clz (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_clzl_mem:
> >> +**	...
> >> +**	clz	[wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_clzl_mem (unsigned long *a)
> >> +{
> >> +  *gui = __clzl (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_clzll_mem:
> >> +**	...
> >> +**	clz	x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_clzll_mem (uint64_t *a)
> >> +{
> >> +  *gui = __clzll (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_cls_mem:
> >> +**	...
> >> +**	cls	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_cls_mem (uint32_t *a)
> >> +{
> >> +  *gui = __cls (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_clsl_mem:
> >> +**	...
> >> +**	cls	[wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_clsl_mem (unsigned long *a)
> >> +{
> >> +  *gui = __clsl (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_clsll_mem:
> >> +**	...
> >> +**	cls	x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_clsll_mem (uint64_t *a)
> >> +{
> >> +  *gui = __clsll (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbit_mem:
> >> +**	...
> >> +**	rbit	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rbit_mem (uint32_t *a)
> >> +{
> >> +  *g32 = __rbit (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbitl_mem:
> >> +**	...
> >> +**	rbit	[wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rbitl_mem (unsigned long *a)
> >> +{
> >> +  *gul = __rbitl (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rbitll_mem:
> >> +**	...
> >> +**	rbit	x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rbitll_mem (uint64_t *a)
> >> +{
> >> +  *g64 = __rbitll (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev_mem:
> >> +**	...
> >> +**	rev	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rev_mem (uint32_t *a)
> >> +{
> >> +  *g32 = __rev (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_revl_mem:
> >> +**	...
> >> +**	rev	[wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_revl_mem (unsigned long *a)
> >> +{
> >> +  *gul = __revl (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_revll_mem:
> >> +**	...
> >> +**	rev	x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_revll_mem (uint64_t *a)
> >> +{
> >> +  *g64 = __revll (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16_mem:
> >> +**	...
> >> +**	rev16	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rev16_mem (uint32_t *a)
> >> +{
> >> +  *g32 = __rev16 (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16l_mem:
> >> +**	...
> >> +**	rev16	[wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rev16l_mem (unsigned long *a)
> >> +{
> >> +  *gul = __rev16l (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_rev16ll_mem:
> >> +**	...
> >> +**	rev16	x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rev16ll_mem (uint64_t *a)
> >> +{
> >> +  *g64 = __rev16ll (*a);
> >> +}
> >> +
> >> +/*
> >> +** test_ror_mem:
> >> +**	...
> >> +**	ror	w[0-9]+, w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_ror_mem (uint32_t *a, uint32_t *r)
> >> +{
> >> +  *g32 = __ror (*a, *r);
> >> +}
> >> +
> >> +/*
> >> +** test_rorl_mem:
> >> +**	...
> >> +**	ror	[wx][0-9]+, [wx][0-9]+, [wx][0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rorl_mem (unsigned long *a, uint32_t *r)
> >> +{
> >> +  *gul = __rorl (*a, *r);
> >> +}
> >> +
> >> +/*
> >> +** test_rorll_mem:
> >> +**	...
> >> +**	ror	x[0-9]+, x[0-9]+, x[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_rorll_mem (uint64_t *a, uint32_t *r)
> >> +{
> >> +  *g64 = __rorll (*a, *r);
> >> +}
> >> +
> >> +/*
> >> +** test_revsh_mem:
> >> +**	...
> >> +**	rev16	w[0-9]+, w[0-9]+
> >> +**	...
> >> +**	ret
> >> +*/
> >> +
> >> +void test_revsh_mem (int16_t *a)
> >> +{
> >> +  *g16 = __revsh (*a);
> >> +}
Andre Vieira (lists) Oct. 4, 2022, 10:33 a.m. UTC | #8
Hi all,

Can I backport this to gcc-11 branch? Also applies cleanly (with the 
exception of the file extensions being different: 'aarch64-builtins.cc 
vs aarch64-builtins.c').

Bootstrapped and regression tested on aarch64-linux-gnu.

Kind regards,
Andre Vieira
Kyrylo Tkachov Oct. 5, 2022, 8:47 a.m. UTC | #9
> -----Original Message-----
> From: Andre Vieira (lists) <andre.simoesdiasvieira@arm.com>
> Sent: Tuesday, October 4, 2022 11:34 AM
> To: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc-patches@gcc.gnu.org;
> Richard Sandiford <Richard.Sandiford@arm.com>; Richard Biener
> <rguenther@suse.de>
> Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> 
> Hi all,
> 
> Can I backport this to gcc-11 branch? Also applies cleanly (with the
> exception of the file extensions being different: 'aarch64-builtins.cc
> vs aarch64-builtins.c').
> 

Ok by me if testing is clean.
Thanks,
Kyrill

> Bootstrapped and regression tested on aarch64-linux-gnu.
> 
> Kind regards,
> Andre Vieira
Richard Biener Oct. 6, 2022, 7:23 a.m. UTC | #10
On Wed, 5 Oct 2022, Kyrylo Tkachov wrote:

> 
> 
> > -----Original Message-----
> > From: Andre Vieira (lists) <andre.simoesdiasvieira@arm.com>
> > Sent: Tuesday, October 4, 2022 11:34 AM
> > To: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc-patches@gcc.gnu.org;
> > Richard Sandiford <Richard.Sandiford@arm.com>; Richard Biener
> > <rguenther@suse.de>
> > Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> > 
> > Hi all,
> > 
> > Can I backport this to gcc-11 branch? Also applies cleanly (with the
> > exception of the file extensions being different: 'aarch64-builtins.cc
> > vs aarch64-builtins.c').
> > 
> 
> Ok by me if testing is clean.

Target patches like this are really up to the maintainers to decide
for backporting.

Richard.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@  enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@  aarch64_init_ls64_builtins (void)
       = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+						 uint32_type_node, NULL_TREE);
+  tree long_fntype = build_function_type_list (long_unsigned_type_node,
+					       long_unsigned_type_node,
+					       NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+						 uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+				   AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype,
+				   AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+				   AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+				   AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype,
+				   AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+				   AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
     aarch64_init_ls64_builtins ();
 }
@@ -2393,6 +2430,32 @@  aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target)
   emit_insn (pat);
   return target;
 }
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
+{
+  rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+  machine_mode mode = GET_MODE (op0);
+  rtx pat;
+  switch (fcode)
+    {
+    case AARCH64_REV16:
+    case AARCH64_REV16L:
+    case AARCH64_REV16LL:
+      pat = gen_aarch64_rev16 (mode, target, op0);
+      break;
+    case AARCH64_RBIT:
+    case AARCH64_RBITL:
+    case AARCH64_RBITLL:
+      pat = gen_aarch64_rbit (mode, target, op0);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  emit_insn (pat);
+  return target;
+}
 
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
    UNSPEC) using MODE.  */
@@ -2551,6 +2614,9 @@  aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
       && fcode <= AARCH64_MEMTAG_BUILTIN_END)
     return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+      && fcode <= AARCH64_RBITLL)
+    return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index acec8c1146765c0fac73c15351853324b8f03209..ef0aed25c6b26eff61f9f6030dc5921a534e3d19 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4950,7 +4950,7 @@  (define_expand "ffs<mode>2"
     rtx ccreg = aarch64_gen_compare_reg (EQ, operands[1], const0_rtx);
     rtx x = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
 
-    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+    emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
     emit_insn (gen_csinc3<mode>_insn (operands[0], x, operands[0], const0_rtx));
     DONE;
@@ -4996,7 +4996,7 @@  (define_insn "clrsb<mode>2"
   [(set_attr "type" "clz")]
 )
 
-(define_insn "rbit<mode>2"
+(define_insn "@aarch64_rbit<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))]
   ""
@@ -5017,7 +5017,7 @@  (define_insn_and_split "ctz<mode>2"
   "reload_completed"
   [(const_int 0)]
   "
-  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+  emit_insn (gen_aarch64_rbit (<MODE>mode, operands[0], operands[1]));
   emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
   DONE;
 ")
@@ -6022,6 +6022,13 @@  (define_insn "bswaphi2"
   [(set_attr "type" "rev")]
 )
 
+(define_insn "@aarch64_rev16<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_REV))]
+  ""
+  "rev16\\t%<w>0, %<w>1"
+  [(set_attr "type" "rev")])
+
 (define_insn "*aarch64_bfxil<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r,r")
     (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 9775a48c65825b424d3eb442384f5ab87b734fd7..faddd5d0a780c5d65ba430bd3174c701e848c794 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -28,6 +28,7 @@ 
 #define _GCC_ARM_ACLE_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 #pragma GCC aarch64 "arm_acle.h"
 
@@ -35,6 +36,54 @@ 
 extern "C" {
 #endif
 
+#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)			      \
+__extension__ extern __inline TYPE				      \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))   \
+NAME (TYPE value, uint32_t rotate)				      \
+{								      \
+  size_t size = sizeof (TYPE) * __CHAR_BIT__;			      \
+  rotate = rotate % size;					      \
+  return value >> rotate | value << (size - rotate);		      \
+}
+
+_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
+_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
+_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
+
+#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, TYPE)		    \
+__extension__ extern __inline TYPE				    \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+__##NAME (TYPE value)						    \
+{								    \
+  return __builtin_##BUILTIN (value);				    \
+}
+
+_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t)
+_GCC_ARM_ACLE_DATA_FN (rev, bswap32, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (revll, bswap64, uint64_t)
+
+__extension__ extern __inline unsigned long
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__revl (unsigned long __value)
+{
+  if (sizeof (unsigned long) == 8)
+    return __revll (__value);
+  else
+    return __rev (__value);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.3-a")
 __extension__ extern __inline int32_t
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
new file mode 100644
index 0000000000000000000000000000000000000000..90813184704dfcdaf2d24d523ff744aa6cbedf1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/data-intrinsics.c
@@ -0,0 +1,215 @@ 
+/* Test the ACLE data intrinsics.  */
+/* { dg-do assemble } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include "arm_acle.h"
+
+/*
+** test_clz:
+**	clz	w0, w0
+**	ret
+*/
+
+uint32_t test_clz (uint32_t a)
+{
+  return __clz (a);
+}
+
+/*
+** test_clzl:
+**	clz	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_clzl (unsigned long a)
+{
+  return __clzl (a);
+}
+
+/*
+** test_clzll:
+**	clz	x0, x0
+**	ret
+*/
+
+uint64_t test_clzll (uint64_t a)
+{
+  return __clzll (a);
+}
+
+/*
+** test_cls:
+**	cls	w0, w0
+**	ret
+*/
+
+uint32_t test_cls (uint32_t a)
+{
+  return __cls (a);
+}
+
+/*
+** test_clsl:
+**	cls	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_clsl (unsigned long a)
+{
+  return __clsl (a);
+}
+
+/*
+** test_clsll:
+**	cls	x0, x0
+**	ret
+*/
+
+uint64_t test_clsll (uint64_t a)
+{
+  return __clsll (a);
+}
+
+/*
+** test_rbit:
+**	rbit	w0, w0
+**	ret
+*/
+
+uint32_t test_rbit (uint32_t a)
+{
+  return __rbit (a);
+}
+
+/*
+** test_rbitl:
+**	rbit	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rbitl (unsigned long a)
+{
+  return __rbitl (a);
+}
+
+/*
+** test_rbitll:
+**	rbit	x0, x0
+**	ret
+*/
+
+uint64_t test_rbitll (uint64_t a)
+{
+  return __rbitll (a);
+}
+
+/*
+** test_rev:
+**	rev	w0, w0
+**	ret
+*/
+
+uint32_t test_rev (uint32_t a)
+{
+  return __builtin_bswap32 (a);
+}
+
+/*
+** test_revl:
+**	rev	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_revl (unsigned long a)
+{
+  return __revl (a);
+}
+
+/*
+** test_revll:
+**	rev	x0, x0
+**	ret
+*/
+
+uint64_t test_revll (uint64_t a)
+{
+  return __revll (a);
+}
+
+/*
+** test_rev16:
+**	rev16	w0, w0
+**	ret
+*/
+
+uint32_t test_rev16 (uint32_t a)
+{
+  return __rev16 (a);
+}
+
+/*
+** test_rev16l:
+**	rev16	[wx]0, [wx]0
+**	ret
+*/
+
+unsigned long test_rev16l (unsigned long a)
+{
+  return __rev16l (a);
+}
+
+/*
+** test_rev16ll:
+**	rev16	x0, x0
+**	ret
+*/
+
+uint64_t test_rev16ll (uint64_t a)
+{
+  return __rev16ll (a);
+}
+
+/*
+** test_ror:
+**	ror	w0, w0, w1
+**	ret
+*/
+
+uint32_t test_ror (uint32_t a, uint32_t r)
+{
+  return __ror (a, r);
+}
+
+/*
+** test_rorl:
+**	ror	[wx]0, [wx]0, [wx]1
+**	ret
+*/
+
+unsigned long test_rorl (unsigned long a, uint32_t r)
+{
+  return __rorl (a, r);
+}
+
+/*
+** test_rorll:
+**	ror	x0, x0, x1
+**	ret
+*/
+
+uint64_t test_rorll (uint64_t a, uint32_t r)
+{
+  return __rorll (a, r);
+}
+
+/*
+** test_revsh:
+**	rev16	w0, w0
+**	ret
+*/
+
+int16_t test_revsh (int16_t a)
+{
+  return __revsh (a);
+}