diff mbox series

[v2] libatomic: Add rcpc3 128-bit atomic operations for AArch64

Message ID 20240612123505.837005-1-victor.donascimento@arm.com
State New
Headers show
Series [v2] libatomic: Add rcpc3 128-bit atomic operations for AArch64 | expand

Commit Message

Victor Do Nascimento June 12, 2024, 12:35 p.m. UTC
The introduction of the optional RCPC3 architectural extension for
Armv8.2-A upwards provides additional support for the release
consistency model, introducing the Load-Acquire RCpc Pair Ordered, and
Store-Release Pair Ordered operations in the form of LDIAPP and STILP.

These operations are single-copy atomic on cores which also implement
LSE2 and, as such, support for these operations is added to Libatomic
and employed accordingly when the LSE2 and RCPC3 features are detected
in a given core at runtime.

libatomic/ChangeLog:

	* config/linux/aarch64/atomic_16.S (libat_load_16): Add LRCPC3
	variant.
	(libat_store_16): Likewise.
	* config/linux/aarch64/host-config.h (HWCAP2_LRCPC3): New.
	(LSE2_LRCPC3_ATOP): Previously LSE2_ATOP.  New ifuncs guarded
	under it.
	(has_rcpc3): New.
---
 libatomic/config/linux/aarch64/atomic_16.S   | 46 +++++++++++++++++++-
 libatomic/config/linux/aarch64/host-config.h | 34 +++++++++++++--
 2 files changed, 74 insertions(+), 6 deletions(-)

Comments

Richard Sandiford June 20, 2024, 8:25 a.m. UTC | #1
Victor Do Nascimento <victor.donascimento@arm.com> writes:
> The introduction of the optional RCPC3 architectural extension for
> Armv8.2-A upwards provides additional support for the release
> consistency model, introducing the Load-Acquire RCpc Pair Ordered, and
> Store-Release Pair Ordered operations in the form of LDIAPP and STILP.
>
> These operations are single-copy atomic on cores which also implement
> LSE2 and, as such, support for these operations is added to Libatomic
> and employed accordingly when the LSE2 and RCPC3 features are detected
> in a given core at runtime.
>
> libatomic/ChangeLog:
>
> 	* config/linux/aarch64/atomic_16.S (libat_load_16): Add LRCPC3
> 	variant.
> 	(libat_store_16): Likewise.
> 	* config/linux/aarch64/host-config.h (HWCAP2_LRCPC3): New.
> 	(LSE2_LRCPC3_ATOP): Previously LSE2_ATOP.  New ifuncs guarded
> 	under it.
> 	(has_rcpc3): New.
> ---
>  libatomic/config/linux/aarch64/atomic_16.S   | 46 +++++++++++++++++++-
>  libatomic/config/linux/aarch64/host-config.h | 34 +++++++++++++--
>  2 files changed, 74 insertions(+), 6 deletions(-)
>
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
> index c44c31c6418..5767fba5c03 100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -35,16 +35,21 @@
>     writes, this will be true when using atomics in actual code.
>  
>     The libat_<op>_16 entry points are ARMv8.0.
> -   The libat_<op>_16_i1 entry points are used when LSE128 is available.
> +   The libat_<op>_16_i1 entry points are used when LSE128 or LRCPC3 is available.
>     The libat_<op>_16_i2 entry points are used when LSE2 is available.  */
>  
>  #include "auto-config.h"
>  
>  	.arch	armv8-a+lse
>  
> +/* There is overlap in atomic instructions implemented in RCPC3 and LSE2.
> +   Consequently, both _i1 and _i2 suffixes are needed for functions using these.
> +   Elsewhere, all extension-specific implementations are mapped to _i1.  */
> +
> +#define LRCPC3(NAME)	libat_##NAME##_i1
>  #define LSE128(NAME)	libat_##NAME##_i1
>  #define LSE(NAME)	libat_##NAME##_i1
> -#define LSE2(NAME)	libat_##NAME##_i1
> +#define LSE2(NAME)	libat_##NAME##_i2
>  #define CORE(NAME)	libat_##NAME
>  #define ATOMIC(NAME)	__atomic_##NAME
>  
> @@ -513,6 +518,43 @@ END (test_and_set_16)
>  /* ifunc implementations: Carries run-time dependence on the presence of further
>     architectural extensions.  */
>  
> +ENTRY_FEAT (load_16, LRCPC3)
> +	cbnz	w1, 1f
> +
> +	/* RELAXED.  */
> +	ldp	res0, res1, [x0]
> +	ret
> +1:
> +	cmp	w1, SEQ_CST
> +	b.eq	2f
> +
> +	/* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
> +	/* ldiapp res0, res1, [x0]  */
> +	.inst	0xd9411800
> +	ret
> +
> +	/* SEQ_CST.  */
> +2:	ldar	tmp0, [x0]	/* Block reordering with Store-Release instr.  */
> +	/* ldiapp res0, res1, [x0]  */
> +	.inst	0xd9411800
> +	ret
> +END_FEAT (load_16, LRCPC3)
> +
> +
> +ENTRY_FEAT (store_16, LRCPC3)
> +	cbnz	w4, 1f
> +
> +	/* RELAXED.  */
> +	stp	in0, in1, [x0]
> +	ret
> +
> +	/* RELEASE/SEQ_CST.  */
> +1:	/* stilp in0, in1, [x0]  */
> +	.inst	0xd9031802
> +	ret
> +END_FEAT (store_16, LRCPC3)
> +
> +
>  ENTRY_FEAT (exchange_16, LSE128)
>  	mov	tmp0, x0
>  	mov	res0, in0
> diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
> index d05e9eb628f..8adf0563001 100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -33,6 +33,9 @@
>  #ifndef HWCAP_USCAT
>  # define HWCAP_USCAT	(1 << 25)
>  #endif
> +#ifndef HWCAP2_LRCPC3
> +# define HWCAP2_LRCPC3	(1UL << 46)
> +#endif
>  #ifndef HWCAP2_LSE128
>  # define HWCAP2_LSE128	(1UL << 47)
>  #endif
> @@ -54,7 +57,7 @@ typedef struct __ifunc_arg_t {
>  #if defined (LAT_CAS_N)
>  # define LSE_ATOP
>  #elif defined (LAT_LOAD_N) || defined (LAT_STORE_N)
> -# define LSE2_ATOP
> +# define LSE2_LRCPC3_ATOP
>  #elif defined (LAT_EXCH_N) || defined (LAT_FIOR_N) || defined (LAT_FAND_N)
>  # define LSE128_ATOP
>  #endif
> @@ -63,9 +66,10 @@ typedef struct __ifunc_arg_t {
>  #  if defined (LSE_ATOP)
>  #   define IFUNC_NCOND(N)	1
>  #   define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
> -#  elif defined (LSE2_ATOP)
> -#   define IFUNC_NCOND(N)	1
> -#   define IFUNC_COND_1	(has_lse2 (hwcap, features))
> +#  elif defined (LSE2_LRCPC3_ATOP)
> +#   define IFUNC_NCOND(N)	2
> +#   define IFUNC_COND_1	(has_rcpc3 (hwcap, features))
> +#   define IFUNC_COND_2	(has_lse2 (hwcap, features))
>  #  elif defined (LSE128_ATOP)
>  #   define IFUNC_NCOND(N)	1
>  #   define IFUNC_COND_1	(has_lse128 (hwcap, features))
> @@ -131,6 +135,28 @@ has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features)
>    return false;
>  }
>  
> +/* LRCPC atomic support encoded in ID_AA64ISAR1_EL1.Atomic, bits[23:20].  The
> +   expected value is 0b0011.  Check that.  */
> +
> +static inline bool
> +has_rcpc3 (unsigned long hwcap, const __ifunc_arg_t *features)
> +{
> +  if (hwcap & _IFUNC_ARG_HWCAP
> +      && features->_hwcap2 & HWCAP2_LRCPC3)
> +    return true;
> +  /* Try fallback feature check method to guarantee LRCPC3 is not implemented.
> +
> +     In the absence of HWCAP_CPUID, we are unable to check for RCPC3, return.
> +     If feature check available, check LSE2 prerequisite before proceeding.  */

It seems unfortunate that one of the things we do as part of this
function is check for the presence of LSE2, which is also what the
second ifunc does.  It might be clearer to have a single resolver
that selects the appropriate routine number.

But that would be another change to the target-independent code,
and you've already had to do one of those as part of this series.
I also can't be 100% sure that having a single resolver would make
things clearer.  So I agree the current approach is ok for now.

> +  if (!(hwcap & HWCAP_CPUID)  || !(hwcap & HWCAP_USCAT))

Nit: should be one fewer space before "||".

OK with that change, thanks.

Richard

> +    return false;
> +  unsigned long isar1;
> +  asm volatile ("mrs %0, ID_AA64ISAR1_EL1" : "=r" (isar1));
> +  if (AT_FEAT_FIELD (isar1) >= 3)
> +    return true;
> +  return false;
> +}
> +
>  #endif /* HAVE_IFUNC */
>  
>  /* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
diff mbox series

Patch

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index c44c31c6418..5767fba5c03 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -35,16 +35,21 @@ 
    writes, this will be true when using atomics in actual code.
 
    The libat_<op>_16 entry points are ARMv8.0.
-   The libat_<op>_16_i1 entry points are used when LSE128 is available.
+   The libat_<op>_16_i1 entry points are used when LSE128 or LRCPC3 is available.
    The libat_<op>_16_i2 entry points are used when LSE2 is available.  */
 
 #include "auto-config.h"
 
 	.arch	armv8-a+lse
 
+/* There is overlap in atomic instructions implemented in RCPC3 and LSE2.
+   Consequently, both _i1 and _i2 suffixes are needed for functions using these.
+   Elsewhere, all extension-specific implementations are mapped to _i1.  */
+
+#define LRCPC3(NAME)	libat_##NAME##_i1
 #define LSE128(NAME)	libat_##NAME##_i1
 #define LSE(NAME)	libat_##NAME##_i1
-#define LSE2(NAME)	libat_##NAME##_i1
+#define LSE2(NAME)	libat_##NAME##_i2
 #define CORE(NAME)	libat_##NAME
 #define ATOMIC(NAME)	__atomic_##NAME
 
@@ -513,6 +518,43 @@  END (test_and_set_16)
 /* ifunc implementations: Carries run-time dependence on the presence of further
    architectural extensions.  */
 
+ENTRY_FEAT (load_16, LRCPC3)
+	cbnz	w1, 1f
+
+	/* RELAXED.  */
+	ldp	res0, res1, [x0]
+	ret
+1:
+	cmp	w1, SEQ_CST
+	b.eq	2f
+
+	/* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
+	/* ldiapp res0, res1, [x0]  */
+	.inst	0xd9411800
+	ret
+
+	/* SEQ_CST.  */
+2:	ldar	tmp0, [x0]	/* Block reordering with Store-Release instr.  */
+	/* ldiapp res0, res1, [x0]  */
+	.inst	0xd9411800
+	ret
+END_FEAT (load_16, LRCPC3)
+
+
+ENTRY_FEAT (store_16, LRCPC3)
+	cbnz	w4, 1f
+
+	/* RELAXED.  */
+	stp	in0, in1, [x0]
+	ret
+
+	/* RELEASE/SEQ_CST.  */
+1:	/* stilp in0, in1, [x0]  */
+	.inst	0xd9031802
+	ret
+END_FEAT (store_16, LRCPC3)
+
+
 ENTRY_FEAT (exchange_16, LSE128)
 	mov	tmp0, x0
 	mov	res0, in0
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index d05e9eb628f..8adf0563001 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -33,6 +33,9 @@ 
 #ifndef HWCAP_USCAT
 # define HWCAP_USCAT	(1 << 25)
 #endif
+#ifndef HWCAP2_LRCPC3
+# define HWCAP2_LRCPC3	(1UL << 46)
+#endif
 #ifndef HWCAP2_LSE128
 # define HWCAP2_LSE128	(1UL << 47)
 #endif
@@ -54,7 +57,7 @@  typedef struct __ifunc_arg_t {
 #if defined (LAT_CAS_N)
 # define LSE_ATOP
 #elif defined (LAT_LOAD_N) || defined (LAT_STORE_N)
-# define LSE2_ATOP
+# define LSE2_LRCPC3_ATOP
 #elif defined (LAT_EXCH_N) || defined (LAT_FIOR_N) || defined (LAT_FAND_N)
 # define LSE128_ATOP
 #endif
@@ -63,9 +66,10 @@  typedef struct __ifunc_arg_t {
 #  if defined (LSE_ATOP)
 #   define IFUNC_NCOND(N)	1
 #   define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
-#  elif defined (LSE2_ATOP)
-#   define IFUNC_NCOND(N)	1
-#   define IFUNC_COND_1	(has_lse2 (hwcap, features))
+#  elif defined (LSE2_LRCPC3_ATOP)
+#   define IFUNC_NCOND(N)	2
+#   define IFUNC_COND_1	(has_rcpc3 (hwcap, features))
+#   define IFUNC_COND_2	(has_lse2 (hwcap, features))
 #  elif defined (LSE128_ATOP)
 #   define IFUNC_NCOND(N)	1
 #   define IFUNC_COND_1	(has_lse128 (hwcap, features))
@@ -131,6 +135,28 @@  has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features)
   return false;
 }
 
+/* LRCPC atomic support encoded in ID_AA64ISAR1_EL1.Atomic, bits[23:20].  The
+   expected value is 0b0011.  Check that.  */
+
+static inline bool
+has_rcpc3 (unsigned long hwcap, const __ifunc_arg_t *features)
+{
+  if (hwcap & _IFUNC_ARG_HWCAP
+      && features->_hwcap2 & HWCAP2_LRCPC3)
+    return true;
+  /* Try fallback feature check method to guarantee LRCPC3 is not implemented.
+
+     In the absence of HWCAP_CPUID, we are unable to check for RCPC3, return.
+     If feature check available, check LSE2 prerequisite before proceeding.  */
+  if (!(hwcap & HWCAP_CPUID)  || !(hwcap & HWCAP_USCAT))
+    return false;
+  unsigned long isar1;
+  asm volatile ("mrs %0, ID_AA64ISAR1_EL1" : "=r" (isar1));
+  if (AT_FEAT_FIELD (isar1) >= 3)
+    return true;
+  return false;
+}
+
 #endif /* HAVE_IFUNC */
 
 /* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */