diff mbox series

PATCH] AArch64: Fix cpu features initialization [PR115342]

Message ID PAWPR08MB8982E391F1465FF711AB32D783F82@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series PATCH] AArch64: Fix cpu features initialization [PR115342] | expand

Commit Message

Wilco Dijkstra June 4, 2024, 12:55 p.m. UTC
Fix CPU features initialization.  Use HWCAP rather than explicit accesses
to CPUID registers.  Perform the initialization atomically to avoid multi-
threading issues.

Passes regress, OK for commit and backport?

libgcc:
        PR target/115342
        * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
        Use HWCAP where possible.  Use atomic write for initialization.
        (__init_cpu_features_resolver): Use atomic load for correct
        initialization.
        (__init_cpu_features): Likewise.

---

Comments

Richard Sandiford June 4, 2024, 1:28 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Fix CPU features initialization.  Use HWCAP rather than explicit accesses
> to CPUID registers.  Perform the initialization atomically to avoid multi-
> threading issues.

Please describe the problem that the patch is fixing.  I think the
PR description would make a better commit message:

-----------------------------------------------------------------------
The CPU features initialization code uses CPUID registers. It uses
incorrect comparisons so that for example SVE is not set if SVE2 is
available. Using HWCAPs for these is both simpler and works
correctly. The initialization must also be done atomically so to avoid
multiple threads causing corruption due to non-atomic RMW of the global.
-----------------------------------------------------------------------

What criteria did you use for choosing whether to keep or remove
the system register checks?

> Passes regress, OK for commit and backport?
>
> libgcc:
>         PR target/115342
>         * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
>         Use HWCAP where possible.  Use atomic write for initialization.

It'd be good to mention the fix for the FEAT_PREDRES system register check
as well.

>         (__init_cpu_features_resolver): Use atomic load for correct
>         initialization.
>         (__init_cpu_features): Likewise.

Thanks,
Richard

>
> ---
>
> diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
> index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644
> --- a/libgcc/config/aarch64/cpuinfo.c
> +++ b/libgcc/config/aarch64/cpuinfo.c
> @@ -227,14 +227,22 @@ struct {
>  #ifndef HWCAP2_SVE_EBF16
>  #define HWCAP2_SVE_EBF16 (1UL << 33)
>  #endif
> +#ifndef HWCAP2_SME2
> +#define HWCAP2_SME2 (1UL << 37)
> +#endif
> +#ifndef HWCAP2_LRCPC3
> +#define HWCAP2_LRCPC3	(1UL << 46)
> +#endif
>  
>  static void
> -__init_cpu_features_constructor(unsigned long hwcap,
> -				const __ifunc_arg_t *arg) {
> -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
> +__init_cpu_features_constructor (unsigned long hwcap,
> +				 const __ifunc_arg_t *arg)
> +{
> +  unsigned long feat = 0;
> +#define setCPUFeature(F) feat |= 1UL << F
>  #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
>  #define extractBits(val, start, number) \
> -  (val & ((1ULL << number) - 1ULL) << start) >> start
> +  (val & ((1UL << number) - 1UL) << start) >> start
>    unsigned long hwcap2 = 0;
>    if (hwcap & _IFUNC_ARG_HWCAP)
>      hwcap2 = arg->_hwcap2;
> @@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
>      setCPUFeature(FEAT_PMULL);
>    if (hwcap & HWCAP_FLAGM)
>      setCPUFeature(FEAT_FLAGM);
> -  if (hwcap2 & HWCAP2_FLAGM2) {
> -    setCPUFeature(FEAT_FLAGM);
> +  if (hwcap2 & HWCAP2_FLAGM2)
>      setCPUFeature(FEAT_FLAGM2);
> -  }
> -  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
> +  if (hwcap & HWCAP_SM4)
>      setCPUFeature(FEAT_SM4);
>    if (hwcap & HWCAP_ASIMDDP)
>      setCPUFeature(FEAT_DOTPROD);
>    if (hwcap & HWCAP_ASIMDFHM)
>      setCPUFeature(FEAT_FP16FML);
> -  if (hwcap & HWCAP_FPHP) {
> +  if (hwcap & HWCAP_FPHP)
>      setCPUFeature(FEAT_FP16);
> -    setCPUFeature(FEAT_FP);
> -  }
>    if (hwcap & HWCAP_DIT)
>      setCPUFeature(FEAT_DIT);
>    if (hwcap & HWCAP_ASIMDRDM)
>      setCPUFeature(FEAT_RDM);
> -  if (hwcap & HWCAP_ILRCPC)
> -    setCPUFeature(FEAT_RCPC2);
>    if (hwcap & HWCAP_AES)
>      setCPUFeature(FEAT_AES);
>    if (hwcap & HWCAP_SHA1)
> @@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
>    if (hwcap & HWCAP_SB)
>      setCPUFeature(FEAT_SB);
>    if (hwcap & HWCAP_SSBS)
> -    setCPUFeature(FEAT_SSBS2);
> -  if (hwcap2 & HWCAP2_MTE) {
> -    setCPUFeature(FEAT_MEMTAG);
> -    setCPUFeature(FEAT_MEMTAG2);
> -  }
> -  if (hwcap2 & HWCAP2_MTE3) {
> -    setCPUFeature(FEAT_MEMTAG);
> -    setCPUFeature(FEAT_MEMTAG2);
> +    {
> +      setCPUFeature(FEAT_SSBS);
> +      setCPUFeature(FEAT_SSBS2);
> +    }
> +  if (hwcap2 & HWCAP2_MTE)
> +    {
> +      setCPUFeature(FEAT_MEMTAG);
> +      setCPUFeature(FEAT_MEMTAG2);
> +    }
> +  if (hwcap2 & HWCAP2_MTE3)
>      setCPUFeature(FEAT_MEMTAG3);
> -  }
>    if (hwcap2 & HWCAP2_SVEAES)
>      setCPUFeature(FEAT_SVE_AES);
> -  if (hwcap2 & HWCAP2_SVEPMULL) {
> -    setCPUFeature(FEAT_SVE_AES);
> +  if (hwcap2 & HWCAP2_SVEPMULL)
>      setCPUFeature(FEAT_SVE_PMULL128);
> -  }
>    if (hwcap2 & HWCAP2_SVEBITPERM)
>      setCPUFeature(FEAT_SVE_BITPERM);
>    if (hwcap2 & HWCAP2_SVESHA3)
> @@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
>      setCPUFeature(FEAT_WFXT);
>    if (hwcap2 & HWCAP2_SME)
>      setCPUFeature(FEAT_SME);
> +  if (hwcap2 & HWCAP2_SME2)
> +    setCPUFeature(FEAT_SME2);
>    if (hwcap2 & HWCAP2_SME_I16I64)
>      setCPUFeature(FEAT_SME_I64);
>    if (hwcap2 & HWCAP2_SME_F64F64)
>      setCPUFeature(FEAT_SME_F64);
> -  if (hwcap & HWCAP_CPUID) {
> -    unsigned long ftr;
> -    getCPUFeature(ID_AA64PFR1_EL1, ftr);
> -    /* ID_AA64PFR1_EL1.MTE >= 0b0001  */
> -    if (extractBits(ftr, 8, 4) >= 0x1)
> -      setCPUFeature(FEAT_MEMTAG);
> -    /* ID_AA64PFR1_EL1.SSBS == 0b0001  */
> -    if (extractBits(ftr, 4, 4) == 0x1)
> -      setCPUFeature(FEAT_SSBS);
> -    /* ID_AA64PFR1_EL1.SME == 0b0010  */
> -    if (extractBits(ftr, 24, 4) == 0x2)
> -      setCPUFeature(FEAT_SME2);
> -    getCPUFeature(ID_AA64PFR0_EL1, ftr);
> -    /* ID_AA64PFR0_EL1.FP != 0b1111  */
> -    if (extractBits(ftr, 16, 4) != 0xF) {
> -      setCPUFeature(FEAT_FP);
> -      /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP  */
> -      setCPUFeature(FEAT_SIMD);
> -    }
> -    /* ID_AA64PFR0_EL1.SVE != 0b0000  */
> -    if (extractBits(ftr, 32, 4) != 0x0) {
> -      /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only  */
> -      getCPUFeature(S3_0_C0_C4_4, ftr);
> -      /* ID_AA64ZFR0_EL1.SVEver == 0b0000  */
> -      if (extractBits(ftr, 0, 4) == 0x0)
> -	setCPUFeature(FEAT_SVE);
> -      /* ID_AA64ZFR0_EL1.SVEver == 0b0001  */
> -      if (extractBits(ftr, 0, 4) == 0x1)
> -	setCPUFeature(FEAT_SVE2);
> -      /* ID_AA64ZFR0_EL1.BF16 != 0b0000  */
> -      if (extractBits(ftr, 20, 4) != 0x0)
> -	setCPUFeature(FEAT_SVE_BF16);
> +  if (hwcap & HWCAP_CPUID)
> +    {
> +      unsigned long ftr;
> +
> +      getCPUFeature(ID_AA64ISAR1_EL1, ftr);
> +      /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
> +      if (extractBits(ftr, 40, 4) >= 0x1)
> +	setCPUFeature(FEAT_PREDRES);
> +      /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
> +      if (extractBits(ftr, 60, 4) >= 0x1)
> +	setCPUFeature(FEAT_LS64);
> +      /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
> +      if (extractBits(ftr, 60, 4) >= 0x2)
> +	setCPUFeature(FEAT_LS64_V);
> +      /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
> +      if (extractBits(ftr, 60, 4) >= 0x3)
> +	setCPUFeature(FEAT_LS64_ACCDATA);
>      }
> -    getCPUFeature(ID_AA64ISAR0_EL1, ftr);
> -    /* ID_AA64ISAR0_EL1.SHA3 != 0b0000  */
> -    if (extractBits(ftr, 32, 4) != 0x0)
> -      setCPUFeature(FEAT_SHA3);
> -    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
> -    /* ID_AA64ISAR1_EL1.DPB >= 0b0001  */
> -    if (extractBits(ftr, 0, 4) >= 0x1)
> -      setCPUFeature(FEAT_DPB);
> -    /* ID_AA64ISAR1_EL1.LRCPC != 0b0000  */
> -    if (extractBits(ftr, 20, 4) != 0x0)
> -      setCPUFeature(FEAT_RCPC);
> -    /* ID_AA64ISAR1_EL1.LRCPC == 0b0011  */
> -    if (extractBits(ftr, 20, 4) == 0x3)
> -      setCPUFeature(FEAT_RCPC3);
> -    /* ID_AA64ISAR1_EL1.SPECRES == 0b0001  */
> -    if (extractBits(ftr, 40, 4) == 0x2)
> -      setCPUFeature(FEAT_PREDRES);
> -    /* ID_AA64ISAR1_EL1.BF16 != 0b0000  */
> -    if (extractBits(ftr, 44, 4) != 0x0)
> -      setCPUFeature(FEAT_BF16);
> -    /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
> -    if (extractBits(ftr, 60, 4) >= 0x1)
> -      setCPUFeature(FEAT_LS64);
> -    /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
> -    if (extractBits(ftr, 60, 4) >= 0x2)
> -      setCPUFeature(FEAT_LS64_V);
> -    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
> -    if (extractBits(ftr, 60, 4) >= 0x3)
> -      setCPUFeature(FEAT_LS64_ACCDATA);
> -  } else {
> -    /* Set some features in case of no CPUID support.  */
> -    if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
> +
> +  if (hwcap & HWCAP_FP)
> +    {
>        setCPUFeature(FEAT_FP);
>        /* FP and AdvSIMD fields have the same value.  */
>        setCPUFeature(FEAT_SIMD);
>      }
> -    if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP)
> -      setCPUFeature(FEAT_DPB);
> -    if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC)
> -      setCPUFeature(FEAT_RCPC);
> -    if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
> -      setCPUFeature(FEAT_BF16);
> -    if (hwcap2 & HWCAP2_SVEBF16)
> -      setCPUFeature(FEAT_SVE_BF16);
> -    if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
> -      setCPUFeature(FEAT_SVE2);
> -    if (hwcap & HWCAP_SHA3)
> -      setCPUFeature(FEAT_SHA3);
> -  }
> +  if (hwcap & HWCAP_DCPOP)
> +    setCPUFeature(FEAT_DPB);
> +  if (hwcap & HWCAP_LRCPC)
> +    setCPUFeature(FEAT_RCPC);
> +  if (hwcap & HWCAP_ILRCPC)
> +    setCPUFeature(FEAT_RCPC2);
> +  if (hwcap2 & HWCAP2_LRCPC3)
> +    setCPUFeature(FEAT_RCPC3);
> +  if (hwcap2 & HWCAP2_BF16)
> +    setCPUFeature(FEAT_BF16);
> +  if (hwcap2 & HWCAP2_SVEBF16)
> +    setCPUFeature(FEAT_SVE_BF16);
> +  if (hwcap & HWCAP_SVE)
> +    setCPUFeature(FEAT_SVE);
> +  if (hwcap2 & HWCAP2_SVE2)
> +    setCPUFeature(FEAT_SVE2);
> +  if (hwcap & HWCAP_SHA3)
> +    setCPUFeature(FEAT_SHA3);
>    setCPUFeature(FEAT_INIT);
> +
> +  __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
>  }
>  
>  void
> -__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) {
> -  if (__aarch64_cpu_features.features)
> +__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg)
> +{
> +  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
>      return;
>    __init_cpu_features_constructor(hwcap, arg);
>  }
>  
>  void __attribute__ ((constructor))
> -__init_cpu_features(void) {
> +__init_cpu_features(void)
> +{
>    unsigned long hwcap;
>    unsigned long hwcap2;
> +
>    /* CPU features already initialized.  */
> -  if (__aarch64_cpu_features.features)
> +  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
>      return;
>    hwcap = getauxval(AT_HWCAP);
>    hwcap2 = getauxval(AT_HWCAP2);
Wilco Dijkstra June 4, 2024, 5:09 p.m. UTC | #2
Hi Richard,

I've reworded the commit message a bit:

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

> What criteria did you use for choosing whether to keep or remove
> the system register checks?

Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
entirely convinced we need these, let alone having 3 individual bits for LS64, but
that's something for the ACLE spec to sort out). The goal here is to fix all obvious
bugs so one can use FMV as intended.

> Passes regress, OK for commit and backport?
>
> libgcc:
>         PR target/115342
>         * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
>         Use HWCAP where possible.  Use atomic write for initialization.

> It'd be good to mention the fix for the FEAT_PREDRES system register check
> as well.

Done, see below.

Cheers,
Wilco


v2: Update commit message and mention PREDRES.

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

Passes regress, OK for commit and backport?

libgcc:
        PR target/115342
        * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
        Use HWCAP where possible.  Use atomic write for initialization.
        Fix FEAT_PREDRES comparison.
        (__init_cpu_features_resolver): Use atomic load for correct
        initialization.
        (__init_cpu_features): Likewise.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3	(1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-				const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+				 const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
     hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
     setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-    setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
     setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
     setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
     setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
     setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
     setCPUFeature(FEAT_FP16);
-    setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
     setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
     setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-    setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
     setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
     setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-    setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
+    {
+      setCPUFeature(FEAT_SSBS);
+      setCPUFeature(FEAT_SSBS2);
+    }
+  if (hwcap2 & HWCAP2_MTE)
+    {
+      setCPUFeature(FEAT_MEMTAG);
+      setCPUFeature(FEAT_MEMTAG2);
+    }
+  if (hwcap2 & HWCAP2_MTE3)
     setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
     setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-    setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
     setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
     setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
     setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+    setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
     setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
     setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-    unsigned long ftr;
-    getCPUFeature(ID_AA64PFR1_EL1, ftr);
-    /* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-    if (extractBits(ftr, 8, 4) >= 0x1)
-      setCPUFeature(FEAT_MEMTAG);
-    /* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-    if (extractBits(ftr, 4, 4) == 0x1)
-      setCPUFeature(FEAT_SSBS);
-    /* ID_AA64PFR1_EL1.SME == 0b0010  */
-    if (extractBits(ftr, 24, 4) == 0x2)
-      setCPUFeature(FEAT_SME2);
-    getCPUFeature(ID_AA64PFR0_EL1, ftr);
-    /* ID_AA64PFR0_EL1.FP != 0b1111  */
-    if (extractBits(ftr, 16, 4) != 0xF) {
-      setCPUFeature(FEAT_FP);
-      /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP  */
-      setCPUFeature(FEAT_SIMD);
-    }
-    /* ID_AA64PFR0_EL1.SVE != 0b0000  */
-    if (extractBits(ftr, 32, 4) != 0x0) {
-      /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only  */
-      getCPUFeature(S3_0_C0_C4_4, ftr);
-      /* ID_AA64ZFR0_EL1.SVEver == 0b0000  */
-      if (extractBits(ftr, 0, 4) == 0x0)
-	setCPUFeature(FEAT_SVE);
-      /* ID_AA64ZFR0_EL1.SVEver == 0b0001  */
-      if (extractBits(ftr, 0, 4) == 0x1)
-	setCPUFeature(FEAT_SVE2);
-      /* ID_AA64ZFR0_EL1.BF16 != 0b0000  */
-      if (extractBits(ftr, 20, 4) != 0x0)
-	setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_CPUID)
+    {
+      unsigned long ftr;
+
+      getCPUFeature(ID_AA64ISAR1_EL1, ftr);
+      /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
+      if (extractBits(ftr, 40, 4) >= 0x1)
+	setCPUFeature(FEAT_PREDRES);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
+      if (extractBits(ftr, 60, 4) >= 0x1)
+	setCPUFeature(FEAT_LS64);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
+      if (extractBits(ftr, 60, 4) >= 0x2)
+	setCPUFeature(FEAT_LS64_V);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
+      if (extractBits(ftr, 60, 4) >= 0x3)
+	setCPUFeature(FEAT_LS64_ACCDATA);
     }
-    getCPUFeature(ID_AA64ISAR0_EL1, ftr);
-    /* ID_AA64ISAR0_EL1.SHA3 != 0b0000  */
-    if (extractBits(ftr, 32, 4) != 0x0)
-      setCPUFeature(FEAT_SHA3);
-    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
-    /* ID_AA64ISAR1_EL1.DPB >= 0b0001  */
-    if (extractBits(ftr, 0, 4) >= 0x1)
-      setCPUFeature(FEAT_DPB);
-    /* ID_AA64ISAR1_EL1.LRCPC != 0b0000  */
-    if (extractBits(ftr, 20, 4) != 0x0)
-      setCPUFeature(FEAT_RCPC);
-    /* ID_AA64ISAR1_EL1.LRCPC == 0b0011  */
-    if (extractBits(ftr, 20, 4) == 0x3)
-      setCPUFeature(FEAT_RCPC3);
-    /* ID_AA64ISAR1_EL1.SPECRES == 0b0001  */
-    if (extractBits(ftr, 40, 4) == 0x2)
-      setCPUFeature(FEAT_PREDRES);
-    /* ID_AA64ISAR1_EL1.BF16 != 0b0000  */
-    if (extractBits(ftr, 44, 4) != 0x0)
-      setCPUFeature(FEAT_BF16);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
-    if (extractBits(ftr, 60, 4) >= 0x1)
-      setCPUFeature(FEAT_LS64);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
-    if (extractBits(ftr, 60, 4) >= 0x2)
-      setCPUFeature(FEAT_LS64_V);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
-    if (extractBits(ftr, 60, 4) >= 0x3)
-      setCPUFeature(FEAT_LS64_ACCDATA);
-  } else {
-    /* Set some features in case of no CPUID support.  */
-    if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
+
+  if (hwcap & HWCAP_FP)
+    {
       setCPUFeature(FEAT_FP);
       /* FP and AdvSIMD fields have the same value.  */
       setCPUFeature(FEAT_SIMD);
     }
-    if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP)
-      setCPUFeature(FEAT_DPB);
-    if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC)
-      setCPUFeature(FEAT_RCPC);
-    if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
-      setCPUFeature(FEAT_BF16);
-    if (hwcap2 & HWCAP2_SVEBF16)
-      setCPUFeature(FEAT_SVE_BF16);
-    if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
-      setCPUFeature(FEAT_SVE2);
-    if (hwcap & HWCAP_SHA3)
-      setCPUFeature(FEAT_SHA3);
-  }
+  if (hwcap & HWCAP_DCPOP)
+    setCPUFeature(FEAT_DPB);
+  if (hwcap & HWCAP_LRCPC)
+    setCPUFeature(FEAT_RCPC);
+  if (hwcap & HWCAP_ILRCPC)
+    setCPUFeature(FEAT_RCPC2);
+  if (hwcap2 & HWCAP2_LRCPC3)
+    setCPUFeature(FEAT_RCPC3);
+  if (hwcap2 & HWCAP2_BF16)
+    setCPUFeature(FEAT_BF16);
+  if (hwcap2 & HWCAP2_SVEBF16)
+    setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_SVE)
+    setCPUFeature(FEAT_SVE);
+  if (hwcap2 & HWCAP2_SVE2)
+    setCPUFeature(FEAT_SVE2);
+  if (hwcap & HWCAP_SHA3)
+    setCPUFeature(FEAT_SHA3);
   setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
 }
 
 void
-__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) {
-  if (__aarch64_cpu_features.features)
+__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg)
+{
+  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
   __init_cpu_features_constructor(hwcap, arg);
 }
 
 void __attribute__ ((constructor))
-__init_cpu_features(void) {
+__init_cpu_features(void)
+{
   unsigned long hwcap;
   unsigned long hwcap2;
+
   /* CPU features already initialized.  */
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
   hwcap = getauxval(AT_HWCAP);
   hwcap2 = getauxval(AT_HWCAP2);
Richard Sandiford June 4, 2024, 6:13 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
> I've reworded the commit message a bit:
>
> The CPU features initialization code uses CPUID registers (rather than
> HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
> is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
> correct.  The initialization must also be done atomically to avoid multiple
> threads causing corruption due to non-atomic RMW accesses to the global.

Thanks, sounds good.

>> What criteria did you use for choosing whether to keep or remove
>> the system register checks?
>
> Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
> entirely convinced we need these, let alone having 3 individual bits for LS64, but
> that's something for the ACLE spec to sort out). The goal here is to fix all obvious
> bugs so one can use FMV as intended.

Didn't we take the opposite approach for libatomic though?

/* LSE128 atomic support encoded in ID_AA64ISAR0_EL1.Atomic,
   bits[23:20].  The expected value is 0b0011.  Check that.  */

#define AT_FEAT_FIELD(isar0)	(((isar0) >> 20) & 15)

static inline bool
has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features)
{
  if (hwcap & _IFUNC_ARG_HWCAP
      && features->_hwcap2 & HWCAP2_LSE128)
    return true;
  /* A 0 HWCAP2_LSE128 bit may be just as much a sign of missing HWCAP2 bit
     support in older kernels as it is of CPU feature absence.  Try fallback
     method to guarantee LSE128 is not implemented.

     In the absence of HWCAP_CPUID, we are unable to check for LSE128.
     If feature check available, check LSE2 prerequisite before proceeding.  */
  if (!(hwcap & HWCAP_CPUID) || !(hwcap & HWCAP_USCAT))
     return false;

  unsigned long isar0;
  asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0));
  if (AT_FEAT_FIELD (isar0) >= 3)
    return true;
  return false;
}

I suppose one difference is that the libatomic code is gating a
choice between a well-defined, curated set of routines, whereas the
libgcc code is providing a general user-facing feature.  So maybe
libgcc should be more conservative for that reason?

Thanks,
Richard
Wilco Dijkstra June 5, 2024, 11:24 a.m. UTC | #4
Hi Richard,

>> Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
>> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
>> entirely convinced we need these, let alone having 3 individual bits for LS64, but
>> that's something for the ACLE spec to sort out). The goal here is to fix all obvious
>> bugs so one can use FMV as intended.
>
> Didn't we take the opposite approach for libatomic though?

We started the work before LSE128/RCPC3 HWCAPs were added, so there was no
alternative at the time. Checking both means a higher QoI, but once most distros
use modern kernels, the CPUID checks become unnecessary and will be removed.

> I suppose one difference is that the libatomic code is gating a
> choice between a well-defined, curated set of routines, whereas the
> libgcc code is providing a general user-facing feature.  So maybe
> libgcc should be more conservative for that reason?

Indeed. Using HWCAP means it's trivially correct and working identically between GCC
and LLVM.

I don't rule out adding extra CPUID checks for some features. However unlike libatomic,
the selected features are very user visible, so we would need to specify for which features
this is both useful and correct, and make sure GCC and LLVM behave in the same way.

Cheers,
Wilco
Richard Sandiford June 5, 2024, 12:38 p.m. UTC | #5
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
>>> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
>>> entirely convinced we need these, let alone having 3 individual bits for LS64, but
>>> that's something for the ACLE spec to sort out). The goal here is to fix all obvious
>>> bugs so one can use FMV as intended.
>>
>> Didn't we take the opposite approach for libatomic though?
>
> We started the work before LSE128/RCPC3 HWCAPs were added, so there was no
> alternative at the time. Checking both means a higher QoI, but once most distros
> use modern kernels, the CPUID checks become unnecessary and will be removed.
>
>> I suppose one difference is that the libatomic code is gating a
>> choice between a well-defined, curated set of routines, whereas the
>> libgcc code is providing a general user-facing feature.  So maybe
>> libgcc should be more conservative for that reason?
>
> Indeed. Using HWCAP means it's trivially correct and working identically between GCC
> and LLVM.
>
> I don't rule out adding extra CPUID checks for some features. However unlike libatomic,
> the selected features are very user visible, so we would need to specify for which features
> this is both useful and correct, and make sure GCC and LLVM behave in the same way.

Thanks, makes sense.  On that basis, the patch is OK for trunk and GCC
14 branch with the previously discussed changes to the changelog &
commit message.

Richard
diff mbox series

Patch

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@  struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3	(1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-				const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+				 const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
     hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@  __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
     setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-    setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
     setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
     setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
     setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
     setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
     setCPUFeature(FEAT_FP16);
-    setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
     setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
     setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-    setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
     setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@  __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
     setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-    setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
+    {
+      setCPUFeature(FEAT_SSBS);
+      setCPUFeature(FEAT_SSBS2);
+    }
+  if (hwcap2 & HWCAP2_MTE)
+    {
+      setCPUFeature(FEAT_MEMTAG);
+      setCPUFeature(FEAT_MEMTAG2);
+    }
+  if (hwcap2 & HWCAP2_MTE3)
     setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
     setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-    setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
     setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
     setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -329,108 +330,76 @@  __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
     setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+    setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
     setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
     setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-    unsigned long ftr;
-    getCPUFeature(ID_AA64PFR1_EL1, ftr);
-    /* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-    if (extractBits(ftr, 8, 4) >= 0x1)
-      setCPUFeature(FEAT_MEMTAG);
-    /* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-    if (extractBits(ftr, 4, 4) == 0x1)
-      setCPUFeature(FEAT_SSBS);
-    /* ID_AA64PFR1_EL1.SME == 0b0010  */
-    if (extractBits(ftr, 24, 4) == 0x2)
-      setCPUFeature(FEAT_SME2);
-    getCPUFeature(ID_AA64PFR0_EL1, ftr);
-    /* ID_AA64PFR0_EL1.FP != 0b1111  */
-    if (extractBits(ftr, 16, 4) != 0xF) {
-      setCPUFeature(FEAT_FP);
-      /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP  */
-      setCPUFeature(FEAT_SIMD);
-    }
-    /* ID_AA64PFR0_EL1.SVE != 0b0000  */
-    if (extractBits(ftr, 32, 4) != 0x0) {
-      /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only  */
-      getCPUFeature(S3_0_C0_C4_4, ftr);
-      /* ID_AA64ZFR0_EL1.SVEver == 0b0000  */
-      if (extractBits(ftr, 0, 4) == 0x0)
-	setCPUFeature(FEAT_SVE);
-      /* ID_AA64ZFR0_EL1.SVEver == 0b0001  */
-      if (extractBits(ftr, 0, 4) == 0x1)
-	setCPUFeature(FEAT_SVE2);
-      /* ID_AA64ZFR0_EL1.BF16 != 0b0000  */
-      if (extractBits(ftr, 20, 4) != 0x0)
-	setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_CPUID)
+    {
+      unsigned long ftr;
+
+      getCPUFeature(ID_AA64ISAR1_EL1, ftr);
+      /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
+      if (extractBits(ftr, 40, 4) >= 0x1)
+	setCPUFeature(FEAT_PREDRES);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
+      if (extractBits(ftr, 60, 4) >= 0x1)
+	setCPUFeature(FEAT_LS64);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
+      if (extractBits(ftr, 60, 4) >= 0x2)
+	setCPUFeature(FEAT_LS64_V);
+      /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
+      if (extractBits(ftr, 60, 4) >= 0x3)
+	setCPUFeature(FEAT_LS64_ACCDATA);
     }
-    getCPUFeature(ID_AA64ISAR0_EL1, ftr);
-    /* ID_AA64ISAR0_EL1.SHA3 != 0b0000  */
-    if (extractBits(ftr, 32, 4) != 0x0)
-      setCPUFeature(FEAT_SHA3);
-    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
-    /* ID_AA64ISAR1_EL1.DPB >= 0b0001  */
-    if (extractBits(ftr, 0, 4) >= 0x1)
-      setCPUFeature(FEAT_DPB);
-    /* ID_AA64ISAR1_EL1.LRCPC != 0b0000  */
-    if (extractBits(ftr, 20, 4) != 0x0)
-      setCPUFeature(FEAT_RCPC);
-    /* ID_AA64ISAR1_EL1.LRCPC == 0b0011  */
-    if (extractBits(ftr, 20, 4) == 0x3)
-      setCPUFeature(FEAT_RCPC3);
-    /* ID_AA64ISAR1_EL1.SPECRES == 0b0001  */
-    if (extractBits(ftr, 40, 4) == 0x2)
-      setCPUFeature(FEAT_PREDRES);
-    /* ID_AA64ISAR1_EL1.BF16 != 0b0000  */
-    if (extractBits(ftr, 44, 4) != 0x0)
-      setCPUFeature(FEAT_BF16);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
-    if (extractBits(ftr, 60, 4) >= 0x1)
-      setCPUFeature(FEAT_LS64);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
-    if (extractBits(ftr, 60, 4) >= 0x2)
-      setCPUFeature(FEAT_LS64_V);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
-    if (extractBits(ftr, 60, 4) >= 0x3)
-      setCPUFeature(FEAT_LS64_ACCDATA);
-  } else {
-    /* Set some features in case of no CPUID support.  */
-    if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
+
+  if (hwcap & HWCAP_FP)
+    {
       setCPUFeature(FEAT_FP);
       /* FP and AdvSIMD fields have the same value.  */
       setCPUFeature(FEAT_SIMD);
     }
-    if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP)
-      setCPUFeature(FEAT_DPB);
-    if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC)
-      setCPUFeature(FEAT_RCPC);
-    if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
-      setCPUFeature(FEAT_BF16);
-    if (hwcap2 & HWCAP2_SVEBF16)
-      setCPUFeature(FEAT_SVE_BF16);
-    if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
-      setCPUFeature(FEAT_SVE2);
-    if (hwcap & HWCAP_SHA3)
-      setCPUFeature(FEAT_SHA3);
-  }
+  if (hwcap & HWCAP_DCPOP)
+    setCPUFeature(FEAT_DPB);
+  if (hwcap & HWCAP_LRCPC)
+    setCPUFeature(FEAT_RCPC);
+  if (hwcap & HWCAP_ILRCPC)
+    setCPUFeature(FEAT_RCPC2);
+  if (hwcap2 & HWCAP2_LRCPC3)
+    setCPUFeature(FEAT_RCPC3);
+  if (hwcap2 & HWCAP2_BF16)
+    setCPUFeature(FEAT_BF16);
+  if (hwcap2 & HWCAP2_SVEBF16)
+    setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_SVE)
+    setCPUFeature(FEAT_SVE);
+  if (hwcap2 & HWCAP2_SVE2)
+    setCPUFeature(FEAT_SVE2);
+  if (hwcap & HWCAP_SHA3)
+    setCPUFeature(FEAT_SHA3);
   setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
 }
 
 void
-__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) {
-  if (__aarch64_cpu_features.features)
+__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg)
+{
+  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
   __init_cpu_features_constructor(hwcap, arg);
 }
 
 void __attribute__ ((constructor))
-__init_cpu_features(void) {
+__init_cpu_features(void)
+{
   unsigned long hwcap;
   unsigned long hwcap2;
+
   /* CPU features already initialized.  */
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
   hwcap = getauxval(AT_HWCAP);
   hwcap2 = getauxval(AT_HWCAP2);