Message ID | PAWPR08MB8982E391F1465FF711AB32D783F82@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | PATCH] AArch64: Fix cpu features initialization [PR115342] | expand |
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: > Fix CPU features initialization. Use HWCAP rather than explicit accesses > to CPUID registers. Perform the initialization atomically to avoid multi- > threading issues. Please describe the problem that the patch is fixing. I think the PR description would make a better commit message: ----------------------------------------------------------------------- The CPU features initialization code uses CPUID registers. It uses incorrect comparisons so that for example SVE is not set if SVE2 is available. Using HWCAPs for these is both simpler and works correctly. The initialization must also be done atomically so to avoid multiple threads causing corruption due to non-atomic RMW of the global. ----------------------------------------------------------------------- What criteria did you use for choosing whether to keep or remove the system register checks? > Passes regress, OK for commit and backport? > > libgcc: > PR target/115342 > * config/aarch64/cpuinfo.c (__init_cpu_features_constructor): > Use HWCAP where possible. Use atomic write for initialization. It'd be good to mention the fix for the FEAT_PREDRES system register check as well. > (__init_cpu_features_resolver): Use atomic load for correct > initialization. > (__init_cpu_features): Likewise. Thanks, Richard > > --- > > diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c > index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644 > --- a/libgcc/config/aarch64/cpuinfo.c > +++ b/libgcc/config/aarch64/cpuinfo.c > @@ -227,14 +227,22 @@ struct { > #ifndef HWCAP2_SVE_EBF16 > #define HWCAP2_SVE_EBF16 (1UL << 33) > #endif > +#ifndef HWCAP2_SME2 > +#define HWCAP2_SME2 (1UL << 37) > +#endif > +#ifndef HWCAP2_LRCPC3 > +#define HWCAP2_LRCPC3 (1UL << 46) > +#endif > > static void > -__init_cpu_features_constructor(unsigned long hwcap, > - const __ifunc_arg_t *arg) { > -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F > +__init_cpu_features_constructor (unsigned long hwcap, > + const __ifunc_arg_t *arg) > +{ > + unsigned long feat = 0; > +#define setCPUFeature(F) feat |= 1UL << F > #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr)) > #define extractBits(val, start, number) \ > - (val & ((1ULL << number) - 1ULL) << start) >> start > + (val & ((1UL << number) - 1UL) << start) >> start > unsigned long hwcap2 = 0; > if (hwcap & _IFUNC_ARG_HWCAP) > hwcap2 = arg->_hwcap2; > @@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap, > setCPUFeature(FEAT_PMULL); > if (hwcap & HWCAP_FLAGM) > setCPUFeature(FEAT_FLAGM); > - if (hwcap2 & HWCAP2_FLAGM2) { > - setCPUFeature(FEAT_FLAGM); > + if (hwcap2 & HWCAP2_FLAGM2) > setCPUFeature(FEAT_FLAGM2); > - } > - if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4) > + if (hwcap & HWCAP_SM4) > setCPUFeature(FEAT_SM4); > if (hwcap & HWCAP_ASIMDDP) > setCPUFeature(FEAT_DOTPROD); > if (hwcap & HWCAP_ASIMDFHM) > setCPUFeature(FEAT_FP16FML); > - if (hwcap & HWCAP_FPHP) { > + if (hwcap & HWCAP_FPHP) > setCPUFeature(FEAT_FP16); > - setCPUFeature(FEAT_FP); > - } > if (hwcap & HWCAP_DIT) > setCPUFeature(FEAT_DIT); > if (hwcap & HWCAP_ASIMDRDM) > setCPUFeature(FEAT_RDM); > - if (hwcap & HWCAP_ILRCPC) > - setCPUFeature(FEAT_RCPC2); > if (hwcap & HWCAP_AES) > setCPUFeature(FEAT_AES); > if (hwcap & HWCAP_SHA1) > @@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap, > if (hwcap & HWCAP_SB) > setCPUFeature(FEAT_SB); > if (hwcap & HWCAP_SSBS) > - setCPUFeature(FEAT_SSBS2); > - if (hwcap2 & HWCAP2_MTE) { > - setCPUFeature(FEAT_MEMTAG); > - setCPUFeature(FEAT_MEMTAG2); > - } > - if (hwcap2 & HWCAP2_MTE3) { > - setCPUFeature(FEAT_MEMTAG); > - setCPUFeature(FEAT_MEMTAG2); > + { > + setCPUFeature(FEAT_SSBS); > + setCPUFeature(FEAT_SSBS2); > + } > + if (hwcap2 & HWCAP2_MTE) > + { > + setCPUFeature(FEAT_MEMTAG); > + setCPUFeature(FEAT_MEMTAG2); > + } > + if (hwcap2 & HWCAP2_MTE3) > setCPUFeature(FEAT_MEMTAG3); > - } > if (hwcap2 & HWCAP2_SVEAES) > setCPUFeature(FEAT_SVE_AES); > - if (hwcap2 & HWCAP2_SVEPMULL) { > - setCPUFeature(FEAT_SVE_AES); > + if (hwcap2 & HWCAP2_SVEPMULL) > setCPUFeature(FEAT_SVE_PMULL128); > - } > if (hwcap2 & HWCAP2_SVEBITPERM) > setCPUFeature(FEAT_SVE_BITPERM); > if (hwcap2 & HWCAP2_SVESHA3) > @@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap, > setCPUFeature(FEAT_WFXT); > if (hwcap2 & HWCAP2_SME) > setCPUFeature(FEAT_SME); > + if (hwcap2 & HWCAP2_SME2) > + setCPUFeature(FEAT_SME2); > if (hwcap2 & HWCAP2_SME_I16I64) > setCPUFeature(FEAT_SME_I64); > if (hwcap2 & HWCAP2_SME_F64F64) > setCPUFeature(FEAT_SME_F64); > - if (hwcap & HWCAP_CPUID) { > - unsigned long ftr; > - getCPUFeature(ID_AA64PFR1_EL1, ftr); > - /* ID_AA64PFR1_EL1.MTE >= 0b0001 */ > - if (extractBits(ftr, 8, 4) >= 0x1) > - setCPUFeature(FEAT_MEMTAG); > - /* ID_AA64PFR1_EL1.SSBS == 0b0001 */ > - if (extractBits(ftr, 4, 4) == 0x1) > - setCPUFeature(FEAT_SSBS); > - /* ID_AA64PFR1_EL1.SME == 0b0010 */ > - if (extractBits(ftr, 24, 4) == 0x2) > - setCPUFeature(FEAT_SME2); > - getCPUFeature(ID_AA64PFR0_EL1, ftr); > - /* ID_AA64PFR0_EL1.FP != 0b1111 */ > - if (extractBits(ftr, 16, 4) != 0xF) { > - setCPUFeature(FEAT_FP); > - /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP */ > - setCPUFeature(FEAT_SIMD); > - } > - /* ID_AA64PFR0_EL1.SVE != 0b0000 */ > - if (extractBits(ftr, 32, 4) != 0x0) { > - /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only */ > - getCPUFeature(S3_0_C0_C4_4, ftr); > - /* ID_AA64ZFR0_EL1.SVEver == 0b0000 */ > - if (extractBits(ftr, 0, 4) == 0x0) > - setCPUFeature(FEAT_SVE); > - /* ID_AA64ZFR0_EL1.SVEver == 0b0001 */ > - if (extractBits(ftr, 0, 4) == 0x1) > - setCPUFeature(FEAT_SVE2); > - /* ID_AA64ZFR0_EL1.BF16 != 0b0000 */ > - if (extractBits(ftr, 20, 4) != 0x0) > - setCPUFeature(FEAT_SVE_BF16); > + if (hwcap & HWCAP_CPUID) > + { > + unsigned long ftr; > + > + getCPUFeature(ID_AA64ISAR1_EL1, ftr); > + /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001 */ > + if (extractBits(ftr, 40, 4) >= 0x1) > + setCPUFeature(FEAT_PREDRES); > + /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ > + if (extractBits(ftr, 60, 4) >= 0x1) > + setCPUFeature(FEAT_LS64); > + /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ > + if (extractBits(ftr, 60, 4) >= 0x2) > + setCPUFeature(FEAT_LS64_V); > + /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ > + if (extractBits(ftr, 60, 4) >= 0x3) > + setCPUFeature(FEAT_LS64_ACCDATA); > } > - getCPUFeature(ID_AA64ISAR0_EL1, ftr); > - /* ID_AA64ISAR0_EL1.SHA3 != 0b0000 */ > - if (extractBits(ftr, 32, 4) != 0x0) > - setCPUFeature(FEAT_SHA3); > - getCPUFeature(ID_AA64ISAR1_EL1, ftr); > - /* ID_AA64ISAR1_EL1.DPB >= 0b0001 */ > - if (extractBits(ftr, 0, 4) >= 0x1) > - setCPUFeature(FEAT_DPB); > - /* ID_AA64ISAR1_EL1.LRCPC != 0b0000 */ > - if (extractBits(ftr, 20, 4) != 0x0) > - setCPUFeature(FEAT_RCPC); > - /* ID_AA64ISAR1_EL1.LRCPC == 0b0011 */ > - if (extractBits(ftr, 20, 4) == 0x3) > - setCPUFeature(FEAT_RCPC3); > - /* ID_AA64ISAR1_EL1.SPECRES == 0b0001 */ > - if (extractBits(ftr, 40, 4) == 0x2) > - setCPUFeature(FEAT_PREDRES); > - /* ID_AA64ISAR1_EL1.BF16 != 0b0000 */ > - if (extractBits(ftr, 44, 4) != 0x0) > - setCPUFeature(FEAT_BF16); > - /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ > - if (extractBits(ftr, 60, 4) >= 0x1) > - setCPUFeature(FEAT_LS64); > - /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ > - if (extractBits(ftr, 60, 4) >= 0x2) > - setCPUFeature(FEAT_LS64_V); > - /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ > - if (extractBits(ftr, 60, 4) >= 0x3) > - setCPUFeature(FEAT_LS64_ACCDATA); > - } else { > - /* Set some features in case of no CPUID support. */ > - if (hwcap & (HWCAP_FP | HWCAP_FPHP)) { > + > + if (hwcap & HWCAP_FP) > + { > setCPUFeature(FEAT_FP); > /* FP and AdvSIMD fields have the same value. */ > setCPUFeature(FEAT_SIMD); > } > - if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP) > - setCPUFeature(FEAT_DPB); > - if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC) > - setCPUFeature(FEAT_RCPC); > - if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16) > - setCPUFeature(FEAT_BF16); > - if (hwcap2 & HWCAP2_SVEBF16) > - setCPUFeature(FEAT_SVE_BF16); > - if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE) > - setCPUFeature(FEAT_SVE2); > - if (hwcap & HWCAP_SHA3) > - setCPUFeature(FEAT_SHA3); > - } > + if (hwcap & HWCAP_DCPOP) > + setCPUFeature(FEAT_DPB); > + if (hwcap & HWCAP_LRCPC) > + setCPUFeature(FEAT_RCPC); > + if (hwcap & HWCAP_ILRCPC) > + setCPUFeature(FEAT_RCPC2); > + if (hwcap2 & HWCAP2_LRCPC3) > + setCPUFeature(FEAT_RCPC3); > + if (hwcap2 & HWCAP2_BF16) > + setCPUFeature(FEAT_BF16); > + if (hwcap2 & HWCAP2_SVEBF16) > + setCPUFeature(FEAT_SVE_BF16); > + if (hwcap & HWCAP_SVE) > + setCPUFeature(FEAT_SVE); > + if (hwcap2 & HWCAP2_SVE2) > + setCPUFeature(FEAT_SVE2); > + if (hwcap & HWCAP_SHA3) > + setCPUFeature(FEAT_SHA3); > setCPUFeature(FEAT_INIT); > + > + __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED); > } > > void > -__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) { > - if (__aarch64_cpu_features.features) > +__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) > +{ > + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) > return; > __init_cpu_features_constructor(hwcap, arg); > } > > void __attribute__ ((constructor)) > -__init_cpu_features(void) { > +__init_cpu_features(void) > +{ > unsigned long hwcap; > unsigned long hwcap2; > + > /* CPU features already initialized. */ > - if (__aarch64_cpu_features.features) > + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) > return; > hwcap = getauxval(AT_HWCAP); > hwcap2 = getauxval(AT_HWCAP2);
Hi Richard, I've reworded the commit message a bit: The CPU features initialization code uses CPUID registers (rather than HWCAP). The equality comparisons it uses are incorrect: for example FEAT_SVE is not set if SVE2 is available. Using HWCAPs for these is both simpler and correct. The initialization must also be done atomically to avoid multiple threads causing corruption due to non-atomic RMW accesses to the global. > What criteria did you use for choosing whether to keep or remove > the system register checks? Essentially anything covered by HWCAP doesn't need an explicit check. So I kept the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not entirely convinced we need these, let alone having 3 individual bits for LS64, but that's something for the ACLE spec to sort out). The goal here is to fix all obvious bugs so one can use FMV as intended. > Passes regress, OK for commit and backport? > > libgcc: > PR target/115342 > * config/aarch64/cpuinfo.c (__init_cpu_features_constructor): > Use HWCAP where possible. Use atomic write for initialization. > It'd be good to mention the fix for the FEAT_PREDRES system register check > as well. Done, see below. Cheers, Wilco v2: Update commit message and mention PREDRES. The CPU features initialization code uses CPUID registers (rather than HWCAP). The equality comparisons it uses are incorrect: for example FEAT_SVE is not set if SVE2 is available. Using HWCAPs for these is both simpler and correct. The initialization must also be done atomically to avoid multiple threads causing corruption due to non-atomic RMW accesses to the global. Passes regress, OK for commit and backport? libgcc: PR target/115342 * config/aarch64/cpuinfo.c (__init_cpu_features_constructor): Use HWCAP where possible. Use atomic write for initialization. Fix FEAT_PREDRES comparison. (__init_cpu_features_resolver): Use atomic load for correct initialization. (__init_cpu_features): Likewise. --- diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644 --- a/libgcc/config/aarch64/cpuinfo.c +++ b/libgcc/config/aarch64/cpuinfo.c @@ -227,14 +227,22 @@ struct { #ifndef HWCAP2_SVE_EBF16 #define HWCAP2_SVE_EBF16 (1UL << 33) #endif +#ifndef HWCAP2_SME2 +#define HWCAP2_SME2 (1UL << 37) +#endif +#ifndef HWCAP2_LRCPC3 +#define HWCAP2_LRCPC3 (1UL << 46) +#endif static void -__init_cpu_features_constructor(unsigned long hwcap, - const __ifunc_arg_t *arg) { -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F +__init_cpu_features_constructor (unsigned long hwcap, + const __ifunc_arg_t *arg) +{ + unsigned long feat = 0; +#define setCPUFeature(F) feat |= 1UL << F #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr)) #define extractBits(val, start, number) \ - (val & ((1ULL << number) - 1ULL) << start) >> start + (val & ((1UL << number) - 1UL) << start) >> start unsigned long hwcap2 = 0; if (hwcap & _IFUNC_ARG_HWCAP) hwcap2 = arg->_hwcap2; @@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_PMULL); if (hwcap & HWCAP_FLAGM) setCPUFeature(FEAT_FLAGM); - if (hwcap2 & HWCAP2_FLAGM2) { - setCPUFeature(FEAT_FLAGM); + if (hwcap2 & HWCAP2_FLAGM2) setCPUFeature(FEAT_FLAGM2); - } - if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4) + if (hwcap & HWCAP_SM4) setCPUFeature(FEAT_SM4); if (hwcap & HWCAP_ASIMDDP) setCPUFeature(FEAT_DOTPROD); if (hwcap & HWCAP_ASIMDFHM) setCPUFeature(FEAT_FP16FML); - if (hwcap & HWCAP_FPHP) { + if (hwcap & HWCAP_FPHP) setCPUFeature(FEAT_FP16); - setCPUFeature(FEAT_FP); - } if (hwcap & HWCAP_DIT) setCPUFeature(FEAT_DIT); if (hwcap & HWCAP_ASIMDRDM) setCPUFeature(FEAT_RDM); - if (hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC2); if (hwcap & HWCAP_AES) setCPUFeature(FEAT_AES); if (hwcap & HWCAP_SHA1) @@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap, if (hwcap & HWCAP_SB) setCPUFeature(FEAT_SB); if (hwcap & HWCAP_SSBS) - setCPUFeature(FEAT_SSBS2); - if (hwcap2 & HWCAP2_MTE) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - } - if (hwcap2 & HWCAP2_MTE3) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); + { + setCPUFeature(FEAT_SSBS); + setCPUFeature(FEAT_SSBS2); + } + if (hwcap2 & HWCAP2_MTE) + { + setCPUFeature(FEAT_MEMTAG); + setCPUFeature(FEAT_MEMTAG2); + } + if (hwcap2 & HWCAP2_MTE3) setCPUFeature(FEAT_MEMTAG3); - } if (hwcap2 & HWCAP2_SVEAES) setCPUFeature(FEAT_SVE_AES); - if (hwcap2 & HWCAP2_SVEPMULL) { - setCPUFeature(FEAT_SVE_AES); + if (hwcap2 & HWCAP2_SVEPMULL) setCPUFeature(FEAT_SVE_PMULL128); - } if (hwcap2 & HWCAP2_SVEBITPERM) setCPUFeature(FEAT_SVE_BITPERM); if (hwcap2 & HWCAP2_SVESHA3) @@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_WFXT); if (hwcap2 & HWCAP2_SME) setCPUFeature(FEAT_SME); + if (hwcap2 & HWCAP2_SME2) + setCPUFeature(FEAT_SME2); if (hwcap2 & HWCAP2_SME_I16I64) setCPUFeature(FEAT_SME_I64); if (hwcap2 & HWCAP2_SME_F64F64) setCPUFeature(FEAT_SME_F64); - if (hwcap & HWCAP_CPUID) { - unsigned long ftr; - getCPUFeature(ID_AA64PFR1_EL1, ftr); - /* ID_AA64PFR1_EL1.MTE >= 0b0001 */ - if (extractBits(ftr, 8, 4) >= 0x1) - setCPUFeature(FEAT_MEMTAG); - /* ID_AA64PFR1_EL1.SSBS == 0b0001 */ - if (extractBits(ftr, 4, 4) == 0x1) - setCPUFeature(FEAT_SSBS); - /* ID_AA64PFR1_EL1.SME == 0b0010 */ - if (extractBits(ftr, 24, 4) == 0x2) - setCPUFeature(FEAT_SME2); - getCPUFeature(ID_AA64PFR0_EL1, ftr); - /* ID_AA64PFR0_EL1.FP != 0b1111 */ - if (extractBits(ftr, 16, 4) != 0xF) { - setCPUFeature(FEAT_FP); - /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP */ - setCPUFeature(FEAT_SIMD); - } - /* ID_AA64PFR0_EL1.SVE != 0b0000 */ - if (extractBits(ftr, 32, 4) != 0x0) { - /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only */ - getCPUFeature(S3_0_C0_C4_4, ftr); - /* ID_AA64ZFR0_EL1.SVEver == 0b0000 */ - if (extractBits(ftr, 0, 4) == 0x0) - setCPUFeature(FEAT_SVE); - /* ID_AA64ZFR0_EL1.SVEver == 0b0001 */ - if (extractBits(ftr, 0, 4) == 0x1) - setCPUFeature(FEAT_SVE2); - /* ID_AA64ZFR0_EL1.BF16 != 0b0000 */ - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_SVE_BF16); + if (hwcap & HWCAP_CPUID) + { + unsigned long ftr; + + getCPUFeature(ID_AA64ISAR1_EL1, ftr); + /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001 */ + if (extractBits(ftr, 40, 4) >= 0x1) + setCPUFeature(FEAT_PREDRES); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ + if (extractBits(ftr, 60, 4) >= 0x1) + setCPUFeature(FEAT_LS64); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ + if (extractBits(ftr, 60, 4) >= 0x2) + setCPUFeature(FEAT_LS64_V); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ + if (extractBits(ftr, 60, 4) >= 0x3) + setCPUFeature(FEAT_LS64_ACCDATA); } - getCPUFeature(ID_AA64ISAR0_EL1, ftr); - /* ID_AA64ISAR0_EL1.SHA3 != 0b0000 */ - if (extractBits(ftr, 32, 4) != 0x0) - setCPUFeature(FEAT_SHA3); - getCPUFeature(ID_AA64ISAR1_EL1, ftr); - /* ID_AA64ISAR1_EL1.DPB >= 0b0001 */ - if (extractBits(ftr, 0, 4) >= 0x1) - setCPUFeature(FEAT_DPB); - /* ID_AA64ISAR1_EL1.LRCPC != 0b0000 */ - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_RCPC); - /* ID_AA64ISAR1_EL1.LRCPC == 0b0011 */ - if (extractBits(ftr, 20, 4) == 0x3) - setCPUFeature(FEAT_RCPC3); - /* ID_AA64ISAR1_EL1.SPECRES == 0b0001 */ - if (extractBits(ftr, 40, 4) == 0x2) - setCPUFeature(FEAT_PREDRES); - /* ID_AA64ISAR1_EL1.BF16 != 0b0000 */ - if (extractBits(ftr, 44, 4) != 0x0) - setCPUFeature(FEAT_BF16); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ - if (extractBits(ftr, 60, 4) >= 0x1) - setCPUFeature(FEAT_LS64); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ - if (extractBits(ftr, 60, 4) >= 0x2) - setCPUFeature(FEAT_LS64_V); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ - if (extractBits(ftr, 60, 4) >= 0x3) - setCPUFeature(FEAT_LS64_ACCDATA); - } else { - /* Set some features in case of no CPUID support. */ - if (hwcap & (HWCAP_FP | HWCAP_FPHP)) { + + if (hwcap & HWCAP_FP) + { setCPUFeature(FEAT_FP); /* FP and AdvSIMD fields have the same value. */ setCPUFeature(FEAT_SIMD); } - if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP) - setCPUFeature(FEAT_DPB); - if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC); - if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16) - setCPUFeature(FEAT_BF16); - if (hwcap2 & HWCAP2_SVEBF16) - setCPUFeature(FEAT_SVE_BF16); - if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE) - setCPUFeature(FEAT_SVE2); - if (hwcap & HWCAP_SHA3) - setCPUFeature(FEAT_SHA3); - } + if (hwcap & HWCAP_DCPOP) + setCPUFeature(FEAT_DPB); + if (hwcap & HWCAP_LRCPC) + setCPUFeature(FEAT_RCPC); + if (hwcap & HWCAP_ILRCPC) + setCPUFeature(FEAT_RCPC2); + if (hwcap2 & HWCAP2_LRCPC3) + setCPUFeature(FEAT_RCPC3); + if (hwcap2 & HWCAP2_BF16) + setCPUFeature(FEAT_BF16); + if (hwcap2 & HWCAP2_SVEBF16) + setCPUFeature(FEAT_SVE_BF16); + if (hwcap & HWCAP_SVE) + setCPUFeature(FEAT_SVE); + if (hwcap2 & HWCAP2_SVE2) + setCPUFeature(FEAT_SVE2); + if (hwcap & HWCAP_SHA3) + setCPUFeature(FEAT_SHA3); setCPUFeature(FEAT_INIT); + + __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED); } void -__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) { - if (__aarch64_cpu_features.features) +__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) +{ + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) return; __init_cpu_features_constructor(hwcap, arg); } void __attribute__ ((constructor)) -__init_cpu_features(void) { +__init_cpu_features(void) +{ unsigned long hwcap; unsigned long hwcap2; + /* CPU features already initialized. */ - if (__aarch64_cpu_features.features) + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) return; hwcap = getauxval(AT_HWCAP); hwcap2 = getauxval(AT_HWCAP2);
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: > Hi Richard, > > I've reworded the commit message a bit: > > The CPU features initialization code uses CPUID registers (rather than > HWCAP). The equality comparisons it uses are incorrect: for example FEAT_SVE > is not set if SVE2 is available. Using HWCAPs for these is both simpler and > correct. The initialization must also be done atomically to avoid multiple > threads causing corruption due to non-atomic RMW accesses to the global. Thanks, sounds good. >> What criteria did you use for choosing whether to keep or remove >> the system register checks? > > Essentially anything covered by HWCAP doesn't need an explicit check. So I kept > the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not > entirely convinced we need these, let alone having 3 individual bits for LS64, but > that's something for the ACLE spec to sort out). The goal here is to fix all obvious > bugs so one can use FMV as intended. Didn't we take the opposite approach for libatomic though? /* LSE128 atomic support encoded in ID_AA64ISAR0_EL1.Atomic, bits[23:20]. The expected value is 0b0011. Check that. */ #define AT_FEAT_FIELD(isar0) (((isar0) >> 20) & 15) static inline bool has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features) { if (hwcap & _IFUNC_ARG_HWCAP && features->_hwcap2 & HWCAP2_LSE128) return true; /* A 0 HWCAP2_LSE128 bit may be just as much a sign of missing HWCAP2 bit support in older kernels as it is of CPU feature absence. Try fallback method to guarantee LSE128 is not implemented. In the absence of HWCAP_CPUID, we are unable to check for LSE128. If feature check available, check LSE2 prerequisite before proceeding. */ if (!(hwcap & HWCAP_CPUID) || !(hwcap & HWCAP_USCAT)) return false; unsigned long isar0; asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0)); if (AT_FEAT_FIELD (isar0) >= 3) return true; return false; } I suppose one difference is that the libatomic code is gating a choice between a well-defined, curated set of routines, whereas the libgcc code is providing a general user-facing feature. So maybe libgcc should be more conservative for that reason? Thanks, Richard
Hi Richard, >> Essentially anything covered by HWCAP doesn't need an explicit check. So I kept >> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not >> entirely convinced we need these, let alone having 3 individual bits for LS64, but >> that's something for the ACLE spec to sort out). The goal here is to fix all obvious >> bugs so one can use FMV as intended. > > Didn't we take the opposite approach for libatomic though? We started the work before LSE128/RCPC3 HWCAPs were added, so there was no alternative at the time. Checking both means a higher QoI, but once most distros use modern kernels, the CPUID checks become unnecessary and will be removed. > I suppose one difference is that the libatomic code is gating a > choice between a well-defined, curated set of routines, whereas the > libgcc code is providing a general user-facing feature. So maybe > libgcc should be more conservative for that reason? Indeed. Using HWCAP means it's trivially correct and working identically between GCC and LLVM. I don't rule out adding extra CPUID checks for some features. However unlike libatomic, the selected features are very user visible, so we would need to specify for which features this is both useful and correct, and make sure GCC and LLVM behave in the same way. Cheers, Wilco
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: > Hi Richard, > >>> Essentially anything covered by HWCAP doesn't need an explicit check. So I kept >>> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not >>> entirely convinced we need these, let alone having 3 individual bits for LS64, but >>> that's something for the ACLE spec to sort out). The goal here is to fix all obvious >>> bugs so one can use FMV as intended. >> >> Didn't we take the opposite approach for libatomic though? > > We started the work before LSE128/RCPC3 HWCAPs were added, so there was no > alternative at the time. Checking both means a higher QoI, but once most distros > use modern kernels, the CPUID checks become unnecessary and will be removed. > >> I suppose one difference is that the libatomic code is gating a >> choice between a well-defined, curated set of routines, whereas the >> libgcc code is providing a general user-facing feature. So maybe >> libgcc should be more conservative for that reason? > > Indeed. Using HWCAP means it's trivially correct and working identically between GCC > and LLVM. > > I don't rule out adding extra CPUID checks for some features. However unlike libatomic, > the selected features are very user visible, so we would need to specify for which features > this is both useful and correct, and make sure GCC and LLVM behave in the same way. Thanks, makes sense. On that basis, the patch is OK for trunk and GCC 14 branch with the previously discussed changes to the changelog & commit message. Richard
diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c index 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9 100644 --- a/libgcc/config/aarch64/cpuinfo.c +++ b/libgcc/config/aarch64/cpuinfo.c @@ -227,14 +227,22 @@ struct { #ifndef HWCAP2_SVE_EBF16 #define HWCAP2_SVE_EBF16 (1UL << 33) #endif +#ifndef HWCAP2_SME2 +#define HWCAP2_SME2 (1UL << 37) +#endif +#ifndef HWCAP2_LRCPC3 +#define HWCAP2_LRCPC3 (1UL << 46) +#endif static void -__init_cpu_features_constructor(unsigned long hwcap, - const __ifunc_arg_t *arg) { -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F +__init_cpu_features_constructor (unsigned long hwcap, + const __ifunc_arg_t *arg) +{ + unsigned long feat = 0; +#define setCPUFeature(F) feat |= 1UL << F #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr)) #define extractBits(val, start, number) \ - (val & ((1ULL << number) - 1ULL) << start) >> start + (val & ((1UL << number) - 1UL) << start) >> start unsigned long hwcap2 = 0; if (hwcap & _IFUNC_ARG_HWCAP) hwcap2 = arg->_hwcap2; @@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_PMULL); if (hwcap & HWCAP_FLAGM) setCPUFeature(FEAT_FLAGM); - if (hwcap2 & HWCAP2_FLAGM2) { - setCPUFeature(FEAT_FLAGM); + if (hwcap2 & HWCAP2_FLAGM2) setCPUFeature(FEAT_FLAGM2); - } - if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4) + if (hwcap & HWCAP_SM4) setCPUFeature(FEAT_SM4); if (hwcap & HWCAP_ASIMDDP) setCPUFeature(FEAT_DOTPROD); if (hwcap & HWCAP_ASIMDFHM) setCPUFeature(FEAT_FP16FML); - if (hwcap & HWCAP_FPHP) { + if (hwcap & HWCAP_FPHP) setCPUFeature(FEAT_FP16); - setCPUFeature(FEAT_FP); - } if (hwcap & HWCAP_DIT) setCPUFeature(FEAT_DIT); if (hwcap & HWCAP_ASIMDRDM) setCPUFeature(FEAT_RDM); - if (hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC2); if (hwcap & HWCAP_AES) setCPUFeature(FEAT_AES); if (hwcap & HWCAP_SHA1) @@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap, if (hwcap & HWCAP_SB) setCPUFeature(FEAT_SB); if (hwcap & HWCAP_SSBS) - setCPUFeature(FEAT_SSBS2); - if (hwcap2 & HWCAP2_MTE) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - } - if (hwcap2 & HWCAP2_MTE3) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); + { + setCPUFeature(FEAT_SSBS); + setCPUFeature(FEAT_SSBS2); + } + if (hwcap2 & HWCAP2_MTE) + { + setCPUFeature(FEAT_MEMTAG); + setCPUFeature(FEAT_MEMTAG2); + } + if (hwcap2 & HWCAP2_MTE3) setCPUFeature(FEAT_MEMTAG3); - } if (hwcap2 & HWCAP2_SVEAES) setCPUFeature(FEAT_SVE_AES); - if (hwcap2 & HWCAP2_SVEPMULL) { - setCPUFeature(FEAT_SVE_AES); + if (hwcap2 & HWCAP2_SVEPMULL) setCPUFeature(FEAT_SVE_PMULL128); - } if (hwcap2 & HWCAP2_SVEBITPERM) setCPUFeature(FEAT_SVE_BITPERM); if (hwcap2 & HWCAP2_SVESHA3) @@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_WFXT); if (hwcap2 & HWCAP2_SME) setCPUFeature(FEAT_SME); + if (hwcap2 & HWCAP2_SME2) + setCPUFeature(FEAT_SME2); if (hwcap2 & HWCAP2_SME_I16I64) setCPUFeature(FEAT_SME_I64); if (hwcap2 & HWCAP2_SME_F64F64) setCPUFeature(FEAT_SME_F64); - if (hwcap & HWCAP_CPUID) { - unsigned long ftr; - getCPUFeature(ID_AA64PFR1_EL1, ftr); - /* ID_AA64PFR1_EL1.MTE >= 0b0001 */ - if (extractBits(ftr, 8, 4) >= 0x1) - setCPUFeature(FEAT_MEMTAG); - /* ID_AA64PFR1_EL1.SSBS == 0b0001 */ - if (extractBits(ftr, 4, 4) == 0x1) - setCPUFeature(FEAT_SSBS); - /* ID_AA64PFR1_EL1.SME == 0b0010 */ - if (extractBits(ftr, 24, 4) == 0x2) - setCPUFeature(FEAT_SME2); - getCPUFeature(ID_AA64PFR0_EL1, ftr); - /* ID_AA64PFR0_EL1.FP != 0b1111 */ - if (extractBits(ftr, 16, 4) != 0xF) { - setCPUFeature(FEAT_FP); - /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP */ - setCPUFeature(FEAT_SIMD); - } - /* ID_AA64PFR0_EL1.SVE != 0b0000 */ - if (extractBits(ftr, 32, 4) != 0x0) { - /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only */ - getCPUFeature(S3_0_C0_C4_4, ftr); - /* ID_AA64ZFR0_EL1.SVEver == 0b0000 */ - if (extractBits(ftr, 0, 4) == 0x0) - setCPUFeature(FEAT_SVE); - /* ID_AA64ZFR0_EL1.SVEver == 0b0001 */ - if (extractBits(ftr, 0, 4) == 0x1) - setCPUFeature(FEAT_SVE2); - /* ID_AA64ZFR0_EL1.BF16 != 0b0000 */ - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_SVE_BF16); + if (hwcap & HWCAP_CPUID) + { + unsigned long ftr; + + getCPUFeature(ID_AA64ISAR1_EL1, ftr); + /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001 */ + if (extractBits(ftr, 40, 4) >= 0x1) + setCPUFeature(FEAT_PREDRES); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ + if (extractBits(ftr, 60, 4) >= 0x1) + setCPUFeature(FEAT_LS64); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ + if (extractBits(ftr, 60, 4) >= 0x2) + setCPUFeature(FEAT_LS64_V); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ + if (extractBits(ftr, 60, 4) >= 0x3) + setCPUFeature(FEAT_LS64_ACCDATA); } - getCPUFeature(ID_AA64ISAR0_EL1, ftr); - /* ID_AA64ISAR0_EL1.SHA3 != 0b0000 */ - if (extractBits(ftr, 32, 4) != 0x0) - setCPUFeature(FEAT_SHA3); - getCPUFeature(ID_AA64ISAR1_EL1, ftr); - /* ID_AA64ISAR1_EL1.DPB >= 0b0001 */ - if (extractBits(ftr, 0, 4) >= 0x1) - setCPUFeature(FEAT_DPB); - /* ID_AA64ISAR1_EL1.LRCPC != 0b0000 */ - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_RCPC); - /* ID_AA64ISAR1_EL1.LRCPC == 0b0011 */ - if (extractBits(ftr, 20, 4) == 0x3) - setCPUFeature(FEAT_RCPC3); - /* ID_AA64ISAR1_EL1.SPECRES == 0b0001 */ - if (extractBits(ftr, 40, 4) == 0x2) - setCPUFeature(FEAT_PREDRES); - /* ID_AA64ISAR1_EL1.BF16 != 0b0000 */ - if (extractBits(ftr, 44, 4) != 0x0) - setCPUFeature(FEAT_BF16); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ - if (extractBits(ftr, 60, 4) >= 0x1) - setCPUFeature(FEAT_LS64); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ - if (extractBits(ftr, 60, 4) >= 0x2) - setCPUFeature(FEAT_LS64_V); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ - if (extractBits(ftr, 60, 4) >= 0x3) - setCPUFeature(FEAT_LS64_ACCDATA); - } else { - /* Set some features in case of no CPUID support. */ - if (hwcap & (HWCAP_FP | HWCAP_FPHP)) { + + if (hwcap & HWCAP_FP) + { setCPUFeature(FEAT_FP); /* FP and AdvSIMD fields have the same value. */ setCPUFeature(FEAT_SIMD); } - if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP) - setCPUFeature(FEAT_DPB); - if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC); - if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16) - setCPUFeature(FEAT_BF16); - if (hwcap2 & HWCAP2_SVEBF16) - setCPUFeature(FEAT_SVE_BF16); - if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE) - setCPUFeature(FEAT_SVE2); - if (hwcap & HWCAP_SHA3) - setCPUFeature(FEAT_SHA3); - } + if (hwcap & HWCAP_DCPOP) + setCPUFeature(FEAT_DPB); + if (hwcap & HWCAP_LRCPC) + setCPUFeature(FEAT_RCPC); + if (hwcap & HWCAP_ILRCPC) + setCPUFeature(FEAT_RCPC2); + if (hwcap2 & HWCAP2_LRCPC3) + setCPUFeature(FEAT_RCPC3); + if (hwcap2 & HWCAP2_BF16) + setCPUFeature(FEAT_BF16); + if (hwcap2 & HWCAP2_SVEBF16) + setCPUFeature(FEAT_SVE_BF16); + if (hwcap & HWCAP_SVE) + setCPUFeature(FEAT_SVE); + if (hwcap2 & HWCAP2_SVE2) + setCPUFeature(FEAT_SVE2); + if (hwcap & HWCAP_SHA3) + setCPUFeature(FEAT_SHA3); setCPUFeature(FEAT_INIT); + + __atomic_store_n (&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED); } void -__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) { - if (__aarch64_cpu_features.features) +__init_cpu_features_resolver(unsigned long hwcap, const __ifunc_arg_t *arg) +{ + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) return; __init_cpu_features_constructor(hwcap, arg); } void __attribute__ ((constructor)) -__init_cpu_features(void) { +__init_cpu_features(void) +{ unsigned long hwcap; unsigned long hwcap2; + /* CPU features already initialized. */ - if (__aarch64_cpu_features.features) + if (__atomic_load_n (&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) return; hwcap = getauxval(AT_HWCAP); hwcap2 = getauxval(AT_HWCAP2);