Message ID | 20230706122544.175643-1-bmahi496@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v3] PowerPC: Influence cpu/arch hwcap features via GLIBC_TUNABLES. | expand |
On 06/07/23 09:25, bmahi496--- via Libc-alpha wrote: > From: Mahesh Bodapati <mahesh.bodapati@ibm.com> > > This patch enables the option to influence hwcaps used by PowerPC. > The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz...., > can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx > and zzz, where the feature name is case-sensitive and has to match the ones > mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. > > Note that the tunable only handles the features which are really used > in the IFUNC selection. All others are ignored as the values are only > used inside glibc. > --- > manual/tunables.texi | 5 +- > sysdeps/powerpc/cpu-features.c | 91 ++++++++++++++++++- > sysdeps/powerpc/cpu-features.h | 57 ++++++++++++ > sysdeps/powerpc/dl-tunables.list | 3 + > sysdeps/powerpc/hwcapinfo.c | 4 + > .../power4/multiarch/ifunc-impl-list.c | 4 +- > .../powerpc32/power4/multiarch/init-arch.h | 10 +- > sysdeps/powerpc/powerpc64/dl-machine.h | 2 - > .../powerpc64/multiarch/ifunc-impl-list.c | 7 +- > 9 files changed, 171 insertions(+), 12 deletions(-) > > diff --git a/manual/tunables.texi b/manual/tunables.texi > index 4ca0e42a11..776fd93fd9 100644 > --- a/manual/tunables.texi > +++ b/manual/tunables.texi > @@ -513,7 +513,10 @@ On s390x, the supported HWCAP and STFLE features can be found in > @code{sysdeps/s390/cpu-features.c}. In addition the user can also set > a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features. > > -This tunable is specific to i386, x86-64 and s390x. > +On powerpc, the supported HWCAP and HWCAP2 features can be found in > +@code{sysdeps/powerpc/dl-procinfo.c}. > + > +This tunable is specific to i386, x86-64, s390x and powerpc. > @end deftp > > @deftp Tunable glibc.cpu.cached_memopt > diff --git a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c > index 0ef3cf89d2..40d945ba03 100644 > --- a/sysdeps/powerpc/cpu-features.c > +++ b/sysdeps/powerpc/cpu-features.c > @@ -19,14 +19,103 @@ > #include <stdint.h> > #include <cpu-features.h> > #include <elf/dl-tunables.h> > +#include <unistd.h> > +#include <string.h> > +#define MEMCMP_DEFAULT memcmp > +#define STRLEN_DEFAULT strlen > + > +static void > +TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > +{ > + /* The current IFUNC selection is always using the most recent > + features which are available via AT_HWCAP or AT_HWCAP2. But in > + some scenarios it is useful to adjust this selection. > + > + The environment variable: > + > + GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,.... > + > + Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2 > + feature xxx, where the feature name is case-sensitive and has to match > + the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */ > + > + /* Copy the features from dl_powerpc_cpu_features, which contains the > + features provided by AT_HWCAP and AT_HWCAP2. */ > + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); > + unsigned long int tcbv_hwcap = cpu_features->hwcap; > + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; > + const char *token = valp->strval; > + do > + { > + const char *token_end, *feature; > + bool disable; > + size_t token_len, i, feature_len; > + /* Find token separator or end of string. */ > + for (token_end = token; *token_end != ','; token_end++) > + if (*token_end == '\0') > + break; > + > + /* Determine feature. */ > + token_len = token_end - token; > + if (*token == '-') > + { > + disable = true; > + feature = token + 1; > + feature_len = token_len - 1; > + } > + else > + { > + disable = false; > + feature = token; > + feature_len = token_len; > + } > + for (i = 0; hwcap_tunables[i].name != NULL; ++i) It is not clear to me how this is true since there is no NULL sentinel on hwcap_tunables. > + { > + /* Check the tunable name on the supported list. */ > + if (STRLEN_DEFAULT (hwcap_tunables[i].name) == feature_len > + && MEMCMP_DEFAULT (feature, hwcap_tunables[i].name, feature_len) > + == 0) > + { > + /* Update the hwcap and hwcap2 bits. */ > + if (disable) > + { > + /* Id is 1 for hwcap2 tunable. */ > + if (hwcap_tunables[i].id) > + cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask); > + else > + cpu_features->hwcap &= ~(hwcap_tunables[i].mask); > + } > + else > + { > + /* Enable the features and also checking that no unsupported > + features were enabled by user. */ > + if (hwcap_tunables[i].id) > + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); > + else > + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); > + } > + } > + } > + token += token_len; > + /* ... and skip token separator for next round. */ > + if (*token == ',') token++; > + } > + while (*token != '\0'); > +} > > static inline void > -init_cpu_features (struct cpu_features *cpu_features) > +init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[]) > { > + /* Fill the cpu_features with the supported hwcaps > + which are set by __tcb_parse_hwcap_and_convert_at_platform. */ > + cpu_features->hwcap = hwcaps[0]; > + cpu_features->hwcap2 = hwcaps[1]; > /* Default is to use aligned memory access on optimized function unless > tunables is enable, since for this case user can explicit disable > unaligned optimizations. */ > int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t, > NULL); > cpu_features->use_cached_memopt = (cached_memfunc > 0); > + TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, > + TUNABLE_CALLBACK (set_hwcaps)); > } > diff --git a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h > index d316dc3d64..6839c3085b 100644 > --- a/sysdeps/powerpc/cpu-features.h > +++ b/sysdeps/powerpc/cpu-features.h > @@ -19,10 +19,67 @@ > # define __CPU_FEATURES_POWERPC_H > > #include <stdbool.h> > +#include <sys/auxv.h> > > struct cpu_features > { > bool use_cached_memopt; > + unsigned long int hwcap; > + unsigned long int hwcap2; > +}; > + > +static const struct > +{ > + const char *name; > + int mask; > + bool id; > +} hwcap_tunables[] = { > + /* AT_HWCAP tunable masks. */ > + { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, This creates one extra dynamic relocation per entry: powerpc64le-linux-gnu-base$ powerpc64le-linux-gnu-readelf -a elf/ld.so [...] Relocation section '.relr.dyn' at offset 0xc68 contains 3 entries: 10 offsets [...] powerpc64le-linux-gnu-patch$ powerpc64le-linux-gnu-readelf -a elf/ld.so [...] Relocation section '.relr.dyn' at offset 0xc68 contains 5 entries: 53 offsets [...] Which I think we should avoid since is a small slow down on every program invocation. You can either define the name with a predefine size that fits for every name (say 32), but this will waste a some of space. Or you can specify the hwcap_tunables struct as pointing to an offset: static const char hwcap_names[] = "4xxmac\0" "altivec\0" [...] static const struct { unsigned short off; int mask; bool id; } hwcap_tunables[] = { { 0, PPC_FEATURE_HAS_4xxMAC, 0 }, { 7, PPC_FEATURE_HAS_ALTIVEC, 0 }, [...] } And then you check the name as: for (i = 0; array_length (hwcap_tunables); ++i) { const char *hwcap_name = hwcap_names + hwcap_tunables[i].off; [...] } The drowback is to get the offsets right it would require some preprocessor phase (something like we do for the signal and errno list). In any case I think it should add some testing. > + { "altivec", PPC_FEATURE_HAS_ALTIVEC, 0 }, > + { "arch_2_05", PPC_FEATURE_ARCH_2_05, 0 }, > + { "arch_2_06", PPC_FEATURE_ARCH_2_06, 0 }, > + { "archpmu", PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0 }, > + { "booke", PPC_FEATURE_BOOKE, 0 }, > + { "cellbe", PPC_FEATURE_CELL_BE, 0 }, > + { "dfp", PPC_FEATURE_HAS_DFP, 0 }, > + { "efpdouble", PPC_FEATURE_HAS_EFP_DOUBLE, 0 }, > + { "efpsingle", PPC_FEATURE_HAS_EFP_SINGLE, 0 }, > + { "fpu", PPC_FEATURE_HAS_FPU, 0 }, > + { "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, 0 }, > + { "mmu", PPC_FEATURE_HAS_MMU, 0 }, > + { "notb", PPC_FEATURE_NO_TB, 0 }, > + { "pa6t", PPC_FEATURE_PA6T, 0 }, > + { "power4", PPC_FEATURE_POWER4, 0 }, > + { "power5", PPC_FEATURE_POWER5, 0 }, > + { "power5+", PPC_FEATURE_POWER5_PLUS, 0 }, > + { "power6x", PPC_FEATURE_POWER6_EXT, 0 }, > + { "ppc32", PPC_FEATURE_32, 0 }, > + { "ppc601", PPC_FEATURE_601_INSTR, 0 }, > + { "ppc64", PPC_FEATURE_64, 0 }, > + { "ppcle", PPC_FEATURE_PPC_LE, 0 }, > + { "smt", PPC_FEATURE_SMT, 0 }, > + { "spe", PPC_FEATURE_HAS_SPE, 0 }, > + { "true_le", PPC_FEATURE_TRUE_LE, 0 }, > + { "ucache", PPC_FEATURE_UNIFIED_CACHE, 0 }, > + { "vsx", PPC_FEATURE_HAS_VSX, 0 }, > + > + /* AT_HWCAP2 tunable masks. */ > + { "arch_2_07", PPC_FEATURE2_ARCH_2_07, 1 }, > + { "dscr", PPC_FEATURE2_HAS_DSCR, 1 }, > + { "ebb", PPC_FEATURE2_HAS_EBB, 1 }, > + { "htm", PPC_FEATURE2_HAS_HTM, 1 }, > + { "htm-nosc", PPC_FEATURE2_HTM_NOSC, 1 }, > + { "htm-no-suspend", PPC_FEATURE2_HTM_NO_SUSPEND, 1 }, > + { "isel", PPC_FEATURE2_HAS_ISEL, 1 }, > + { "tar", PPC_FEATURE2_HAS_TAR, 1 }, > + { "vcrypto", PPC_FEATURE2_HAS_VEC_CRYPTO, 1 }, > + { "arch_3_00", PPC_FEATURE2_ARCH_3_00, 1 }, > + { "ieee128", PPC_FEATURE2_HAS_IEEE128, 1 }, > + { "darn", PPC_FEATURE2_DARN, 1 }, > + { "scv", PPC_FEATURE2_SCV, 1 }, > + { "arch_3_1", PPC_FEATURE2_ARCH_3_1, 1 }, > + { "mma", PPC_FEATURE2_MMA, 1 }, > }; > > #endif /* __CPU_FEATURES_H */ > diff --git a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list > index 87d6235c75..807b7f8013 100644 > --- a/sysdeps/powerpc/dl-tunables.list > +++ b/sysdeps/powerpc/dl-tunables.list > @@ -24,5 +24,8 @@ glibc { > maxval: 1 > default: 0 > } > + hwcaps { > + type: STRING > + } > } > } > diff --git a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c > index e26e64d99e..f2c473c556 100644 > --- a/sysdeps/powerpc/hwcapinfo.c > +++ b/sysdeps/powerpc/hwcapinfo.c > @@ -19,6 +19,7 @@ > #include <unistd.h> > #include <shlib-compat.h> > #include <dl-procinfo.h> > +#include <cpu-features.c> > > tcbhead_t __tcb __attribute__ ((visibility ("hidden"))); > > @@ -63,6 +64,9 @@ __tcb_parse_hwcap_and_convert_at_platform (void) > else if (h1 & PPC_FEATURE_POWER5) > h1 |= PPC_FEATURE_POWER4; > > + uint64_t array_hwcaps[] = { h1, h2 }; > + init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps); > + > /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that > we can read both in a single load later. */ > __tcb.hwcap = (h1 << 32) | (h2 & 0xffffffff); > diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c > index b4f80539e7..986c37d71e 100644 > --- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c > @@ -21,6 +21,7 @@ > #include <wchar.h> > #include <ldsodefs.h> > #include <ifunc-impl-list.h> > +#include <cpu-features.h> > > size_t > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > @@ -28,7 +29,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > { > size_t i = max; > > - unsigned long int hwcap = GLRO(dl_hwcap); > + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); > + unsigned long int hwcap = features->hwcap; > /* hwcap contains only the latest supported ISA, the code checks which is > and fills the previous supported ones. */ > if (hwcap & PPC_FEATURE_ARCH_2_06) > diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h > index 3dd00e02ee..a0bbd12012 100644 > --- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h > +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h > @@ -16,6 +16,7 @@ > <https://www.gnu.org/licenses/>. */ > > #include <ldsodefs.h> > +#include <cpu-features.h> > > /* The code checks if _rtld_global_ro was realocated before trying to access > the dl_hwcap field. The assembly is to make the compiler not optimize the > @@ -32,11 +33,12 @@ > # define __GLRO(value) GLRO(value) > #endif > > -/* dl_hwcap contains only the latest supported ISA, the macro checks which is > - and fills the previous ones. */ > +/* Get the hardware information post the tunables set , the macro checks > + it and fills the previous ones. */ > #define INIT_ARCH() \ > - unsigned long int hwcap = __GLRO(dl_hwcap); \ > - unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \ > + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); \ > + unsigned long int hwcap = features->hwcap; \ > + unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \ > bool __attribute__((unused)) use_cached_memopt = \ > __GLRO(dl_powerpc_cpu_features.use_cached_memopt); \ > if (hwcap & PPC_FEATURE_ARCH_2_06) \ > diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h > index 9b8943bc91..449208e86f 100644 > --- a/sysdeps/powerpc/powerpc64/dl-machine.h > +++ b/sysdeps/powerpc/powerpc64/dl-machine.h > @@ -27,7 +27,6 @@ > #include <dl-tls.h> > #include <sysdep.h> > #include <hwcapinfo.h> > -#include <cpu-features.c> > #include <dl-static-tls.h> > #include <dl-funcdesc.h> > #include <dl-machine-rel.h> > @@ -297,7 +296,6 @@ static inline void __attribute__ ((unused)) > dl_platform_init (void) > { > __tcb_parse_hwcap_and_convert_at_platform (); > - init_cpu_features (&GLRO(dl_powerpc_cpu_features)); > } > #endif > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index ebe9434052..fc26dd0e17 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -17,6 +17,7 @@ > <https://www.gnu.org/licenses/>. */ > > #include <assert.h> > +#include <cpu-features.h> > #include <string.h> > #include <wchar.h> > #include <ldsodefs.h> > @@ -27,9 +28,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > size_t max) > { > size_t i = max; > - > - unsigned long int hwcap = GLRO(dl_hwcap); > - unsigned long int hwcap2 = GLRO(dl_hwcap2); > + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); > + unsigned long int hwcap = features->hwcap; > + unsigned long int hwcap2 = features->hwcap2; > #ifdef SHARED > int cacheline_size = GLRO(dl_cache_line_size); > #endif
On 7/6/23 8:16 AM, Adhemerval Zanella Netto wrote: > On 06/07/23 09:25, bmahi496--- via Libc-alpha wrote: >> struct cpu_features >> { >> bool use_cached_memopt; >> + unsigned long int hwcap; >> + unsigned long int hwcap2; >> +}; >> + >> +static const struct >> +{ >> + const char *name; >> + int mask; >> + bool id; >> +} hwcap_tunables[] = { >> + /* AT_HWCAP tunable masks. */ >> + { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, > > This creates one extra dynamic relocation per entry: > > powerpc64le-linux-gnu-base$ powerpc64le-linux-gnu-readelf -a elf/ld.so > [...] > Relocation section '.relr.dyn' at offset 0xc68 contains 3 entries: > 10 offsets > [...] > > powerpc64le-linux-gnu-patch$ powerpc64le-linux-gnu-readelf -a elf/ld.so > [...] > Relocation section '.relr.dyn' at offset 0xc68 contains 5 entries: > 53 offsets > [...] Good catch! Especially since the plan is to add space for hwcap3 and hwcap4 (possibly more) in the near future. We don't need the extra hwcaps yet, but we want to reserve the space in the TCB for them now, where __builtin_cpu_supports() uses them, so that when we need them in the future, the space is already there to use in distro glibcs. > Which I think we should avoid since is a small slow down on every program > invocation. You can either define the name with a predefine size that > fits for every name (say 32), but this will waste a some of space. Or > you can specify the hwcap_tunables struct as pointing to an offset: > > static const char hwcap_names[] = > "4xxmac\0" > "altivec\0" > [...] > > static const struct > { > unsigned short off; > int mask; > bool id; > } hwcap_tunables[] = { > { 0, PPC_FEATURE_HAS_4xxMAC, 0 }, > { 7, PPC_FEATURE_HAS_ALTIVEC, 0 }, > [...] > } > And then you check the name as: > > for (i = 0; array_length (hwcap_tunables); ++i) > { > const char *hwcap_name = hwcap_names + hwcap_tunables[i].off; > [...] > } > > The drowback is to get the offsets right it would require some preprocessor > phase (something like we do for the signal and errno list). I don't think we need the offset in the struct, since Mahesh's loop is already calculating the strlen of hwcap_names[].name, so we can just update the pointer as we go. Ala... const char *hwcap_name = &hwcap_names[0]; for (i = 0; array_length (hwcap_tunables); ++i) { size_t tunable_len = STRLEN_DEFAULT (hwcap_name); /* Check the tunable name on the supported list. */ if (tunable_len == feature_len && MEMCMP_DEFAULT (feature, hwcap_name, feature_len) == 0) { ... } hwcap_name += tunable_len + 1; } Peter
On 7/6/23 7:25 AM, bmahi496@linux.ibm.com wrote: > + /* Copy the features from dl_powerpc_cpu_features, which contains the > + features provided by AT_HWCAP and AT_HWCAP2. */ > + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); > + unsigned long int tcbv_hwcap = cpu_features->hwcap; > + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; [snip] > + else > + { > + /* Enable the features and also checking that no unsupported > + features were enabled by user. */ > + if (hwcap_tunables[i].id) > + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); > + else > + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); > + } I don't see how this code can ever "add" new bits to cpu_features->hwcap or cpu_features->hwcap2. We cache their values at the top of the loop and then OR in the tunable mask, but only if they're not already set in the cached values. If the tunable mask has a bit that isn't already present in cpu_features->hwcap/cpu_features->hwcap2, we'll never set them. It seems your code as is, can only ever remove bits from hwcap/hwcap2. Question for you or anyone else, is there a scenario where we can execute our set_hwcaps tunable callback function where the set of HWCAP/2 feature bits is a subset of the HWCAP/2 bits that the kernel passed to us? For example, could our set_hwcaps tunable callback function be called multiple times in some scenario like via a fork or ??? If yes, then tcbv_hwcap and tcbv_hwcap2 really should be set to the HWCAP/2 values the kernel gives us and not some possibly modified version in cpu_features->hwcap/2. If no, then I think we probably don't even need to support trying to enable/add bits to the cpu_features->hwcap/2 masks at all, since we'd only try setting bits that are already set or bits we're not supposed to set since they aren't in the kernel's version of the hwcaps. Peter
On 06/07/23 6:46 pm, Adhemerval Zanella Netto wrote: > > On 06/07/23 09:25, bmahi496--- via Libc-alpha wrote: >> From: Mahesh Bodapati <mahesh.bodapati@ibm.com> >> >> This patch enables the option to influence hwcaps used by PowerPC. >> The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz...., >> can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx >> and zzz, where the feature name is case-sensitive and has to match the ones >> mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. >> >> Note that the tunable only handles the features which are really used >> in the IFUNC selection. All others are ignored as the values are only >> used inside glibc. >> --- >> manual/tunables.texi | 5 +- >> sysdeps/powerpc/cpu-features.c | 91 ++++++++++++++++++- >> sysdeps/powerpc/cpu-features.h | 57 ++++++++++++ >> sysdeps/powerpc/dl-tunables.list | 3 + >> sysdeps/powerpc/hwcapinfo.c | 4 + >> .../power4/multiarch/ifunc-impl-list.c | 4 +- >> .../powerpc32/power4/multiarch/init-arch.h | 10 +- >> sysdeps/powerpc/powerpc64/dl-machine.h | 2 - >> .../powerpc64/multiarch/ifunc-impl-list.c | 7 +- >> 9 files changed, 171 insertions(+), 12 deletions(-) >> >> diff --git a/manual/tunables.texi b/manual/tunables.texi >> index 4ca0e42a11..776fd93fd9 100644 >> --- a/manual/tunables.texi >> +++ b/manual/tunables.texi >> @@ -513,7 +513,10 @@ On s390x, the supported HWCAP and STFLE features can be found in >> @code{sysdeps/s390/cpu-features.c}. In addition the user can also set >> a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features. >> >> -This tunable is specific to i386, x86-64 and s390x. >> +On powerpc, the supported HWCAP and HWCAP2 features can be found in >> +@code{sysdeps/powerpc/dl-procinfo.c}. >> + >> +This tunable is specific to i386, x86-64, s390x and powerpc. >> @end deftp >> >> @deftp Tunable glibc.cpu.cached_memopt >> diff --git a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c >> index 0ef3cf89d2..40d945ba03 100644 >> --- a/sysdeps/powerpc/cpu-features.c >> +++ b/sysdeps/powerpc/cpu-features.c >> @@ -19,14 +19,103 @@ >> #include <stdint.h> >> #include <cpu-features.h> >> #include <elf/dl-tunables.h> >> +#include <unistd.h> >> +#include <string.h> >> +#define MEMCMP_DEFAULT memcmp >> +#define STRLEN_DEFAULT strlen >> + >> +static void >> +TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) >> +{ >> + /* The current IFUNC selection is always using the most recent >> + features which are available via AT_HWCAP or AT_HWCAP2. But in >> + some scenarios it is useful to adjust this selection. >> + >> + The environment variable: >> + >> + GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,.... >> + >> + Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2 >> + feature xxx, where the feature name is case-sensitive and has to match >> + the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */ >> + >> + /* Copy the features from dl_powerpc_cpu_features, which contains the >> + features provided by AT_HWCAP and AT_HWCAP2. */ >> + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); >> + unsigned long int tcbv_hwcap = cpu_features->hwcap; >> + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; >> + const char *token = valp->strval; >> + do >> + { >> + const char *token_end, *feature; >> + bool disable; >> + size_t token_len, i, feature_len; >> + /* Find token separator or end of string. */ >> + for (token_end = token; *token_end != ','; token_end++) >> + if (*token_end == '\0') >> + break; >> + >> + /* Determine feature. */ >> + token_len = token_end - token; >> + if (*token == '-') >> + { >> + disable = true; >> + feature = token + 1; >> + feature_len = token_len - 1; >> + } >> + else >> + { >> + disable = false; >> + feature = token; >> + feature_len = token_len; >> + } >> + for (i = 0; hwcap_tunables[i].name != NULL; ++i) > It is not clear to me how this is true since there is no NULL sentinel on hwcap_tunables. > >> + { >> + /* Check the tunable name on the supported list. */ >> + if (STRLEN_DEFAULT (hwcap_tunables[i].name) == feature_len >> + && MEMCMP_DEFAULT (feature, hwcap_tunables[i].name, feature_len) >> + == 0) >> + { >> + /* Update the hwcap and hwcap2 bits. */ >> + if (disable) >> + { >> + /* Id is 1 for hwcap2 tunable. */ >> + if (hwcap_tunables[i].id) >> + cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask); >> + else >> + cpu_features->hwcap &= ~(hwcap_tunables[i].mask); >> + } >> + else >> + { >> + /* Enable the features and also checking that no unsupported >> + features were enabled by user. */ >> + if (hwcap_tunables[i].id) >> + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); >> + else >> + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); >> + } >> + } >> + } >> + token += token_len; >> + /* ... and skip token separator for next round. */ >> + if (*token == ',') token++; >> + } >> + while (*token != '\0'); >> +} >> >> static inline void >> -init_cpu_features (struct cpu_features *cpu_features) >> +init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[]) >> { >> + /* Fill the cpu_features with the supported hwcaps >> + which are set by __tcb_parse_hwcap_and_convert_at_platform. */ >> + cpu_features->hwcap = hwcaps[0]; >> + cpu_features->hwcap2 = hwcaps[1]; >> /* Default is to use aligned memory access on optimized function unless >> tunables is enable, since for this case user can explicit disable >> unaligned optimizations. */ >> int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t, >> NULL); >> cpu_features->use_cached_memopt = (cached_memfunc > 0); >> + TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, >> + TUNABLE_CALLBACK (set_hwcaps)); >> } >> diff --git a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h >> index d316dc3d64..6839c3085b 100644 >> --- a/sysdeps/powerpc/cpu-features.h >> +++ b/sysdeps/powerpc/cpu-features.h >> @@ -19,10 +19,67 @@ >> # define __CPU_FEATURES_POWERPC_H >> >> #include <stdbool.h> >> +#include <sys/auxv.h> >> >> struct cpu_features >> { >> bool use_cached_memopt; >> + unsigned long int hwcap; >> + unsigned long int hwcap2; >> +}; >> + >> +static const struct >> +{ >> + const char *name; >> + int mask; >> + bool id; >> +} hwcap_tunables[] = { >> + /* AT_HWCAP tunable masks. */ >> + { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, > This creates one extra dynamic relocation per entry: > > powerpc64le-linux-gnu-base$ powerpc64le-linux-gnu-readelf -a elf/ld.so > [...] > Relocation section '.relr.dyn' at offset 0xc68 contains 3 entries: > 10 offsets > [...] > > powerpc64le-linux-gnu-patch$ powerpc64le-linux-gnu-readelf -a elf/ld.so > [...] > Relocation section '.relr.dyn' at offset 0xc68 contains 5 entries: > 53 offsets > [...] > > Which I think we should avoid since is a small slow down on every program > invocation. You can either define the name with a predefine size that > fits for every name (say 32), but this will waste a some of space. Or > you can specify the hwcap_tunables struct as pointing to an offset: > > static const char hwcap_names[] = > "4xxmac\0" > "altivec\0" > [...] > > static const struct > { > unsigned short off; > int mask; > bool id; > } hwcap_tunables[] = { > { 0, PPC_FEATURE_HAS_4xxMAC, 0 }, > { 7, PPC_FEATURE_HAS_ALTIVEC, 0 }, > [...] > } > > And then you check the name as: > > for (i = 0; array_length (hwcap_tunables); ++i) > { > const char *hwcap_name = hwcap_names + hwcap_tunables[i].off; > [...] > } > > The drowback is to get the offsets right it would require some preprocessor > phase (something like we do for the signal and errno list). > > In any case I think it should add some testing. Thanks Adhemerval for finding it. Yeah ,I can see additional 43 RELA entries with my patch. I started implemented the code in the way that you were suggested. I will share the patch once the testing has been done. >> + { "altivec", PPC_FEATURE_HAS_ALTIVEC, 0 }, >> + { "arch_2_05", PPC_FEATURE_ARCH_2_05, 0 }, >> + { "arch_2_06", PPC_FEATURE_ARCH_2_06, 0 }, >> + { "archpmu", PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0 }, >> + { "booke", PPC_FEATURE_BOOKE, 0 }, >> + { "cellbe", PPC_FEATURE_CELL_BE, 0 }, >> + { "dfp", PPC_FEATURE_HAS_DFP, 0 }, >> + { "efpdouble", PPC_FEATURE_HAS_EFP_DOUBLE, 0 }, >> + { "efpsingle", PPC_FEATURE_HAS_EFP_SINGLE, 0 }, >> + { "fpu", PPC_FEATURE_HAS_FPU, 0 }, >> + { "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, 0 }, >> + { "mmu", PPC_FEATURE_HAS_MMU, 0 }, >> + { "notb", PPC_FEATURE_NO_TB, 0 }, >> + { "pa6t", PPC_FEATURE_PA6T, 0 }, >> + { "power4", PPC_FEATURE_POWER4, 0 }, >> + { "power5", PPC_FEATURE_POWER5, 0 }, >> + { "power5+", PPC_FEATURE_POWER5_PLUS, 0 }, >> + { "power6x", PPC_FEATURE_POWER6_EXT, 0 }, >> + { "ppc32", PPC_FEATURE_32, 0 }, >> + { "ppc601", PPC_FEATURE_601_INSTR, 0 }, >> + { "ppc64", PPC_FEATURE_64, 0 }, >> + { "ppcle", PPC_FEATURE_PPC_LE, 0 }, >> + { "smt", PPC_FEATURE_SMT, 0 }, >> + { "spe", PPC_FEATURE_HAS_SPE, 0 }, >> + { "true_le", PPC_FEATURE_TRUE_LE, 0 }, >> + { "ucache", PPC_FEATURE_UNIFIED_CACHE, 0 }, >> + { "vsx", PPC_FEATURE_HAS_VSX, 0 }, >> + >> + /* AT_HWCAP2 tunable masks. */ >> + { "arch_2_07", PPC_FEATURE2_ARCH_2_07, 1 }, >> + { "dscr", PPC_FEATURE2_HAS_DSCR, 1 }, >> + { "ebb", PPC_FEATURE2_HAS_EBB, 1 }, >> + { "htm", PPC_FEATURE2_HAS_HTM, 1 }, >> + { "htm-nosc", PPC_FEATURE2_HTM_NOSC, 1 }, >> + { "htm-no-suspend", PPC_FEATURE2_HTM_NO_SUSPEND, 1 }, >> + { "isel", PPC_FEATURE2_HAS_ISEL, 1 }, >> + { "tar", PPC_FEATURE2_HAS_TAR, 1 }, >> + { "vcrypto", PPC_FEATURE2_HAS_VEC_CRYPTO, 1 }, >> + { "arch_3_00", PPC_FEATURE2_ARCH_3_00, 1 }, >> + { "ieee128", PPC_FEATURE2_HAS_IEEE128, 1 }, >> + { "darn", PPC_FEATURE2_DARN, 1 }, >> + { "scv", PPC_FEATURE2_SCV, 1 }, >> + { "arch_3_1", PPC_FEATURE2_ARCH_3_1, 1 }, >> + { "mma", PPC_FEATURE2_MMA, 1 }, >> }; >> >> #endif /* __CPU_FEATURES_H */ >> diff --git a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list >> index 87d6235c75..807b7f8013 100644 >> --- a/sysdeps/powerpc/dl-tunables.list >> +++ b/sysdeps/powerpc/dl-tunables.list >> @@ -24,5 +24,8 @@ glibc { >> maxval: 1 >> default: 0 >> } >> + hwcaps { >> + type: STRING >> + } >> } >> } >> diff --git a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c >> index e26e64d99e..f2c473c556 100644 >> --- a/sysdeps/powerpc/hwcapinfo.c >> +++ b/sysdeps/powerpc/hwcapinfo.c >> @@ -19,6 +19,7 @@ >> #include <unistd.h> >> #include <shlib-compat.h> >> #include <dl-procinfo.h> >> +#include <cpu-features.c> >> >> tcbhead_t __tcb __attribute__ ((visibility ("hidden"))); >> >> @@ -63,6 +64,9 @@ __tcb_parse_hwcap_and_convert_at_platform (void) >> else if (h1 & PPC_FEATURE_POWER5) >> h1 |= PPC_FEATURE_POWER4; >> >> + uint64_t array_hwcaps[] = { h1, h2 }; >> + init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps); >> + >> /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that >> we can read both in a single load later. */ >> __tcb.hwcap = (h1 << 32) | (h2 & 0xffffffff); >> diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c >> index b4f80539e7..986c37d71e 100644 >> --- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c >> @@ -21,6 +21,7 @@ >> #include <wchar.h> >> #include <ldsodefs.h> >> #include <ifunc-impl-list.h> >> +#include <cpu-features.h> >> >> size_t >> __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> @@ -28,7 +29,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> { >> size_t i = max; >> >> - unsigned long int hwcap = GLRO(dl_hwcap); >> + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); >> + unsigned long int hwcap = features->hwcap; >> /* hwcap contains only the latest supported ISA, the code checks which is >> and fills the previous supported ones. */ >> if (hwcap & PPC_FEATURE_ARCH_2_06) >> diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h >> index 3dd00e02ee..a0bbd12012 100644 >> --- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h >> +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h >> @@ -16,6 +16,7 @@ >> <https://www.gnu.org/licenses/>. */ >> >> #include <ldsodefs.h> >> +#include <cpu-features.h> >> >> /* The code checks if _rtld_global_ro was realocated before trying to access >> the dl_hwcap field. The assembly is to make the compiler not optimize the >> @@ -32,11 +33,12 @@ >> # define __GLRO(value) GLRO(value) >> #endif >> >> -/* dl_hwcap contains only the latest supported ISA, the macro checks which is >> - and fills the previous ones. */ >> +/* Get the hardware information post the tunables set , the macro checks >> + it and fills the previous ones. */ >> #define INIT_ARCH() \ >> - unsigned long int hwcap = __GLRO(dl_hwcap); \ >> - unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \ >> + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); \ >> + unsigned long int hwcap = features->hwcap; \ >> + unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \ >> bool __attribute__((unused)) use_cached_memopt = \ >> __GLRO(dl_powerpc_cpu_features.use_cached_memopt); \ >> if (hwcap & PPC_FEATURE_ARCH_2_06) \ >> diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h >> index 9b8943bc91..449208e86f 100644 >> --- a/sysdeps/powerpc/powerpc64/dl-machine.h >> +++ b/sysdeps/powerpc/powerpc64/dl-machine.h >> @@ -27,7 +27,6 @@ >> #include <dl-tls.h> >> #include <sysdep.h> >> #include <hwcapinfo.h> >> -#include <cpu-features.c> >> #include <dl-static-tls.h> >> #include <dl-funcdesc.h> >> #include <dl-machine-rel.h> >> @@ -297,7 +296,6 @@ static inline void __attribute__ ((unused)) >> dl_platform_init (void) >> { >> __tcb_parse_hwcap_and_convert_at_platform (); >> - init_cpu_features (&GLRO(dl_powerpc_cpu_features)); >> } >> #endif >> >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> index ebe9434052..fc26dd0e17 100644 >> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> @@ -17,6 +17,7 @@ >> <https://www.gnu.org/licenses/>. */ >> >> #include <assert.h> >> +#include <cpu-features.h> >> #include <string.h> >> #include <wchar.h> >> #include <ldsodefs.h> >> @@ -27,9 +28,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> size_t max) >> { >> size_t i = max; >> - >> - unsigned long int hwcap = GLRO(dl_hwcap); >> - unsigned long int hwcap2 = GLRO(dl_hwcap2); >> + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); >> + unsigned long int hwcap = features->hwcap; >> + unsigned long int hwcap2 = features->hwcap2; >> #ifdef SHARED >> int cacheline_size = GLRO(dl_cache_line_size); >> #endif
On 07/07/23 2:35 am, Peter Bergner wrote: > On 7/6/23 7:25 AM, bmahi496@linux.ibm.com wrote: >> + /* Copy the features from dl_powerpc_cpu_features, which contains the >> + features provided by AT_HWCAP and AT_HWCAP2. */ >> + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); >> + unsigned long int tcbv_hwcap = cpu_features->hwcap; >> + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; > [snip] >> + else >> + { >> + /* Enable the features and also checking that no unsupported >> + features were enabled by user. */ >> + if (hwcap_tunables[i].id) >> + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); >> + else >> + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); >> + } > I don't see how this code can ever "add" new bits to cpu_features->hwcap > or cpu_features->hwcap2. We cache their values at the top of the loop > and then OR in the tunable mask, but only if they're not already set > in the cached values. If the tunable mask has a bit that isn't already > present in cpu_features->hwcap/cpu_features->hwcap2, we'll never set > them. It seems your code as is, can only ever remove bits from hwcap/hwcap2. > > Question for you or anyone else, is there a scenario where we can execute > our set_hwcaps tunable callback function where the set of HWCAP/2 feature > bits is a subset of the HWCAP/2 bits that the kernel passed to us? I don't see any scenario like that. It will be helpful when user set multiple entries of tunable. For example "-arch_3_1,-arch_3_00,-vsx,arch_3_00" here we have 2 entries of arch_3_00 where it's disabling at first place and enabling at second place. > > For example, could our set_hwcaps tunable callback function be called multiple > times in some scenario like via a fork or ??? If yes, then tcbv_hwcap and > tcbv_hwcap2 really should be set to the HWCAP/2 values the kernel gives us > and not some possibly modified version in cpu_features->hwcap/2. If no, then > I think we probably don't even need to support trying to enable/add bits to > the cpu_features->hwcap/2 masks at all, since we'd only try setting bits > that are already set or bits we're not supposed to set since they aren't > in the kernel's version of the hwcaps. > > > Peter > >
On 06/07/23 18:05, Peter Bergner via Libc-alpha wrote: > On 7/6/23 7:25 AM, bmahi496@linux.ibm.com wrote: >> + /* Copy the features from dl_powerpc_cpu_features, which contains the >> + features provided by AT_HWCAP and AT_HWCAP2. */ >> + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); >> + unsigned long int tcbv_hwcap = cpu_features->hwcap; >> + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; > [snip] >> + else >> + { >> + /* Enable the features and also checking that no unsupported >> + features were enabled by user. */ >> + if (hwcap_tunables[i].id) >> + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); >> + else >> + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); >> + } > > I don't see how this code can ever "add" new bits to cpu_features->hwcap > or cpu_features->hwcap2. We cache their values at the top of the loop > and then OR in the tunable mask, but only if they're not already set > in the cached values. If the tunable mask has a bit that isn't already > present in cpu_features->hwcap/cpu_features->hwcap2, we'll never set > them. It seems your code as is, can only ever remove bits from hwcap/hwcap2. > > Question for you or anyone else, is there a scenario where we can execute > our set_hwcaps tunable callback function where the set of HWCAP/2 feature > bits is a subset of the HWCAP/2 bits that the kernel passed to us? > > For example, could our set_hwcaps tunable callback function be called multiple > times in some scenario like via a fork or ??? If yes, then tcbv_hwcap and > tcbv_hwcap2 really should be set to the HWCAP/2 values the kernel gives us > and not some possibly modified version in cpu_features->hwcap/2. If no, then > I think we probably don't even need to support trying to enable/add bits to > the cpu_features->hwcap/2 masks at all, since we'd only try setting bits > that are already set or bits we're not supposed to set since they aren't > in the kernel's version of the hwcaps. The set_hwcaps is called once only at process execution. Adding bits might make sense in a scenario that you are running a new glibc on older kernels and you know that some string routines are safe to run on that hardware (although it might come with some caveats, since it might still require some kernel support). With this patch GLIBC_TUNABLES=glibc.cpu.hwcaps=feature is ignored if 'feature' is not support by HWCAP/2, which is not explicit state in documentation and it slight differ from s390x/x86. The s390x only considers features that are used in ifunc resolver, but since the idea is to use __builtin_cpu_supports maybe powerpc should handle everything. It also bothers me that our documentation references to a source file to actually get the supported 'features' strings. I think we can make it better and proper document the correct list of features and maybe add a ld.so option to also print the supported values.
On 7/7/23 9:25 AM, Adhemerval Zanella Netto wrote: > The set_hwcaps is called once only at process execution. Adding bits might > make sense in a scenario that you are running a new glibc on older kernels > and you know that some string routines are safe to run on that hardware > (although it might come with some caveats, since it might still require some > kernel support). AIUI, on Power, the firmware (which knows about the "new" hardware features on the cpu we're running on) passes the HWCAP/HWCAP2 masks to the kernel. The kernel (old or new) then places those masks (even the new ones it doesn't know about/understand) into the AUXV that is passed to glibc, so we don't have a problem with "old" kernels and the HWCAP/HWCAP2 masks glibc receives from the kernel should be "full-featured" wrt the cpu it is running on. This is why I was mentioning I'm not sure supporting adding bits to the masks makes any sense on Power, unless there is a mechanism where we might have removed some earlier. It sounds like from your comment that there isn't a way for that to happen. That said, keeping the tunable API the same across the different architectures the same is important, so we should at least accept the user trying...it just won't do anything. :-) > With this patch GLIBC_TUNABLES=glibc.cpu.hwcaps=feature is ignored if 'feature' > is not support by HWCAP/2, which is not explicit state in documentation and > it slight differ from s390x/x86. The s390x only considers features that are > used in ifunc resolver, but since the idea is to use __builtin_cpu_supports > maybe powerpc should handle everything. Since our (old or new) kernel supplied HWCAP/HWCAP2 masks are full-featured, it makes sense on Power not to allow users to set any bits that were not set in the original hwcap masks, since we "know" the underlying hardware doesn't support them. I believe Mahesh's current patch only supports modifying the internal to glibc ifunc resolvers, but I do want them to eventually be made externally available for use by the __builtin_cpu_supports() built-in. > It also bothers me that our documentation references to a source file to > actually get the supported 'features' strings. I think we can make it better > and proper document the correct list of features and maybe add a ld.so > option to also print the supported values. I like that idea. That said, GCC documents the 'features' as part of the __builtin_cpu_supports built-in documentation: https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html ...but the user should be able to get this from glibc documentation since the tunables is a glibc feature (wow, too many features :-). Peter
On 7/7/23 5:52 AM, MAHESH BODAPATI wrote: > On 07/07/23 2:35 am, Peter Bergner wrote: >> I don't see how this code can ever "add" new bits to cpu_features->hwcap >> or cpu_features->hwcap2. We cache their values at the top of the loop >> and then OR in the tunable mask, but only if they're not already set >> in the cached values. If the tunable mask has a bit that isn't already >> present in cpu_features->hwcap/cpu_features->hwcap2, we'll never set >> them. It seems your code as is, can only ever remove bits from hwcap/hwcap2. >> >> Question for you or anyone else, is there a scenario where we can execute >> our set_hwcaps tunable callback function where the set of HWCAP/2 feature >> bits is a subset of the HWCAP/2 bits that the kernel passed to us? > > I don't see any scenario like that. > It will be helpful when user set multiple entries of tunable. > For example "-arch_3_1,-arch_3_00,-vsx,arch_3_00" here we have 2 entries of arch_3_00 > where it's disabling at first place and enabling at second place. What I meant was that if you look at the bits in cpu_features->hwcap and cpu_features->hwcap2 before this code is executed and their bits after, you will not see any bits that are set that were not already set before. That's what I mean by your code can only ever remove bits, not add new bits. Your example above is just cleaning up dumb user usage, since that is exactly equivalent (because our original mask is full-featured) to the user instead saying "-arch_3_1,-vsx". Ie, we only remove bits in the end. Adhemerval confirmed that this code is only run once, so my concern that your code can't add new bits to the hwcap masks isn't a problem, since our hwcap masks start out full-featured and there is no way for them to have hwcap bits removed before your code is executed. Peter
diff --git a/manual/tunables.texi b/manual/tunables.texi index 4ca0e42a11..776fd93fd9 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -513,7 +513,10 @@ On s390x, the supported HWCAP and STFLE features can be found in @code{sysdeps/s390/cpu-features.c}. In addition the user can also set a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features. -This tunable is specific to i386, x86-64 and s390x. +On powerpc, the supported HWCAP and HWCAP2 features can be found in +@code{sysdeps/powerpc/dl-procinfo.c}. + +This tunable is specific to i386, x86-64, s390x and powerpc. @end deftp @deftp Tunable glibc.cpu.cached_memopt diff --git a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c index 0ef3cf89d2..40d945ba03 100644 --- a/sysdeps/powerpc/cpu-features.c +++ b/sysdeps/powerpc/cpu-features.c @@ -19,14 +19,103 @@ #include <stdint.h> #include <cpu-features.h> #include <elf/dl-tunables.h> +#include <unistd.h> +#include <string.h> +#define MEMCMP_DEFAULT memcmp +#define STRLEN_DEFAULT strlen + +static void +TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) +{ + /* The current IFUNC selection is always using the most recent + features which are available via AT_HWCAP or AT_HWCAP2. But in + some scenarios it is useful to adjust this selection. + + The environment variable: + + GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,.... + + Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2 + feature xxx, where the feature name is case-sensitive and has to match + the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */ + + /* Copy the features from dl_powerpc_cpu_features, which contains the + features provided by AT_HWCAP and AT_HWCAP2. */ + struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features); + unsigned long int tcbv_hwcap = cpu_features->hwcap; + unsigned long int tcbv_hwcap2 = cpu_features->hwcap2; + const char *token = valp->strval; + do + { + const char *token_end, *feature; + bool disable; + size_t token_len, i, feature_len; + /* Find token separator or end of string. */ + for (token_end = token; *token_end != ','; token_end++) + if (*token_end == '\0') + break; + + /* Determine feature. */ + token_len = token_end - token; + if (*token == '-') + { + disable = true; + feature = token + 1; + feature_len = token_len - 1; + } + else + { + disable = false; + feature = token; + feature_len = token_len; + } + for (i = 0; hwcap_tunables[i].name != NULL; ++i) + { + /* Check the tunable name on the supported list. */ + if (STRLEN_DEFAULT (hwcap_tunables[i].name) == feature_len + && MEMCMP_DEFAULT (feature, hwcap_tunables[i].name, feature_len) + == 0) + { + /* Update the hwcap and hwcap2 bits. */ + if (disable) + { + /* Id is 1 for hwcap2 tunable. */ + if (hwcap_tunables[i].id) + cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask); + else + cpu_features->hwcap &= ~(hwcap_tunables[i].mask); + } + else + { + /* Enable the features and also checking that no unsupported + features were enabled by user. */ + if (hwcap_tunables[i].id) + cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask); + else + cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask); + } + } + } + token += token_len; + /* ... and skip token separator for next round. */ + if (*token == ',') token++; + } + while (*token != '\0'); +} static inline void -init_cpu_features (struct cpu_features *cpu_features) +init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[]) { + /* Fill the cpu_features with the supported hwcaps + which are set by __tcb_parse_hwcap_and_convert_at_platform. */ + cpu_features->hwcap = hwcaps[0]; + cpu_features->hwcap2 = hwcaps[1]; /* Default is to use aligned memory access on optimized function unless tunables is enable, since for this case user can explicit disable unaligned optimizations. */ int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t, NULL); cpu_features->use_cached_memopt = (cached_memfunc > 0); + TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, + TUNABLE_CALLBACK (set_hwcaps)); } diff --git a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h index d316dc3d64..6839c3085b 100644 --- a/sysdeps/powerpc/cpu-features.h +++ b/sysdeps/powerpc/cpu-features.h @@ -19,10 +19,67 @@ # define __CPU_FEATURES_POWERPC_H #include <stdbool.h> +#include <sys/auxv.h> struct cpu_features { bool use_cached_memopt; + unsigned long int hwcap; + unsigned long int hwcap2; +}; + +static const struct +{ + const char *name; + int mask; + bool id; +} hwcap_tunables[] = { + /* AT_HWCAP tunable masks. */ + { "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0 }, + { "altivec", PPC_FEATURE_HAS_ALTIVEC, 0 }, + { "arch_2_05", PPC_FEATURE_ARCH_2_05, 0 }, + { "arch_2_06", PPC_FEATURE_ARCH_2_06, 0 }, + { "archpmu", PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0 }, + { "booke", PPC_FEATURE_BOOKE, 0 }, + { "cellbe", PPC_FEATURE_CELL_BE, 0 }, + { "dfp", PPC_FEATURE_HAS_DFP, 0 }, + { "efpdouble", PPC_FEATURE_HAS_EFP_DOUBLE, 0 }, + { "efpsingle", PPC_FEATURE_HAS_EFP_SINGLE, 0 }, + { "fpu", PPC_FEATURE_HAS_FPU, 0 }, + { "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, 0 }, + { "mmu", PPC_FEATURE_HAS_MMU, 0 }, + { "notb", PPC_FEATURE_NO_TB, 0 }, + { "pa6t", PPC_FEATURE_PA6T, 0 }, + { "power4", PPC_FEATURE_POWER4, 0 }, + { "power5", PPC_FEATURE_POWER5, 0 }, + { "power5+", PPC_FEATURE_POWER5_PLUS, 0 }, + { "power6x", PPC_FEATURE_POWER6_EXT, 0 }, + { "ppc32", PPC_FEATURE_32, 0 }, + { "ppc601", PPC_FEATURE_601_INSTR, 0 }, + { "ppc64", PPC_FEATURE_64, 0 }, + { "ppcle", PPC_FEATURE_PPC_LE, 0 }, + { "smt", PPC_FEATURE_SMT, 0 }, + { "spe", PPC_FEATURE_HAS_SPE, 0 }, + { "true_le", PPC_FEATURE_TRUE_LE, 0 }, + { "ucache", PPC_FEATURE_UNIFIED_CACHE, 0 }, + { "vsx", PPC_FEATURE_HAS_VSX, 0 }, + + /* AT_HWCAP2 tunable masks. */ + { "arch_2_07", PPC_FEATURE2_ARCH_2_07, 1 }, + { "dscr", PPC_FEATURE2_HAS_DSCR, 1 }, + { "ebb", PPC_FEATURE2_HAS_EBB, 1 }, + { "htm", PPC_FEATURE2_HAS_HTM, 1 }, + { "htm-nosc", PPC_FEATURE2_HTM_NOSC, 1 }, + { "htm-no-suspend", PPC_FEATURE2_HTM_NO_SUSPEND, 1 }, + { "isel", PPC_FEATURE2_HAS_ISEL, 1 }, + { "tar", PPC_FEATURE2_HAS_TAR, 1 }, + { "vcrypto", PPC_FEATURE2_HAS_VEC_CRYPTO, 1 }, + { "arch_3_00", PPC_FEATURE2_ARCH_3_00, 1 }, + { "ieee128", PPC_FEATURE2_HAS_IEEE128, 1 }, + { "darn", PPC_FEATURE2_DARN, 1 }, + { "scv", PPC_FEATURE2_SCV, 1 }, + { "arch_3_1", PPC_FEATURE2_ARCH_3_1, 1 }, + { "mma", PPC_FEATURE2_MMA, 1 }, }; #endif /* __CPU_FEATURES_H */ diff --git a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list index 87d6235c75..807b7f8013 100644 --- a/sysdeps/powerpc/dl-tunables.list +++ b/sysdeps/powerpc/dl-tunables.list @@ -24,5 +24,8 @@ glibc { maxval: 1 default: 0 } + hwcaps { + type: STRING + } } } diff --git a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c index e26e64d99e..f2c473c556 100644 --- a/sysdeps/powerpc/hwcapinfo.c +++ b/sysdeps/powerpc/hwcapinfo.c @@ -19,6 +19,7 @@ #include <unistd.h> #include <shlib-compat.h> #include <dl-procinfo.h> +#include <cpu-features.c> tcbhead_t __tcb __attribute__ ((visibility ("hidden"))); @@ -63,6 +64,9 @@ __tcb_parse_hwcap_and_convert_at_platform (void) else if (h1 & PPC_FEATURE_POWER5) h1 |= PPC_FEATURE_POWER4; + uint64_t array_hwcaps[] = { h1, h2 }; + init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps); + /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that we can read both in a single load later. */ __tcb.hwcap = (h1 << 32) | (h2 & 0xffffffff); diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c index b4f80539e7..986c37d71e 100644 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c @@ -21,6 +21,7 @@ #include <wchar.h> #include <ldsodefs.h> #include <ifunc-impl-list.h> +#include <cpu-features.h> size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -28,7 +29,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, { size_t i = max; - unsigned long int hwcap = GLRO(dl_hwcap); + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); + unsigned long int hwcap = features->hwcap; /* hwcap contains only the latest supported ISA, the code checks which is and fills the previous supported ones. */ if (hwcap & PPC_FEATURE_ARCH_2_06) diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h index 3dd00e02ee..a0bbd12012 100644 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h @@ -16,6 +16,7 @@ <https://www.gnu.org/licenses/>. */ #include <ldsodefs.h> +#include <cpu-features.h> /* The code checks if _rtld_global_ro was realocated before trying to access the dl_hwcap field. The assembly is to make the compiler not optimize the @@ -32,11 +33,12 @@ # define __GLRO(value) GLRO(value) #endif -/* dl_hwcap contains only the latest supported ISA, the macro checks which is - and fills the previous ones. */ +/* Get the hardware information post the tunables set , the macro checks + it and fills the previous ones. */ #define INIT_ARCH() \ - unsigned long int hwcap = __GLRO(dl_hwcap); \ - unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \ + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); \ + unsigned long int hwcap = features->hwcap; \ + unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \ bool __attribute__((unused)) use_cached_memopt = \ __GLRO(dl_powerpc_cpu_features.use_cached_memopt); \ if (hwcap & PPC_FEATURE_ARCH_2_06) \ diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h index 9b8943bc91..449208e86f 100644 --- a/sysdeps/powerpc/powerpc64/dl-machine.h +++ b/sysdeps/powerpc/powerpc64/dl-machine.h @@ -27,7 +27,6 @@ #include <dl-tls.h> #include <sysdep.h> #include <hwcapinfo.h> -#include <cpu-features.c> #include <dl-static-tls.h> #include <dl-funcdesc.h> #include <dl-machine-rel.h> @@ -297,7 +296,6 @@ static inline void __attribute__ ((unused)) dl_platform_init (void) { __tcb_parse_hwcap_and_convert_at_platform (); - init_cpu_features (&GLRO(dl_powerpc_cpu_features)); } #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index ebe9434052..fc26dd0e17 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -17,6 +17,7 @@ <https://www.gnu.org/licenses/>. */ #include <assert.h> +#include <cpu-features.h> #include <string.h> #include <wchar.h> #include <ldsodefs.h> @@ -27,9 +28,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, size_t max) { size_t i = max; - - unsigned long int hwcap = GLRO(dl_hwcap); - unsigned long int hwcap2 = GLRO(dl_hwcap2); + const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); + unsigned long int hwcap = features->hwcap; + unsigned long int hwcap2 = features->hwcap2; #ifdef SHARED int cacheline_size = GLRO(dl_cache_line_size); #endif
From: Mahesh Bodapati <mahesh.bodapati@ibm.com> This patch enables the option to influence hwcaps used by PowerPC. The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz...., can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx and zzz, where the feature name is case-sensitive and has to match the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. Note that the tunable only handles the features which are really used in the IFUNC selection. All others are ignored as the values are only used inside glibc. --- manual/tunables.texi | 5 +- sysdeps/powerpc/cpu-features.c | 91 ++++++++++++++++++- sysdeps/powerpc/cpu-features.h | 57 ++++++++++++ sysdeps/powerpc/dl-tunables.list | 3 + sysdeps/powerpc/hwcapinfo.c | 4 + .../power4/multiarch/ifunc-impl-list.c | 4 +- .../powerpc32/power4/multiarch/init-arch.h | 10 +- sysdeps/powerpc/powerpc64/dl-machine.h | 2 - .../powerpc64/multiarch/ifunc-impl-list.c | 7 +- 9 files changed, 171 insertions(+), 12 deletions(-)