Message ID | PAWPR08MB898285FB64F7186879479C94832A2@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Check kernel version for SVE ifuncs | expand |
On 13/03/24 11:31, Wilco Dijkstra wrote: > > Older Linux kernels may disable SVE after certain syscalls. Calling the > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > As a result, applications with a high use of syscalls may run slower with > the SVE memcpy. Avoid this by checking the kernel version and enable the > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. > > Passes regress, OK for commit? How does it handle backports? Parsing kernel version is really not ideal (and that's why we removed it on previous version), can't kernel provide this information through any means (as powerpc does for HTM without suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ? > > --- > > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h > index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 > --- a/sysdeps/aarch64/cpu-features.h > +++ b/sysdeps/aarch64/cpu-features.h > @@ -71,6 +71,7 @@ struct cpu_features > /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ > uint8_t mte_state; > bool sve; > + bool prefer_sve_ifuncs; > bool mops; > }; > > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -36,5 +36,7 @@ > MTE_ENABLED (); \ > bool __attribute__((unused)) sve = \ > GLRO(dl_aarch64_cpu_features).sve; \ > + bool __attribute__((unused)) prefer_sve_ifuncs = \ > + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ > bool __attribute__((unused)) mops = \ > GLRO(dl_aarch64_cpu_features).mops; > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) > { > if (IS_A64FX (midr)) > return __memcpy_a64fx; > - return __memcpy_sve; > + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -47,7 +47,7 @@ select_memmove_ifunc (void) > { > if (IS_A64FX (midr)) > return __memmove_a64fx; > - return __memmove_sve; > + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -21,6 +21,7 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > #include <sys/prctl.h> > +#include <sys/utsname.h> > #include <dl-tunables-parse.h> > > #define DCZID_DZP_MASK (1 << 4) > @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) > return UINT64_MAX; > } > > +/* Parse kernel version without calling any library functions. > + Allow 2 digits for kernel version and 3 digits for major version, > + separated by '.': "kk.mmm.". > + Return kernel version * 1000 + major version, or -1 on failure. */ > + > +static inline int > +kernel_version (void) > +{ > + struct utsname buf; > + const char *p = &buf.release[0]; > + int kernel = 0; > + int major = 0; > + > + if (__uname (&buf) < 0) > + return -1; > + > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p != '.') > + return -1; > + p++; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p != '.' && *p != '\0') > + return -1; > + > + return kernel * 1000 + major; > +} > + > static inline void > init_cpu_features (struct cpu_features *cpu_features) > { > @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) > /* Check if SVE is supported. */ > cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; > > + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ > + cpu_features->prefer_sve_ifuncs = > + cpu_features->sve && kernel_version () >= 6002; > + > /* Check if MOPS is supported. */ > cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; > } >
The 03/13/2024 14:31, Wilco Dijkstra wrote: > > Older Linux kernels may disable SVE after certain syscalls. Calling the > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > As a result, applications with a high use of syscalls may run slower with > the SVE memcpy. Avoid this by checking the kernel version and enable the > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. a kernel version check at startup is a bit ugly. i think we should use the aarch64 kernel-features.h to define an __ASSUME_FAST_SVE or similar based on the __LINUX_KERNEL_VERSION (so once we raise the min kernel version to 6.2 the startup check is not done anymore or users can config --enable-kernel=6.2.0). (fast_sve may not be the best name, 'no_excessive_sve_traps' or 'sve_stays_enabled' may be better, but i'm not clear on the exact kernel behaviour we want here so let you decide) > > Passes regress, OK for commit? > patch looks ok otherwise. > --- > > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h > index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 > --- a/sysdeps/aarch64/cpu-features.h > +++ b/sysdeps/aarch64/cpu-features.h > @@ -71,6 +71,7 @@ struct cpu_features > /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ > uint8_t mte_state; > bool sve; > + bool prefer_sve_ifuncs; > bool mops; > }; > > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -36,5 +36,7 @@ > MTE_ENABLED (); \ > bool __attribute__((unused)) sve = \ > GLRO(dl_aarch64_cpu_features).sve; \ > + bool __attribute__((unused)) prefer_sve_ifuncs = \ > + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ > bool __attribute__((unused)) mops = \ > GLRO(dl_aarch64_cpu_features).mops; > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) > { > if (IS_A64FX (midr)) > return __memcpy_a64fx; > - return __memcpy_sve; > + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -47,7 +47,7 @@ select_memmove_ifunc (void) > { > if (IS_A64FX (midr)) > return __memmove_a64fx; > - return __memmove_sve; > + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -21,6 +21,7 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > #include <sys/prctl.h> > +#include <sys/utsname.h> > #include <dl-tunables-parse.h> > > #define DCZID_DZP_MASK (1 << 4) > @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) > return UINT64_MAX; > } > > +/* Parse kernel version without calling any library functions. > + Allow 2 digits for kernel version and 3 digits for major version, > + separated by '.': "kk.mmm.". looks reasonable. > + Return kernel version * 1000 + major version, or -1 on failure. */ > + > +static inline int > +kernel_version (void) > +{ > + struct utsname buf; > + const char *p = &buf.release[0]; > + int kernel = 0; > + int major = 0; > + > + if (__uname (&buf) < 0) > + return -1; > + > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p != '.') > + return -1; > + p++; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p != '.' && *p != '\0') > + return -1; > + > + return kernel * 1000 + major; > +} > + > static inline void > init_cpu_features (struct cpu_features *cpu_features) > { > @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) > /* Check if SVE is supported. */ > cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; > > + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ > + cpu_features->prefer_sve_ifuncs = > + cpu_features->sve && kernel_version () >= 6002; e.g. this can be '&& fast_sve ()' and define fast_sve based on __ASSUME_FAST_SVE. > + > /* Check if MOPS is supported. */ > cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; > } >
The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: > On 13/03/24 11:31, Wilco Dijkstra wrote: > > > > Older Linux kernels may disable SVE after certain syscalls. Calling the > > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > > As a result, applications with a high use of syscalls may run slower with > > the SVE memcpy. Avoid this by checking the kernel version and enable the > > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. > > > > Passes regress, OK for commit? > > How does it handle backports? Parsing kernel version is really not ideal > (and that's why we removed it on previous version), can't kernel provide > this information through any means (as powerpc does for HTM without > suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ? the check is for performance only, not correctness. if we detect a bad kernel as good, that can cause significant slowdown due to the traps depending on the workload. (at least this is the claim, i haven't verified, but plausible) if we detect a good kernel as bad (e.g. because of kernel backports), that is acceptable amount of slowdown (the fallback memcpy is reasonably fast). i think the kernel version check is ugly, but in this case it makes sense: direct detection of the behaviour is hard, the version check should reliably avoid detecting a bad kernel as good, and it is better than always assuming a bad kernel. > > > > > --- > > > > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h > > index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 > > --- a/sysdeps/aarch64/cpu-features.h > > +++ b/sysdeps/aarch64/cpu-features.h > > @@ -71,6 +71,7 @@ struct cpu_features > > /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ > > uint8_t mte_state; > > bool sve; > > + bool prefer_sve_ifuncs; > > bool mops; > > }; > > > > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > > index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 > > --- a/sysdeps/aarch64/multiarch/init-arch.h > > +++ b/sysdeps/aarch64/multiarch/init-arch.h > > @@ -36,5 +36,7 @@ > > MTE_ENABLED (); \ > > bool __attribute__((unused)) sve = \ > > GLRO(dl_aarch64_cpu_features).sve; \ > > + bool __attribute__((unused)) prefer_sve_ifuncs = \ > > + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ > > bool __attribute__((unused)) mops = \ > > GLRO(dl_aarch64_cpu_features).mops; > > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > > index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 > > --- a/sysdeps/aarch64/multiarch/memcpy.c > > +++ b/sysdeps/aarch64/multiarch/memcpy.c > > @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) > > { > > if (IS_A64FX (midr)) > > return __memcpy_a64fx; > > - return __memcpy_sve; > > + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; > > } > > > > if (IS_THUNDERX (midr)) > > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > > index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 > > --- a/sysdeps/aarch64/multiarch/memmove.c > > +++ b/sysdeps/aarch64/multiarch/memmove.c > > @@ -47,7 +47,7 @@ select_memmove_ifunc (void) > > { > > if (IS_A64FX (midr)) > > return __memmove_a64fx; > > - return __memmove_sve; > > + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; > > } > > > > if (IS_THUNDERX (midr)) > > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > > index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 > > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > > @@ -21,6 +21,7 @@ > > #include <sys/auxv.h> > > #include <elf/dl-hwcaps.h> > > #include <sys/prctl.h> > > +#include <sys/utsname.h> > > #include <dl-tunables-parse.h> > > > > #define DCZID_DZP_MASK (1 << 4) > > @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) > > return UINT64_MAX; > > } > > > > +/* Parse kernel version without calling any library functions. > > + Allow 2 digits for kernel version and 3 digits for major version, > > + separated by '.': "kk.mmm.". > > + Return kernel version * 1000 + major version, or -1 on failure. */ > > + > > +static inline int > > +kernel_version (void) > > +{ > > + struct utsname buf; > > + const char *p = &buf.release[0]; > > + int kernel = 0; > > + int major = 0; > > + > > + if (__uname (&buf) < 0) > > + return -1; > > + > > + if (*p >= '0' && *p <= '9') > > + kernel = (kernel * 10) + *p++ - '0'; > > + if (*p >= '0' && *p <= '9') > > + kernel = (kernel * 10) + *p++ - '0'; > > + if (*p != '.') > > + return -1; > > + p++; > > + if (*p >= '0' && *p <= '9') > > + major = (major * 10) + *p++ - '0'; > > + if (*p >= '0' && *p <= '9') > > + major = (major * 10) + *p++ - '0'; > > + if (*p >= '0' && *p <= '9') > > + major = (major * 10) + *p++ - '0'; > > + if (*p != '.' && *p != '\0') > > + return -1; > > + > > + return kernel * 1000 + major; > > +} > > + > > static inline void > > init_cpu_features (struct cpu_features *cpu_features) > > { > > @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) > > /* Check if SVE is supported. */ > > cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; > > > > + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ > > + cpu_features->prefer_sve_ifuncs = > > + cpu_features->sve && kernel_version () >= 6002; > > + > > /* Check if MOPS is supported. */ > > cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; > > } > >
On Wed, Mar 13, 2024 at 7:32 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > > Older Linux kernels may disable SVE after certain syscalls. Calling the > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > As a result, applications with a high use of syscalls may run slower with > the SVE memcpy. Avoid this by checking the kernel version and enable the > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. > > Passes regress, OK for commit? > > --- > > diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h > index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 > --- a/sysdeps/aarch64/cpu-features.h > +++ b/sysdeps/aarch64/cpu-features.h > @@ -71,6 +71,7 @@ struct cpu_features > /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ > uint8_t mte_state; > bool sve; > + bool prefer_sve_ifuncs; > bool mops; > }; > > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -36,5 +36,7 @@ > MTE_ENABLED (); \ > bool __attribute__((unused)) sve = \ > GLRO(dl_aarch64_cpu_features).sve; \ > + bool __attribute__((unused)) prefer_sve_ifuncs = \ > + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ > bool __attribute__((unused)) mops = \ > GLRO(dl_aarch64_cpu_features).mops; > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) > { > if (IS_A64FX (midr)) > return __memcpy_a64fx; > - return __memcpy_sve; > + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; Does it make sense to check prefer_sve_ifuncs before checking IS_A64FX because this is about the use of SVE registers and the kernel version and the a64fx versions use SVE too? That is doing: if (!prefer_sve_ifuncs) return __memcpy_generic; if (IS_A64FX (midr)) return __memcpy_a64fx; return __memcpy_sve; Thanks, Andrew Pinski > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -47,7 +47,7 @@ select_memmove_ifunc (void) > { > if (IS_A64FX (midr)) > return __memmove_a64fx; > - return __memmove_sve; > + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; > } > > if (IS_THUNDERX (midr)) > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -21,6 +21,7 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > #include <sys/prctl.h> > +#include <sys/utsname.h> > #include <dl-tunables-parse.h> > > #define DCZID_DZP_MASK (1 << 4) > @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) > return UINT64_MAX; > } > > +/* Parse kernel version without calling any library functions. > + Allow 2 digits for kernel version and 3 digits for major version, > + separated by '.': "kk.mmm.". > + Return kernel version * 1000 + major version, or -1 on failure. */ > + > +static inline int > +kernel_version (void) > +{ > + struct utsname buf; > + const char *p = &buf.release[0]; > + int kernel = 0; > + int major = 0; > + > + if (__uname (&buf) < 0) > + return -1; > + > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + kernel = (kernel * 10) + *p++ - '0'; > + if (*p != '.') > + return -1; > + p++; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p >= '0' && *p <= '9') > + major = (major * 10) + *p++ - '0'; > + if (*p != '.' && *p != '\0') > + return -1; > + > + return kernel * 1000 + major; > +} > + > static inline void > init_cpu_features (struct cpu_features *cpu_features) > { > @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) > /* Check if SVE is supported. */ > cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; > > + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ > + cpu_features->prefer_sve_ifuncs = > + cpu_features->sve && kernel_version () >= 6002; > + > /* Check if MOPS is supported. */ > cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; > } >
On 13/03/24 16:25, Szabolcs Nagy wrote: > The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: >> On 13/03/24 11:31, Wilco Dijkstra wrote: >>> >>> Older Linux kernels may disable SVE after certain syscalls. Calling the >>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. >>> As a result, applications with a high use of syscalls may run slower with >>> the SVE memcpy. Avoid this by checking the kernel version and enable the >>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. >>> >>> Passes regress, OK for commit? >> >> How does it handle backports? Parsing kernel version is really not ideal >> (and that's why we removed it on previous version), can't kernel provide >> this information through any means (as powerpc does for HTM without >> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ? > > the check is for performance only, not correctness. > > if we detect a bad kernel as good, that can cause significant > slowdown due to the traps depending on the workload. > (at least this is the claim, i haven't verified, but plausible) > > if we detect a good kernel as bad (e.g. because of kernel > backports), that is acceptable amount of slowdown (the fallback > memcpy is reasonably fast). > > i think the kernel version check is ugly, but in this case it > makes sense: direct detection of the behaviour is hard, the > version check should reliably avoid detecting a bad kernel as > good, and it is better than always assuming a bad kernel. Yes, I understand this. My point it won't be possible to backport this kernel fix to get the performance improvement of SVE routines without hacking the glibc as well. It is not really a blocker, but I would expect kernel to do proper advertise for such performance change that might interfere with ifunc selection. Maybe we can add a tunable to force SVE selection, but I don't have a strong opinion. And I don't think __ASSUME_FAST_SVE would work well here, it means it would always detect a good kernel even when running on a older one (I am not sure how usual this is). The minimum supported kernel version can work to ensure that this check won't be necessary, but in this case we won't really need this test anyway. The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda) used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease. I think we will need at least the osrelease fallback in the case of uname syscall filtering. > >> >>> >>> --- >>> >>> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h >>> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 >>> --- a/sysdeps/aarch64/cpu-features.h >>> +++ b/sysdeps/aarch64/cpu-features.h >>> @@ -71,6 +71,7 @@ struct cpu_features >>> /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ >>> uint8_t mte_state; >>> bool sve; >>> + bool prefer_sve_ifuncs; >>> bool mops; >>> }; >>> >>> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h >>> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 >>> --- a/sysdeps/aarch64/multiarch/init-arch.h >>> +++ b/sysdeps/aarch64/multiarch/init-arch.h >>> @@ -36,5 +36,7 @@ >>> MTE_ENABLED (); \ >>> bool __attribute__((unused)) sve = \ >>> GLRO(dl_aarch64_cpu_features).sve; \ >>> + bool __attribute__((unused)) prefer_sve_ifuncs = \ >>> + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ >>> bool __attribute__((unused)) mops = \ >>> GLRO(dl_aarch64_cpu_features).mops; >>> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c >>> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 >>> --- a/sysdeps/aarch64/multiarch/memcpy.c >>> +++ b/sysdeps/aarch64/multiarch/memcpy.c >>> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) >>> { >>> if (IS_A64FX (midr)) >>> return __memcpy_a64fx; >>> - return __memcpy_sve; >>> + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; >>> } >>> >>> if (IS_THUNDERX (midr)) >>> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c >>> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 >>> --- a/sysdeps/aarch64/multiarch/memmove.c >>> +++ b/sysdeps/aarch64/multiarch/memmove.c >>> @@ -47,7 +47,7 @@ select_memmove_ifunc (void) >>> { >>> if (IS_A64FX (midr)) >>> return __memmove_a64fx; >>> - return __memmove_sve; >>> + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; >>> } >>> >>> if (IS_THUNDERX (midr)) >>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >>> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 >>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >>> @@ -21,6 +21,7 @@ >>> #include <sys/auxv.h> >>> #include <elf/dl-hwcaps.h> >>> #include <sys/prctl.h> >>> +#include <sys/utsname.h> >>> #include <dl-tunables-parse.h> >>> >>> #define DCZID_DZP_MASK (1 << 4) >>> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) >>> return UINT64_MAX; >>> } >>> >>> +/* Parse kernel version without calling any library functions. >>> + Allow 2 digits for kernel version and 3 digits for major version, >>> + separated by '.': "kk.mmm.". >>> + Return kernel version * 1000 + major version, or -1 on failure. */ >>> + >>> +static inline int >>> +kernel_version (void) >>> +{ >>> + struct utsname buf; >>> + const char *p = &buf.release[0]; >>> + int kernel = 0; >>> + int major = 0; >>> + >>> + if (__uname (&buf) < 0) >>> + return -1; >>> + >>> + if (*p >= '0' && *p <= '9') >>> + kernel = (kernel * 10) + *p++ - '0'; >>> + if (*p >= '0' && *p <= '9') >>> + kernel = (kernel * 10) + *p++ - '0'; >>> + if (*p != '.') >>> + return -1; >>> + p++; >>> + if (*p >= '0' && *p <= '9') >>> + major = (major * 10) + *p++ - '0'; >>> + if (*p >= '0' && *p <= '9') >>> + major = (major * 10) + *p++ - '0'; >>> + if (*p >= '0' && *p <= '9') >>> + major = (major * 10) + *p++ - '0'; >>> + if (*p != '.' && *p != '\0') >>> + return -1; >>> + >>> + return kernel * 1000 + major; >>> +} >>> + >>> static inline void >>> init_cpu_features (struct cpu_features *cpu_features) >>> { >>> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) >>> /* Check if SVE is supported. */ >>> cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; >>> >>> + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ >>> + cpu_features->prefer_sve_ifuncs = >>> + cpu_features->sve && kernel_version () >= 6002; >>> + >>> /* Check if MOPS is supported. */ >>> cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; >>> } >>>
Hi Andrew, > Does it make sense to check prefer_sve_ifuncs before checking IS_A64FX > because this is about the use of SVE registers and the kernel version > and the a64fx versions use SVE too? > > That is doing: > if (!prefer_sve_ifuncs) > return __memcpy_generic; > if (IS_A64FX (midr)) > return __memcpy_a64fx; > return __memcpy_sve; The A64FX memcpy shows a major speedup due to the very wide SVE registers. It runs floating point code, not general purpose server code. Since the benefit of SVE is larger and the likelihood of system calls far lower, there is no reason to change the default for A64FX. Cheers, Wilco
The 03/13/2024 16:55, Adhemerval Zanella Netto wrote: > On 13/03/24 16:25, Szabolcs Nagy wrote: > > The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: > >> On 13/03/24 11:31, Wilco Dijkstra wrote: > >>> > >>> Older Linux kernels may disable SVE after certain syscalls. Calling the > >>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > >>> As a result, applications with a high use of syscalls may run slower with > >>> the SVE memcpy. Avoid this by checking the kernel version and enable the > >>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. > >>> > >>> Passes regress, OK for commit? > >> > >> How does it handle backports? Parsing kernel version is really not ideal > >> (and that's why we removed it on previous version), can't kernel provide > >> this information through any means (as powerpc does for HTM without > >> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ? > > > > the check is for performance only, not correctness. > > > > if we detect a bad kernel as good, that can cause significant > > slowdown due to the traps depending on the workload. > > (at least this is the claim, i haven't verified, but plausible) > > > > if we detect a good kernel as bad (e.g. because of kernel > > backports), that is acceptable amount of slowdown (the fallback > > memcpy is reasonably fast). > > > > i think the kernel version check is ugly, but in this case it > > makes sense: direct detection of the behaviour is hard, the > > version check should reliably avoid detecting a bad kernel as > > good, and it is better than always assuming a bad kernel. > > Yes, I understand this. My point it won't be possible to backport this > kernel fix to get the performance improvement of SVE routines without > hacking the glibc as well. It is not really a blocker, but I would > expect kernel to do proper advertise for such performance change that > might interfere with ifunc selection. Maybe we can add a tunable to > force SVE selection, but I don't have a strong opinion. right. i can ask linux devs, if they are happy introducing an easy check in future kernels or if the change is easy to backport at all or we don't want to encourage that. > > And I don't think __ASSUME_FAST_SVE would work well here, it means it > would always detect a good kernel even when running on a older one why? if it is set based on min supported kernel version then running on older kernel is a bug. if it can be overriden by a tunable then it is a user error if the tunable is wrong. > (I am not sure how usual this is). The minimum supported kernel > version can work to ensure that this check won't be necessary, but in > this case we won't really need this test anyway. you mean after min version is increased the check can be removed? i expect all __ASSUME* based on min linux version works like that and it's useful to have the __ASSUME exactly to be able to find which code can be removed after a min version increase. i don't know if distros actually adjust the min version in their glibc, i guess that would be risky if it should work in containers, so 6.2 min version is probably far in the future. > > The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda) > used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease. > I think we will need at least the osrelease fallback in the case of > uname syscall filtering. i didn't know about the vdso PT_NOTE. we could check osrelease there i guess. fallback is not critical as slightly slower memcpy is not the end of the world, and there is a balance how complicated code we want to maintain here. > > > > >> > >>> > >>> --- > >>> > >>> diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h > >>> index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 > >>> --- a/sysdeps/aarch64/cpu-features.h > >>> +++ b/sysdeps/aarch64/cpu-features.h > >>> @@ -71,6 +71,7 @@ struct cpu_features > >>> /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ > >>> uint8_t mte_state; > >>> bool sve; > >>> + bool prefer_sve_ifuncs; > >>> bool mops; > >>> }; > >>> > >>> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > >>> index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 > >>> --- a/sysdeps/aarch64/multiarch/init-arch.h > >>> +++ b/sysdeps/aarch64/multiarch/init-arch.h > >>> @@ -36,5 +36,7 @@ > >>> MTE_ENABLED (); \ > >>> bool __attribute__((unused)) sve = \ > >>> GLRO(dl_aarch64_cpu_features).sve; \ > >>> + bool __attribute__((unused)) prefer_sve_ifuncs = \ > >>> + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ > >>> bool __attribute__((unused)) mops = \ > >>> GLRO(dl_aarch64_cpu_features).mops; > >>> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > >>> index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 > >>> --- a/sysdeps/aarch64/multiarch/memcpy.c > >>> +++ b/sysdeps/aarch64/multiarch/memcpy.c > >>> @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) > >>> { > >>> if (IS_A64FX (midr)) > >>> return __memcpy_a64fx; > >>> - return __memcpy_sve; > >>> + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; > >>> } > >>> > >>> if (IS_THUNDERX (midr)) > >>> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > >>> index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 > >>> --- a/sysdeps/aarch64/multiarch/memmove.c > >>> +++ b/sysdeps/aarch64/multiarch/memmove.c > >>> @@ -47,7 +47,7 @@ select_memmove_ifunc (void) > >>> { > >>> if (IS_A64FX (midr)) > >>> return __memmove_a64fx; > >>> - return __memmove_sve; > >>> + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; > >>> } > >>> > >>> if (IS_THUNDERX (midr)) > >>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > >>> index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 > >>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > >>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > >>> @@ -21,6 +21,7 @@ > >>> #include <sys/auxv.h> > >>> #include <elf/dl-hwcaps.h> > >>> #include <sys/prctl.h> > >>> +#include <sys/utsname.h> > >>> #include <dl-tunables-parse.h> > >>> > >>> #define DCZID_DZP_MASK (1 << 4) > >>> @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) > >>> return UINT64_MAX; > >>> } > >>> > >>> +/* Parse kernel version without calling any library functions. > >>> + Allow 2 digits for kernel version and 3 digits for major version, > >>> + separated by '.': "kk.mmm.". > >>> + Return kernel version * 1000 + major version, or -1 on failure. */ > >>> + > >>> +static inline int > >>> +kernel_version (void) > >>> +{ > >>> + struct utsname buf; > >>> + const char *p = &buf.release[0]; > >>> + int kernel = 0; > >>> + int major = 0; > >>> + > >>> + if (__uname (&buf) < 0) > >>> + return -1; > >>> + > >>> + if (*p >= '0' && *p <= '9') > >>> + kernel = (kernel * 10) + *p++ - '0'; > >>> + if (*p >= '0' && *p <= '9') > >>> + kernel = (kernel * 10) + *p++ - '0'; > >>> + if (*p != '.') > >>> + return -1; > >>> + p++; > >>> + if (*p >= '0' && *p <= '9') > >>> + major = (major * 10) + *p++ - '0'; > >>> + if (*p >= '0' && *p <= '9') > >>> + major = (major * 10) + *p++ - '0'; > >>> + if (*p >= '0' && *p <= '9') > >>> + major = (major * 10) + *p++ - '0'; > >>> + if (*p != '.' && *p != '\0') > >>> + return -1; > >>> + > >>> + return kernel * 1000 + major; > >>> +} > >>> + > >>> static inline void > >>> init_cpu_features (struct cpu_features *cpu_features) > >>> { > >>> @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) > >>> /* Check if SVE is supported. */ > >>> cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; > >>> > >>> + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ > >>> + cpu_features->prefer_sve_ifuncs = > >>> + cpu_features->sve && kernel_version () >= 6002; > >>> + > >>> /* Check if MOPS is supported. */ > >>> cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; > >>> } > >>>
* Adhemerval Zanella Netto: > Yes, I understand this. My point it won't be possible to backport this > kernel fix to get the performance improvement of SVE routines without > hacking the glibc as well. It is not really a blocker, but I would > expect kernel to do proper advertise for such performance change that > might interfere with ifunc selection. Maybe we can add a tunable to > force SVE selection, but I don't have a strong opinion. I think the kernel change went into the other direction: it eliminated the SVE trap after system calls. Thanks, Florian
* Wilco Dijkstra: > Older Linux kernels may disable SVE after certain syscalls. Calling the > SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. > As a result, applications with a high use of syscalls may run slower with > the SVE memcpy. Avoid this by checking the kernel version and enable the > SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. Given that the existing SVE users have not complained about this, is it really a good idea to disable SVE string functions for them? Not everyone interleaves system calls and string functions in inner loops. > +/* Parse kernel version without calling any library functions. > + Allow 2 digits for kernel version and 3 digits for major version, > + separated by '.': "kk.mmm.". > + Return kernel version * 1000 + major version, or -1 on failure. */ I think you should parse the versions you exclude because you know their syntax. We tried to predict the layout of future kernel version strings before and failed. Thanks, Florian
On 14/03/24 05:35, Szabolcs Nagy wrote: > The 03/13/2024 16:55, Adhemerval Zanella Netto wrote: >> On 13/03/24 16:25, Szabolcs Nagy wrote: >>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: >>>> On 13/03/24 11:31, Wilco Dijkstra wrote: >>>>> >>>>> Older Linux kernels may disable SVE after certain syscalls. Calling the >>>>> SVE-optimized memcpy afterwards will then cause a trap to reenable SVE. >>>>> As a result, applications with a high use of syscalls may run slower with >>>>> the SVE memcpy. Avoid this by checking the kernel version and enable the >>>>> SVE-optimized memcpy/memmove ifuncs only on Linux kernel 6.2 or newer. >>>>> >>>>> Passes regress, OK for commit? >>>> >>>> How does it handle backports? Parsing kernel version is really not ideal >>>> (and that's why we removed it on previous version), can't kernel provide >>>> this information through any means (as powerpc does for HTM without >>>> suspend, PPC_FEATURE2_HTM_NO_SUSPEND) ? >>> >>> the check is for performance only, not correctness. >>> >>> if we detect a bad kernel as good, that can cause significant >>> slowdown due to the traps depending on the workload. >>> (at least this is the claim, i haven't verified, but plausible) >>> >>> if we detect a good kernel as bad (e.g. because of kernel >>> backports), that is acceptable amount of slowdown (the fallback >>> memcpy is reasonably fast). >>> >>> i think the kernel version check is ugly, but in this case it >>> makes sense: direct detection of the behaviour is hard, the >>> version check should reliably avoid detecting a bad kernel as >>> good, and it is better than always assuming a bad kernel. >> >> Yes, I understand this. My point it won't be possible to backport this >> kernel fix to get the performance improvement of SVE routines without >> hacking the glibc as well. It is not really a blocker, but I would >> expect kernel to do proper advertise for such performance change that >> might interfere with ifunc selection. Maybe we can add a tunable to >> force SVE selection, but I don't have a strong opinion. > > right. > > i can ask linux devs, if they are happy introducing an easy > check in future kernels or if the change is easy to backport > at all or we don't want to encourage that. > >> >> And I don't think __ASSUME_FAST_SVE would work well here, it means it >> would always detect a good kernel even when running on a older one > > why? if it is set based on min supported kernel version then > running on older kernel is a bug. if it can be overriden by > a tunable then it is a user error if the tunable is wrong. > >> (I am not sure how usual this is). The minimum supported kernel >> version can work to ensure that this check won't be necessary, but in >> this case we won't really need this test anyway. > > you mean after min version is increased the check can be removed? > i expect all __ASSUME* based on min linux version works like that > and it's useful to have the __ASSUME exactly to be able to find > which code can be removed after a min version increase. > > i don't know if distros actually adjust the min version in their > glibc, i guess that would be risky if it should work in containers, > so 6.2 min version is probably far in the future. All major distros I am aware of does not set --enable-kernel, so SVE only will be selected either someone builds a glibc with --enable-kernel=6.2 or when we raise the minimum version to 6.2. Not a deal breaker, but the SVE routines will ended up not being actively used for a long time. > >> >> The _dl_discover_osversion (removed by b46d250656794e63a2946c481fda) >> used to check the PT_NOTE, and fallback to uname or /proc/sys/kernel/osrelease. >> I think we will need at least the osrelease fallback in the case of >> uname syscall filtering. > > i didn't know about the vdso PT_NOTE. > > we could check osrelease there i guess. fallback is not critical > as slightly slower memcpy is not the end of the world, and there > is a balance how complicated code we want to maintain here. I agree, maybe uname alone should be suffice.
The 03/14/2024 10:47, Adhemerval Zanella Netto wrote: > On 14/03/24 05:35, Szabolcs Nagy wrote: > > The 03/13/2024 16:55, Adhemerval Zanella Netto wrote: > >> On 13/03/24 16:25, Szabolcs Nagy wrote: > >>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: > >> And I don't think __ASSUME_FAST_SVE would work well here, it means it > >> would always detect a good kernel even when running on a older one > > > > why? if it is set based on min supported kernel version then > > running on older kernel is a bug. if it can be overriden by > > a tunable then it is a user error if the tunable is wrong. > > > >> (I am not sure how usual this is). The minimum supported kernel > >> version can work to ensure that this check won't be necessary, but in > >> this case we won't really need this test anyway. > > > > you mean after min version is increased the check can be removed? > > i expect all __ASSUME* based on min linux version works like that > > and it's useful to have the __ASSUME exactly to be able to find > > which code can be removed after a min version increase. > > > > i don't know if distros actually adjust the min version in their > > glibc, i guess that would be risky if it should work in containers, > > so 6.2 min version is probably far in the future. > > All major distros I am aware of does not set --enable-kernel, so SVE > only will be selected either someone builds a glibc with > --enable-kernel=6.2 or when we raise the minimum version to 6.2. Not > a deal breaker, but the SVE routines will ended up not being actively > used for a long time. the __ASSUME would just gate the runtime version check, we would still do a kernel version check so sve will be used on new kernels, the __ASSUME is there so we dont forget to remove the check when the min version is high enough.
On 14/03/24 11:26, Szabolcs Nagy wrote: > The 03/14/2024 10:47, Adhemerval Zanella Netto wrote: >> On 14/03/24 05:35, Szabolcs Nagy wrote: >>> The 03/13/2024 16:55, Adhemerval Zanella Netto wrote: >>>> On 13/03/24 16:25, Szabolcs Nagy wrote: >>>>> The 03/13/2024 15:12, Adhemerval Zanella Netto wrote: >>>> And I don't think __ASSUME_FAST_SVE would work well here, it means it >>>> would always detect a good kernel even when running on a older one >>> >>> why? if it is set based on min supported kernel version then >>> running on older kernel is a bug. if it can be overriden by >>> a tunable then it is a user error if the tunable is wrong. >>> >>>> (I am not sure how usual this is). The minimum supported kernel >>>> version can work to ensure that this check won't be necessary, but in >>>> this case we won't really need this test anyway. >>> >>> you mean after min version is increased the check can be removed? >>> i expect all __ASSUME* based on min linux version works like that >>> and it's useful to have the __ASSUME exactly to be able to find >>> which code can be removed after a min version increase. >>> >>> i don't know if distros actually adjust the min version in their >>> glibc, i guess that would be risky if it should work in containers, >>> so 6.2 min version is probably far in the future. >> >> All major distros I am aware of does not set --enable-kernel, so SVE >> only will be selected either someone builds a glibc with >> --enable-kernel=6.2 or when we raise the minimum version to 6.2. Not >> a deal breaker, but the SVE routines will ended up not being actively >> used for a long time. > > the __ASSUME would just gate the runtime version check, > we would still do a kernel version check so sve will be > used on new kernels, the __ASSUME is there so we dont > forget to remove the check when the min version is high > enough. Ah right, I assumed you suggested to use only the _ASSUME check instead of kernel version. Sounds reasonable to me.
Hi Florian, > Given that the existing SVE users have not complained about this, is it > really a good idea to disable SVE string functions for them? Not > everyone interleaves system calls and string functions in inner loops. People have found significant regressions when trying the SVE memcpy [1], the kernel commit mentions 70% overhead in microbenchmarks [2]. Also distros appear to use a wide range of kernel versions. I've seen the same GLIBC version used with 5.11 kernel all the way up to 6.5 in the same version of a distro - and these are all standard installations... Hence we need some kind of workaround to deal with old kernels before adding more uses of SVE (new SVE ifuncs or backports of existing ones). > +/* Parse kernel version without calling any library functions. > + Allow 2 digits for kernel version and 3 digits for major version, > + separated by '.': "kk.mmm.". > + Return kernel version * 1000 + major version, or -1 on failure. */ > I think you should parse the versions you exclude because you know their > syntax. We tried to predict the layout of future kernel version strings > before and failed. Yes that is a good idea. So we could just scan for "k.m." format and if it doesn't match or if uname fails, treat it like a new kernel. It's not a correctness issue, so if someone uses a different format or blocks uname() in a container then it's fine to assume it is a new kernel and enable the SVE ifuncs. Cheers, Wilco [1] https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1999551/comments/45 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/arm64/kernel/fpsimd.c?id=8c845e2731041f0fdf9287dea80b039b26332c9f
* Wilco Dijkstra: > Hi Florian, > >> Given that the existing SVE users have not complained about this, is it >> really a good idea to disable SVE string functions for them? Not >> everyone interleaves system calls and string functions in inner loops. > > People have found significant regressions when trying the SVE memcpy [1], > the kernel commit mentions 70% overhead in microbenchmarks [2]. > > Also distros appear to use a wide range of kernel versions. I've seen the > same GLIBC version used with 5.11 kernel all the way up to 6.5 in the > same version of a distro - and these are all standard installations... I generally prefer we fix the component that has the bug. With that approach, you'd have to to use your distribution contacts to request a backport. The workaround introduces a performance hit for those users who today benefit from the SVE string operations, but do not suffer from the syscall interaction issue. In the end, it is your port, but version-based workarounds are really unusual for glibc. Thanks, Florian
Hi Florian, > I generally prefer we fix the component that has the bug. With that > approach, you'd have to to use your distribution contacts to request a > backport. The issue is present since SVE was added in 4.18 so it affects many kernels. It is unlikely to be easy to backport since it relies on various other changes to how syscalls and register state is dealt with. Originally it was thought to be OK - and it would be if you only ever use SVE in vectorized loops. However using a few SVE instructions everywhere in applications breaks that model. Plus all the security features have increased the overhead of kernel traps in recent years... > The workaround introduces a performance hit for those users who today > benefit from the SVE string operations, but do not suffer from the > syscall interaction issue. Today everybody has the performance hit of not getting the SVE memcpy at all as it has blocked backports. For example I don't see the SVE memcpy being used on Neoverse V1. > In the end, it is your port, but version-based workarounds are really > unusual for glibc. Agreed, I'm hoping we can remove the check in a few years time when all distros have moved to minimum kernel >= 6.2 in their supported versions. Cheers, Wilco
* Wilco Dijkstra: > Hi Florian, > >> I generally prefer we fix the component that has the bug. With that >> approach, you'd have to to use your distribution contacts to request a >> backport. > > The issue is present since SVE was added in 4.18 so it affects many > kernels. It is unlikely to be easy to backport since it relies on > various other changes to how syscalls and register state is dealt > with. > > Originally it was thought to be OK - and it would be if you only ever > use SVE in vectorized loops. However using a few SVE instructions > everywhere in applications breaks that model. Plus all the security > features have increased the overhead of kernel traps in recent > years... Yes, that might be the case. Can we hold off merging is for a bit? I want to cross-checks a few things internally before we go with the version check as proposed if that's possible. Thanks, Florian
* Florian Weimer: > * Wilco Dijkstra: > >> Hi Florian, >> >>> I generally prefer we fix the component that has the bug. With that >>> approach, you'd have to to use your distribution contacts to request a >>> backport. >> >> The issue is present since SVE was added in 4.18 so it affects many >> kernels. It is unlikely to be easy to backport since it relies on >> various other changes to how syscalls and register state is dealt >> with. >> >> Originally it was thought to be OK - and it would be if you only ever >> use SVE in vectorized loops. However using a few SVE instructions >> everywhere in applications breaks that model. Plus all the security >> features have increased the overhead of kernel traps in recent >> years... > > Yes, that might be the case. > > Can we hold off merging is for a bit? I want to cross-checks a few > things internally before we go with the version check as proposed if > that's possible. I would suggest to check for version >= 6.2 || version == 5.14.0. At this point, people running 5.14 are very likely on the el9 kernel or a derivative, and we have backported the upstream fix into it: arm64/sve: Leave SVE enabled on syscall if we don't context switch <https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/commit/564deeeb8038b29b91aafe2871c06aaa299145b1> There are no plans to backport this into the el8 kernel that I know off, and the window for such changes is more or less closed at this point, so a similar check for 4.18.0 is not needed. I understand that this makes the check even uglier, but that's the nature of version checks. 8-( Thanks, Florian
Hi Florian, > I would suggest to check for version >= 6.2 || version == 5.14.0. At > this point, people running 5.14 are very likely on the el9 kernel or a > derivative, and we have backported the upstream fix into it: Thanks for checking that, I've updated the patch to exclude 5.14.0. > I understand that this makes the check even uglier, but that's the > nature of version checks. 8-( Yes, but it's unavoidable if the kernel doesn't add performance related HWCAPs for us to check. I cleaned up the parser a little by using the same 8:8:8 format as used by __LINUX_KERNEL_VERSION. Cheers, Wilco
diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index 77a782422af1b6e4b2af32bfebfda37874111510..5f2da91ebbd0adafb0d84ec503b0f902f566da5a 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -71,6 +71,7 @@ struct cpu_features /* Currently, the GLIBC memory tagging tunable only defines 8 bits. */ uint8_t mte_state; bool sve; + bool prefer_sve_ifuncs; bool mops; }; diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h index c52860efb22d70eb4bdf356781f51c7de8ec67dc..61dc40088f4d9e5e06b57bdc7d54bde1e2a686a4 100644 --- a/sysdeps/aarch64/multiarch/init-arch.h +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -36,5 +36,7 @@ MTE_ENABLED (); \ bool __attribute__((unused)) sve = \ GLRO(dl_aarch64_cpu_features).sve; \ + bool __attribute__((unused)) prefer_sve_ifuncs = \ + GLRO(dl_aarch64_cpu_features).prefer_sve_ifuncs; \ bool __attribute__((unused)) mops = \ GLRO(dl_aarch64_cpu_features).mops; diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index d12eccfca51f4bcfef6ccf5aa286edb301e361ac..ce53567dab33c2f00b89b4069235abd4651811a6 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -47,7 +47,7 @@ select_memcpy_ifunc (void) { if (IS_A64FX (midr)) return __memcpy_a64fx; - return __memcpy_sve; + return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; } if (IS_THUNDERX (midr)) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index 2081eeb4d40e0240e67a7b26b64576f44eaf18e3..fe95037be391896c7670ef606bf4d3ba7dfb6a00 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -47,7 +47,7 @@ select_memmove_ifunc (void) { if (IS_A64FX (midr)) return __memmove_a64fx; - return __memmove_sve; + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; } if (IS_THUNDERX (midr)) diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index b1a3f673f067280bdacfddd92723a81e418023e5..13b02c45df80b493516b3c9d4acbbbffaa47af92 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -21,6 +21,7 @@ #include <sys/auxv.h> #include <elf/dl-hwcaps.h> #include <sys/prctl.h> +#include <sys/utsname.h> #include <dl-tunables-parse.h> #define DCZID_DZP_MASK (1 << 4) @@ -62,6 +63,41 @@ get_midr_from_mcpu (const struct tunable_str_t *mcpu) return UINT64_MAX; } +/* Parse kernel version without calling any library functions. + Allow 2 digits for kernel version and 3 digits for major version, + separated by '.': "kk.mmm.". + Return kernel version * 1000 + major version, or -1 on failure. */ + +static inline int +kernel_version (void) +{ + struct utsname buf; + const char *p = &buf.release[0]; + int kernel = 0; + int major = 0; + + if (__uname (&buf) < 0) + return -1; + + if (*p >= '0' && *p <= '9') + kernel = (kernel * 10) + *p++ - '0'; + if (*p >= '0' && *p <= '9') + kernel = (kernel * 10) + *p++ - '0'; + if (*p != '.') + return -1; + p++; + if (*p >= '0' && *p <= '9') + major = (major * 10) + *p++ - '0'; + if (*p >= '0' && *p <= '9') + major = (major * 10) + *p++ - '0'; + if (*p >= '0' && *p <= '9') + major = (major * 10) + *p++ - '0'; + if (*p != '.' && *p != '\0') + return -1; + + return kernel * 1000 + major; +} + static inline void init_cpu_features (struct cpu_features *cpu_features) { @@ -126,6 +162,10 @@ init_cpu_features (struct cpu_features *cpu_features) /* Check if SVE is supported. */ cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE; + /* Prefer using SVE in string ifuncs from Linux 6.2 onwards. */ + cpu_features->prefer_sve_ifuncs = + cpu_features->sve && kernel_version () >= 6002; + /* Check if MOPS is supported. */ cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS; }