Message ID | 20240524173851.2483952-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2,1/2] x86: Improve large memset perf with non-temporal stores [RHEL-29312] | expand |
On Fri, May 24, 2024 at 12:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The tuning for non-temporal stores for memset vs memcpy is not always > the same. This includes both the exact value and whether non-temporal > stores are profitable at all for a given arch. > > This patch add `x86_memset_non_temporal_threshold`. Currently we > disable non-temporal stores for non Intel vendors as the only > benchmarks showing its benefit have been on Intel hardware. > --- > manual/tunables.texi | 16 +++++++++++++++- > sysdeps/x86/cacheinfo.h | 8 +++++++- > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > sysdeps/x86/dl-tunables.list | 3 +++ > sysdeps/x86/include/cpu-features.h | 4 +++- > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > 7 files changed, 49 insertions(+), 6 deletions(-) > > diff --git a/manual/tunables.texi b/manual/tunables.texi > index baaf751721..8dd02d8149 100644 > --- a/manual/tunables.texi > +++ b/manual/tunables.texi > @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) > glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) > glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) > +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) > glibc.cpu.x86_shstk: > glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) > @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages. > @cindex shared_cache_size tunables > @cindex tunables, shared_cache_size > @cindex non_temporal_threshold tunables > -@cindex tunables, non_temporal_threshold > +@cindex memset_non_temporal_threshold tunables > +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold > > @deftp {Tunable namespace} glibc.cpu > Behavior of @theglibc{} can be tuned to assume specific hardware capabilities > @@ -574,6 +576,18 @@ like memmove and memcpy. > This tunable is specific to i386 and x86-64. > @end deftp > > +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold > +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows > +the user to set threshold in bytes for non temporal store in > +memset. Non temporal stores give a hint to the hardware to move data > +directly to memory without displacing other data from the cache. This > +tunable is used by some platforms to determine when to use non > +temporal stores memset. > + > +This tunable is specific to i386 and x86-64. > +@end deftp > + > + > @deftp Tunable glibc.cpu.x86_rep_movsb_threshold > The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to > set threshold in bytes to start using "rep movsb". The value must be > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > index ab73556772..83491607c7 100644 > --- a/sysdeps/x86/cacheinfo.h > +++ b/sysdeps/x86/cacheinfo.h > @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; > long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; > long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; > > -/* Threshold to use non temporal store. */ > +/* Threshold to use non temporal store in memmove. */ > long int __x86_shared_non_temporal_threshold attribute_hidden; > > +/* Threshold to use non temporal store in memset. */ > +long int __x86_memset_non_temporal_threshold attribute_hidden; > + > /* Threshold to use Enhanced REP MOVSB. */ > long int __x86_rep_movsb_threshold attribute_hidden = 2048; > > @@ -77,6 +80,9 @@ init_cacheinfo (void) > __x86_shared_non_temporal_threshold > = cpu_features->non_temporal_threshold; > > + __x86_memset_non_temporal_threshold > + = cpu_features->memset_non_temporal_threshold; > + > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index 5a98f70364..d375a7cba6 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > rep_movsb_threshold = 2112; > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > + Until we benchmark data on other x86 processor, disable non-temporal > + stores in memset. */ > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > + if (cpu_features->basic.kind == arch_kind_intel) > + memset_non_temporal_threshold = non_temporal_threshold; > + > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > cases slower than the vectorized path (and for some alignments, > it is really slow, check BZ #30994). */ > @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > && tunable_size <= maximum_non_temporal_threshold) > non_temporal_threshold = tunable_size; > > + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > + if (tunable_size > minimum_non_temporal_threshold > + && tunable_size <= maximum_non_temporal_threshold) > + memset_non_temporal_threshold = tunable_size; > + > tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); > if (tunable_size > minimum_rep_movsb_threshold) > rep_movsb_threshold = tunable_size; > @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > minimum_non_temporal_threshold, > maximum_non_temporal_threshold); > + TUNABLE_SET_WITH_BOUNDS ( > + x86_memset_non_temporal_threshold, memset_non_temporal_threshold, > + minimum_non_temporal_threshold, maximum_non_temporal_threshold); > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > minimum_rep_movsb_threshold, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > cpu_features->data_cache_size = data; > cpu_features->shared_cache_size = shared; > cpu_features->non_temporal_threshold = non_temporal_threshold; > + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; > cpu_features->rep_movsb_threshold = rep_movsb_threshold; > cpu_features->rep_stosb_threshold = rep_stosb_threshold; > cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c > index ceafde9481..49eeb5f70a 100644 > --- a/sysdeps/x86/dl-diagnostics-cpu.c > +++ b/sysdeps/x86/dl-diagnostics-cpu.c > @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void) > cpu_features->shared_cache_size); > print_cpu_features_value ("non_temporal_threshold", > cpu_features->non_temporal_threshold); > + print_cpu_features_value ("memset_non_temporal_threshold", > + cpu_features->memset_non_temporal_threshold); > print_cpu_features_value ("rep_movsb_threshold", > cpu_features->rep_movsb_threshold); > print_cpu_features_value ("rep_movsb_stop_threshold", > diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list > index 7d82da0dec..a0a1299592 100644 > --- a/sysdeps/x86/dl-tunables.list > +++ b/sysdeps/x86/dl-tunables.list > @@ -30,6 +30,9 @@ glibc { > x86_non_temporal_threshold { > type: SIZE_T > } > + x86_memset_non_temporal_threshold { > + type: SIZE_T > + } > x86_rep_movsb_threshold { > type: SIZE_T > # Since there is overhead to set up REP MOVSB operation, REP > diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h > index cd7bd27cf3..aaae44f0e1 100644 > --- a/sysdeps/x86/include/cpu-features.h > +++ b/sysdeps/x86/include/cpu-features.h > @@ -944,8 +944,10 @@ struct cpu_features > /* Shared cache size for use in memory and string routines, typically > L2 or L3 size. */ > unsigned long int shared_cache_size; > - /* Threshold to use non temporal store. */ > + /* Threshold to use non temporal store in memmove. */ > unsigned long int non_temporal_threshold; > + /* Threshold to use non temporal store in memset. */ > + unsigned long int memset_non_temporal_threshold; > /* Threshold to use "rep movsb". */ > unsigned long int rep_movsb_threshold; > /* Threshold to stop using "rep movsb". */ > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 637caadb40..88bf08e4f4 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -24,9 +24,9 @@ > 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with > 4 VEC stores and store 4 * VEC at a time until done. > 6. On machines ERMS feature, if size is range > - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) > + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) > then REP STOSB will be used. > - 7. If size >= __x86_shared_non_temporal_threshold, use a > + 7. If size >= __x86_memset_non_temporal_threshold, use a > non-temporal stores. */ > > #include <sysdep.h> > @@ -318,7 +318,7 @@ L(return_vzeroupper): > /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in > range for 2-byte jump encoding. */ > L(stosb_local): > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP > jae L(nt_memset) > movzbl %sil, %eax > mov %RDX_LP, %RCX_LP > -- > 2.34.1 > ping
On Fri, May 24, 2024 at 10:39 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The tuning for non-temporal stores for memset vs memcpy is not always > the same. This includes both the exact value and whether non-temporal > stores are profitable at all for a given arch. > > This patch add `x86_memset_non_temporal_threshold`. Currently we > disable non-temporal stores for non Intel vendors as the only > benchmarks showing its benefit have been on Intel hardware. > --- > manual/tunables.texi | 16 +++++++++++++++- > sysdeps/x86/cacheinfo.h | 8 +++++++- > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > sysdeps/x86/dl-tunables.list | 3 +++ > sysdeps/x86/include/cpu-features.h | 4 +++- > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > 7 files changed, 49 insertions(+), 6 deletions(-) > > diff --git a/manual/tunables.texi b/manual/tunables.texi > index baaf751721..8dd02d8149 100644 > --- a/manual/tunables.texi > +++ b/manual/tunables.texi > @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) > glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) > glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) > +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) > glibc.cpu.x86_shstk: > glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) > @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages. > @cindex shared_cache_size tunables > @cindex tunables, shared_cache_size > @cindex non_temporal_threshold tunables > -@cindex tunables, non_temporal_threshold > +@cindex memset_non_temporal_threshold tunables > +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold > > @deftp {Tunable namespace} glibc.cpu > Behavior of @theglibc{} can be tuned to assume specific hardware capabilities > @@ -574,6 +576,18 @@ like memmove and memcpy. > This tunable is specific to i386 and x86-64. > @end deftp > > +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold > +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows > +the user to set threshold in bytes for non temporal store in > +memset. Non temporal stores give a hint to the hardware to move data > +directly to memory without displacing other data from the cache. This > +tunable is used by some platforms to determine when to use non > +temporal stores memset. > + > +This tunable is specific to i386 and x86-64. > +@end deftp > + > + > @deftp Tunable glibc.cpu.x86_rep_movsb_threshold > The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to > set threshold in bytes to start using "rep movsb". The value must be > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > index ab73556772..83491607c7 100644 > --- a/sysdeps/x86/cacheinfo.h > +++ b/sysdeps/x86/cacheinfo.h > @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; > long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; > long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; > > -/* Threshold to use non temporal store. */ > +/* Threshold to use non temporal store in memmove. */ > long int __x86_shared_non_temporal_threshold attribute_hidden; > > +/* Threshold to use non temporal store in memset. */ > +long int __x86_memset_non_temporal_threshold attribute_hidden; > + > /* Threshold to use Enhanced REP MOVSB. */ > long int __x86_rep_movsb_threshold attribute_hidden = 2048; > > @@ -77,6 +80,9 @@ init_cacheinfo (void) > __x86_shared_non_temporal_threshold > = cpu_features->non_temporal_threshold; > > + __x86_memset_non_temporal_threshold > + = cpu_features->memset_non_temporal_threshold; > + > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index 5a98f70364..d375a7cba6 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > rep_movsb_threshold = 2112; > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > + Until we benchmark data on other x86 processor, disable non-temporal > + stores in memset. */ > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > + if (cpu_features->basic.kind == arch_kind_intel) > + memset_non_temporal_threshold = non_temporal_threshold; > + > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > cases slower than the vectorized path (and for some alignments, > it is really slow, check BZ #30994). */ > @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > && tunable_size <= maximum_non_temporal_threshold) > non_temporal_threshold = tunable_size; > > + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > + if (tunable_size > minimum_non_temporal_threshold > + && tunable_size <= maximum_non_temporal_threshold) > + memset_non_temporal_threshold = tunable_size; > + > tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); > if (tunable_size > minimum_rep_movsb_threshold) > rep_movsb_threshold = tunable_size; > @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > minimum_non_temporal_threshold, > maximum_non_temporal_threshold); > + TUNABLE_SET_WITH_BOUNDS ( > + x86_memset_non_temporal_threshold, memset_non_temporal_threshold, > + minimum_non_temporal_threshold, maximum_non_temporal_threshold); > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > minimum_rep_movsb_threshold, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > cpu_features->data_cache_size = data; > cpu_features->shared_cache_size = shared; > cpu_features->non_temporal_threshold = non_temporal_threshold; > + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; > cpu_features->rep_movsb_threshold = rep_movsb_threshold; > cpu_features->rep_stosb_threshold = rep_stosb_threshold; > cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c > index ceafde9481..49eeb5f70a 100644 > --- a/sysdeps/x86/dl-diagnostics-cpu.c > +++ b/sysdeps/x86/dl-diagnostics-cpu.c > @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void) > cpu_features->shared_cache_size); > print_cpu_features_value ("non_temporal_threshold", > cpu_features->non_temporal_threshold); > + print_cpu_features_value ("memset_non_temporal_threshold", > + cpu_features->memset_non_temporal_threshold); > print_cpu_features_value ("rep_movsb_threshold", > cpu_features->rep_movsb_threshold); > print_cpu_features_value ("rep_movsb_stop_threshold", > diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list > index 7d82da0dec..a0a1299592 100644 > --- a/sysdeps/x86/dl-tunables.list > +++ b/sysdeps/x86/dl-tunables.list > @@ -30,6 +30,9 @@ glibc { > x86_non_temporal_threshold { > type: SIZE_T > } > + x86_memset_non_temporal_threshold { > + type: SIZE_T > + } > x86_rep_movsb_threshold { > type: SIZE_T > # Since there is overhead to set up REP MOVSB operation, REP > diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h > index cd7bd27cf3..aaae44f0e1 100644 > --- a/sysdeps/x86/include/cpu-features.h > +++ b/sysdeps/x86/include/cpu-features.h > @@ -944,8 +944,10 @@ struct cpu_features > /* Shared cache size for use in memory and string routines, typically > L2 or L3 size. */ > unsigned long int shared_cache_size; > - /* Threshold to use non temporal store. */ > + /* Threshold to use non temporal store in memmove. */ > unsigned long int non_temporal_threshold; > + /* Threshold to use non temporal store in memset. */ > + unsigned long int memset_non_temporal_threshold; > /* Threshold to use "rep movsb". */ > unsigned long int rep_movsb_threshold; > /* Threshold to stop using "rep movsb". */ > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 637caadb40..88bf08e4f4 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -24,9 +24,9 @@ > 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with > 4 VEC stores and store 4 * VEC at a time until done. > 6. On machines ERMS feature, if size is range > - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) > + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) > then REP STOSB will be used. > - 7. If size >= __x86_shared_non_temporal_threshold, use a > + 7. If size >= __x86_memset_non_temporal_threshold, use a > non-temporal stores. */ > > #include <sysdep.h> > @@ -318,7 +318,7 @@ L(return_vzeroupper): > /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in > range for 2-byte jump encoding. */ > L(stosb_local): > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP > jae L(nt_memset) > movzbl %sil, %eax > mov %RDX_LP, %RCX_LP > -- > 2.34.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
Hi, I'm not subscribed to the glibc list - pls CC me directly on replies. On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote: > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > The tuning for non-temporal stores for memset vs memcpy is not always > > the same. This includes both the exact value and whether non-temporal > > stores are profitable at all for a given arch. > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we > > disable non-temporal stores for non Intel vendors as the only > > benchmarks showing its benefit have been on Intel hardware. > > --- > > manual/tunables.texi | 16 +++++++++++++++- > > sysdeps/x86/cacheinfo.h | 8 +++++++- > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > > sysdeps/x86/dl-tunables.list | 3 +++ > > sysdeps/x86/include/cpu-features.h | 4 +++- > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > > 7 files changed, 49 insertions(+), 6 deletions(-) ... > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > > + Until we benchmark data on other x86 processor, disable non-temporal > > + stores in memset. */ Well, something's fishy here: $ ./elf/ld.so --list-tunables | grep threshold glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff) glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff) glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff) glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) ^^^^^^^^^ on glibc-2.39.9000-300-g54c1efdac55b from git. That's on a AMD Zen1 so I'd expect that memset NT threshold to be 0xffffffffffffffff by default... Thx.
On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote: > > Hi, > > I'm not subscribed to the glibc list - pls CC me directly on replies. > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote: > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > The tuning for non-temporal stores for memset vs memcpy is not always > > > the same. This includes both the exact value and whether non-temporal > > > stores are profitable at all for a given arch. > > > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we > > > disable non-temporal stores for non Intel vendors as the only > > > benchmarks showing its benefit have been on Intel hardware. > > > --- > > > manual/tunables.texi | 16 +++++++++++++++- > > > sysdeps/x86/cacheinfo.h | 8 +++++++- > > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > > > sysdeps/x86/dl-tunables.list | 3 +++ > > > sysdeps/x86/include/cpu-features.h | 4 +++- > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > > > 7 files changed, 49 insertions(+), 6 deletions(-) > > ... > > > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > > > + Until we benchmark data on other x86 processor, disable non-temporal > > > + stores in memset. */ > > Well, something's fishy here: > > $ ./elf/ld.so --list-tunables | grep threshold > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff) > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff) > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff) > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > ^^^^^^^^^ > > on glibc-2.39.9000-300-g54c1efdac55b from git. > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be > 0xffffffffffffffff by default... > > Thx. > Thanks for bringing this up, looking into it. > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette
On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote: > On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote: > > > > Hi, > > > > I'm not subscribed to the glibc list - pls CC me directly on replies. > > > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote: > > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > The tuning for non-temporal stores for memset vs memcpy is not always > > > > the same. This includes both the exact value and whether non-temporal > > > > stores are profitable at all for a given arch. > > > > > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we > > > > disable non-temporal stores for non Intel vendors as the only > > > > benchmarks showing its benefit have been on Intel hardware. > > > > --- > > > > manual/tunables.texi | 16 +++++++++++++++- > > > > sysdeps/x86/cacheinfo.h | 8 +++++++- > > > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > > > > sysdeps/x86/dl-tunables.list | 3 +++ > > > > sysdeps/x86/include/cpu-features.h | 4 +++- > > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > > > > 7 files changed, 49 insertions(+), 6 deletions(-) > > > > ... > > > > > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > > > > + Until we benchmark data on other x86 processor, disable non-temporal > > > > + stores in memset. */ > > > > Well, something's fishy here: > > > > $ ./elf/ld.so --list-tunables | grep threshold > > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff) > > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff) > > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff) > > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > ^^^^^^^^^ > > > > on glibc-2.39.9000-300-g54c1efdac55b from git. > > > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be > > 0xffffffffffffffff by default... > > > > Thx. > > > > Thanks for bringing this up, looking into it. Thx, so Michael did debug it yesterday to the ranges mismatching: diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c index 147cc4cf23f5..ecf3c1d3736e 100644 --- a/elf/dl-tunables.c +++ b/elf/dl-tunables.c @@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp, /* Bail out if the bounds are not valid. */ if (tunable_val_lt (val, min, unsigned_cmp) - || tunable_val_lt (max, val, unsigned_cmp)) + || tunable_val_lt (max, val, unsigned_cmp)) { + _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n", + val, min, max); return; + } cur->val.numval = val; cur->type.min = min; $ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)" dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff ^^^^^^^ dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) but you guys probably should do the right fix here. Thx.
On Fri, Jun 14, 2024 at 1:01 PM Borislav Petkov <bp@alien8.de> wrote: > > On Fri, Jun 14, 2024 at 11:39:07AM -0500, Noah Goldstein wrote: > > On Fri, Jun 14, 2024 at 5:41 AM Borislav Petkov <bp@alien8.de> wrote: > > > > > > Hi, > > > > > > I'm not subscribed to the glibc list - pls CC me directly on replies. > > > > > > On Wed, May 29, 2024 at 03:53:20PM -0700, H.J. Lu wrote: > > > > On Fri, May 24, 2024 at 10:39?AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > The tuning for non-temporal stores for memset vs memcpy is not always > > > > > the same. This includes both the exact value and whether non-temporal > > > > > stores are profitable at all for a given arch. > > > > > > > > > > This patch add `x86_memset_non_temporal_threshold`. Currently we > > > > > disable non-temporal stores for non Intel vendors as the only > > > > > benchmarks showing its benefit have been on Intel hardware. > > > > > --- > > > > > manual/tunables.texi | 16 +++++++++++++++- > > > > > sysdeps/x86/cacheinfo.h | 8 +++++++- > > > > > sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ > > > > > sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ > > > > > sysdeps/x86/dl-tunables.list | 3 +++ > > > > > sysdeps/x86/include/cpu-features.h | 4 +++- > > > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++--- > > > > > 7 files changed, 49 insertions(+), 6 deletions(-) > > > > > > ... > > > > > > > > + /* Non-temporal stores in memset have only been tested on Intel hardware. > > > > > + Until we benchmark data on other x86 processor, disable non-temporal > > > > > + stores in memset. */ > > > > > > Well, something's fishy here: > > > > > > $ ./elf/ld.so --list-tunables | grep threshold > > > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > > glibc.cpu.x86_rep_movsb_threshold: 0x600000 (min: 0x100, max: 0xffffffffffffffff) > > > glibc.cpu.x86_non_temporal_threshold: 0x600000 (min: 0x4040, max: 0xfffffffffffffff) > > > glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > > glibc.cpu.x86_rep_stosb_threshold: 0xffffffffffffffff (min: 0x1, max: 0xffffffffffffffff) > > > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > > ^^^^^^^^^ > > > > > > on glibc-2.39.9000-300-g54c1efdac55b from git. > > > > > > That's on a AMD Zen1 so I'd expect that memset NT threshold to be > > > 0xffffffffffffffff by default... > > > > > > Thx. > > > > > > > Thanks for bringing this up, looking into it. > > Thx, so Michael did debug it yesterday to the ranges mismatching: > > diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c > index 147cc4cf23f5..ecf3c1d3736e 100644 > --- a/elf/dl-tunables.c > +++ b/elf/dl-tunables.c > @@ -110,8 +110,11 @@ do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp, > > /* Bail out if the bounds are not valid. */ > if (tunable_val_lt (val, min, unsigned_cmp) > - || tunable_val_lt (max, val, unsigned_cmp)) > + || tunable_val_lt (max, val, unsigned_cmp)) { > + _dl_printf("bail out due to: 0x%lx, min: 0x%lx, max: 0x%lx\n", > + val, min, max); > return; > + } > > cur->val.numval = val; > cur->type.min = min; > > $ ./elf/ld.so --list-tunables | grep -E "(threshold|bail)" > dl_init_cacheinfo: memset_non_temporal_threshold: 0xffffffffffffffff > dl_init_cacheinfo: memset_non_temporal_threshold, tunable_size: 0xffffffffffffffff > bail out due to: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff > ^^^^^^^ > > dl_init_cacheinfo: memset_non_temporal_threshold, tunable set: 0xffffffffffffffff, min: 0x4040, max: 0xfffffffffffffff > glibc.cpu.x86_memset_non_temporal_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff) > > but you guys probably should do the right fix here. Just posted the fix, you should be CCd on it. > > Thx. > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette
diff --git a/manual/tunables.texi b/manual/tunables.texi index baaf751721..8dd02d8149 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) glibc.cpu.x86_shstk: glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) @@ -495,7 +496,8 @@ thread stack originally backup by Huge Pages to default pages. @cindex shared_cache_size tunables @cindex tunables, shared_cache_size @cindex non_temporal_threshold tunables -@cindex tunables, non_temporal_threshold +@cindex memset_non_temporal_threshold tunables +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold @deftp {Tunable namespace} glibc.cpu Behavior of @theglibc{} can be tuned to assume specific hardware capabilities @@ -574,6 +576,18 @@ like memmove and memcpy. This tunable is specific to i386 and x86-64. @end deftp +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows +the user to set threshold in bytes for non temporal store in +memset. Non temporal stores give a hint to the hardware to move data +directly to memory without displacing other data from the cache. This +tunable is used by some platforms to determine when to use non +temporal stores memset. + +This tunable is specific to i386 and x86-64. +@end deftp + + @deftp Tunable glibc.cpu.x86_rep_movsb_threshold The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to set threshold in bytes to start using "rep movsb". The value must be diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h index ab73556772..83491607c7 100644 --- a/sysdeps/x86/cacheinfo.h +++ b/sysdeps/x86/cacheinfo.h @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; -/* Threshold to use non temporal store. */ +/* Threshold to use non temporal store in memmove. */ long int __x86_shared_non_temporal_threshold attribute_hidden; +/* Threshold to use non temporal store in memset. */ +long int __x86_memset_non_temporal_threshold attribute_hidden; + /* Threshold to use Enhanced REP MOVSB. */ long int __x86_rep_movsb_threshold attribute_hidden = 2048; @@ -77,6 +80,9 @@ init_cacheinfo (void) __x86_shared_non_temporal_threshold = cpu_features->non_temporal_threshold; + __x86_memset_non_temporal_threshold + = cpu_features->memset_non_temporal_threshold; + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 5a98f70364..d375a7cba6 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; + /* Non-temporal stores in memset have only been tested on Intel hardware. + Until we benchmark data on other x86 processor, disable non-temporal + stores in memset. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (cpu_features->basic.kind == arch_kind_intel) + memset_non_temporal_threshold = non_temporal_threshold; + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of cases slower than the vectorized path (and for some alignments, it is really slow, check BZ #30994). */ @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) && tunable_size <= maximum_non_temporal_threshold) non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) + memset_non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); if (tunable_size > minimum_rep_movsb_threshold) rep_movsb_threshold = tunable_size; @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS ( + x86_memset_non_temporal_threshold, memset_non_temporal_threshold, + minimum_non_temporal_threshold, maximum_non_temporal_threshold); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; cpu_features->rep_movsb_threshold = rep_movsb_threshold; cpu_features->rep_stosb_threshold = rep_stosb_threshold; cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index ceafde9481..49eeb5f70a 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void) cpu_features->shared_cache_size); print_cpu_features_value ("non_temporal_threshold", cpu_features->non_temporal_threshold); + print_cpu_features_value ("memset_non_temporal_threshold", + cpu_features->memset_non_temporal_threshold); print_cpu_features_value ("rep_movsb_threshold", cpu_features->rep_movsb_threshold); print_cpu_features_value ("rep_movsb_stop_threshold", diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index 7d82da0dec..a0a1299592 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -30,6 +30,9 @@ glibc { x86_non_temporal_threshold { type: SIZE_T } + x86_memset_non_temporal_threshold { + type: SIZE_T + } x86_rep_movsb_threshold { type: SIZE_T # Since there is overhead to set up REP MOVSB operation, REP diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index cd7bd27cf3..aaae44f0e1 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -944,8 +944,10 @@ struct cpu_features /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ unsigned long int shared_cache_size; - /* Threshold to use non temporal store. */ + /* Threshold to use non temporal store in memmove. */ unsigned long int non_temporal_threshold; + /* Threshold to use non temporal store in memset. */ + unsigned long int memset_non_temporal_threshold; /* Threshold to use "rep movsb". */ unsigned long int rep_movsb_threshold; /* Threshold to stop using "rep movsb". */ diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 637caadb40..88bf08e4f4 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -24,9 +24,9 @@ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with 4 VEC stores and store 4 * VEC at a time until done. 6. On machines ERMS feature, if size is range - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) then REP STOSB will be used. - 7. If size >= __x86_shared_non_temporal_threshold, use a + 7. If size >= __x86_memset_non_temporal_threshold, use a non-temporal stores. */ #include <sysdep.h> @@ -318,7 +318,7 @@ L(return_vzeroupper): /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in range for 2-byte jump encoding. */ L(stosb_local): - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP jae L(nt_memset) movzbl %sil, %eax mov %RDX_LP, %RCX_LP