Message ID | 20240723063821.3460385-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86: Increase default `rep stosb` threshold for SKX [BZ #32009] | expand |
On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Benchmarks indicate that `2048` (prior value) is far too low a > threshold for using `rep stosb`. Rather something around `1048576` is > preferable: > > https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing > > (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2) > > The `1048576` theshold was tested on multiple SKX machines and > suprisingly seemed to hold regardless of cache hierarchy. > > Also note that in highly parallel settings, a smaller value is > preferable, but this difference does seem not so extreme to justify a > worse threshold with low/moderate parallelism. > > Tested new threshold using qemu on all x86 systems mutually supported > by GLIBC and qemu. > --- > sysdeps/x86/cpu-features.c | 11 +++++++++++ > sysdeps/x86/dl-cacheinfo.h | 21 +++++++++++---------- > sysdeps/x86/dl-diagnostics-cpu.c | 6 ++++-- > sysdeps/x86/include/cpu-features.h | 2 ++ > 4 files changed, 28 insertions(+), 12 deletions(-) > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index c096dd390a..c5429422c0 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features *cpu_features) > enum cpu_features_kind kind; > > cpu_features->cachesize_non_temporal_divisor = 4; > + cpu_features->machine_rep_stosb_threshold = 0; > #if !HAS_CPUID > if (__get_cpuid_max (0, 0) == 0) > { > @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features *cpu_features) > non-temporal on all Skylake servers. */ > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > |= bit_arch_Avoid_Non_Temporal_Memset; > + /* SKX prefers temporal stores for a while. Confusingly, tests > + across multiple SKX systems with different cache sizes all > + indicate the threshold for when `rep stosb` becomes preferable > + is 1048576 as opposed to some function of the size of the > + various components of the cache hierarchy. Worth noting, > + although not taken into account here, is that `rep stosb` is > + more preferable with higher degrees of parallelism, i.e if all > + cores are simultaneously setting memory a lower threshold > + would be preferable. */ > + cpu_features->machine_rep_stosb_threshold = 1048576; > case INTEL_BIGCORE_COMETLAKE: > case INTEL_BIGCORE_SKYLAKE: > case INTEL_BIGCORE_KABYLAKE: > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index a1c03b8903..c646b57508 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (cpu_features->basic.kind == arch_kind_amd) > rep_movsb_threshold = non_temporal_threshold; > > - /* The default threshold to use Enhanced REP STOSB. */ > - unsigned long int rep_stosb_threshold = 2048; > - > long int tunable_size; > > tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL); > @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > /* NB: The default value of the x86_rep_stosb_threshold tunable is the > same as the default value of __x86_rep_stosb_threshold and the > minimum value is fixed. */ > - rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, > - long int, NULL); > - if (cpu_features->basic.kind == arch_kind_amd > - && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > - /* For AMD Zen3+ architecture, the performance of the vectorized loop is > - slightly better than ERMS. */ > - rep_stosb_threshold = SIZE_MAX; > + unsigned long int rep_stosb_threshold > + = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL); > + if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > + { > + if (cpu_features->machine_rep_stosb_threshold != 0) > + rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold; > + /* For AMD Zen3+ architecture, the performance of the vectorized loop > + is slightly better than ERMS. */ > + else if (cpu_features->basic.kind == arch_kind_amd) > + rep_stosb_threshold = SIZE_MAX; > + } > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c > index 49eeb5f70a..b84903a294 100644 > --- a/sysdeps/x86/dl-diagnostics-cpu.c > +++ b/sysdeps/x86/dl-diagnostics-cpu.c > @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void) > cpu_features->level4_cache_size); > print_cpu_features_value ("cachesize_non_temporal_divisor", > cpu_features->cachesize_non_temporal_divisor); > + print_cpu_features_value ("machine_rep_stosb_threshold", > + cpu_features->machine_rep_stosb_threshold); > _Static_assert ( > - offsetof (struct cpu_features, cachesize_non_temporal_divisor) > - + sizeof (cpu_features->cachesize_non_temporal_divisor) > + offsetof (struct cpu_features, machine_rep_stosb_threshold) > + + sizeof (cpu_features->machine_rep_stosb_threshold) > == sizeof (*cpu_features), > "last cpu_features field has been printed"); > > diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h > index aaae44f0e1..5f2b5b93ac 100644 > --- a/sysdeps/x86/include/cpu-features.h > +++ b/sysdeps/x86/include/cpu-features.h > @@ -981,6 +981,8 @@ struct cpu_features > /* When no user non_temporal_threshold is specified. We default to > cachesize / cachesize_non_temporal_divisor. */ > unsigned long int cachesize_non_temporal_divisor; > + /* Default rep stosb threshold (if 0, use default in dl-machine.h). */ > + unsigned long int machine_rep_stosb_threshold; > }; > > /* Get a pointer to the CPU features structure. */ > -- > 2.34.1 > Please rebase against master. Thanks.
On Mon, Aug 19, 2024 at 6:12 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Benchmarks indicate that `2048` (prior value) is far too low a > > threshold for using `rep stosb`. Rather something around `1048576` is > > preferable: > > > > https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing > > > > (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2) > > > > The `1048576` theshold was tested on multiple SKX machines and > > suprisingly seemed to hold regardless of cache hierarchy. > > > > Also note that in highly parallel settings, a smaller value is > > preferable, but this difference does seem not so extreme to justify a > > worse threshold with low/moderate parallelism. > > > > Tested new threshold using qemu on all x86 systems mutually supported > > by GLIBC and qemu. > > --- > > sysdeps/x86/cpu-features.c | 11 +++++++++++ > > sysdeps/x86/dl-cacheinfo.h | 21 +++++++++++---------- > > sysdeps/x86/dl-diagnostics-cpu.c | 6 ++++-- > > sysdeps/x86/include/cpu-features.h | 2 ++ > > 4 files changed, 28 insertions(+), 12 deletions(-) > > > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > > index c096dd390a..c5429422c0 100644 > > --- a/sysdeps/x86/cpu-features.c > > +++ b/sysdeps/x86/cpu-features.c > > @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features *cpu_features) > > enum cpu_features_kind kind; > > > > cpu_features->cachesize_non_temporal_divisor = 4; > > + cpu_features->machine_rep_stosb_threshold = 0; > > #if !HAS_CPUID > > if (__get_cpuid_max (0, 0) == 0) > > { > > @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features *cpu_features) > > non-temporal on all Skylake servers. */ > > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > > |= bit_arch_Avoid_Non_Temporal_Memset; > > + /* SKX prefers temporal stores for a while. Confusingly, tests > > + across multiple SKX systems with different cache sizes all > > + indicate the threshold for when `rep stosb` becomes preferable > > + is 1048576 as opposed to some function of the size of the > > + various components of the cache hierarchy. Worth noting, > > + although not taken into account here, is that `rep stosb` is > > + more preferable with higher degrees of parallelism, i.e if all > > + cores are simultaneously setting memory a lower threshold > > + would be preferable. */ > > + cpu_features->machine_rep_stosb_threshold = 1048576; > > case INTEL_BIGCORE_COMETLAKE: > > case INTEL_BIGCORE_SKYLAKE: > > case INTEL_BIGCORE_KABYLAKE: > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > index a1c03b8903..c646b57508 100644 > > --- a/sysdeps/x86/dl-cacheinfo.h > > +++ b/sysdeps/x86/dl-cacheinfo.h > > @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (cpu_features->basic.kind == arch_kind_amd) > > rep_movsb_threshold = non_temporal_threshold; > > > > - /* The default threshold to use Enhanced REP STOSB. */ > > - unsigned long int rep_stosb_threshold = 2048; > > - > > long int tunable_size; > > > > tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL); > > @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > /* NB: The default value of the x86_rep_stosb_threshold tunable is the > > same as the default value of __x86_rep_stosb_threshold and the > > minimum value is fixed. */ > > - rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, > > - long int, NULL); > > - if (cpu_features->basic.kind == arch_kind_amd > > - && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > > - /* For AMD Zen3+ architecture, the performance of the vectorized loop is > > - slightly better than ERMS. */ > > - rep_stosb_threshold = SIZE_MAX; > > + unsigned long int rep_stosb_threshold > > + = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL); > > + if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > > + { > > + if (cpu_features->machine_rep_stosb_threshold != 0) > > + rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold; > > + /* For AMD Zen3+ architecture, the performance of the vectorized loop > > + is slightly better than ERMS. */ > > + else if (cpu_features->basic.kind == arch_kind_amd) > > + rep_stosb_threshold = SIZE_MAX; > > + } > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c > > index 49eeb5f70a..b84903a294 100644 > > --- a/sysdeps/x86/dl-diagnostics-cpu.c > > +++ b/sysdeps/x86/dl-diagnostics-cpu.c > > @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void) > > cpu_features->level4_cache_size); > > print_cpu_features_value ("cachesize_non_temporal_divisor", > > cpu_features->cachesize_non_temporal_divisor); > > + print_cpu_features_value ("machine_rep_stosb_threshold", > > + cpu_features->machine_rep_stosb_threshold); > > _Static_assert ( > > - offsetof (struct cpu_features, cachesize_non_temporal_divisor) > > - + sizeof (cpu_features->cachesize_non_temporal_divisor) > > + offsetof (struct cpu_features, machine_rep_stosb_threshold) > > + + sizeof (cpu_features->machine_rep_stosb_threshold) > > == sizeof (*cpu_features), > > "last cpu_features field has been printed"); > > > > diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h > > index aaae44f0e1..5f2b5b93ac 100644 > > --- a/sysdeps/x86/include/cpu-features.h > > +++ b/sysdeps/x86/include/cpu-features.h > > @@ -981,6 +981,8 @@ struct cpu_features > > /* When no user non_temporal_threshold is specified. We default to > > cachesize / cachesize_non_temporal_divisor. */ > > unsigned long int cachesize_non_temporal_divisor; > > + /* Default rep stosb threshold (if 0, use default in dl-machine.h). */ > > + unsigned long int machine_rep_stosb_threshold; > > }; > > > > /* Get a pointer to the CPU features structure. */ > > -- > > 2.34.1 > > > > Please rebase against master. > > Thanks. This is on hold at the moment. DJ and I are still trying to figure out the best fix. > > -- > H.J.
On Mon, Aug 19, 2024, 9:29 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Mon, Aug 19, 2024 at 6:12 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > > > > > > Benchmarks indicate that `2048` (prior value) is far too low a > > > threshold for using `rep stosb`. Rather something around `1048576` is > > > preferable: > > > > > > > https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing > > > > > > (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2) > > > > > > The `1048576` theshold was tested on multiple SKX machines and > > > suprisingly seemed to hold regardless of cache hierarchy. > > > > > > Also note that in highly parallel settings, a smaller value is > > > preferable, but this difference does seem not so extreme to justify a > > > worse threshold with low/moderate parallelism. > > > > > > Tested new threshold using qemu on all x86 systems mutually supported > > > by GLIBC and qemu. > > > --- > > > sysdeps/x86/cpu-features.c | 11 +++++++++++ > > > sysdeps/x86/dl-cacheinfo.h | 21 +++++++++++---------- > > > sysdeps/x86/dl-diagnostics-cpu.c | 6 ++++-- > > > sysdeps/x86/include/cpu-features.h | 2 ++ > > > 4 files changed, 28 insertions(+), 12 deletions(-) > > > > > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > > > index c096dd390a..c5429422c0 100644 > > > --- a/sysdeps/x86/cpu-features.c > > > +++ b/sysdeps/x86/cpu-features.c > > > @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features > *cpu_features) > > > enum cpu_features_kind kind; > > > > > > cpu_features->cachesize_non_temporal_divisor = 4; > > > + cpu_features->machine_rep_stosb_threshold = 0; > > > #if !HAS_CPUID > > > if (__get_cpuid_max (0, 0) == 0) > > > { > > > @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features > *cpu_features) > > > non-temporal on all Skylake servers. */ > > > > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > > > |= bit_arch_Avoid_Non_Temporal_Memset; > > > + /* SKX prefers temporal stores for a while. Confusingly, > tests > > > + across multiple SKX systems with different cache > sizes all > > > + indicate the threshold for when `rep stosb` becomes > preferable > > > + is 1048576 as opposed to some function of the size of > the > > > + various components of the cache hierarchy. Worth > noting, > > > + although not taken into account here, is that `rep > stosb` is > > > + more preferable with higher degrees of parallelism, > i.e if all > > > + cores are simultaneously setting memory a lower > threshold > > > + would be preferable. */ > > > + cpu_features->machine_rep_stosb_threshold = 1048576; > > > case INTEL_BIGCORE_COMETLAKE: > > > case INTEL_BIGCORE_SKYLAKE: > > > case INTEL_BIGCORE_KABYLAKE: > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > index a1c03b8903..c646b57508 100644 > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > > if (cpu_features->basic.kind == arch_kind_amd) > > > rep_movsb_threshold = non_temporal_threshold; > > > > > > - /* The default threshold to use Enhanced REP STOSB. */ > > > - unsigned long int rep_stosb_threshold = 2048; > > > - > > > long int tunable_size; > > > > > > tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL); > > > @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > > /* NB: The default value of the x86_rep_stosb_threshold tunable is > the > > > same as the default value of __x86_rep_stosb_threshold and the > > > minimum value is fixed. */ > > > - rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, > > > - long int, NULL); > > > - if (cpu_features->basic.kind == arch_kind_amd > > > - && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > > > - /* For AMD Zen3+ architecture, the performance of the vectorized > loop is > > > - slightly better than ERMS. */ > > > - rep_stosb_threshold = SIZE_MAX; > > > + unsigned long int rep_stosb_threshold > > > + = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL); > > > + if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) > > > + { > > > + if (cpu_features->machine_rep_stosb_threshold != 0) > > > + rep_stosb_threshold = > cpu_features->machine_rep_stosb_threshold; > > > + /* For AMD Zen3+ architecture, the performance of the > vectorized loop > > > + is slightly better than ERMS. */ > > > + else if (cpu_features->basic.kind == arch_kind_amd) > > > + rep_stosb_threshold = SIZE_MAX; > > > + } > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, > SIZE_MAX); > > > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c > b/sysdeps/x86/dl-diagnostics-cpu.c > > > index 49eeb5f70a..b84903a294 100644 > > > --- a/sysdeps/x86/dl-diagnostics-cpu.c > > > +++ b/sysdeps/x86/dl-diagnostics-cpu.c > > > @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void) > > > cpu_features->level4_cache_size); > > > print_cpu_features_value ("cachesize_non_temporal_divisor", > > > > cpu_features->cachesize_non_temporal_divisor); > > > + print_cpu_features_value ("machine_rep_stosb_threshold", > > > + cpu_features->machine_rep_stosb_threshold); > > > _Static_assert ( > > > - offsetof (struct cpu_features, cachesize_non_temporal_divisor) > > > - + sizeof (cpu_features->cachesize_non_temporal_divisor) > > > + offsetof (struct cpu_features, machine_rep_stosb_threshold) > > > + + sizeof (cpu_features->machine_rep_stosb_threshold) > > > == sizeof (*cpu_features), > > > "last cpu_features field has been printed"); > > > > > > diff --git a/sysdeps/x86/include/cpu-features.h > b/sysdeps/x86/include/cpu-features.h > > > index aaae44f0e1..5f2b5b93ac 100644 > > > --- a/sysdeps/x86/include/cpu-features.h > > > +++ b/sysdeps/x86/include/cpu-features.h > > > @@ -981,6 +981,8 @@ struct cpu_features > > > /* When no user non_temporal_threshold is specified. We default to > > > cachesize / cachesize_non_temporal_divisor. */ > > > unsigned long int cachesize_non_temporal_divisor; > > > + /* Default rep stosb threshold (if 0, use default in > dl-machine.h). */ > > > + unsigned long int machine_rep_stosb_threshold; > > > }; > > > > > > /* Get a pointer to the CPU features structure. */ > > > -- > > > 2.34.1 > > > > > > > Please rebase against master. > > > > Thanks. > > This is on hold at the moment. DJ and I are still trying to figure out > the best fix. > OK. > > > -- > > H.J. > >
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index c096dd390a..c5429422c0 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features *cpu_features) enum cpu_features_kind kind; cpu_features->cachesize_non_temporal_divisor = 4; + cpu_features->machine_rep_stosb_threshold = 0; #if !HAS_CPUID if (__get_cpuid_max (0, 0) == 0) { @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features *cpu_features) non-temporal on all Skylake servers. */ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] |= bit_arch_Avoid_Non_Temporal_Memset; + /* SKX prefers temporal stores for a while. Confusingly, tests + across multiple SKX systems with different cache sizes all + indicate the threshold for when `rep stosb` becomes preferable + is 1048576 as opposed to some function of the size of the + various components of the cache hierarchy. Worth noting, + although not taken into account here, is that `rep stosb` is + more preferable with higher degrees of parallelism, i.e if all + cores are simultaneously setting memory a lower threshold + would be preferable. */ + cpu_features->machine_rep_stosb_threshold = 1048576; case INTEL_BIGCORE_COMETLAKE: case INTEL_BIGCORE_SKYLAKE: case INTEL_BIGCORE_KABYLAKE: diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index a1c03b8903..c646b57508 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (cpu_features->basic.kind == arch_kind_amd) rep_movsb_threshold = non_temporal_threshold; - /* The default threshold to use Enhanced REP STOSB. */ - unsigned long int rep_stosb_threshold = 2048; - long int tunable_size; tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL); @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) /* NB: The default value of the x86_rep_stosb_threshold tunable is the same as the default value of __x86_rep_stosb_threshold and the minimum value is fixed. */ - rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, - long int, NULL); - if (cpu_features->basic.kind == arch_kind_amd - && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) - /* For AMD Zen3+ architecture, the performance of the vectorized loop is - slightly better than ERMS. */ - rep_stosb_threshold = SIZE_MAX; + unsigned long int rep_stosb_threshold + = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL); + if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) + { + if (cpu_features->machine_rep_stosb_threshold != 0) + rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold; + /* For AMD Zen3+ architecture, the performance of the vectorized loop + is slightly better than ERMS. */ + else if (cpu_features->basic.kind == arch_kind_amd) + rep_stosb_threshold = SIZE_MAX; + } TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index 49eeb5f70a..b84903a294 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void) cpu_features->level4_cache_size); print_cpu_features_value ("cachesize_non_temporal_divisor", cpu_features->cachesize_non_temporal_divisor); + print_cpu_features_value ("machine_rep_stosb_threshold", + cpu_features->machine_rep_stosb_threshold); _Static_assert ( - offsetof (struct cpu_features, cachesize_non_temporal_divisor) - + sizeof (cpu_features->cachesize_non_temporal_divisor) + offsetof (struct cpu_features, machine_rep_stosb_threshold) + + sizeof (cpu_features->machine_rep_stosb_threshold) == sizeof (*cpu_features), "last cpu_features field has been printed"); diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index aaae44f0e1..5f2b5b93ac 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -981,6 +981,8 @@ struct cpu_features /* When no user non_temporal_threshold is specified. We default to cachesize / cachesize_non_temporal_divisor. */ unsigned long int cachesize_non_temporal_divisor; + /* Default rep stosb threshold (if 0, use default in dl-machine.h). */ + unsigned long int machine_rep_stosb_threshold; }; /* Get a pointer to the CPU features structure. */