Message ID | 20240814063731.3014055-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/2] x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path | expand |
On Tue, Aug 13, 2024 at 11:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This is just a refactor and there should be no behavioral change from > this commit. > > The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob > for controlling whether we use non-temporal memset rather than having > extra logic based on vendor. > --- > sysdeps/x86/cpu-features.c | 16 ++++++++++++++++ > sysdeps/x86/dl-cacheinfo.h | 15 +++++++-------- > 2 files changed, 23 insertions(+), 8 deletions(-) > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index 18ed008040..a4786d23c7 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features) > unsigned int stepping = 0; > enum cpu_features_kind kind; > > + /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, > + as of writing this, we only have benchmarks indicatings it profitability > + on Intel/AMD. */ > + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > + |= bit_arch_Avoid_Non_Temporal_Memset; > + > cpu_features->cachesize_non_temporal_divisor = 4; > #if !HAS_CPUID > if (__get_cpuid_max (0, 0) == 0) > @@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features) > > update_active (cpu_features); > > + /* Benchmarks indicate non-temporal memset can be profitable on Intel > + hardware. */ > + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > + &= ~bit_arch_Avoid_Non_Temporal_Memset; > + > if (family == 0x06) > { > model += extended_model; > @@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht > > ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; > > + /* Benchmarks indicate non-temporal memset can be profitable on AMD > + hardware. */ > + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] > + &= ~bit_arch_Avoid_Non_Temporal_Memset; > + > if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) > { > /* Since the FMA4 bit is in CPUID_INDEX_80000001 and > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index a1c03b8903..3d0c8d43b8 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > rep_movsb_threshold = 2112; > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > - && (cpu_features->basic.kind == arch_kind_intel > - || cpu_features->basic.kind == arch_kind_amd)) > - memset_non_temporal_threshold = non_temporal_threshold; > - > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > cases slower than the vectorized path (and for some alignments, > it is really slow, check BZ #30994). */ > @@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (tunable_size != 0) > shared = tunable_size; > > + /* Non-temporal stores are more performant on some hardware above > + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both > + Intel and AMD hardware. */ > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) > + memset_non_temporal_threshold = non_temporal_threshold; > + > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > if (tunable_size > minimum_non_temporal_threshold > && tunable_size <= maximum_non_temporal_threshold) > -- > 2.34.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 18ed008040..a4786d23c7 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features) unsigned int stepping = 0; enum cpu_features_kind kind; + /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, + as of writing this, we only have benchmarks indicatings it profitability + on Intel/AMD. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + cpu_features->cachesize_non_temporal_divisor = 4; #if !HAS_CPUID if (__get_cpuid_max (0, 0) == 0) @@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features) update_active (cpu_features); + /* Benchmarks indicate non-temporal memset can be profitable on Intel + hardware. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + if (family == 0x06) { model += extended_model; @@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; + /* Benchmarks indicate non-temporal memset can be profitable on AMD + hardware. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) { /* Since the FMA4 bit is in CPUID_INDEX_80000001 and diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index a1c03b8903..3d0c8d43b8 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; - /* Non-temporal stores are more performant on Intel and AMD hardware above - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ - unsigned long int memset_non_temporal_threshold = SIZE_MAX; - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) - && (cpu_features->basic.kind == arch_kind_intel - || cpu_features->basic.kind == arch_kind_amd)) - memset_non_temporal_threshold = non_temporal_threshold; - /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of cases slower than the vectorized path (and for some alignments, it is really slow, check BZ #30994). */ @@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (tunable_size != 0) shared = tunable_size; + /* Non-temporal stores are more performant on some hardware above + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both + Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) + memset_non_temporal_threshold = non_temporal_threshold; + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); if (tunable_size > minimum_non_temporal_threshold && tunable_size <= maximum_non_temporal_threshold)