Message ID | 20240813135525.144529-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86: Add new cpu-flag `Prefer_Non_Temporal` | expand |
On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The goal of this flag is to allow targets which don't prefer/have ERMS > to still access the non-temporal memset implementation. > --- > sysdeps/x86/cpu-tunables.c | 4 ++ > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > ...cpu-features-preferred_feature_index_1.def | 1 + > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > 5 files changed, 58 insertions(+), 18 deletions(-) > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > index ccc6b64dc2..b01b703517 100644 > --- a/sysdeps/x86/cpu-tunables.c > +++ b/sysdeps/x86/cpu-tunables.c > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > } > break; > + case 32: > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > + Prefer_Non_Temporal_Without_ERMS, 32); > + break; > } > } > } > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index a1c03b8903..600bd66a93 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > rep_movsb_threshold = 2112; > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > - && (cpu_features->basic.kind == arch_kind_intel > - || cpu_features->basic.kind == arch_kind_amd)) > - memset_non_temporal_threshold = non_temporal_threshold; > - > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > cases slower than the vectorized path (and for some alignments, > it is really slow, check BZ #30994). */ > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (tunable_size != 0) > shared = tunable_size; > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > + || cpu_features->basic.kind == arch_kind_intel > + || cpu_features->basic.kind == arch_kind_amd)) Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > + memset_non_temporal_threshold = non_temporal_threshold; > + > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > if (tunable_size > minimum_non_temporal_threshold > && tunable_size <= maximum_non_temporal_threshold) > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > slightly better than ERMS. */ > rep_stosb_threshold = SIZE_MAX; > > + /* > + For memset, the non-temporal implementation is only accessed through the > + stosb code. ie: > + ``` > + if (size >= rep_stosb_thresh) > + { > + if (size >= non_temporal_thresh) > + { > + do_non_temporal (); > + } > + do_stosb (); > + } > + do_normal_vec_loop (); > + ``` > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > + `rep stosb` will never be used. > + */ > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > + memset_non_temporal_threshold, > + minimum_non_temporal_threshold, SIZE_MAX); > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) Please add another bit, Avoid_STOSB, and check it instead. > + rep_stosb_threshold > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); Please move and reuse the existing TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL) call. > + > + > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > minimum_non_temporal_threshold, > maximum_non_temporal_threshold); > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > - memset_non_temporal_threshold, > - minimum_non_temporal_threshold, SIZE_MAX); Please leave it here. > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > minimum_rep_movsb_threshold, SIZE_MAX); > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > index 61bbbc2e89..60a3a382ff 100644 > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > BIT (Prefer_FSRM) > BIT (Avoid_Short_Distance_REP_MOVSB) > BIT (Avoid_Non_Temporal_Memset) > +BIT (Prefer_Non_Temporal_Without_ERMS) > \ No newline at end of file Please add a newline. > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > index 94307283d7..4495d795e4 100644 > --- a/sysdeps/x86/tst-hwcap-tunables.c > +++ b/sysdeps/x86/tst-hwcap-tunables.c > @@ -60,7 +60,8 @@ static const struct test_t > /* Disable everything. */ > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > + "-Prefer_Non_Temporal_Without_ERMS", > test_1, > array_length (test_1) > }, > @@ -68,7 +69,8 @@ static const struct test_t > /* Same as before, but with some empty suboptions. */ > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > + "-Prefer_Non_Temporal_Without_ERMS,-,", > test_1, > array_length (test_1) > } > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > index 7a637ef7ca..3ca64bb853 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > + || CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_Non_Temporal_Without_ERMS)) Please check only Prefer_Non_Temporal instead. > return OPTIMIZE (avx512_unaligned_erms); > > return OPTIMIZE (avx512_unaligned); > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > + || CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_Non_Temporal_Without_ERMS)) > return OPTIMIZE (evex_unaligned_erms); Please check only Prefer_Non_Temporal instead. > > return OPTIMIZE (evex_unaligned); > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > + || CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_Non_Temporal_Without_ERMS)) > return OPTIMIZE (avx2_unaligned_erms_rtm); Please check only Prefer_Non_Temporal instead. > > return OPTIMIZE (avx2_unaligned_rtm); > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > Prefer_No_VZEROUPPER, !)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > + || CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_Non_Temporal_Without_ERMS)) > return OPTIMIZE (avx2_unaligned_erms); Please check only Prefer_Non_Temporal instead. > > return OPTIMIZE (avx2_unaligned); > } > } > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > return OPTIMIZE (sse2_unaligned_erms); Please check only Prefer_Non_Temporal instead. > > return OPTIMIZE (sse2_unaligned); > -- > 2.34.1 >
On Tue, Aug 13, 2024 at 7:44 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > to still access the non-temporal memset implementation. > > --- > > sysdeps/x86/cpu-tunables.c | 4 ++ > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > ...cpu-features-preferred_feature_index_1.def | 1 + > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > index ccc6b64dc2..b01b703517 100644 > > --- a/sysdeps/x86/cpu-tunables.c > > +++ b/sysdeps/x86/cpu-tunables.c > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > } > > break; > > + case 32: > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > + Prefer_Non_Temporal_Without_ERMS, 32); > > + break; > > } > > } > > } > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > index a1c03b8903..600bd66a93 100644 > > --- a/sysdeps/x86/dl-cacheinfo.h > > +++ b/sysdeps/x86/dl-cacheinfo.h > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > rep_movsb_threshold = 2112; > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > - && (cpu_features->basic.kind == arch_kind_intel > > - || cpu_features->basic.kind == arch_kind_amd)) > > - memset_non_temporal_threshold = non_temporal_threshold; > > - > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > cases slower than the vectorized path (and for some alignments, > > it is really slow, check BZ #30994). */ > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (tunable_size != 0) > > shared = tunable_size; > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > + || cpu_features->basic.kind == arch_kind_intel > > + || cpu_features->basic.kind == arch_kind_amd)) > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > + memset_non_temporal_threshold = non_temporal_threshold; > > + > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > if (tunable_size > minimum_non_temporal_threshold > > && tunable_size <= maximum_non_temporal_threshold) > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > slightly better than ERMS. */ > > rep_stosb_threshold = SIZE_MAX; > > > > + /* > > + For memset, the non-temporal implementation is only accessed through the > > + stosb code. ie: > > + ``` > > + if (size >= rep_stosb_thresh) > > + { > > + if (size >= non_temporal_thresh) > > + { > > + do_non_temporal (); > > + } > > + do_stosb (); > > + } > > + do_normal_vec_loop (); > > + ``` > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > + `rep stosb` will never be used. > > + */ > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > + memset_non_temporal_threshold, > > + minimum_non_temporal_threshold, SIZE_MAX); > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > Please add another bit, Avoid_STOSB, and check it instead. > > > + rep_stosb_threshold > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > Please move and reuse the existing TUNABLE_GET > (x86_memset_non_temporal_threshold, long int, NULL) > call. > > > + > > + > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > minimum_non_temporal_threshold, > > maximum_non_temporal_threshold); > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > - memset_non_temporal_threshold, > > - minimum_non_temporal_threshold, SIZE_MAX); > > Please leave it here. > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > minimum_rep_movsb_threshold, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > index 61bbbc2e89..60a3a382ff 100644 > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > BIT (Prefer_FSRM) > > BIT (Avoid_Short_Distance_REP_MOVSB) > > BIT (Avoid_Non_Temporal_Memset) > > +BIT (Prefer_Non_Temporal_Without_ERMS) Please also add some comments for Prefer_Non_Temporal and Avoid_STOSB. > > \ No newline at end of file > > Please add a newline. > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > index 94307283d7..4495d795e4 100644 > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > @@ -60,7 +60,8 @@ static const struct test_t > > /* Disable everything. */ > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > + "-Prefer_Non_Temporal_Without_ERMS", > > test_1, > > array_length (test_1) > > }, > > @@ -68,7 +69,8 @@ static const struct test_t > > /* Same as before, but with some empty suboptions. */ > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > test_1, > > array_length (test_1) > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > index 7a637ef7ca..3ca64bb853 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > Please check only Prefer_Non_Temporal instead. > > > return OPTIMIZE (avx512_unaligned_erms); > > > > return OPTIMIZE (avx512_unaligned); > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (evex_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (evex_unaligned); > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > Prefer_No_VZEROUPPER, !)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (avx2_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (avx2_unaligned); > > } > > } > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (sse2_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (sse2_unaligned); > > -- > > 2.34.1 > > > > > -- > H.J.
On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > to still access the non-temporal memset implementation. > > --- > > sysdeps/x86/cpu-tunables.c | 4 ++ > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > ...cpu-features-preferred_feature_index_1.def | 1 + > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > index ccc6b64dc2..b01b703517 100644 > > --- a/sysdeps/x86/cpu-tunables.c > > +++ b/sysdeps/x86/cpu-tunables.c > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > } > > break; > > + case 32: > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > + Prefer_Non_Temporal_Without_ERMS, 32); > > + break; > > } > > } > > } > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > index a1c03b8903..600bd66a93 100644 > > --- a/sysdeps/x86/dl-cacheinfo.h > > +++ b/sysdeps/x86/dl-cacheinfo.h > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > rep_movsb_threshold = 2112; > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > - && (cpu_features->basic.kind == arch_kind_intel > > - || cpu_features->basic.kind == arch_kind_amd)) > > - memset_non_temporal_threshold = non_temporal_threshold; > > - > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > cases slower than the vectorized path (and for some alignments, > > it is really slow, check BZ #30994). */ > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (tunable_size != 0) > > shared = tunable_size; > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > + || cpu_features->basic.kind == arch_kind_intel > > + || cpu_features->basic.kind == arch_kind_amd)) > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > + memset_non_temporal_threshold = non_temporal_threshold; > > + > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > if (tunable_size > minimum_non_temporal_threshold > > && tunable_size <= maximum_non_temporal_threshold) > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > slightly better than ERMS. */ > > rep_stosb_threshold = SIZE_MAX; > > > > + /* > > + For memset, the non-temporal implementation is only accessed through the > > + stosb code. ie: > > + ``` > > + if (size >= rep_stosb_thresh) > > + { > > + if (size >= non_temporal_thresh) > > + { > > + do_non_temporal (); > > + } > > + do_stosb (); > > + } > > + do_normal_vec_loop (); > > + ``` > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > + `rep stosb` will never be used. > > + */ > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > + memset_non_temporal_threshold, > > + minimum_non_temporal_threshold, SIZE_MAX); > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > Please add another bit, Avoid_STOSB, and check it instead. > > > + rep_stosb_threshold > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > Please move and reuse the existing TUNABLE_GET > (x86_memset_non_temporal_threshold, long int, NULL) > call. > > > + > > + > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > minimum_non_temporal_threshold, > > maximum_non_temporal_threshold); > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > - memset_non_temporal_threshold, > > - minimum_non_temporal_threshold, SIZE_MAX); > > Please leave it here. > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > minimum_rep_movsb_threshold, SIZE_MAX); > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > index 61bbbc2e89..60a3a382ff 100644 > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > BIT (Prefer_FSRM) > > BIT (Avoid_Short_Distance_REP_MOVSB) > > BIT (Avoid_Non_Temporal_Memset) > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > \ No newline at end of file > > Please add a newline. > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > index 94307283d7..4495d795e4 100644 > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > @@ -60,7 +60,8 @@ static const struct test_t > > /* Disable everything. */ > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > + "-Prefer_Non_Temporal_Without_ERMS", > > test_1, > > array_length (test_1) > > }, > > @@ -68,7 +69,8 @@ static const struct test_t > > /* Same as before, but with some empty suboptions. */ > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > test_1, > > array_length (test_1) > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > index 7a637ef7ca..3ca64bb853 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > Please check only Prefer_Non_Temporal instead. > > > return OPTIMIZE (avx512_unaligned_erms); > > > > return OPTIMIZE (avx512_unaligned); > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (evex_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (evex_unaligned); > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > Prefer_No_VZEROUPPER, !)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (avx2_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (avx2_unaligned); > > } > > } > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > return OPTIMIZE (sse2_unaligned_erms); > > Please check only Prefer_Non_Temporal instead. > Hmm? Why remove the ERMS check here? Shouldn't we want to select the `_erms` implemention if we want either non-temporal OR erms? Everything else sounds fine though. > > > > return OPTIMIZE (sse2_unaligned); > > -- > > 2.34.1 > > > > > -- > H.J.
On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > > to still access the non-temporal memset implementation. > > > --- > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > > index ccc6b64dc2..b01b703517 100644 > > > --- a/sysdeps/x86/cpu-tunables.c > > > +++ b/sysdeps/x86/cpu-tunables.c > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > > } > > > break; > > > + case 32: > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > + Prefer_Non_Temporal_Without_ERMS, 32); > > > + break; > > > } > > > } > > > } > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > index a1c03b8903..600bd66a93 100644 > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > rep_movsb_threshold = 2112; > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > - && (cpu_features->basic.kind == arch_kind_intel > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > - > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > > cases slower than the vectorized path (and for some alignments, > > > it is really slow, check BZ #30994). */ > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > if (tunable_size != 0) > > > shared = tunable_size; > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > > + || cpu_features->basic.kind == arch_kind_intel > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > + > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > > if (tunable_size > minimum_non_temporal_threshold > > > && tunable_size <= maximum_non_temporal_threshold) > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > slightly better than ERMS. */ > > > rep_stosb_threshold = SIZE_MAX; > > > > > > + /* > > > + For memset, the non-temporal implementation is only accessed through the > > > + stosb code. ie: > > > + ``` > > > + if (size >= rep_stosb_thresh) > > > + { > > > + if (size >= non_temporal_thresh) > > > + { > > > + do_non_temporal (); > > > + } > > > + do_stosb (); > > > + } > > > + do_normal_vec_loop (); > > > + ``` > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > > + `rep stosb` will never be used. > > > + */ > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > + memset_non_temporal_threshold, > > > + minimum_non_temporal_threshold, SIZE_MAX); > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > + rep_stosb_threshold > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > > > Please move and reuse the existing TUNABLE_GET > > (x86_memset_non_temporal_threshold, long int, NULL) > > call. > > > > > + > > > + > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > > minimum_non_temporal_threshold, > > > maximum_non_temporal_threshold); > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > - memset_non_temporal_threshold, > > > - minimum_non_temporal_threshold, SIZE_MAX); > > > > Please leave it here. > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > > minimum_rep_movsb_threshold, SIZE_MAX); > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > index 61bbbc2e89..60a3a382ff 100644 > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > BIT (Prefer_FSRM) > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > BIT (Avoid_Non_Temporal_Memset) > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > \ No newline at end of file > > > > Please add a newline. > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > > index 94307283d7..4495d795e4 100644 > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > @@ -60,7 +60,8 @@ static const struct test_t > > > /* Disable everything. */ > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > test_1, > > > array_length (test_1) > > > }, > > > @@ -68,7 +69,8 @@ static const struct test_t > > > /* Same as before, but with some empty suboptions. */ > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > test_1, > > > array_length (test_1) > > > } > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > index 7a637ef7ca..3ca64bb853 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > { > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > Please check only Prefer_Non_Temporal instead. > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > return OPTIMIZE (avx512_unaligned); > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > { > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > + Prefer_Non_Temporal_Without_ERMS)) > > > return OPTIMIZE (evex_unaligned_erms); > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > return OPTIMIZE (evex_unaligned); > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > { > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > + Prefer_Non_Temporal_Without_ERMS)) > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > Prefer_No_VZEROUPPER, !)) > > > { > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > + Prefer_Non_Temporal_Without_ERMS)) > > > return OPTIMIZE (avx2_unaligned_erms); > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > } > > > } > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > return OPTIMIZE (sse2_unaligned_erms); > > > > Please check only Prefer_Non_Temporal instead. > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > the `_erms` implemention if we want either non-temporal OR erms? You said NT store must go through ERMS. You can do if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) Set Prefer_Non_Temporal for Intel and AMD. > > Everything else sounds fine though. > > > > > > > return OPTIMIZE (sse2_unaligned); > > > -- > > > 2.34.1 > > > > > > > > > -- > > H.J.
On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > > > to still access the non-temporal memset implementation. > > > > --- > > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > > > index ccc6b64dc2..b01b703517 100644 > > > > --- a/sysdeps/x86/cpu-tunables.c > > > > +++ b/sysdeps/x86/cpu-tunables.c > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > > > } > > > > break; > > > > + case 32: > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > > + Prefer_Non_Temporal_Without_ERMS, 32); > > > > + break; > > > > } > > > > } > > > > } > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > index a1c03b8903..600bd66a93 100644 > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > > rep_movsb_threshold = 2112; > > > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > - && (cpu_features->basic.kind == arch_kind_intel > > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > > - > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > > > cases slower than the vectorized path (and for some alignments, > > > > it is really slow, check BZ #30994). */ > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > if (tunable_size != 0) > > > > shared = tunable_size; > > > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > > > + || cpu_features->basic.kind == arch_kind_intel > > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > > + > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > > > if (tunable_size > minimum_non_temporal_threshold > > > > && tunable_size <= maximum_non_temporal_threshold) > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > slightly better than ERMS. */ > > > > rep_stosb_threshold = SIZE_MAX; > > > > > > > > + /* > > > > + For memset, the non-temporal implementation is only accessed through the > > > > + stosb code. ie: > > > > + ``` > > > > + if (size >= rep_stosb_thresh) > > > > + { > > > > + if (size >= non_temporal_thresh) > > > > + { > > > > + do_non_temporal (); > > > > + } > > > > + do_stosb (); > > > > + } > > > > + do_normal_vec_loop (); > > > > + ``` > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > > > + `rep stosb` will never be used. > > > > + */ > > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > + memset_non_temporal_threshold, > > > > + minimum_non_temporal_threshold, SIZE_MAX); > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > > > + rep_stosb_threshold > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > > > > > Please move and reuse the existing TUNABLE_GET > > > (x86_memset_non_temporal_threshold, long int, NULL) > > > call. > > > > > > > + > > > > + > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > > > minimum_non_temporal_threshold, > > > > maximum_non_temporal_threshold); > > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > - memset_non_temporal_threshold, > > > > - minimum_non_temporal_threshold, SIZE_MAX); > > > > > > Please leave it here. > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > > > minimum_rep_movsb_threshold, SIZE_MAX); > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > index 61bbbc2e89..60a3a382ff 100644 > > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > > BIT (Prefer_FSRM) > > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > > BIT (Avoid_Non_Temporal_Memset) > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > > \ No newline at end of file > > > > > > Please add a newline. > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > > > index 94307283d7..4495d795e4 100644 > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > > @@ -60,7 +60,8 @@ static const struct test_t > > > > /* Disable everything. */ > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > > test_1, > > > > array_length (test_1) > > > > }, > > > > @@ -68,7 +69,8 @@ static const struct test_t > > > > /* Same as before, but with some empty suboptions. */ > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > > test_1, > > > > array_length (test_1) > > > > } > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > index 7a637ef7ca..3ca64bb853 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > { > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > > > return OPTIMIZE (avx512_unaligned); > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > { > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > return OPTIMIZE (evex_unaligned_erms); > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > return OPTIMIZE (evex_unaligned); > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > { > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > > Prefer_No_VZEROUPPER, !)) > > > > { > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > return OPTIMIZE (avx2_unaligned_erms); > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > > } > > > > } > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > return OPTIMIZE (sse2_unaligned_erms); > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > > the `_erms` implemention if we want either non-temporal OR erms? > > You said NT store must go through ERMS. You can do > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > Set Prefer_Non_Temporal > > for Intel and AMD. What about machines that want ERMS but not NT? > > > > > Everything else sounds fine though. > > > > > > > > > > return OPTIMIZE (sse2_unaligned); > > > > -- > > > > 2.34.1 > > > > > > > > > > > > > -- > > > H.J. > > > > -- > H.J.
On Tue, Aug 13, 2024 at 8:24 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > > > > to still access the non-temporal memset implementation. > > > > > --- > > > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > > > > index ccc6b64dc2..b01b703517 100644 > > > > > --- a/sysdeps/x86/cpu-tunables.c > > > > > +++ b/sysdeps/x86/cpu-tunables.c > > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > > > > } > > > > > break; > > > > > + case 32: > > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > > > + Prefer_Non_Temporal_Without_ERMS, 32); > > > > > + break; > > > > > } > > > > > } > > > > > } > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > > index a1c03b8903..600bd66a93 100644 > > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > > > rep_movsb_threshold = 2112; > > > > > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > - && (cpu_features->basic.kind == arch_kind_intel > > > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > > > - > > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > > > > cases slower than the vectorized path (and for some alignments, > > > > > it is really slow, check BZ #30994). */ > > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > if (tunable_size != 0) > > > > > shared = tunable_size; > > > > > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > > > > + || cpu_features->basic.kind == arch_kind_intel > > > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > > > + > > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > > > > if (tunable_size > minimum_non_temporal_threshold > > > > > && tunable_size <= maximum_non_temporal_threshold) > > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > slightly better than ERMS. */ > > > > > rep_stosb_threshold = SIZE_MAX; > > > > > > > > > > + /* > > > > > + For memset, the non-temporal implementation is only accessed through the > > > > > + stosb code. ie: > > > > > + ``` > > > > > + if (size >= rep_stosb_thresh) > > > > > + { > > > > > + if (size >= non_temporal_thresh) > > > > > + { > > > > > + do_non_temporal (); > > > > > + } > > > > > + do_stosb (); > > > > > + } > > > > > + do_normal_vec_loop (); > > > > > + ``` > > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > > > > + `rep stosb` will never be used. > > > > > + */ > > > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > + memset_non_temporal_threshold, > > > > > + minimum_non_temporal_threshold, SIZE_MAX); > > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > > > > > + rep_stosb_threshold > > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > > > > > > > Please move and reuse the existing TUNABLE_GET > > > > (x86_memset_non_temporal_threshold, long int, NULL) > > > > call. > > > > > > > > > + > > > > > + > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > > > > minimum_non_temporal_threshold, > > > > > maximum_non_temporal_threshold); > > > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > - memset_non_temporal_threshold, > > > > > - minimum_non_temporal_threshold, SIZE_MAX); > > > > > > > > Please leave it here. > > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > > > > minimum_rep_movsb_threshold, SIZE_MAX); > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > index 61bbbc2e89..60a3a382ff 100644 > > > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > > > BIT (Prefer_FSRM) > > > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > > > BIT (Avoid_Non_Temporal_Memset) > > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > > > \ No newline at end of file > > > > > > > > Please add a newline. > > > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > > > > index 94307283d7..4495d795e4 100644 > > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > > > @@ -60,7 +60,8 @@ static const struct test_t > > > > > /* Disable everything. */ > > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > > > test_1, > > > > > array_length (test_1) > > > > > }, > > > > > @@ -68,7 +69,8 @@ static const struct test_t > > > > > /* Same as before, but with some empty suboptions. */ > > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > > > test_1, > > > > > array_length (test_1) > > > > > } > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > index 7a637ef7ca..3ca64bb853 100644 > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > { > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > > > > > return OPTIMIZE (avx512_unaligned); > > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > { > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > return OPTIMIZE (evex_unaligned_erms); > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > return OPTIMIZE (evex_unaligned); > > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > > { > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > > > Prefer_No_VZEROUPPER, !)) > > > > > { > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > return OPTIMIZE (avx2_unaligned_erms); > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > > > } > > > > > } > > > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > return OPTIMIZE (sse2_unaligned_erms); > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > > > the `_erms` implemention if we want either non-temporal OR erms? > > > > You said NT store must go through ERMS. You can do > > > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > Set Prefer_Non_Temporal > > > > for Intel and AMD. > > What about machines that want ERMS but not NT? We can check CPUID to turn off Prefer_Non_Temporal. > > > > > > > > Everything else sounds fine though. > > > > > > > > > > > > > return OPTIMIZE (sse2_unaligned); > > > > > -- > > > > > 2.34.1 > > > > > > > > > > > > > > > > > -- > > > > H.J. > > > > > > > > -- > > H.J.
On Tue, Aug 13, 2024 at 11:33 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 8:24 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > > > > > to still access the non-temporal memset implementation. > > > > > > --- > > > > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > > > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > > > > > index ccc6b64dc2..b01b703517 100644 > > > > > > --- a/sysdeps/x86/cpu-tunables.c > > > > > > +++ b/sysdeps/x86/cpu-tunables.c > > > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > > > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > > > > > } > > > > > > break; > > > > > > + case 32: > > > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > > > > + Prefer_Non_Temporal_Without_ERMS, 32); > > > > > > + break; > > > > > > } > > > > > > } > > > > > > } > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > > > index a1c03b8903..600bd66a93 100644 > > > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > > > > rep_movsb_threshold = 2112; > > > > > > > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > > - && (cpu_features->basic.kind == arch_kind_intel > > > > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > > > > - > > > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > > > > > cases slower than the vectorized path (and for some alignments, > > > > > > it is really slow, check BZ #30994). */ > > > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > if (tunable_size != 0) > > > > > > shared = tunable_size; > > > > > > > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > > > > > + || cpu_features->basic.kind == arch_kind_intel > > > > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > > > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > > > > + > > > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > > > > > if (tunable_size > minimum_non_temporal_threshold > > > > > > && tunable_size <= maximum_non_temporal_threshold) > > > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > slightly better than ERMS. */ > > > > > > rep_stosb_threshold = SIZE_MAX; > > > > > > > > > > > > + /* > > > > > > + For memset, the non-temporal implementation is only accessed through the > > > > > > + stosb code. ie: > > > > > > + ``` > > > > > > + if (size >= rep_stosb_thresh) > > > > > > + { > > > > > > + if (size >= non_temporal_thresh) > > > > > > + { > > > > > > + do_non_temporal (); > > > > > > + } > > > > > > + do_stosb (); > > > > > > + } > > > > > > + do_normal_vec_loop (); > > > > > > + ``` > > > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > > > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > > > > > + `rep stosb` will never be used. > > > > > > + */ > > > > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > > + memset_non_temporal_threshold, > > > > > > + minimum_non_temporal_threshold, SIZE_MAX); > > > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > > > > > > > + rep_stosb_threshold > > > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > > > > > > > > > Please move and reuse the existing TUNABLE_GET > > > > > (x86_memset_non_temporal_threshold, long int, NULL) > > > > > call. Would prefer to leave as is. We need the final values to be equal and it seems less bug prone to use the final value as opposed to having a seperate bounds check here. > > > > > > > > > > > + > > > > > > + > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > > > > > minimum_non_temporal_threshold, > > > > > > maximum_non_temporal_threshold); > > > > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > > - memset_non_temporal_threshold, > > > > > > - minimum_non_temporal_threshold, SIZE_MAX); > > > > > > > > > > Please leave it here. > > > > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > > > > > minimum_rep_movsb_threshold, SIZE_MAX); > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > > > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > index 61bbbc2e89..60a3a382ff 100644 > > > > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > > > > BIT (Prefer_FSRM) > > > > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > > > > BIT (Avoid_Non_Temporal_Memset) > > > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > > > > \ No newline at end of file > > > > > > > > > > Please add a newline. > > > > > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > index 94307283d7..4495d795e4 100644 > > > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > @@ -60,7 +60,8 @@ static const struct test_t > > > > > > /* Disable everything. */ > > > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > > > > test_1, > > > > > > array_length (test_1) > > > > > > }, > > > > > > @@ -68,7 +69,8 @@ static const struct test_t > > > > > > /* Same as before, but with some empty suboptions. */ > > > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > > > > test_1, > > > > > > array_length (test_1) > > > > > > } > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > index 7a637ef7ca..3ca64bb853 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > > { > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > > > > > > > return OPTIMIZE (avx512_unaligned); > > > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > > { > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > return OPTIMIZE (evex_unaligned_erms); > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > return OPTIMIZE (evex_unaligned); > > > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > > > { > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > > > > Prefer_No_VZEROUPPER, !)) > > > > > > { > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > return OPTIMIZE (avx2_unaligned_erms); > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > > > > } > > > > > > } > > > > > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > return OPTIMIZE (sse2_unaligned_erms); > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > > > > the `_erms` implemention if we want either non-temporal OR erms? > > > > > > You said NT store must go through ERMS. You can do > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > Set Prefer_Non_Temporal > > > > > > for Intel and AMD. > > > > What about machines that want ERMS but not NT? > > We can check CPUID to turn off Prefer_Non_Temporal. This seems overly complicated. We have a code path that goes to ERMS or NT. The natural check for that seems to be `ERMS || NT`. > > > > > > > > > > > > Everything else sounds fine though. > > > > > > > > > > > > > > > > return OPTIMIZE (sse2_unaligned); > > > > > > -- > > > > > > 2.34.1 > > > > > > > > > > > > > > > > > > > > > -- > > > > > H.J. > > > > > > > > > > > > -- > > > H.J. > > > > -- > H.J.
On Tue, Aug 13, 2024 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Aug 13, 2024 at 11:33 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Aug 13, 2024 at 8:24 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS > > > > > > > to still access the non-temporal memset implementation. > > > > > > > --- > > > > > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > > > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- > > > > > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > > > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > > > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c > > > > > > > index ccc6b64dc2..b01b703517 100644 > > > > > > > --- a/sysdeps/x86/cpu-tunables.c > > > > > > > +++ b/sysdeps/x86/cpu-tunables.c > > > > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) > > > > > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); > > > > > > > } > > > > > > > break; > > > > > > > + case 32: > > > > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > > > > > + Prefer_Non_Temporal_Without_ERMS, 32); > > > > > > > + break; > > > > > > > } > > > > > > > } > > > > > > > } > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > > > > index a1c03b8903..600bd66a93 100644 > > > > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > > > > > rep_movsb_threshold = 2112; > > > > > > > > > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > > > - && (cpu_features->basic.kind == arch_kind_intel > > > > > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > > > > > - > > > > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of > > > > > > > cases slower than the vectorized path (and for some alignments, > > > > > > > it is really slow, check BZ #30994). */ > > > > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > > if (tunable_size != 0) > > > > > > > shared = tunable_size; > > > > > > > > > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above > > > > > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ > > > > > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; > > > > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) > > > > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) > > > > > > > + || cpu_features->basic.kind == arch_kind_intel > > > > > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. > > > > > > > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > > > > > + > > > > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); > > > > > > > if (tunable_size > minimum_non_temporal_threshold > > > > > > > && tunable_size <= maximum_non_temporal_threshold) > > > > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > > > slightly better than ERMS. */ > > > > > > > rep_stosb_threshold = SIZE_MAX; > > > > > > > > > > > > > > + /* > > > > > > > + For memset, the non-temporal implementation is only accessed through the > > > > > > > + stosb code. ie: > > > > > > > + ``` > > > > > > > + if (size >= rep_stosb_thresh) > > > > > > > + { > > > > > > > + if (size >= non_temporal_thresh) > > > > > > > + { > > > > > > > + do_non_temporal (); > > > > > > > + } > > > > > > > + do_stosb (); > > > > > > > + } > > > > > > > + do_normal_vec_loop (); > > > > > > > + ``` > > > > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` > > > > > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, > > > > > > > + `rep stosb` will never be used. > > > > > > > + */ > > > > > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > > > + memset_non_temporal_threshold, > > > > > > > + minimum_non_temporal_threshold, SIZE_MAX); > > > > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > > > > > > > > > + rep_stosb_threshold > > > > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); > > > > > > > > > > > > Please move and reuse the existing TUNABLE_GET > > > > > > (x86_memset_non_temporal_threshold, long int, NULL) > > > > > > call. > > Would prefer to leave as is. We need the final values to be equal > and it seems less bug prone to use the final value as opposed to > having a seperate bounds check here. > > > > > > > > > > > > > + > > > > > > > + > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, > > > > > > > minimum_non_temporal_threshold, > > > > > > > maximum_non_temporal_threshold); > > > > > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, > > > > > > > - memset_non_temporal_threshold, > > > > > > > - minimum_non_temporal_threshold, SIZE_MAX); > > > > > > > > > > > > Please leave it here. > > > > > > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, > > > > > > > minimum_rep_movsb_threshold, SIZE_MAX); > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, > > > > > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > index 61bbbc2e89..60a3a382ff 100644 > > > > > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > > > > > BIT (Prefer_FSRM) > > > > > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > > > > > BIT (Avoid_Non_Temporal_Memset) > > > > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > > > > > \ No newline at end of file > > > > > > > > > > > > Please add a newline. > > > > > > > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > index 94307283d7..4495d795e4 100644 > > > > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > @@ -60,7 +60,8 @@ static const struct test_t > > > > > > > /* Disable everything. */ > > > > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > > > > > test_1, > > > > > > > array_length (test_1) > > > > > > > }, > > > > > > > @@ -68,7 +69,8 @@ static const struct test_t > > > > > > > /* Same as before, but with some empty suboptions. */ > > > > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > > > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > > > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > > > > > test_1, > > > > > > > array_length (test_1) > > > > > > > } > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > index 7a637ef7ca..3ca64bb853 100644 > > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > > > { > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > > > > > > > > > return OPTIMIZE (avx512_unaligned); > > > > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > > > > > > { > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > return OPTIMIZE (evex_unaligned_erms); > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (evex_unaligned); > > > > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > > > > { > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > Prefer_No_VZEROUPPER, !)) > > > > > > > { > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) > > > > > > > return OPTIMIZE (avx2_unaligned_erms); > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) > > > > > > > return OPTIMIZE (sse2_unaligned_erms); > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > > > > > the `_erms` implemention if we want either non-temporal OR erms? > > > > > > > > You said NT store must go through ERMS. You can do > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > Set Prefer_Non_Temporal > > > > > > > > for Intel and AMD. > > > > > > What about machines that want ERMS but not NT? > > > > We can check CPUID to turn off Prefer_Non_Temporal. > > This seems overly complicated. We have a code path that goes > to ERMS or NT. The natural check for that seems to be `ERMS || NT`. > > > > > > > > > > > > > > > > Everything else sounds fine though. > > > > > > > > > > > > > > > > > > > return OPTIMIZE (sse2_unaligned); > > > > > > > -- > > > > > > > 2.34.1 > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > H.J. > > > > > > > > > > > > > > > > -- > > > > H.J. > > > > > > > > -- > > H.J. Okay looking over what we have, my preference would be to scrap the whole `Prefer_Non_Temporal` and use the existing `Avoid_Non_Temporal_Memset` along with new `Avoid_STOSB`. And implement the following: 1) Default `Avoid_Non_Temporal_Memset = 1` except for AMD + Intel. 2) Continue using `Avoid_Non_Temporal_Memset` as is now (i.e Intel SKX). 3) Drop the explicit `Intel || AMD` check in `dl-cacheinfo.h` -- 1 & 2 & 3 should match current behavior. I will make them a seperate commit in the series. 4) Add `Avoid_STOSB`. 5) If `!Avoid_Non_Temporal_Memset || ERMS` -> `*_erms` memset impl 6) if `Avoid_STOSB` then `x86_stosb_thresh = x86_memset_non_temporal_thresh` Does this sound reasonable?
On Tue, Aug 13, 2024, 9:39 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Tue, Aug 13, 2024 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > > > > On Tue, Aug 13, 2024 at 11:33 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Tue, Aug 13, 2024 at 8:24 AM Noah Goldstein < > goldstein.w.n@gmail.com> wrote: > > > > > > > > On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> > wrote: > > > > > > > > > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein < > goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> > wrote: > > > > > > > > > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein < > goldstein.w.n@gmail.com> wrote: > > > > > > > > > > > > > > > > The goal of this flag is to allow targets which don't > prefer/have ERMS > > > > > > > > to still access the non-temporal memset implementation. > > > > > > > > --- > > > > > > > > sysdeps/x86/cpu-tunables.c | 4 ++ > > > > > > > > sysdeps/x86/dl-cacheinfo.h | 46 > ++++++++++++++----- > > > > > > > > ...cpu-features-preferred_feature_index_1.def | 1 + > > > > > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- > > > > > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- > > > > > > > > 5 files changed, 58 insertions(+), 18 deletions(-) > > > > > > > > > > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c > b/sysdeps/x86/cpu-tunables.c > > > > > > > > index ccc6b64dc2..b01b703517 100644 > > > > > > > > --- a/sysdeps/x86/cpu-tunables.c > > > > > > > > +++ b/sysdeps/x86/cpu-tunables.c > > > > > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) > (tunable_val_t *valp) > > > > > > > > (n, cpu_features, > Prefer_PMINUB_for_stringop, SSE2, 26); > > > > > > > > } > > > > > > > > break; > > > > > > > > + case 32: > > > > > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, > > > > > > > > + > Prefer_Non_Temporal_Without_ERMS, 32); > > > > > > > > + break; > > > > > > > > } > > > > > > > > } > > > > > > > > } > > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h > b/sysdeps/x86/dl-cacheinfo.h > > > > > > > > index a1c03b8903..600bd66a93 100644 > > > > > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features > *cpu_features) > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > > > > > > > > rep_movsb_threshold = 2112; > > > > > > > > > > > > > > > > - /* Non-temporal stores are more performant on Intel and > AMD hardware above > > > > > > > > - non_temporal_threshold. Enable this for both Intel and > AMD hardware. */ > > > > > > > > - unsigned long int memset_non_temporal_threshold = > SIZE_MAX; > > > > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, > Avoid_Non_Temporal_Memset) > > > > > > > > - && (cpu_features->basic.kind == arch_kind_intel > > > > > > > > - || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > > - memset_non_temporal_threshold = non_temporal_threshold; > > > > > > > > - > > > > > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is > in a lot of > > > > > > > > cases slower than the vectorized path (and for some > alignments, > > > > > > > > it is really slow, check BZ #30994). */ > > > > > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct > cpu_features *cpu_features) > > > > > > > > if (tunable_size != 0) > > > > > > > > shared = tunable_size; > > > > > > > > > > > > > > > > + /* Non-temporal stores are more performant on Intel and > AMD hardware above > > > > > > > > + non_temporal_threshold. Enable this for both Intel and > AMD hardware. */ > > > > > > > > + unsigned long int memset_non_temporal_threshold = > SIZE_MAX; > > > > > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, > Avoid_Non_Temporal_Memset) > > > > > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, > Prefer_Non_Temporal_Without_ERMS) > > > > > > > > + || cpu_features->basic.kind == arch_kind_intel > > > > > > > > + || cpu_features->basic.kind == arch_kind_amd)) > > > > > > > > > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel > and AMD. > > > > > > > > > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; > > > > > > > > + > > > > > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, > long int, NULL); > > > > > > > > if (tunable_size > minimum_non_temporal_threshold > > > > > > > > && tunable_size <= maximum_non_temporal_threshold) > > > > > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct > cpu_features *cpu_features) > > > > > > > > slightly better than ERMS. */ > > > > > > > > rep_stosb_threshold = SIZE_MAX; > > > > > > > > > > > > > > > > + /* > > > > > > > > + For memset, the non-temporal implementation is only > accessed through the > > > > > > > > + stosb code. ie: > > > > > > > > + ``` > > > > > > > > + if (size >= rep_stosb_thresh) > > > > > > > > + { > > > > > > > > + if (size >= non_temporal_thresh) > > > > > > > > + { > > > > > > > > + do_non_temporal (); > > > > > > > > + } > > > > > > > > + do_stosb (); > > > > > > > > + } > > > > > > > > + do_normal_vec_loop (); > > > > > > > > + ``` > > > > > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = > non_temporal_thresh` > > > > > > > > + to enable the implementation. If `rep_stosb_thresh = > non_temporal_thresh`, > > > > > > > > + `rep stosb` will never be used. > > > > > > > > + */ > > > > > > > > + TUNABLE_SET_WITH_BOUNDS > (x86_memset_non_temporal_threshold, > > > > > > > > + memset_non_temporal_threshold, > > > > > > > > + minimum_non_temporal_threshold, > SIZE_MAX); > > > > > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > > > > > Please add another bit, Avoid_STOSB, and check it instead. > > > > > > > > > > > > > > > + rep_stosb_threshold > > > > > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, > long int, NULL); > > > > > > > > > > > > > > Please move and reuse the existing TUNABLE_GET > > > > > > > (x86_memset_non_temporal_threshold, long int, NULL) > > > > > > > call. > > > > Would prefer to leave as is. We need the final values to be equal > > and it seems less bug prone to use the final value as opposed to > > having a seperate bounds check here. > > > > > > > > > > > > > > > + > > > > > > > > + > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, > SIZE_MAX); > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, > 0, SIZE_MAX); > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, > non_temporal_threshold, > > > > > > > > minimum_non_temporal_threshold, > > > > > > > > maximum_non_temporal_threshold); > > > > > > > > - TUNABLE_SET_WITH_BOUNDS > (x86_memset_non_temporal_threshold, > > > > > > > > - memset_non_temporal_threshold, > > > > > > > > - minimum_non_temporal_threshold, > SIZE_MAX); > > > > > > > > > > > > > > Please leave it here. > > > > > > > > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, > rep_movsb_threshold, > > > > > > > > minimum_rep_movsb_threshold, > SIZE_MAX); > > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, > rep_stosb_threshold, 1, > > > > > > > > diff --git > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > > index 61bbbc2e89..60a3a382ff 100644 > > > > > > > > --- > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > > +++ > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > > > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) > > > > > > > > BIT (Prefer_FSRM) > > > > > > > > BIT (Avoid_Short_Distance_REP_MOVSB) > > > > > > > > BIT (Avoid_Non_Temporal_Memset) > > > > > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) > > > > > > > > \ No newline at end of file > > > > > > > > > > > > > > Please add a newline. > > > > > > > > > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c > b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > > index 94307283d7..4495d795e4 100644 > > > > > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c > > > > > > > > @@ -60,7 +60,8 @@ static const struct test_t > > > > > > > > /* Disable everything. */ > > > > > > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," > > > > > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", > > > > > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS", > > > > > > > > test_1, > > > > > > > > array_length (test_1) > > > > > > > > }, > > > > > > > > @@ -68,7 +69,8 @@ static const struct test_t > > > > > > > > /* Same as before, but with some empty suboptions. */ > > > > > > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," > > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," > > > > > > > > - > "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", > > > > > > > > + > "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," > > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", > > > > > > > > test_1, > > > > > > > > array_length (test_1) > > > > > > > > } > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h > b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > > index 7a637ef7ca..3ca64bb853 100644 > > > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h > > > > > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) > > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, > AVX512BW) > > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, > BMI2)) > > > > > > > > { > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > > + > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > return OPTIMIZE (avx512_unaligned_erms); > > > > > > > > > > > > > > > > return OPTIMIZE (avx512_unaligned); > > > > > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) > > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, > AVX512BW) > > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, > BMI2)) > > > > > > > > { > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > > + > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > return OPTIMIZE (evex_unaligned_erms); > > > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (evex_unaligned); > > > > > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) > > > > > > > > > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > > > > > > { > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > > + > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); > > > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); > > > > > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) > > > > > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > > Prefer_No_VZEROUPPER, > !)) > > > > > > > > { > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > > > > > > > > + > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > return OPTIMIZE (avx2_unaligned_erms); > > > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (avx2_unaligned); > > > > > > > > } > > > > > > > > } > > > > > > > > > > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) > > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, > Prefer_Non_Temporal_Without_ERMS)) > > > > > > > > return OPTIMIZE (sse2_unaligned_erms); > > > > > > > > > > > > > > Please check only Prefer_Non_Temporal instead. > > > > > > > > > > > > > > > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select > > > > > > the `_erms` implemention if we want either non-temporal OR erms? > > > > > > > > > > You said NT store must go through ERMS. You can do > > > > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > Set Prefer_Non_Temporal > > > > > > > > > > for Intel and AMD. > > > > > > > > What about machines that want ERMS but not NT? > > > > > > We can check CPUID to turn off Prefer_Non_Temporal. > > > > This seems overly complicated. We have a code path that goes > > to ERMS or NT. The natural check for that seems to be `ERMS || NT`. > > > > > > > > > > > > > > > > > > > > Everything else sounds fine though. > > > > > > > > > > > > > > > > > > > > > > return OPTIMIZE (sse2_unaligned); > > > > > > > > -- > > > > > > > > 2.34.1 > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > H.J. > > > > > > > > > > > > > > > > > > > > -- > > > > > H.J. > > > > > > > > > > > > -- > > > H.J. > > Okay looking over what we have, my preference would be to scrap the > whole `Prefer_Non_Temporal` and use the existing > `Avoid_Non_Temporal_Memset` along with new `Avoid_STOSB`. > And implement the following: > > 1) Default `Avoid_Non_Temporal_Memset = 1` except for AMD + Intel. > 2) Continue using `Avoid_Non_Temporal_Memset` as is now (i.e Intel SKX). > 3) Drop the explicit `Intel || AMD` check in `dl-cacheinfo.h` > -- 1 & 2 & 3 should match current behavior. I will make them a seperate > commit > in the series. > 4) Add `Avoid_STOSB`. > 5) If `!Avoid_Non_Temporal_Memset || ERMS` -> `*_erms` memset impl > 6) if `Avoid_STOSB` then `x86_stosb_thresh = > x86_memset_non_temporal_thresh` > > Does this sound reasonable? > It sounds reasonable as long as we check only one bit when making a decision. H.J.
On Wed, Aug 14, 2024 at 1:29 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 13, 2024, 9:39 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Tue, Aug 13, 2024 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> > >> > On Tue, Aug 13, 2024 at 11:33 PM H.J. Lu <hjl.tools@gmail.com> wrote: >> > > >> > > On Tue, Aug 13, 2024 at 8:24 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> > > > >> > > > On Tue, Aug 13, 2024 at 11:12 PM H.J. Lu <hjl.tools@gmail.com> wrote: >> > > > > >> > > > > On Tue, Aug 13, 2024 at 8:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> > > > > > >> > > > > > On Tue, Aug 13, 2024 at 10:45 PM H.J. Lu <hjl.tools@gmail.com> wrote: >> > > > > > > >> > > > > > > On Tue, Aug 13, 2024 at 6:55 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> > > > > > > > >> > > > > > > > The goal of this flag is to allow targets which don't prefer/have ERMS >> > > > > > > > to still access the non-temporal memset implementation. >> > > > > > > > --- >> > > > > > > > sysdeps/x86/cpu-tunables.c | 4 ++ >> > > > > > > > sysdeps/x86/dl-cacheinfo.h | 46 ++++++++++++++----- >> > > > > > > > ...cpu-features-preferred_feature_index_1.def | 1 + >> > > > > > > > sysdeps/x86/tst-hwcap-tunables.c | 6 ++- >> > > > > > > > sysdeps/x86_64/multiarch/ifunc-memset.h | 19 ++++++-- >> > > > > > > > 5 files changed, 58 insertions(+), 18 deletions(-) >> > > > > > > > >> > > > > > > > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c >> > > > > > > > index ccc6b64dc2..b01b703517 100644 >> > > > > > > > --- a/sysdeps/x86/cpu-tunables.c >> > > > > > > > +++ b/sysdeps/x86/cpu-tunables.c >> > > > > > > > @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) >> > > > > > > > (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); >> > > > > > > > } >> > > > > > > > break; >> > > > > > > > + case 32: >> > > > > > > > + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, >> > > > > > > > + Prefer_Non_Temporal_Without_ERMS, 32); >> > > > > > > > + break; >> > > > > > > > } >> > > > > > > > } >> > > > > > > > } >> > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h >> > > > > > > > index a1c03b8903..600bd66a93 100644 >> > > > > > > > --- a/sysdeps/x86/dl-cacheinfo.h >> > > > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h >> > > > > > > > @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) >> > > > > > > > rep_movsb_threshold = 2112; >> > > > > > > > >> > > > > > > > - /* Non-temporal stores are more performant on Intel and AMD hardware above >> > > > > > > > - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ >> > > > > > > > - unsigned long int memset_non_temporal_threshold = SIZE_MAX; >> > > > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) >> > > > > > > > - && (cpu_features->basic.kind == arch_kind_intel >> > > > > > > > - || cpu_features->basic.kind == arch_kind_amd)) >> > > > > > > > - memset_non_temporal_threshold = non_temporal_threshold; >> > > > > > > > - >> > > > > > > > /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of >> > > > > > > > cases slower than the vectorized path (and for some alignments, >> > > > > > > > it is really slow, check BZ #30994). */ >> > > > > > > > @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> > > > > > > > if (tunable_size != 0) >> > > > > > > > shared = tunable_size; >> > > > > > > > >> > > > > > > > + /* Non-temporal stores are more performant on Intel and AMD hardware above >> > > > > > > > + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ >> > > > > > > > + unsigned long int memset_non_temporal_threshold = SIZE_MAX; >> > > > > > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) >> > > > > > > > + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) >> > > > > > > > + || cpu_features->basic.kind == arch_kind_intel >> > > > > > > > + || cpu_features->basic.kind == arch_kind_amd)) >> > > > > > > >> > > > > > > Please add Prefer_Non_Temporal instead and enable it for Intel and AMD. >> > > > > > > >> > > > > > > > + memset_non_temporal_threshold = non_temporal_threshold; >> > > > > > > > + >> > > > > > > > tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); >> > > > > > > > if (tunable_size > minimum_non_temporal_threshold >> > > > > > > > && tunable_size <= maximum_non_temporal_threshold) >> > > > > > > > @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) >> > > > > > > > slightly better than ERMS. */ >> > > > > > > > rep_stosb_threshold = SIZE_MAX; >> > > > > > > > >> > > > > > > > + /* >> > > > > > > > + For memset, the non-temporal implementation is only accessed through the >> > > > > > > > + stosb code. ie: >> > > > > > > > + ``` >> > > > > > > > + if (size >= rep_stosb_thresh) >> > > > > > > > + { >> > > > > > > > + if (size >= non_temporal_thresh) >> > > > > > > > + { >> > > > > > > > + do_non_temporal (); >> > > > > > > > + } >> > > > > > > > + do_stosb (); >> > > > > > > > + } >> > > > > > > > + do_normal_vec_loop (); >> > > > > > > > + ``` >> > > > > > > > + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` >> > > > > > > > + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, >> > > > > > > > + `rep stosb` will never be used. >> > > > > > > > + */ >> > > > > > > > + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, >> > > > > > > > + memset_non_temporal_threshold, >> > > > > > > > + minimum_non_temporal_threshold, SIZE_MAX); >> > > > > > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > >> > > > > > > Please add another bit, Avoid_STOSB, and check it instead. >> > > > > > > >> > > > > > > > + rep_stosb_threshold >> > > > > > > > + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); >> > > > > > > >> > > > > > > Please move and reuse the existing TUNABLE_GET >> > > > > > > (x86_memset_non_temporal_threshold, long int, NULL) >> > > > > > > call. >> > >> > Would prefer to leave as is. We need the final values to be equal >> > and it seems less bug prone to use the final value as opposed to >> > having a seperate bounds check here. >> > > > > > > >> > > > > > > > + >> > > > > > > > + >> > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); >> > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); >> > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, >> > > > > > > > minimum_non_temporal_threshold, >> > > > > > > > maximum_non_temporal_threshold); >> > > > > > > > - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, >> > > > > > > > - memset_non_temporal_threshold, >> > > > > > > > - minimum_non_temporal_threshold, SIZE_MAX); >> > > > > > > >> > > > > > > Please leave it here. >> > > > > > > >> > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, >> > > > > > > > minimum_rep_movsb_threshold, SIZE_MAX); >> > > > > > > > TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, >> > > > > > > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> > > > > > > > index 61bbbc2e89..60a3a382ff 100644 >> > > > > > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> > > > > > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> > > > > > > > @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) >> > > > > > > > BIT (Prefer_FSRM) >> > > > > > > > BIT (Avoid_Short_Distance_REP_MOVSB) >> > > > > > > > BIT (Avoid_Non_Temporal_Memset) >> > > > > > > > +BIT (Prefer_Non_Temporal_Without_ERMS) >> > > > > > > > \ No newline at end of file >> > > > > > > >> > > > > > > Please add a newline. >> > > > > > > >> > > > > > > > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c >> > > > > > > > index 94307283d7..4495d795e4 100644 >> > > > > > > > --- a/sysdeps/x86/tst-hwcap-tunables.c >> > > > > > > > +++ b/sysdeps/x86/tst-hwcap-tunables.c >> > > > > > > > @@ -60,7 +60,8 @@ static const struct test_t >> > > > > > > > /* Disable everything. */ >> > > > > > > > "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," >> > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," >> > > > > > > > - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", >> > > > > > > > + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," >> > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS", >> > > > > > > > test_1, >> > > > > > > > array_length (test_1) >> > > > > > > > }, >> > > > > > > > @@ -68,7 +69,8 @@ static const struct test_t >> > > > > > > > /* Same as before, but with some empty suboptions. */ >> > > > > > > > ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," >> > > > > > > > "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," >> > > > > > > > - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", >> > > > > > > > + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," >> > > > > > > > + "-Prefer_Non_Temporal_Without_ERMS,-,", >> > > > > > > > test_1, >> > > > > > > > array_length (test_1) >> > > > > > > > } >> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h >> > > > > > > > index 7a637ef7ca..3ca64bb853 100644 >> > > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h >> > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h >> > > > > > > > @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) >> > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) >> > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) >> > > > > > > > { >> > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) >> > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, >> > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > >> > > > > > > Please check only Prefer_Non_Temporal instead. >> > > > > > > >> > > > > > > > return OPTIMIZE (avx512_unaligned_erms); >> > > > > > > > >> > > > > > > > return OPTIMIZE (avx512_unaligned); >> > > > > > > > @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) >> > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) >> > > > > > > > && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) >> > > > > > > > { >> > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) >> > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, >> > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > > return OPTIMIZE (evex_unaligned_erms); >> > > > > > > >> > > > > > > Please check only Prefer_Non_Temporal instead. >> > > > > > > >> > > > > > > > >> > > > > > > > return OPTIMIZE (evex_unaligned); >> > > > > > > > @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) >> > > > > > > > >> > > > > > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) >> > > > > > > > { >> > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) >> > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, >> > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > > return OPTIMIZE (avx2_unaligned_erms_rtm); >> > > > > > > >> > > > > > > Please check only Prefer_Non_Temporal instead. >> > > > > > > >> > > > > > > > >> > > > > > > > return OPTIMIZE (avx2_unaligned_rtm); >> > > > > > > > @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) >> > > > > > > > if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, >> > > > > > > > Prefer_No_VZEROUPPER, !)) >> > > > > > > > { >> > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) >> > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, >> > > > > > > > + Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > > return OPTIMIZE (avx2_unaligned_erms); >> > > > > > > >> > > > > > > Please check only Prefer_Non_Temporal instead. >> > > > > > > >> > > > > > > > >> > > > > > > > return OPTIMIZE (avx2_unaligned); >> > > > > > > > } >> > > > > > > > } >> > > > > > > > >> > > > > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > > > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) >> > > > > > > > + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) >> > > > > > > > return OPTIMIZE (sse2_unaligned_erms); >> > > > > > > >> > > > > > > Please check only Prefer_Non_Temporal instead. >> > > > > > > >> > > > > > >> > > > > > Hmm? Why remove the ERMS check here? Shouldn't we want to select >> > > > > > the `_erms` implemention if we want either non-temporal OR erms? >> > > > > >> > > > > You said NT store must go through ERMS. You can do >> > > > > >> > > > > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) >> > > > > Set Prefer_Non_Temporal >> > > > > >> > > > > for Intel and AMD. >> > > > >> > > > What about machines that want ERMS but not NT? >> > > >> > > We can check CPUID to turn off Prefer_Non_Temporal. >> > >> > This seems overly complicated. We have a code path that goes >> > to ERMS or NT. The natural check for that seems to be `ERMS || NT`. >> > > >> > > > > >> > > > > > >> > > > > > Everything else sounds fine though. >> > > > > > >> > > > > > > > >> > > > > > > > return OPTIMIZE (sse2_unaligned); >> > > > > > > > -- >> > > > > > > > 2.34.1 >> > > > > > > > >> > > > > > > >> > > > > > > >> > > > > > > -- >> > > > > > > H.J. >> > > > > >> > > > > >> > > > > >> > > > > -- >> > > > > H.J. >> > > >> > > >> > > >> > > -- >> > > H.J. >> >> Okay looking over what we have, my preference would be to scrap the >> whole `Prefer_Non_Temporal` and use the existing >> `Avoid_Non_Temporal_Memset` along with new `Avoid_STOSB`. >> And implement the following: >> >> 1) Default `Avoid_Non_Temporal_Memset = 1` except for AMD + Intel. >> 2) Continue using `Avoid_Non_Temporal_Memset` as is now (i.e Intel SKX). >> 3) Drop the explicit `Intel || AMD` check in `dl-cacheinfo.h` >> -- 1 & 2 & 3 should match current behavior. I will make them a seperate commit >> in the series. >> 4) Add `Avoid_STOSB`. >> 5) If `!Avoid_Non_Temporal_Memset || ERMS` -> `*_erms` memset impl >> 6) if `Avoid_STOSB` then `x86_stosb_thresh = x86_memset_non_temporal_thresh` >> >> Does this sound reasonable? > > > It sounds reasonable as long as we > check only one bit when making a > decision. In the ifunc? What bit do you propose? I really don't like the idea of setting the ERMS / Avoid_Non_Temporal_Memset bit to indicate our ifunc preference here. Whether we have ERMS or not is irrelevant if we are going to use just the non-temporal impl and whether we are avoiding non-temporal memset is irrelevant if we have ERMS. > > H.J. > >
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index ccc6b64dc2..b01b703517 100644 --- a/sysdeps/x86/cpu-tunables.c +++ b/sysdeps/x86/cpu-tunables.c @@ -255,6 +255,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) (n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26); } break; + case 32: + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Prefer_Non_Temporal_Without_ERMS, 32); + break; } } } diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index a1c03b8903..600bd66a93 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; - /* Non-temporal stores are more performant on Intel and AMD hardware above - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ - unsigned long int memset_non_temporal_threshold = SIZE_MAX; - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) - && (cpu_features->basic.kind == arch_kind_intel - || cpu_features->basic.kind == arch_kind_amd)) - memset_non_temporal_threshold = non_temporal_threshold; - /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of cases slower than the vectorized path (and for some alignments, it is really slow, check BZ #30994). */ @@ -1017,6 +1009,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (tunable_size != 0) shared = tunable_size; + /* Non-temporal stores are more performant on Intel and AMD hardware above + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) + && (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS) + || cpu_features->basic.kind == arch_kind_intel + || cpu_features->basic.kind == arch_kind_amd)) + memset_non_temporal_threshold = non_temporal_threshold; + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); if (tunable_size > minimum_non_temporal_threshold && tunable_size <= maximum_non_temporal_threshold) @@ -1042,14 +1043,37 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) slightly better than ERMS. */ rep_stosb_threshold = SIZE_MAX; + /* + For memset, the non-temporal implementation is only accessed through the + stosb code. ie: + ``` + if (size >= rep_stosb_thresh) + { + if (size >= non_temporal_thresh) + { + do_non_temporal (); + } + do_stosb (); + } + do_normal_vec_loop (); + ``` + So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` + to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, + `rep stosb` will never be used. + */ + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, + memset_non_temporal_threshold, + minimum_non_temporal_threshold, SIZE_MAX); + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) + rep_stosb_threshold + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, minimum_non_temporal_threshold, maximum_non_temporal_threshold); - TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, - memset_non_temporal_threshold, - minimum_non_temporal_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def index 61bbbc2e89..60a3a382ff 100644 --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def @@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) BIT (Prefer_FSRM) BIT (Avoid_Short_Distance_REP_MOVSB) BIT (Avoid_Non_Temporal_Memset) +BIT (Prefer_Non_Temporal_Without_ERMS) \ No newline at end of file diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c index 94307283d7..4495d795e4 100644 --- a/sysdeps/x86/tst-hwcap-tunables.c +++ b/sysdeps/x86/tst-hwcap-tunables.c @@ -60,7 +60,8 @@ static const struct test_t /* Disable everything. */ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," - "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," + "-Prefer_Non_Temporal_Without_ERMS", test_1, array_length (test_1) }, @@ -68,7 +69,8 @@ static const struct test_t /* Same as before, but with some empty suboptions. */ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," - "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," + "-Prefer_Non_Temporal_Without_ERMS,-,", test_1, array_length (test_1) } diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index 7a637ef7ca..3ca64bb853 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -61,7 +61,9 @@ IFUNC_SELECTOR (void) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, + Prefer_Non_Temporal_Without_ERMS)) return OPTIMIZE (avx512_unaligned_erms); return OPTIMIZE (avx512_unaligned); @@ -76,7 +78,9 @@ IFUNC_SELECTOR (void) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, + Prefer_Non_Temporal_Without_ERMS)) return OPTIMIZE (evex_unaligned_erms); return OPTIMIZE (evex_unaligned); @@ -84,7 +88,9 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, + Prefer_Non_Temporal_Without_ERMS)) return OPTIMIZE (avx2_unaligned_erms_rtm); return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +99,17 @@ IFUNC_SELECTOR (void) if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, + Prefer_Non_Temporal_Without_ERMS)) return OPTIMIZE (avx2_unaligned_erms); return OPTIMIZE (avx2_unaligned); } } - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_Non_Temporal_Without_ERMS)) return OPTIMIZE (sse2_unaligned_erms); return OPTIMIZE (sse2_unaligned);