Message ID | 20220614175737.2616508-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Fix incorrect tunable logic for memmove-vec-unaligned-erms. | expand |
On Tue, Jun 14, 2022 at 10:57 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Move the setting of `rep_movsb_stop_threshold` to after the tunables > have been collected so that the `rep_movsb_stop_threshold` (which > is used to redirect control flow to the non_temporal case) will > use any user value for `non_temporal_threshold` (from > glibc.cpu.x86_non_temporal_threshold) > > As well, use the proper bound in non_tempral case in > 'memmove-vec-unaligned-erms' when entering from size being above > `rep_movsb_stop_threshold`. > --- > sysdeps/x86/dl-cacheinfo.h | 24 +++++++++---------- > .../multiarch/memmove-vec-unaligned-erms.S | 2 +- > 2 files changed, 13 insertions(+), 13 deletions(-) > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index f64a2fb0ba..cc3b840f9c 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) > rep_movsb_threshold = 2112; > > - unsigned long int rep_movsb_stop_threshold; > - /* ERMS feature is implemented from AMD Zen3 architecture and it is > - performing poorly for data above L2 cache size. Henceforth, adding > - an upper bound threshold parameter to limit the usage of Enhanced > - REP MOVSB operations and setting its value to L2 cache size. */ > - if (cpu_features->basic.kind == arch_kind_amd) > - rep_movsb_stop_threshold = core; > - /* Setting the upper bound of ERMS to the computed value of > - non-temporal threshold for architectures other than AMD. */ > - else > - rep_movsb_stop_threshold = non_temporal_threshold; > - > /* The default threshold to use Enhanced REP STOSB. */ > unsigned long int rep_stosb_threshold = 2048; > > @@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > SIZE_MAX); > #endif > > + unsigned long int rep_movsb_stop_threshold; > + /* ERMS feature is implemented from AMD Zen3 architecture and it is > + performing poorly for data above L2 cache size. Henceforth, adding > + an upper bound threshold parameter to limit the usage of Enhanced > + REP MOVSB operations and setting its value to L2 cache size. */ > + if (cpu_features->basic.kind == arch_kind_amd) > + rep_movsb_stop_threshold = core; > + /* Setting the upper bound of ERMS to the computed value of > + non-temporal threshold for architectures other than AMD. */ > + else > + rep_movsb_stop_threshold = non_temporal_threshold; > + > cpu_features->data_cache_size = data; > cpu_features->shared_cache_size = shared; > cpu_features->non_temporal_threshold = non_temporal_threshold; This is OK. > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index af51177d5d..6d93b7c690 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -724,7 +724,7 @@ L(skip_short_movsb_check): > .p2align 4,, 10 > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > L(large_memcpy_2x_check): > - cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > jb L(more_8x_vec_check) > L(large_memcpy_2x): > /* To reach this point it is impossible for dst > src and > -- > 2.34.1 > This should be in a separate patch. A few lines below, there are movq %rdx, %r10 shrq $LOG_4X_MEMCPY_THRESH, %r10 cmp __x86_shared_non_temporal_threshold(%rip), %r10 jae L(large_memcpy_4x) __x86_shared_non_temporal_threshold is reused for a different threshold.
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index f64a2fb0ba..cc3b840f9c 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; - unsigned long int rep_movsb_stop_threshold; - /* ERMS feature is implemented from AMD Zen3 architecture and it is - performing poorly for data above L2 cache size. Henceforth, adding - an upper bound threshold parameter to limit the usage of Enhanced - REP MOVSB operations and setting its value to L2 cache size. */ - if (cpu_features->basic.kind == arch_kind_amd) - rep_movsb_stop_threshold = core; - /* Setting the upper bound of ERMS to the computed value of - non-temporal threshold for architectures other than AMD. */ - else - rep_movsb_stop_threshold = non_temporal_threshold; - /* The default threshold to use Enhanced REP STOSB. */ unsigned long int rep_stosb_threshold = 2048; @@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) SIZE_MAX); #endif + unsigned long int rep_movsb_stop_threshold; + /* ERMS feature is implemented from AMD Zen3 architecture and it is + performing poorly for data above L2 cache size. Henceforth, adding + an upper bound threshold parameter to limit the usage of Enhanced + REP MOVSB operations and setting its value to L2 cache size. */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_movsb_stop_threshold = core; + /* Setting the upper bound of ERMS to the computed value of + non-temporal threshold for architectures other than AMD. */ + else + rep_movsb_stop_threshold = non_temporal_threshold; + cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index af51177d5d..6d93b7c690 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -724,7 +724,7 @@ L(skip_short_movsb_check): .p2align 4,, 10 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_memcpy_2x_check): - cmp __x86_rep_movsb_threshold(%rip), %RDX_LP + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP jb L(more_8x_vec_check) L(large_memcpy_2x): /* To reach this point it is impossible for dst > src and