diff mbox series

[v1] x86: Fix incorrect tunable logic for memmove-vec-unaligned-erms.

Message ID 20220614175737.2616508-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Fix incorrect tunable logic for memmove-vec-unaligned-erms. | expand

Commit Message

Noah Goldstein June 14, 2022, 5:57 p.m. UTC
Move the setting of `rep_movsb_stop_threshold` to after the tunables
have been collected so that the `rep_movsb_stop_threshold` (which
is used to redirect control flow to the non_temporal case) will
use any user value for `non_temporal_threshold` (from
glibc.cpu.x86_non_temporal_threshold)

As well, use the proper bound in non_tempral case in
'memmove-vec-unaligned-erms' when entering from size being above
`rep_movsb_stop_threshold`.
---
 sysdeps/x86/dl-cacheinfo.h                    | 24 +++++++++----------
 .../multiarch/memmove-vec-unaligned-erms.S    |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

Comments

H.J. Lu June 14, 2022, 7:41 p.m. UTC | #1
On Tue, Jun 14, 2022 at 10:57 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Move the setting of `rep_movsb_stop_threshold` to after the tunables
> have been collected so that the `rep_movsb_stop_threshold` (which
> is used to redirect control flow to the non_temporal case) will
> use any user value for `non_temporal_threshold` (from
> glibc.cpu.x86_non_temporal_threshold)
>
> As well, use the proper bound in non_tempral case in
> 'memmove-vec-unaligned-erms' when entering from size being above
> `rep_movsb_stop_threshold`.
> ---
>  sysdeps/x86/dl-cacheinfo.h                    | 24 +++++++++----------
>  .../multiarch/memmove-vec-unaligned-erms.S    |  2 +-
>  2 files changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index f64a2fb0ba..cc3b840f9c 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>
> -  unsigned long int rep_movsb_stop_threshold;
> -  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> -     performing poorly for data above L2 cache size. Henceforth, adding
> -     an upper bound threshold parameter to limit the usage of Enhanced
> -     REP MOVSB operations and setting its value to L2 cache size.  */
> -  if (cpu_features->basic.kind == arch_kind_amd)
> -    rep_movsb_stop_threshold = core;
> -  /* Setting the upper bound of ERMS to the computed value of
> -     non-temporal threshold for architectures other than AMD.  */
> -  else
> -    rep_movsb_stop_threshold = non_temporal_threshold;
> -
>    /* The default threshold to use Enhanced REP STOSB.  */
>    unsigned long int rep_stosb_threshold = 2048;
>
> @@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>                            SIZE_MAX);
>  #endif
>
> +  unsigned long int rep_movsb_stop_threshold;
> +  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> +     performing poorly for data above L2 cache size. Henceforth, adding
> +     an upper bound threshold parameter to limit the usage of Enhanced
> +     REP MOVSB operations and setting its value to L2 cache size.  */
> +  if (cpu_features->basic.kind == arch_kind_amd)
> +    rep_movsb_stop_threshold = core;
> +  /* Setting the upper bound of ERMS to the computed value of
> +     non-temporal threshold for architectures other than AMD.  */
> +  else
> +    rep_movsb_stop_threshold = non_temporal_threshold;
> +
>    cpu_features->data_cache_size = data;
>    cpu_features->shared_cache_size = shared;
>    cpu_features->non_temporal_threshold = non_temporal_threshold;

This is OK.

> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index af51177d5d..6d93b7c690 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -724,7 +724,7 @@ L(skip_short_movsb_check):
>         .p2align 4,, 10
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>  L(large_memcpy_2x_check):
> -       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
>         jb      L(more_8x_vec_check)
>  L(large_memcpy_2x):
>         /* To reach this point it is impossible for dst > src and
> --
> 2.34.1
>

This should be in a separate patch.  A few lines below,  there are

        movq    %rdx, %r10
        shrq    $LOG_4X_MEMCPY_THRESH, %r10
        cmp     __x86_shared_non_temporal_threshold(%rip), %r10
        jae     L(large_memcpy_4x)

__x86_shared_non_temporal_threshold is reused for a different
threshold.
diff mbox series

Patch

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f64a2fb0ba..cc3b840f9c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -898,18 +898,6 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-  unsigned long int rep_movsb_stop_threshold;
-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
-     performing poorly for data above L2 cache size. Henceforth, adding
-     an upper bound threshold parameter to limit the usage of Enhanced
-     REP MOVSB operations and setting its value to L2 cache size.  */
-  if (cpu_features->basic.kind == arch_kind_amd)
-    rep_movsb_stop_threshold = core;
-  /* Setting the upper bound of ERMS to the computed value of
-     non-temporal threshold for architectures other than AMD.  */
-  else
-    rep_movsb_stop_threshold = non_temporal_threshold;
-
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
 
@@ -951,6 +939,18 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
 			   SIZE_MAX);
 #endif
 
+  unsigned long int rep_movsb_stop_threshold;
+  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+     performing poorly for data above L2 cache size. Henceforth, adding
+     an upper bound threshold parameter to limit the usage of Enhanced
+     REP MOVSB operations and setting its value to L2 cache size.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_stop_threshold = core;
+  /* Setting the upper bound of ERMS to the computed value of
+     non-temporal threshold for architectures other than AMD.  */
+  else
+    rep_movsb_stop_threshold = non_temporal_threshold;
+
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
   cpu_features->non_temporal_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index af51177d5d..6d93b7c690 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -724,7 +724,7 @@  L(skip_short_movsb_check):
 	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 L(large_memcpy_2x_check):
-	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
 	/* To reach this point it is impossible for dst > src and