diff mbox series

[v1,1/3] x86: Fix misordered logic for setting `rep_movsb_stop_threshold`

Message ID 20220615002533.1741934-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/3] x86: Fix misordered logic for setting `rep_movsb_stop_threshold` | expand

Commit Message

Noah Goldstein June 15, 2022, 12:25 a.m. UTC
Move the setting of `rep_movsb_stop_threshold` to after the tunables
have been collected so that the `rep_movsb_stop_threshold` (which
is used to redirect control flow to the non_temporal case) will
use any user value for `non_temporal_threshold` (set using
glibc.cpu.x86_non_temporal_threshold)
---
 sysdeps/x86/dl-cacheinfo.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

Comments

H.J. Lu June 15, 2022, 1:02 a.m. UTC | #1
On Tue, Jun 14, 2022 at 5:25 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Move the setting of `rep_movsb_stop_threshold` to after the tunables
> have been collected so that the `rep_movsb_stop_threshold` (which
> is used to redirect control flow to the non_temporal case) will
> use any user value for `non_temporal_threshold` (set using
> glibc.cpu.x86_non_temporal_threshold)
> ---
>  sysdeps/x86/dl-cacheinfo.h | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index f64a2fb0ba..cc3b840f9c 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>
> -  unsigned long int rep_movsb_stop_threshold;
> -  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> -     performing poorly for data above L2 cache size. Henceforth, adding
> -     an upper bound threshold parameter to limit the usage of Enhanced
> -     REP MOVSB operations and setting its value to L2 cache size.  */
> -  if (cpu_features->basic.kind == arch_kind_amd)
> -    rep_movsb_stop_threshold = core;
> -  /* Setting the upper bound of ERMS to the computed value of
> -     non-temporal threshold for architectures other than AMD.  */
> -  else
> -    rep_movsb_stop_threshold = non_temporal_threshold;
> -
>    /* The default threshold to use Enhanced REP STOSB.  */
>    unsigned long int rep_stosb_threshold = 2048;
>
> @@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>                            SIZE_MAX);
>  #endif
>
> +  unsigned long int rep_movsb_stop_threshold;
> +  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> +     performing poorly for data above L2 cache size. Henceforth, adding
> +     an upper bound threshold parameter to limit the usage of Enhanced
> +     REP MOVSB operations and setting its value to L2 cache size.  */
> +  if (cpu_features->basic.kind == arch_kind_amd)
> +    rep_movsb_stop_threshold = core;
> +  /* Setting the upper bound of ERMS to the computed value of
> +     non-temporal threshold for architectures other than AMD.  */
> +  else
> +    rep_movsb_stop_threshold = non_temporal_threshold;
> +
>    cpu_features->data_cache_size = data;
>    cpu_features->shared_cache_size = shared;
>    cpu_features->non_temporal_threshold = non_temporal_threshold;
> --
> 2.34.1
>

LGTM.

Thanks.
Sunil Pandey July 14, 2022, 2:53 a.m. UTC | #2
On Tue, Jun 14, 2022 at 6:03 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Tue, Jun 14, 2022 at 5:25 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Move the setting of `rep_movsb_stop_threshold` to after the tunables
> > have been collected so that the `rep_movsb_stop_threshold` (which
> > is used to redirect control flow to the non_temporal case) will
> > use any user value for `non_temporal_threshold` (set using
> > glibc.cpu.x86_non_temporal_threshold)
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 24 ++++++++++++------------
> >  1 file changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index f64a2fb0ba..cc3b840f9c 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> >      rep_movsb_threshold = 2112;
> >
> > -  unsigned long int rep_movsb_stop_threshold;
> > -  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > -     performing poorly for data above L2 cache size. Henceforth, adding
> > -     an upper bound threshold parameter to limit the usage of Enhanced
> > -     REP MOVSB operations and setting its value to L2 cache size.  */
> > -  if (cpu_features->basic.kind == arch_kind_amd)
> > -    rep_movsb_stop_threshold = core;
> > -  /* Setting the upper bound of ERMS to the computed value of
> > -     non-temporal threshold for architectures other than AMD.  */
> > -  else
> > -    rep_movsb_stop_threshold = non_temporal_threshold;
> > -
> >    /* The default threshold to use Enhanced REP STOSB.  */
> >    unsigned long int rep_stosb_threshold = 2048;
> >
> > @@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >                            SIZE_MAX);
> >  #endif
> >
> > +  unsigned long int rep_movsb_stop_threshold;
> > +  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> > +     performing poorly for data above L2 cache size. Henceforth, adding
> > +     an upper bound threshold parameter to limit the usage of Enhanced
> > +     REP MOVSB operations and setting its value to L2 cache size.  */
> > +  if (cpu_features->basic.kind == arch_kind_amd)
> > +    rep_movsb_stop_threshold = core;
> > +  /* Setting the upper bound of ERMS to the computed value of
> > +     non-temporal threshold for architectures other than AMD.  */
> > +  else
> > +    rep_movsb_stop_threshold = non_temporal_threshold;
> > +
> >    cpu_features->data_cache_size = data;
> >    cpu_features->shared_cache_size = shared;
> >    cpu_features->non_temporal_threshold = non_temporal_threshold;
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
diff mbox series

Patch

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f64a2fb0ba..cc3b840f9c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -898,18 +898,6 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-  unsigned long int rep_movsb_stop_threshold;
-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
-     performing poorly for data above L2 cache size. Henceforth, adding
-     an upper bound threshold parameter to limit the usage of Enhanced
-     REP MOVSB operations and setting its value to L2 cache size.  */
-  if (cpu_features->basic.kind == arch_kind_amd)
-    rep_movsb_stop_threshold = core;
-  /* Setting the upper bound of ERMS to the computed value of
-     non-temporal threshold for architectures other than AMD.  */
-  else
-    rep_movsb_stop_threshold = non_temporal_threshold;
-
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
 
@@ -951,6 +939,18 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
 			   SIZE_MAX);
 #endif
 
+  unsigned long int rep_movsb_stop_threshold;
+  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+     performing poorly for data above L2 cache size. Henceforth, adding
+     an upper bound threshold parameter to limit the usage of Enhanced
+     REP MOVSB operations and setting its value to L2 cache size.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_stop_threshold = core;
+  /* Setting the upper bound of ERMS to the computed value of
+     non-temporal threshold for architectures other than AMD.  */
+  else
+    rep_movsb_stop_threshold = non_temporal_threshold;
+
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
   cpu_features->non_temporal_threshold = non_temporal_threshold;