diff mbox series

x86: Increase default `rep stosb` threshold for SKX [BZ #32009]

Message ID 20240723063821.3460385-1-goldstein.w.n@gmail.com
State New
Headers show
Series x86: Increase default `rep stosb` threshold for SKX [BZ #32009] | expand

Commit Message

Noah Goldstein July 23, 2024, 6:38 a.m. UTC
Benchmarks indicate that `2048` (prior value) is far too low a
threshold for using `rep stosb`. Rather something around `1048576` is
preferable:

https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing

(See: SKX-1/SKX-1.1/SKX-2/SKX-2.2)

The `1048576` theshold was tested on multiple SKX machines and
suprisingly seemed to hold regardless of cache hierarchy.

Also note that in highly parallel settings, a smaller value is
preferable, but this difference does seem not so extreme to justify a
worse threshold with low/moderate parallelism.

Tested new threshold using qemu on all x86 systems mutually supported
by GLIBC and qemu.
---
 sysdeps/x86/cpu-features.c         | 11 +++++++++++
 sysdeps/x86/dl-cacheinfo.h         | 21 +++++++++++----------
 sysdeps/x86/dl-diagnostics-cpu.c   |  6 ++++--
 sysdeps/x86/include/cpu-features.h |  2 ++
 4 files changed, 28 insertions(+), 12 deletions(-)

Comments

H.J. Lu Aug. 19, 2024, 1:12 p.m. UTC | #1
On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Benchmarks indicate that `2048` (prior value) is far too low a
> threshold for using `rep stosb`. Rather something around `1048576` is
> preferable:
>
> https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
>
> (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2)
>
> The `1048576` theshold was tested on multiple SKX machines and
> suprisingly seemed to hold regardless of cache hierarchy.
>
> Also note that in highly parallel settings, a smaller value is
> preferable, but this difference does seem not so extreme to justify a
> worse threshold with low/moderate parallelism.
>
> Tested new threshold using qemu on all x86 systems mutually supported
> by GLIBC and qemu.
> ---
>  sysdeps/x86/cpu-features.c         | 11 +++++++++++
>  sysdeps/x86/dl-cacheinfo.h         | 21 +++++++++++----------
>  sysdeps/x86/dl-diagnostics-cpu.c   |  6 ++++--
>  sysdeps/x86/include/cpu-features.h |  2 ++
>  4 files changed, 28 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index c096dd390a..c5429422c0 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features *cpu_features)
>    enum cpu_features_kind kind;
>
>    cpu_features->cachesize_non_temporal_divisor = 4;
> +  cpu_features->machine_rep_stosb_threshold = 0;
>  #if !HAS_CPUID
>    if (__get_cpuid_max (0, 0) == 0)
>      {
> @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features *cpu_features)
>                      non-temporal on all Skylake servers. */
>               cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
>                   |= bit_arch_Avoid_Non_Temporal_Memset;
> +             /* SKX prefers temporal stores for a while. Confusingly, tests
> +                across multiple SKX systems with different cache sizes all
> +                indicate the threshold for when `rep stosb` becomes preferable
> +                is 1048576 as opposed to some function of the size of the
> +                various components of the cache hierarchy. Worth noting,
> +                although not taken into account here, is that `rep stosb` is
> +                more preferable with higher degrees of parallelism, i.e if all
> +                cores are simultaneously setting memory a lower threshold
> +                would be preferable.  */
> +             cpu_features->machine_rep_stosb_threshold = 1048576;
>             case INTEL_BIGCORE_COMETLAKE:
>             case INTEL_BIGCORE_SKYLAKE:
>             case INTEL_BIGCORE_KABYLAKE:
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index a1c03b8903..c646b57508 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (cpu_features->basic.kind == arch_kind_amd)
>      rep_movsb_threshold = non_temporal_threshold;
>
> -  /* The default threshold to use Enhanced REP STOSB.  */
> -  unsigned long int rep_stosb_threshold = 2048;
> -
>    long int tunable_size;
>
>    tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
> @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
>       same as the default value of __x86_rep_stosb_threshold and the
>       minimum value is fixed.  */
> -  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
> -                                    long int, NULL);
> -  if (cpu_features->basic.kind == arch_kind_amd
> -      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> -    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
> -       slightly better than ERMS.  */
> -    rep_stosb_threshold = SIZE_MAX;
> +  unsigned long int rep_stosb_threshold
> +      = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
> +  if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> +    {
> +      if (cpu_features->machine_rep_stosb_threshold != 0)
> +       rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold;
> +      /* For AMD Zen3+ architecture, the performance of the vectorized loop
> +        is slightly better than ERMS.  */
> +      else if (cpu_features->basic.kind == arch_kind_amd)
> +       rep_stosb_threshold = SIZE_MAX;
> +    }
>
>    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
>    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> index 49eeb5f70a..b84903a294 100644
> --- a/sysdeps/x86/dl-diagnostics-cpu.c
> +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void)
>                              cpu_features->level4_cache_size);
>    print_cpu_features_value ("cachesize_non_temporal_divisor",
>                             cpu_features->cachesize_non_temporal_divisor);
> +  print_cpu_features_value ("machine_rep_stosb_threshold",
> +                           cpu_features->machine_rep_stosb_threshold);
>    _Static_assert (
> -      offsetof (struct cpu_features, cachesize_non_temporal_divisor)
> -             + sizeof (cpu_features->cachesize_non_temporal_divisor)
> +      offsetof (struct cpu_features, machine_rep_stosb_threshold)
> +             + sizeof (cpu_features->machine_rep_stosb_threshold)
>           == sizeof (*cpu_features),
>        "last cpu_features field has been printed");
>
> diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> index aaae44f0e1..5f2b5b93ac 100644
> --- a/sysdeps/x86/include/cpu-features.h
> +++ b/sysdeps/x86/include/cpu-features.h
> @@ -981,6 +981,8 @@ struct cpu_features
>    /* When no user non_temporal_threshold is specified. We default to
>       cachesize / cachesize_non_temporal_divisor.  */
>    unsigned long int cachesize_non_temporal_divisor;
> +  /* Default rep stosb threshold (if 0, use default in dl-machine.h).  */
> +  unsigned long int machine_rep_stosb_threshold;
>  };
>
>  /* Get a pointer to the CPU features structure.  */
> --
> 2.34.1
>

Please rebase against master.

Thanks.
Noah Goldstein Aug. 19, 2024, 4:29 p.m. UTC | #2
On Mon, Aug 19, 2024 at 6:12 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Benchmarks indicate that `2048` (prior value) is far too low a
> > threshold for using `rep stosb`. Rather something around `1048576` is
> > preferable:
> >
> > https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
> >
> > (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2)
> >
> > The `1048576` theshold was tested on multiple SKX machines and
> > suprisingly seemed to hold regardless of cache hierarchy.
> >
> > Also note that in highly parallel settings, a smaller value is
> > preferable, but this difference does seem not so extreme to justify a
> > worse threshold with low/moderate parallelism.
> >
> > Tested new threshold using qemu on all x86 systems mutually supported
> > by GLIBC and qemu.
> > ---
> >  sysdeps/x86/cpu-features.c         | 11 +++++++++++
> >  sysdeps/x86/dl-cacheinfo.h         | 21 +++++++++++----------
> >  sysdeps/x86/dl-diagnostics-cpu.c   |  6 ++++--
> >  sysdeps/x86/include/cpu-features.h |  2 ++
> >  4 files changed, 28 insertions(+), 12 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index c096dd390a..c5429422c0 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features *cpu_features)
> >    enum cpu_features_kind kind;
> >
> >    cpu_features->cachesize_non_temporal_divisor = 4;
> > +  cpu_features->machine_rep_stosb_threshold = 0;
> >  #if !HAS_CPUID
> >    if (__get_cpuid_max (0, 0) == 0)
> >      {
> > @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features *cpu_features)
> >                      non-temporal on all Skylake servers. */
> >               cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> >                   |= bit_arch_Avoid_Non_Temporal_Memset;
> > +             /* SKX prefers temporal stores for a while. Confusingly, tests
> > +                across multiple SKX systems with different cache sizes all
> > +                indicate the threshold for when `rep stosb` becomes preferable
> > +                is 1048576 as opposed to some function of the size of the
> > +                various components of the cache hierarchy. Worth noting,
> > +                although not taken into account here, is that `rep stosb` is
> > +                more preferable with higher degrees of parallelism, i.e if all
> > +                cores are simultaneously setting memory a lower threshold
> > +                would be preferable.  */
> > +             cpu_features->machine_rep_stosb_threshold = 1048576;
> >             case INTEL_BIGCORE_COMETLAKE:
> >             case INTEL_BIGCORE_SKYLAKE:
> >             case INTEL_BIGCORE_KABYLAKE:
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index a1c03b8903..c646b57508 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    if (cpu_features->basic.kind == arch_kind_amd)
> >      rep_movsb_threshold = non_temporal_threshold;
> >
> > -  /* The default threshold to use Enhanced REP STOSB.  */
> > -  unsigned long int rep_stosb_threshold = 2048;
> > -
> >    long int tunable_size;
> >
> >    tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
> > @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
> >       same as the default value of __x86_rep_stosb_threshold and the
> >       minimum value is fixed.  */
> > -  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
> > -                                    long int, NULL);
> > -  if (cpu_features->basic.kind == arch_kind_amd
> > -      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> > -    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
> > -       slightly better than ERMS.  */
> > -    rep_stosb_threshold = SIZE_MAX;
> > +  unsigned long int rep_stosb_threshold
> > +      = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
> > +  if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> > +    {
> > +      if (cpu_features->machine_rep_stosb_threshold != 0)
> > +       rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold;
> > +      /* For AMD Zen3+ architecture, the performance of the vectorized loop
> > +        is slightly better than ERMS.  */
> > +      else if (cpu_features->basic.kind == arch_kind_amd)
> > +       rep_stosb_threshold = SIZE_MAX;
> > +    }
> >
> >    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> >    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
> > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
> > index 49eeb5f70a..b84903a294 100644
> > --- a/sysdeps/x86/dl-diagnostics-cpu.c
> > +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> > @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void)
> >                              cpu_features->level4_cache_size);
> >    print_cpu_features_value ("cachesize_non_temporal_divisor",
> >                             cpu_features->cachesize_non_temporal_divisor);
> > +  print_cpu_features_value ("machine_rep_stosb_threshold",
> > +                           cpu_features->machine_rep_stosb_threshold);
> >    _Static_assert (
> > -      offsetof (struct cpu_features, cachesize_non_temporal_divisor)
> > -             + sizeof (cpu_features->cachesize_non_temporal_divisor)
> > +      offsetof (struct cpu_features, machine_rep_stosb_threshold)
> > +             + sizeof (cpu_features->machine_rep_stosb_threshold)
> >           == sizeof (*cpu_features),
> >        "last cpu_features field has been printed");
> >
> > diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
> > index aaae44f0e1..5f2b5b93ac 100644
> > --- a/sysdeps/x86/include/cpu-features.h
> > +++ b/sysdeps/x86/include/cpu-features.h
> > @@ -981,6 +981,8 @@ struct cpu_features
> >    /* When no user non_temporal_threshold is specified. We default to
> >       cachesize / cachesize_non_temporal_divisor.  */
> >    unsigned long int cachesize_non_temporal_divisor;
> > +  /* Default rep stosb threshold (if 0, use default in dl-machine.h).  */
> > +  unsigned long int machine_rep_stosb_threshold;
> >  };
> >
> >  /* Get a pointer to the CPU features structure.  */
> > --
> > 2.34.1
> >
>
> Please rebase against master.
>
> Thanks.

This is on hold at the moment. DJ and I are still trying to figure out
the best fix.
>
> --
> H.J.
H.J. Lu Aug. 19, 2024, 4:35 p.m. UTC | #3
On Mon, Aug 19, 2024, 9:29 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Mon, Aug 19, 2024 at 6:12 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jul 22, 2024 at 11:38 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> > >
> > > Benchmarks indicate that `2048` (prior value) is far too low a
> > > threshold for using `rep stosb`. Rather something around `1048576` is
> > > preferable:
> > >
> > >
> https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
> > >
> > > (See: SKX-1/SKX-1.1/SKX-2/SKX-2.2)
> > >
> > > The `1048576` theshold was tested on multiple SKX machines and
> > > suprisingly seemed to hold regardless of cache hierarchy.
> > >
> > > Also note that in highly parallel settings, a smaller value is
> > > preferable, but this difference does seem not so extreme to justify a
> > > worse threshold with low/moderate parallelism.
> > >
> > > Tested new threshold using qemu on all x86 systems mutually supported
> > > by GLIBC and qemu.
> > > ---
> > >  sysdeps/x86/cpu-features.c         | 11 +++++++++++
> > >  sysdeps/x86/dl-cacheinfo.h         | 21 +++++++++++----------
> > >  sysdeps/x86/dl-diagnostics-cpu.c   |  6 ++++--
> > >  sysdeps/x86/include/cpu-features.h |  2 ++
> > >  4 files changed, 28 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > > index c096dd390a..c5429422c0 100644
> > > --- a/sysdeps/x86/cpu-features.c
> > > +++ b/sysdeps/x86/cpu-features.c
> > > @@ -757,6 +757,7 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> > >    enum cpu_features_kind kind;
> > >
> > >    cpu_features->cachesize_non_temporal_divisor = 4;
> > > +  cpu_features->machine_rep_stosb_threshold = 0;
> > >  #if !HAS_CPUID
> > >    if (__get_cpuid_max (0, 0) == 0)
> > >      {
> > > @@ -879,6 +880,16 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> > >                      non-temporal on all Skylake servers. */
> > >
>  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > >                   |= bit_arch_Avoid_Non_Temporal_Memset;
> > > +             /* SKX prefers temporal stores for a while. Confusingly,
> tests
> > > +                across multiple SKX systems with different cache
> sizes all
> > > +                indicate the threshold for when `rep stosb` becomes
> preferable
> > > +                is 1048576 as opposed to some function of the size of
> the
> > > +                various components of the cache hierarchy. Worth
> noting,
> > > +                although not taken into account here, is that `rep
> stosb` is
> > > +                more preferable with higher degrees of parallelism,
> i.e if all
> > > +                cores are simultaneously setting memory a lower
> threshold
> > > +                would be preferable.  */
> > > +             cpu_features->machine_rep_stosb_threshold = 1048576;
> > >             case INTEL_BIGCORE_COMETLAKE:
> > >             case INTEL_BIGCORE_SKYLAKE:
> > >             case INTEL_BIGCORE_KABYLAKE:
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index a1c03b8903..c646b57508 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -1002,9 +1002,6 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
> > >    if (cpu_features->basic.kind == arch_kind_amd)
> > >      rep_movsb_threshold = non_temporal_threshold;
> > >
> > > -  /* The default threshold to use Enhanced REP STOSB.  */
> > > -  unsigned long int rep_stosb_threshold = 2048;
> > > -
> > >    long int tunable_size;
> > >
> > >    tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
> > > @@ -1034,13 +1031,17 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
> > >    /* NB: The default value of the x86_rep_stosb_threshold tunable is
> the
> > >       same as the default value of __x86_rep_stosb_threshold and the
> > >       minimum value is fixed.  */
> > > -  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
> > > -                                    long int, NULL);
> > > -  if (cpu_features->basic.kind == arch_kind_amd
> > > -      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> > > -    /* For AMD Zen3+ architecture, the performance of the vectorized
> loop is
> > > -       slightly better than ERMS.  */
> > > -    rep_stosb_threshold = SIZE_MAX;
> > > +  unsigned long int rep_stosb_threshold
> > > +      = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
> > > +  if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
> > > +    {
> > > +      if (cpu_features->machine_rep_stosb_threshold != 0)
> > > +       rep_stosb_threshold =
> cpu_features->machine_rep_stosb_threshold;
> > > +      /* For AMD Zen3+ architecture, the performance of the
> vectorized loop
> > > +        is slightly better than ERMS.  */
> > > +      else if (cpu_features->basic.kind == arch_kind_amd)
> > > +       rep_stosb_threshold = SIZE_MAX;
> > > +    }
> > >
> > >    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
> > >    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0,
> SIZE_MAX);
> > > diff --git a/sysdeps/x86/dl-diagnostics-cpu.c
> b/sysdeps/x86/dl-diagnostics-cpu.c
> > > index 49eeb5f70a..b84903a294 100644
> > > --- a/sysdeps/x86/dl-diagnostics-cpu.c
> > > +++ b/sysdeps/x86/dl-diagnostics-cpu.c
> > > @@ -128,9 +128,11 @@ _dl_diagnostics_cpu (void)
> > >                              cpu_features->level4_cache_size);
> > >    print_cpu_features_value ("cachesize_non_temporal_divisor",
> > >
>  cpu_features->cachesize_non_temporal_divisor);
> > > +  print_cpu_features_value ("machine_rep_stosb_threshold",
> > > +                           cpu_features->machine_rep_stosb_threshold);
> > >    _Static_assert (
> > > -      offsetof (struct cpu_features, cachesize_non_temporal_divisor)
> > > -             + sizeof (cpu_features->cachesize_non_temporal_divisor)
> > > +      offsetof (struct cpu_features, machine_rep_stosb_threshold)
> > > +             + sizeof (cpu_features->machine_rep_stosb_threshold)
> > >           == sizeof (*cpu_features),
> > >        "last cpu_features field has been printed");
> > >
> > > diff --git a/sysdeps/x86/include/cpu-features.h
> b/sysdeps/x86/include/cpu-features.h
> > > index aaae44f0e1..5f2b5b93ac 100644
> > > --- a/sysdeps/x86/include/cpu-features.h
> > > +++ b/sysdeps/x86/include/cpu-features.h
> > > @@ -981,6 +981,8 @@ struct cpu_features
> > >    /* When no user non_temporal_threshold is specified. We default to
> > >       cachesize / cachesize_non_temporal_divisor.  */
> > >    unsigned long int cachesize_non_temporal_divisor;
> > > +  /* Default rep stosb threshold (if 0, use default in
> dl-machine.h).  */
> > > +  unsigned long int machine_rep_stosb_threshold;
> > >  };
> > >
> > >  /* Get a pointer to the CPU features structure.  */
> > > --
> > > 2.34.1
> > >
> >
> > Please rebase against master.
> >
> > Thanks.
>
> This is on hold at the moment. DJ and I are still trying to figure out
> the best fix.
>

OK.

>
> > --
> > H.J.
>
>
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c096dd390a..c5429422c0 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -757,6 +757,7 @@  init_cpu_features (struct cpu_features *cpu_features)
   enum cpu_features_kind kind;
 
   cpu_features->cachesize_non_temporal_divisor = 4;
+  cpu_features->machine_rep_stosb_threshold = 0;
 #if !HAS_CPUID
   if (__get_cpuid_max (0, 0) == 0)
     {
@@ -879,6 +880,16 @@  init_cpu_features (struct cpu_features *cpu_features)
 		     non-temporal on all Skylake servers. */
 	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
 		  |= bit_arch_Avoid_Non_Temporal_Memset;
+	      /* SKX prefers temporal stores for a while. Confusingly, tests
+	         across multiple SKX systems with different cache sizes all
+	         indicate the threshold for when `rep stosb` becomes preferable
+	         is 1048576 as opposed to some function of the size of the
+	         various components of the cache hierarchy. Worth noting,
+	         although not taken into account here, is that `rep stosb` is
+	         more preferable with higher degrees of parallelism, i.e if all
+	         cores are simultaneously setting memory a lower threshold
+	         would be preferable.  */
+	      cpu_features->machine_rep_stosb_threshold = 1048576;
 	    case INTEL_BIGCORE_COMETLAKE:
 	    case INTEL_BIGCORE_SKYLAKE:
 	    case INTEL_BIGCORE_KABYLAKE:
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a1c03b8903..c646b57508 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1002,9 +1002,6 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (cpu_features->basic.kind == arch_kind_amd)
     rep_movsb_threshold = non_temporal_threshold;
 
-  /* The default threshold to use Enhanced REP STOSB.  */
-  unsigned long int rep_stosb_threshold = 2048;
-
   long int tunable_size;
 
   tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
@@ -1034,13 +1031,17 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The default value of the x86_rep_stosb_threshold tunable is the
      same as the default value of __x86_rep_stosb_threshold and the
      minimum value is fixed.  */
-  rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
-				     long int, NULL);
-  if (cpu_features->basic.kind == arch_kind_amd
-      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
-    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
-       slightly better than ERMS.  */
-    rep_stosb_threshold = SIZE_MAX;
+  unsigned long int rep_stosb_threshold
+      = TUNABLE_GET (x86_rep_stosb_threshold, long int, NULL);
+  if (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
+    {
+      if (cpu_features->machine_rep_stosb_threshold != 0)
+	rep_stosb_threshold = cpu_features->machine_rep_stosb_threshold;
+      /* For AMD Zen3+ architecture, the performance of the vectorized loop
+	 is slightly better than ERMS.  */
+      else if (cpu_features->basic.kind == arch_kind_amd)
+	rep_stosb_threshold = SIZE_MAX;
+    }
 
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 49eeb5f70a..b84903a294 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -128,9 +128,11 @@  _dl_diagnostics_cpu (void)
                             cpu_features->level4_cache_size);
   print_cpu_features_value ("cachesize_non_temporal_divisor",
 			    cpu_features->cachesize_non_temporal_divisor);
+  print_cpu_features_value ("machine_rep_stosb_threshold",
+			    cpu_features->machine_rep_stosb_threshold);
   _Static_assert (
-      offsetof (struct cpu_features, cachesize_non_temporal_divisor)
-	      + sizeof (cpu_features->cachesize_non_temporal_divisor)
+      offsetof (struct cpu_features, machine_rep_stosb_threshold)
+	      + sizeof (cpu_features->machine_rep_stosb_threshold)
 	  == sizeof (*cpu_features),
       "last cpu_features field has been printed");
 
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index aaae44f0e1..5f2b5b93ac 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -981,6 +981,8 @@  struct cpu_features
   /* When no user non_temporal_threshold is specified. We default to
      cachesize / cachesize_non_temporal_divisor.  */
   unsigned long int cachesize_non_temporal_divisor;
+  /* Default rep stosb threshold (if 0, use default in dl-machine.h).  */
+  unsigned long int machine_rep_stosb_threshold;
 };
 
 /* Get a pointer to the CPU features structure.  */