diff mbox series

[v1] x86: Disable non-temporal memset on Skylake Server

Message ID 20240715082228.715048-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Disable non-temporal memset on Skylake Server | expand

Commit Message

Noah Goldstein July 15, 2024, 8:22 a.m. UTC
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).

Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.

This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.

Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
---
 sysdeps/x86/cpu-features.c                        | 11 +++++++++--
 sysdeps/x86/cpu-tunables.c                        |  5 +++++
 sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
 .../cpu-features-preferred_feature_index_1.def    |  1 +
 sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
 5 files changed, 25 insertions(+), 11 deletions(-)

Comments

Andreas K. Huettel July 15, 2024, 11:51 a.m. UTC | #1
Am Montag, 15. Juli 2024, 10:22:28 CEST schrieb Noah Goldstein:
> The original commit enabling non-temporal memset on Skylake Server had
> erroneous benchmarks (actually done on ICX).
> 
> Further benchmarks indicate non-temporal stores may in fact by a
> regression on Skylake Server.
> 
> This commit may be over-cautious in some cases, but should avoid any
> regressions for 2.40.

OK with someone's R-B

> 
> Tested using qemu on all x86_64 cpu arch supported by both qemu +
> GLIBC.
> ---
>  sysdeps/x86/cpu-features.c                        | 11 +++++++++--
>  sysdeps/x86/cpu-tunables.c                        |  5 +++++
>  sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
>  .../cpu-features-preferred_feature_index_1.def    |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
>  5 files changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index e501e084ef..f473fba216 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features *cpu_features)
>  
>  	      /* Newer Bigcore microarch (larger non-temporal store
>  		 threshold).  */
> -	    case INTEL_BIGCORE_SKYLAKE:
> -	    case INTEL_BIGCORE_KABYLAKE:
>  	    case INTEL_BIGCORE_COMETLAKE:
>  	    case INTEL_BIGCORE_SKYLAKE_AVX512:
>  	    case INTEL_BIGCORE_CANNONLAKE:
> +	      /* Benchmarks indicate non-temporal memset is not
> +		     necessarily profitable on SKX (and in some cases much
> +		     worse). This is likely unique to SKX due its it unique
> +		     mesh interconnect (not present on ICX or BWD). Disable
> +		     non-temporal on all Skylake servers. */
> +	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +		  |= bit_arch_Avoid_Non_Temporal_Memset;
> +	    case INTEL_BIGCORE_SKYLAKE:
> +	    case INTEL_BIGCORE_KABYLAKE:
>  	    case INTEL_BIGCORE_ICELAKE:
>  	    case INTEL_BIGCORE_TIGERLAKE:
>  	    case INTEL_BIGCORE_ROCKETLAKE:
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..ae9dcd6180 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>  		(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
>  	    }
>  	  break;
> +	case 25:
> +	  {
> +	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +					      Avoid_Non_Temporal_Memset, 25);
> +	  }
>  	case 26:
>  	    {
>  	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..a1c03b8903 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* Non-temporal stores are more performant on Intel and AMD hardware above
>       non_temporal_threshold. Enable this for both Intel and AMD hardware. */
>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> -  if (cpu_features->basic.kind == arch_kind_intel
> -      || cpu_features->basic.kind == arch_kind_amd)
> -      memset_non_temporal_threshold = non_temporal_threshold;
> -
> -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> -      cases slower than the vectorized path (and for some alignments,
> -      it is really slow, check BZ #30994).  */
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> +      && (cpu_features->basic.kind == arch_kind_intel
> +	  || cpu_features->basic.kind == arch_kind_amd))
> +    memset_non_temporal_threshold = non_temporal_threshold;
> +
> +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> +     cases slower than the vectorized path (and for some alignments,
> +     it is really slow, check BZ #30994).  */
>    if (cpu_features->basic.kind == arch_kind_amd)
>      rep_movsb_threshold = non_temporal_threshold;
>  
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..61bbbc2e89 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Avoid_Non_Temporal_Memset)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..94307283d7 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
>      test_1,
>      array_length (test_1)
>    }
>
H.J. Lu July 16, 2024, 5:07 a.m. UTC | #2
On Tue, Jul 16, 2024, 5:01 AM DJ Delorie <dj@redhat.com> wrote:

>
> Other than "I don't know what the effect of this patch is"
> (benchmark-wise) it looks OK to me.  It looks like the only affected
> microarches are COMETLAKE, SKYLAKE_AVX512, and CANNONLAKE.  The system I
> tested on is a Xeon 4216, I don't know which microarch that falls into.
> Was this your intent?  The description implies otherwise.
>
> This change should be overridable by tunables anyway (I hope?), and
> can't actually *break* anything, so as long as the above was your
> intent...
>
> LGTM.
> Reviewed-by: DJ Delorie <dj@redhat.com>
>
> Noah Goldstein <goldstein.w.n@gmail.com> writes:
> > The original commit enabling non-temporal memset on Skylake Server had
> > erroneous benchmarks (actually done on ICX).
> >
> > Further benchmarks indicate non-temporal stores may in fact by a
> > regression on Skylake Server.
> >
> > This commit may be over-cautious in some cases, but should avoid any
> > regressions for 2.40.
> >
> > Tested using qemu on all x86_64 cpu arch supported by both qemu +
> > GLIBC.
>
> Reviewing the code itself; the system I tested on didn't act like I
> expected so I havn't spot-checked it for functionality.
>
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index e501e084ef..f473fba216 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >
> >             /* Newer Bigcore microarch (larger non-temporal store
> >                threshold).  */
> > -         case INTEL_BIGCORE_SKYLAKE:
> > -         case INTEL_BIGCORE_KABYLAKE:
> >           case INTEL_BIGCORE_COMETLAKE:
>

Comet Lake is the same as Skylake client. Please also move it.


>           case INTEL_BIGCORE_SKYLAKE_AVX512:
> >           case INTEL_BIGCORE_CANNONLAKE:
> > +           /* Benchmarks indicate non-temporal memset is not
> > +                  necessarily profitable on SKX (and in some cases much
> > +                  worse). This is likely unique to SKX due its it unique
> > +                  mesh interconnect (not present on ICX or BWD). Disable
> > +                  non-temporal on all Skylake servers. */
> > +           cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > +               |= bit_arch_Avoid_Non_Temporal_Memset;
> > +         case INTEL_BIGCORE_SKYLAKE:
> > +         case INTEL_BIGCORE_KABYLAKE:
> >           case INTEL_BIGCORE_ICELAKE:
> >           case INTEL_BIGCORE_TIGERLAKE:
> >           case INTEL_BIGCORE_ROCKETLAKE:
>
> So for three microarches, we prefer to avoid non-temporal.  This implies
> that for skylake and kabylake we continue to prefere NT.  OK.
>
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index 89da7a03da..ae9dcd6180 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> >               (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
> >           }
> >         break;
> > +     case 25:
> > +       {
> > +         CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > +                                           Avoid_Non_Temporal_Memset,
> 25);
> > +       }
> >       case 26:
> >           {
> >             CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
>
> Ok, 25 was missing anyway ;-)
>
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..a1c03b8903 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
> >    /* Non-temporal stores are more performant on Intel and AMD hardware
> above
> >       non_temporal_threshold. Enable this for both Intel and AMD
> hardware. */
> >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>
> > -  if (cpu_features->basic.kind == arch_kind_intel
> > -      || cpu_features->basic.kind == arch_kind_amd)
> > -      memset_non_temporal_threshold = non_temporal_threshold;
> > -
> > -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> > -      cases slower than the vectorized path (and for some alignments,
> > -      it is really slow, check BZ #30994).  */
>
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> > +      && (cpu_features->basic.kind == arch_kind_intel
> > +       || cpu_features->basic.kind == arch_kind_amd))
> > +    memset_non_temporal_threshold = non_temporal_threshold;
> > +
> > +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> > +     cases slower than the vectorized path (and for some alignments,
> > +     it is really slow, check BZ #30994).  */
>
> >    if (cpu_features->basic.kind == arch_kind_amd)
> >      rep_movsb_threshold = non_temporal_threshold;
>
> Ok.
>
> > diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..61bbbc2e89 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >  BIT (MathVec_Prefer_No_AVX512)
> >  BIT (Prefer_FSRM)
> >  BIT (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Avoid_Non_Temporal_Memset)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..94307283d7 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> >      /* Disable everything.  */
> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > -    "-AVX_Fast_Unaligned_Load",
> > +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
> >      test_1,
> >      array_length (test_1)
> >    },
> > @@ -68,7 +68,7 @@ static const struct test_t
> >      /* Same as before, but with some empty suboptions.  */
> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
> >      test_1,
> >      array_length (test_1)
> >    }
>
> Ok.
>
>
Noah Goldstein July 16, 2024, 5:38 a.m. UTC | #3
On Tue, Jul 16, 2024 at 1:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>
>
> On Tue, Jul 16, 2024, 5:01 AM DJ Delorie <dj@redhat.com> wrote:
>>
>>
>> Other than "I don't know what the effect of this patch is"
>> (benchmark-wise) it looks OK to me.  It looks like the only affected
>> microarches are COMETLAKE, SKYLAKE_AVX512, and CANNONLAKE.  The system I
>> tested on is a Xeon 4216, I don't know which microarch that falls into.
>> Was this your intent?  The description implies otherwise.
>>
>> This change should be overridable by tunables anyway (I hope?), and
>> can't actually *break* anything, so as long as the above was your
>> intent...
>>
>> LGTM.
>> Reviewed-by: DJ Delorie <dj@redhat.com>
>>
>> Noah Goldstein <goldstein.w.n@gmail.com> writes:
>> > The original commit enabling non-temporal memset on Skylake Server had
>> > erroneous benchmarks (actually done on ICX).
>> >
>> > Further benchmarks indicate non-temporal stores may in fact by a
>> > regression on Skylake Server.
>> >
>> > This commit may be over-cautious in some cases, but should avoid any
>> > regressions for 2.40.
>> >
>> > Tested using qemu on all x86_64 cpu arch supported by both qemu +
>> > GLIBC.
>>
>> Reviewing the code itself; the system I tested on didn't act like I
>> expected so I havn't spot-checked it for functionality.
>>
>> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> > index e501e084ef..f473fba216 100644
>> > --- a/sysdeps/x86/cpu-features.c
>> > +++ b/sysdeps/x86/cpu-features.c
>> > @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features *cpu_features)
>> >
>> >             /* Newer Bigcore microarch (larger non-temporal store
>> >                threshold).  */
>> > -         case INTEL_BIGCORE_SKYLAKE:
>> > -         case INTEL_BIGCORE_KABYLAKE:
>> >           case INTEL_BIGCORE_COMETLAKE:
>
>
> Comet Lake is the same as Skylake client. Please also move it.

Done
>
>
>> >           case INTEL_BIGCORE_SKYLAKE_AVX512:
>> >           case INTEL_BIGCORE_CANNONLAKE:
>> > +           /* Benchmarks indicate non-temporal memset is not
>> > +                  necessarily profitable on SKX (and in some cases much
>> > +                  worse). This is likely unique to SKX due its it unique
>> > +                  mesh interconnect (not present on ICX or BWD). Disable
>> > +                  non-temporal on all Skylake servers. */
>> > +           cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
>> > +               |= bit_arch_Avoid_Non_Temporal_Memset;
>> > +         case INTEL_BIGCORE_SKYLAKE:
>> > +         case INTEL_BIGCORE_KABYLAKE:
>> >           case INTEL_BIGCORE_ICELAKE:
>> >           case INTEL_BIGCORE_TIGERLAKE:
>> >           case INTEL_BIGCORE_ROCKETLAKE:
>>
>> So for three microarches, we prefer to avoid non-temporal.  This implies
>> that for skylake and kabylake we continue to prefere NT.  OK.
>>
>> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
>> > index 89da7a03da..ae9dcd6180 100644
>> > --- a/sysdeps/x86/cpu-tunables.c
>> > +++ b/sysdeps/x86/cpu-tunables.c
>> > @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>> >               (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
>> >           }
>> >         break;
>> > +     case 25:
>> > +       {
>> > +         CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>> > +                                           Avoid_Non_Temporal_Memset, 25);
>> > +       }
>> >       case 26:
>> >           {
>> >             CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
>>
>> Ok, 25 was missing anyway ;-)
>>
>> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
>> > index 5e77345a6e..a1c03b8903 100644
>> > --- a/sysdeps/x86/dl-cacheinfo.h
>> > +++ b/sysdeps/x86/dl-cacheinfo.h
>> > @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>> >    /* Non-temporal stores are more performant on Intel and AMD hardware above
>> >       non_temporal_threshold. Enable this for both Intel and AMD hardware. */
>> >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>>
>> > -  if (cpu_features->basic.kind == arch_kind_intel
>> > -      || cpu_features->basic.kind == arch_kind_amd)
>> > -      memset_non_temporal_threshold = non_temporal_threshold;
>> > -
>> > -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>> > -      cases slower than the vectorized path (and for some alignments,
>> > -      it is really slow, check BZ #30994).  */
>>
>> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>> > +      && (cpu_features->basic.kind == arch_kind_intel
>> > +       || cpu_features->basic.kind == arch_kind_amd))
>> > +    memset_non_temporal_threshold = non_temporal_threshold;
>> > +
>> > +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>> > +     cases slower than the vectorized path (and for some alignments,
>> > +     it is really slow, check BZ #30994).  */
>>
>> >    if (cpu_features->basic.kind == arch_kind_amd)
>> >      rep_movsb_threshold = non_temporal_threshold;
>>
>> Ok.
>>
>> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > index 85e7f54ec8..61bbbc2e89 100644
>> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>> >  BIT (MathVec_Prefer_No_AVX512)
>> >  BIT (Prefer_FSRM)
>> >  BIT (Avoid_Short_Distance_REP_MOVSB)
>> > +BIT (Avoid_Non_Temporal_Memset)
>> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
>> > index 8589a9fd66..94307283d7 100644
>> > --- a/sysdeps/x86/tst-hwcap-tunables.c
>> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
>> > @@ -60,7 +60,7 @@ static const struct test_t
>> >      /* Disable everything.  */
>> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
>> > -    "-AVX_Fast_Unaligned_Load",
>> > +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
>> >      test_1,
>> >      array_length (test_1)
>> >    },
>> > @@ -68,7 +68,7 @@ static const struct test_t
>> >      /* Same as before, but with some empty suboptions.  */
>> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
>> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
>> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
>> >      test_1,
>> >      array_length (test_1)
>> >    }
>>
>> Ok.
>>
Noah Goldstein July 16, 2024, 5:52 a.m. UTC | #4
On Tue, Jul 16, 2024 at 5:00 AM DJ Delorie <dj@redhat.com> wrote:
>
>
> Other than "I don't know what the effect of this patch is"
> (benchmark-wise) it looks OK to me.  It looks like the only affected
> microarches are COMETLAKE, SKYLAKE_AVX512, and CANNONLAKE.  The system I
> tested on is a Xeon 4216, I don't know which microarch that falls into.
> Was this your intent?  The description implies otherwise.
>
Xeon 4216 will be affected (Cascade Lake):
https://ark.intel.com/content/www/us/en/ark/products/193394/intel-xeon-silver-4216-processor-22m-cache-2-10-ghz.html

Note, this is different from the ERMS issue. This is correcting a regression
I introduced because of incorrect benchmarks.

With this patch, there should be no notable changes to memset perf on
Xeon 4216.

The results of the patch on using all x86_64 arch supported by qemu:
```
Same: 486
Same: 486-v1
Same: Broadwell
Same: Broadwell-IBRS
Same: Broadwell-noTSX
Same: Broadwell-noTSX-IBRS
Same: Broadwell-v1
Same: Broadwell-v2
Same: Broadwell-v3
Same: Broadwell-v4
Diff: Cascadelake-Server
27c27
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-noTSX
30c30
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-v1
27c27
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-v2
32c32
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-v3
30c30
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-v4
30c30
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cascadelake-Server-v5
31c31
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Same: Conroe
Same: Conroe-v1
Diff: Cooperlake
36c36
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cooperlake-v1
36c36
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Cooperlake-v2
37c37
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Same: Denverton
Same: Denverton-v1
Same: Denverton-v2
Same: Denverton-v3
Same: Dhyana
Same: Dhyana-v1
Same: Dhyana-v2
Same: EPYC
Same: EPYC-IBPB
Same: EPYC-Milan
Same: EPYC-Milan-v1
Same: EPYC-Rome
Same: EPYC-Rome-v1
Same: EPYC-Rome-v2
Same: EPYC-v1
Same: EPYC-v2
Same: EPYC-v3
Same: Haswell
Same: Haswell-IBRS
Same: Haswell-noTSX
Same: Haswell-noTSX-IBRS
Same: Haswell-v1
Same: Haswell-v2
Same: Haswell-v3
Same: Haswell-v4
Same: Icelake-Client
Same: Icelake-Client-noTSX
Same: Icelake-Client-v1
Same: Icelake-Client-v2
Same: Icelake-Client-v3
Same: Icelake-Server
Same: Icelake-Server-noTSX
Same: Icelake-Server-v1
Same: Icelake-Server-v2
Same: Icelake-Server-v3
Same: Icelake-Server-v4
Same: Icelake-Server-v5
Same: IvyBridge
Same: IvyBridge-IBRS
Same: IvyBridge-v1
Same: IvyBridge-v2
Same: KnightsMill
Same: KnightsMill-v1
Same: Nehalem
Same: Nehalem-IBRS
Same: Nehalem-v1
Same: Nehalem-v2
Same: Opteron_G1
Same: Opteron_G1-v1
Same: Opteron_G2
Same: Opteron_G2-v1
Same: Opteron_G3
Same: Opteron_G3-v1
Same: Opteron_G4
Same: Opteron_G4-v1
Same: Opteron_G5
Same: Opteron_G5-v1
Same: Penryn
Same: Penryn-v1
Same: SandyBridge
Same: SandyBridge-IBRS
Same: SandyBridge-v1
Same: SandyBridge-v2
Same: Skylake-Client
Same: Skylake-Client-IBRS
Same: Skylake-Client-noTSX-IBRS
Same: Skylake-Client-v1
Same: Skylake-Client-v2
Same: Skylake-Client-v3
Same: Skylake-Client-v4
Diff: Skylake-Server
24c24
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-IBRS
25c25
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-noTSX-IBRS
23c23
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-v1
24c24
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-v2
25c25
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-v3
23c23
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-v4
23c23
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Diff: Skylake-Server-v5
24c24
< glibc.cpu.x86_memset_non_temporal_threshold: 0xc00000 (min: 0x4040,
max: 0xffffffffffffffff)
---
> glibc.cpu.x86_memset_non_temporal_threshold: 0xffffffffffffffff (min: 0x4040, max: 0xffffffffffffffff)
Same: Snowridge
Same: Snowridge-v1
Same: Snowridge-v2
Same: Snowridge-v3
Same: Snowridge-v4
Same: Westmere
Same: Westmere-IBRS
Same: Westmere-v1
Same: Westmere-v2
Same: athlon
Same: athlon-v1
Same: core2duo
Same: core2duo-v1
Same: coreduo
Same: coreduo-v1
Same: kvm32
Same: kvm32-v1
Same: kvm64
Same: kvm64-v1
Same: n270
Same: n270-v1
Same: pentium
Same: pentium-v1
Same: pentium2
Same: pentium2-v1
Same: pentium3
Same: pentium3-v1
Same: phenom
Same: phenom-v1
Same: qemu32
Same: qemu32-v1
Same: qemu64
Same: qemu64-v1
Same: base
Same: max
```

Which is exactly what we are looking for.

> This change should be overridable by tunables anyway (I hope?), and
> can't actually *break* anything, so as long as the above was your
> intent...

Correct.

>
> LGTM.
> Reviewed-by: DJ Delorie <dj@redhat.com>
>
> Noah Goldstein <goldstein.w.n@gmail.com> writes:
> > The original commit enabling non-temporal memset on Skylake Server had
> > erroneous benchmarks (actually done on ICX).
> >
> > Further benchmarks indicate non-temporal stores may in fact by a
> > regression on Skylake Server.
> >
> > This commit may be over-cautious in some cases, but should avoid any
> > regressions for 2.40.
> >
> > Tested using qemu on all x86_64 cpu arch supported by both qemu +
> > GLIBC.
>
> Reviewing the code itself; the system I tested on didn't act like I
> expected so I havn't spot-checked it for functionality.
>
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index e501e084ef..f473fba216 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features *cpu_features)
> >
> >             /* Newer Bigcore microarch (larger non-temporal store
> >                threshold).  */
> > -         case INTEL_BIGCORE_SKYLAKE:
> > -         case INTEL_BIGCORE_KABYLAKE:
> >           case INTEL_BIGCORE_COMETLAKE:
> >           case INTEL_BIGCORE_SKYLAKE_AVX512:
> >           case INTEL_BIGCORE_CANNONLAKE:
> > +           /* Benchmarks indicate non-temporal memset is not
> > +                  necessarily profitable on SKX (and in some cases much
> > +                  worse). This is likely unique to SKX due its it unique
> > +                  mesh interconnect (not present on ICX or BWD). Disable
> > +                  non-temporal on all Skylake servers. */
> > +           cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > +               |= bit_arch_Avoid_Non_Temporal_Memset;
> > +         case INTEL_BIGCORE_SKYLAKE:
> > +         case INTEL_BIGCORE_KABYLAKE:
> >           case INTEL_BIGCORE_ICELAKE:
> >           case INTEL_BIGCORE_TIGERLAKE:
> >           case INTEL_BIGCORE_ROCKETLAKE:
>
> So for three microarches, we prefer to avoid non-temporal.  This implies
> that for skylake and kabylake we continue to prefere NT.  OK.
>
> > diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> > index 89da7a03da..ae9dcd6180 100644
> > --- a/sysdeps/x86/cpu-tunables.c
> > +++ b/sysdeps/x86/cpu-tunables.c
> > @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> >               (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
> >           }
> >         break;
> > +     case 25:
> > +       {
> > +         CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> > +                                           Avoid_Non_Temporal_Memset, 25);
> > +       }
> >       case 26:
> >           {
> >             CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
>
> Ok, 25 was missing anyway ;-)
>
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 5e77345a6e..a1c03b8903 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> >    /* Non-temporal stores are more performant on Intel and AMD hardware above
> >       non_temporal_threshold. Enable this for both Intel and AMD hardware. */
> >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>
> > -  if (cpu_features->basic.kind == arch_kind_intel
> > -      || cpu_features->basic.kind == arch_kind_amd)
> > -      memset_non_temporal_threshold = non_temporal_threshold;
> > -
> > -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> > -      cases slower than the vectorized path (and for some alignments,
> > -      it is really slow, check BZ #30994).  */
>
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> > +      && (cpu_features->basic.kind == arch_kind_intel
> > +       || cpu_features->basic.kind == arch_kind_amd))
> > +    memset_non_temporal_threshold = non_temporal_threshold;
> > +
> > +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> > +     cases slower than the vectorized path (and for some alignments,
> > +     it is really slow, check BZ #30994).  */
>
> >    if (cpu_features->basic.kind == arch_kind_amd)
> >      rep_movsb_threshold = non_temporal_threshold;
>
> Ok.
>
> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 85e7f54ec8..61bbbc2e89 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >  BIT (MathVec_Prefer_No_AVX512)
> >  BIT (Prefer_FSRM)
> >  BIT (Avoid_Short_Distance_REP_MOVSB)
> > +BIT (Avoid_Non_Temporal_Memset)
> > diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
> > index 8589a9fd66..94307283d7 100644
> > --- a/sysdeps/x86/tst-hwcap-tunables.c
> > +++ b/sysdeps/x86/tst-hwcap-tunables.c
> > @@ -60,7 +60,7 @@ static const struct test_t
> >      /* Disable everything.  */
> >      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> > -    "-AVX_Fast_Unaligned_Load",
> > +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
> >      test_1,
> >      array_length (test_1)
> >    },
> > @@ -68,7 +68,7 @@ static const struct test_t
> >      /* Same as before, but with some empty suboptions.  */
> >      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> > -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> > +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
> >      test_1,
> >      array_length (test_1)
> >    }
>
> Ok.
>
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index e501e084ef..f473fba216 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -870,11 +870,18 @@  init_cpu_features (struct cpu_features *cpu_features)
 
 	      /* Newer Bigcore microarch (larger non-temporal store
 		 threshold).  */
-	    case INTEL_BIGCORE_SKYLAKE:
-	    case INTEL_BIGCORE_KABYLAKE:
 	    case INTEL_BIGCORE_COMETLAKE:
 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
 	    case INTEL_BIGCORE_CANNONLAKE:
+	      /* Benchmarks indicate non-temporal memset is not
+		     necessarily profitable on SKX (and in some cases much
+		     worse). This is likely unique to SKX due its it unique
+		     mesh interconnect (not present on ICX or BWD). Disable
+		     non-temporal on all Skylake servers. */
+	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+		  |= bit_arch_Avoid_Non_Temporal_Memset;
+	    case INTEL_BIGCORE_SKYLAKE:
+	    case INTEL_BIGCORE_KABYLAKE:
 	    case INTEL_BIGCORE_ICELAKE:
 	    case INTEL_BIGCORE_TIGERLAKE:
 	    case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a03da..ae9dcd6180 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -243,6 +243,11 @@  TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
 	    }
 	  break;
+	case 25:
+	  {
+	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+					      Avoid_Non_Temporal_Memset, 25);
+	  }
 	case 26:
 	    {
 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5e77345a6e..a1c03b8903 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -991,13 +991,14 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* Non-temporal stores are more performant on Intel and AMD hardware above
      non_temporal_threshold. Enable this for both Intel and AMD hardware. */
   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
-  if (cpu_features->basic.kind == arch_kind_intel
-      || cpu_features->basic.kind == arch_kind_amd)
-      memset_non_temporal_threshold = non_temporal_threshold;
-
-   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
-      cases slower than the vectorized path (and for some alignments,
-      it is really slow, check BZ #30994).  */
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+      && (cpu_features->basic.kind == arch_kind_intel
+	  || cpu_features->basic.kind == arch_kind_amd))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
+  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+     cases slower than the vectorized path (and for some alignments,
+     it is really slow, check BZ #30994).  */
   if (cpu_features->basic.kind == arch_kind_amd)
     rep_movsb_threshold = non_temporal_threshold;
 
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54ec8..61bbbc2e89 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@  BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 8589a9fd66..94307283d7 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@  static const struct test_t
     /* Disable everything.  */
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
-    "-AVX_Fast_Unaligned_Load",
+    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
     test_1,
     array_length (test_1)
   },
@@ -68,7 +68,7 @@  static const struct test_t
     /* Same as before, but with some empty suboptions.  */
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
-    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
     test_1,
     array_length (test_1)
   }