diff mbox series

x86: Disable non-temporal memset on Skylake Server

Message ID 20240716053740.2483193-1-goldstein.w.n@gmail.com
State New
Headers show
Series x86: Disable non-temporal memset on Skylake Server | expand

Commit Message

Noah Goldstein July 16, 2024, 5:37 a.m. UTC
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).

Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.

This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.

Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
---
 sysdeps/x86/cpu-features.c                        | 13 ++++++++++---
 sysdeps/x86/cpu-tunables.c                        |  5 +++++
 sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
 .../cpu-features-preferred_feature_index_1.def    |  1 +
 sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
 5 files changed, 26 insertions(+), 12 deletions(-)

Comments

H.J. Lu July 16, 2024, 5:59 a.m. UTC | #1
On Tue, Jul 16, 2024, 1:39 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> The original commit enabling non-temporal memset on Skylake Server had
> erroneous benchmarks (actually done on ICX).
>
> Further benchmarks indicate non-temporal stores may in fact by a
> regression on Skylake Server.
>
> This commit may be over-cautious in some cases, but should avoid any
> regressions for 2.40.
>
> Tested using qemu on all x86_64 cpu arch supported by both qemu +
> GLIBC.
> ---
>  sysdeps/x86/cpu-features.c                        | 13 ++++++++++---
>  sysdeps/x86/cpu-tunables.c                        |  5 +++++
>  sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
>  .../cpu-features-preferred_feature_index_1.def    |  1 +
>  sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
>  5 files changed, 26 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index e501e084ef..c096dd390a 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features *cpu_features)
>
>               /* Newer Bigcore microarch (larger non-temporal store
>                  threshold).  */
> -           case INTEL_BIGCORE_SKYLAKE:
> -           case INTEL_BIGCORE_KABYLAKE:
> -           case INTEL_BIGCORE_COMETLAKE:
>             case INTEL_BIGCORE_SKYLAKE_AVX512:
>             case INTEL_BIGCORE_CANNONLAKE:
> +             /* Benchmarks indicate non-temporal memset is not
> +                    necessarily profitable on SKX (and in some cases much
> +                    worse). This is likely unique to SKX due its it unique
> +                    mesh interconnect (not present on ICX or BWD). Disable
> +                    non-temporal on all Skylake servers. */
> +             cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +                 |= bit_arch_Avoid_Non_Temporal_Memset;
> +           case INTEL_BIGCORE_COMETLAKE:
> +           case INTEL_BIGCORE_SKYLAKE:
> +           case INTEL_BIGCORE_KABYLAKE:
>             case INTEL_BIGCORE_ICELAKE:
>             case INTEL_BIGCORE_TIGERLAKE:
>             case INTEL_BIGCORE_ROCKETLAKE:
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index 89da7a03da..ae9dcd6180 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>                 (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
>             }
>           break;
> +       case 25:
> +         {
> +           CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> +                                             Avoid_Non_Temporal_Memset,
> 25);
> +         }
>         case 26:
>             {
>               CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 5e77345a6e..a1c03b8903 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    /* Non-temporal stores are more performant on Intel and AMD hardware
> above
>       non_temporal_threshold. Enable this for both Intel and AMD hardware.
> */
>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> -  if (cpu_features->basic.kind == arch_kind_intel
> -      || cpu_features->basic.kind == arch_kind_amd)
> -      memset_non_temporal_threshold = non_temporal_threshold;
> -
> -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> -      cases slower than the vectorized path (and for some alignments,
> -      it is really slow, check BZ #30994).  */
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> +      && (cpu_features->basic.kind == arch_kind_intel
> +         || cpu_features->basic.kind == arch_kind_amd))
> +    memset_non_temporal_threshold = non_temporal_threshold;
> +
> +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> +     cases slower than the vectorized path (and for some alignments,
> +     it is really slow, check BZ #30994).  */
>    if (cpu_features->basic.kind == arch_kind_amd)
>      rep_movsb_threshold = non_temporal_threshold;
>
> diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 85e7f54ec8..61bbbc2e89 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Avoid_Short_Distance_REP_MOVSB)
> +BIT (Avoid_Non_Temporal_Memset)
> diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> index 8589a9fd66..94307283d7 100644
> --- a/sysdeps/x86/tst-hwcap-tunables.c
> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> @@ -60,7 +60,7 @@ static const struct test_t
>      /* Disable everything.  */
>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> -    "-AVX_Fast_Unaligned_Load",
> +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
>      test_1,
>      array_length (test_1)
>    },
> @@ -68,7 +68,7 @@ static const struct test_t
>      /* Same as before, but with some empty suboptions.  */
>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
>      test_1,
>      array_length (test_1)
>    }
> --
> 2.34.1
>

LGTM.

Thanks.

H.J.

>
>
>
Noah Goldstein July 16, 2024, 6:13 a.m. UTC | #2
On Tue, Jul 16, 2024 at 1:59 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 16, 2024, 1:39 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> The original commit enabling non-temporal memset on Skylake Server had
>> erroneous benchmarks (actually done on ICX).
>>
>> Further benchmarks indicate non-temporal stores may in fact by a
>> regression on Skylake Server.
>>
>> This commit may be over-cautious in some cases, but should avoid any
>> regressions for 2.40.
>>
>> Tested using qemu on all x86_64 cpu arch supported by both qemu +
>> GLIBC.
>> ---
>>  sysdeps/x86/cpu-features.c                        | 13 ++++++++++---
>>  sysdeps/x86/cpu-tunables.c                        |  5 +++++
>>  sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
>>  .../cpu-features-preferred_feature_index_1.def    |  1 +
>>  sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
>>  5 files changed, 26 insertions(+), 12 deletions(-)
>>
>> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> index e501e084ef..c096dd390a 100644
>> --- a/sysdeps/x86/cpu-features.c
>> +++ b/sysdeps/x86/cpu-features.c
>> @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features *cpu_features)
>>
>>               /* Newer Bigcore microarch (larger non-temporal store
>>                  threshold).  */
>> -           case INTEL_BIGCORE_SKYLAKE:
>> -           case INTEL_BIGCORE_KABYLAKE:
>> -           case INTEL_BIGCORE_COMETLAKE:
>>             case INTEL_BIGCORE_SKYLAKE_AVX512:
>>             case INTEL_BIGCORE_CANNONLAKE:
>> +             /* Benchmarks indicate non-temporal memset is not
>> +                    necessarily profitable on SKX (and in some cases much
>> +                    worse). This is likely unique to SKX due its it unique
>> +                    mesh interconnect (not present on ICX or BWD). Disable
>> +                    non-temporal on all Skylake servers. */
>> +             cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
>> +                 |= bit_arch_Avoid_Non_Temporal_Memset;
>> +           case INTEL_BIGCORE_COMETLAKE:
>> +           case INTEL_BIGCORE_SKYLAKE:
>> +           case INTEL_BIGCORE_KABYLAKE:
>>             case INTEL_BIGCORE_ICELAKE:
>>             case INTEL_BIGCORE_TIGERLAKE:
>>             case INTEL_BIGCORE_ROCKETLAKE:
>> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
>> index 89da7a03da..ae9dcd6180 100644
>> --- a/sysdeps/x86/cpu-tunables.c
>> +++ b/sysdeps/x86/cpu-tunables.c
>> @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
>>                 (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
>>             }
>>           break;
>> +       case 25:
>> +         {
>> +           CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
>> +                                             Avoid_Non_Temporal_Memset, 25);
>> +         }
>>         case 26:
>>             {
>>               CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
>> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
>> index 5e77345a6e..a1c03b8903 100644
>> --- a/sysdeps/x86/dl-cacheinfo.h
>> +++ b/sysdeps/x86/dl-cacheinfo.h
>> @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>>    /* Non-temporal stores are more performant on Intel and AMD hardware above
>>       non_temporal_threshold. Enable this for both Intel and AMD hardware. */
>>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>> -  if (cpu_features->basic.kind == arch_kind_intel
>> -      || cpu_features->basic.kind == arch_kind_amd)
>> -      memset_non_temporal_threshold = non_temporal_threshold;
>> -
>> -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>> -      cases slower than the vectorized path (and for some alignments,
>> -      it is really slow, check BZ #30994).  */
>> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
>> +      && (cpu_features->basic.kind == arch_kind_intel
>> +         || cpu_features->basic.kind == arch_kind_amd))
>> +    memset_non_temporal_threshold = non_temporal_threshold;
>> +
>> +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>> +     cases slower than the vectorized path (and for some alignments,
>> +     it is really slow, check BZ #30994).  */
>>    if (cpu_features->basic.kind == arch_kind_amd)
>>      rep_movsb_threshold = non_temporal_threshold;
>>
>> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> index 85e7f54ec8..61bbbc2e89 100644
>> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>>  BIT (MathVec_Prefer_No_AVX512)
>>  BIT (Prefer_FSRM)
>>  BIT (Avoid_Short_Distance_REP_MOVSB)
>> +BIT (Avoid_Non_Temporal_Memset)
>> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
>> index 8589a9fd66..94307283d7 100644
>> --- a/sysdeps/x86/tst-hwcap-tunables.c
>> +++ b/sysdeps/x86/tst-hwcap-tunables.c
>> @@ -60,7 +60,7 @@ static const struct test_t
>>      /* Disable everything.  */
>>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
>> -    "-AVX_Fast_Unaligned_Load",
>> +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
>>      test_1,
>>      array_length (test_1)
>>    },
>> @@ -68,7 +68,7 @@ static const struct test_t
>>      /* Same as before, but with some empty suboptions.  */
>>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
>>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
>> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
>> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
>>      test_1,
>>      array_length (test_1)
>>    }
>> --
>> 2.34.1
>
>
> LGTM.
>
> Thanks.
>
> H.J.

Can I add you reviewed by tag?
>>
>>
>>
H.J. Lu July 16, 2024, 6:22 a.m. UTC | #3
On Tue, Jul 16, 2024, 2:13 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Tue, Jul 16, 2024 at 1:59 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jul 16, 2024, 1:39 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> The original commit enabling non-temporal memset on Skylake Server had
> >> erroneous benchmarks (actually done on ICX).
> >>
> >> Further benchmarks indicate non-temporal stores may in fact by a
> >> regression on Skylake Server.
> >>
> >> This commit may be over-cautious in some cases, but should avoid any
> >> regressions for 2.40.
> >>
> >> Tested using qemu on all x86_64 cpu arch supported by both qemu +
> >> GLIBC.
> >> ---
> >>  sysdeps/x86/cpu-features.c                        | 13 ++++++++++---
> >>  sysdeps/x86/cpu-tunables.c                        |  5 +++++
> >>  sysdeps/x86/dl-cacheinfo.h                        | 15 ++++++++-------
> >>  .../cpu-features-preferred_feature_index_1.def    |  1 +
> >>  sysdeps/x86/tst-hwcap-tunables.c                  |  4 ++--
> >>  5 files changed, 26 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> >> index e501e084ef..c096dd390a 100644
> >> --- a/sysdeps/x86/cpu-features.c
> >> +++ b/sysdeps/x86/cpu-features.c
> >> @@ -870,11 +870,18 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >>
> >>               /* Newer Bigcore microarch (larger non-temporal store
> >>                  threshold).  */
> >> -           case INTEL_BIGCORE_SKYLAKE:
> >> -           case INTEL_BIGCORE_KABYLAKE:
> >> -           case INTEL_BIGCORE_COMETLAKE:
> >>             case INTEL_BIGCORE_SKYLAKE_AVX512:
> >>             case INTEL_BIGCORE_CANNONLAKE:
> >> +             /* Benchmarks indicate non-temporal memset is not
> >> +                    necessarily profitable on SKX (and in some cases
> much
> >> +                    worse). This is likely unique to SKX due its it
> unique
> >> +                    mesh interconnect (not present on ICX or BWD).
> Disable
> >> +                    non-temporal on all Skylake servers. */
> >> +
>  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> >> +                 |= bit_arch_Avoid_Non_Temporal_Memset;
> >> +           case INTEL_BIGCORE_COMETLAKE:
> >> +           case INTEL_BIGCORE_SKYLAKE:
> >> +           case INTEL_BIGCORE_KABYLAKE:
> >>             case INTEL_BIGCORE_ICELAKE:
> >>             case INTEL_BIGCORE_TIGERLAKE:
> >>             case INTEL_BIGCORE_ROCKETLAKE:
> >> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> >> index 89da7a03da..ae9dcd6180 100644
> >> --- a/sysdeps/x86/cpu-tunables.c
> >> +++ b/sysdeps/x86/cpu-tunables.c
> >> @@ -243,6 +243,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> >>                 (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F,
> 24);
> >>             }
> >>           break;
> >> +       case 25:
> >> +         {
> >> +           CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
> >> +
>  Avoid_Non_Temporal_Memset, 25);
> >> +         }
> >>         case 26:
> >>             {
> >>               CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
> >> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> >> index 5e77345a6e..a1c03b8903 100644
> >> --- a/sysdeps/x86/dl-cacheinfo.h
> >> +++ b/sysdeps/x86/dl-cacheinfo.h
> >> @@ -991,13 +991,14 @@ dl_init_cacheinfo (struct cpu_features
> *cpu_features)
> >>    /* Non-temporal stores are more performant on Intel and AMD hardware
> above
> >>       non_temporal_threshold. Enable this for both Intel and AMD
> hardware. */
> >>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> >> -  if (cpu_features->basic.kind == arch_kind_intel
> >> -      || cpu_features->basic.kind == arch_kind_amd)
> >> -      memset_non_temporal_threshold = non_temporal_threshold;
> >> -
> >> -   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> >> -      cases slower than the vectorized path (and for some alignments,
> >> -      it is really slow, check BZ #30994).  */
> >> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> >> +      && (cpu_features->basic.kind == arch_kind_intel
> >> +         || cpu_features->basic.kind == arch_kind_amd))
> >> +    memset_non_temporal_threshold = non_temporal_threshold;
> >> +
> >> +  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> >> +     cases slower than the vectorized path (and for some alignments,
> >> +     it is really slow, check BZ #30994).  */
> >>    if (cpu_features->basic.kind == arch_kind_amd)
> >>      rep_movsb_threshold = non_temporal_threshold;
> >>
> >> diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> index 85e7f54ec8..61bbbc2e89 100644
> >> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >>  BIT (MathVec_Prefer_No_AVX512)
> >>  BIT (Prefer_FSRM)
> >>  BIT (Avoid_Short_Distance_REP_MOVSB)
> >> +BIT (Avoid_Non_Temporal_Memset)
> >> diff --git a/sysdeps/x86/tst-hwcap-tunables.c
> b/sysdeps/x86/tst-hwcap-tunables.c
> >> index 8589a9fd66..94307283d7 100644
> >> --- a/sysdeps/x86/tst-hwcap-tunables.c
> >> +++ b/sysdeps/x86/tst-hwcap-tunables.c
> >> @@ -60,7 +60,7 @@ static const struct test_t
> >>      /* Disable everything.  */
> >>      "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
> >> -    "-AVX_Fast_Unaligned_Load",
> >> +    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
> >>      test_1,
> >>      array_length (test_1)
> >>    },
> >> @@ -68,7 +68,7 @@ static const struct test_t
> >>      /* Same as before, but with some empty suboptions.  */
> >>      ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
> >>      "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
> >> -    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
> >> +    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
> >>      test_1,
> >>      array_length (test_1)
> >>    }
> >> --
> >> 2.34.1
> >
> >
> > LGTM.
> >
> > Thanks.
> >
> > H.J.
>
> Can I add you reviewed by tag?
>

Yes.

>>
> >>
> >>
>
>
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index e501e084ef..c096dd390a 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -870,11 +870,18 @@  init_cpu_features (struct cpu_features *cpu_features)
 
 	      /* Newer Bigcore microarch (larger non-temporal store
 		 threshold).  */
-	    case INTEL_BIGCORE_SKYLAKE:
-	    case INTEL_BIGCORE_KABYLAKE:
-	    case INTEL_BIGCORE_COMETLAKE:
 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
 	    case INTEL_BIGCORE_CANNONLAKE:
+	      /* Benchmarks indicate non-temporal memset is not
+		     necessarily profitable on SKX (and in some cases much
+		     worse). This is likely unique to SKX due its it unique
+		     mesh interconnect (not present on ICX or BWD). Disable
+		     non-temporal on all Skylake servers. */
+	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+		  |= bit_arch_Avoid_Non_Temporal_Memset;
+	    case INTEL_BIGCORE_COMETLAKE:
+	    case INTEL_BIGCORE_SKYLAKE:
+	    case INTEL_BIGCORE_KABYLAKE:
 	    case INTEL_BIGCORE_ICELAKE:
 	    case INTEL_BIGCORE_TIGERLAKE:
 	    case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a03da..ae9dcd6180 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -243,6 +243,11 @@  TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
 	    }
 	  break;
+	case 25:
+	  {
+	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+					      Avoid_Non_Temporal_Memset, 25);
+	  }
 	case 26:
 	    {
 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5e77345a6e..a1c03b8903 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -991,13 +991,14 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* Non-temporal stores are more performant on Intel and AMD hardware above
      non_temporal_threshold. Enable this for both Intel and AMD hardware. */
   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
-  if (cpu_features->basic.kind == arch_kind_intel
-      || cpu_features->basic.kind == arch_kind_amd)
-      memset_non_temporal_threshold = non_temporal_threshold;
-
-   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
-      cases slower than the vectorized path (and for some alignments,
-      it is really slow, check BZ #30994).  */
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+      && (cpu_features->basic.kind == arch_kind_intel
+	  || cpu_features->basic.kind == arch_kind_amd))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
+  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+     cases slower than the vectorized path (and for some alignments,
+     it is really slow, check BZ #30994).  */
   if (cpu_features->basic.kind == arch_kind_amd)
     rep_movsb_threshold = non_temporal_threshold;
 
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54ec8..61bbbc2e89 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@  BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 8589a9fd66..94307283d7 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@  static const struct test_t
     /* Disable everything.  */
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
-    "-AVX_Fast_Unaligned_Load",
+    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
     test_1,
     array_length (test_1)
   },
@@ -68,7 +68,7 @@  static const struct test_t
     /* Same as before, but with some empty suboptions.  */
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
-    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
     test_1,
     array_length (test_1)
   }