diff mbox series

[v3,1/2] x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path

Message ID 20240814063731.3014055-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v3,1/2] x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path | expand

Commit Message

Noah Goldstein Aug. 14, 2024, 6:37 a.m. UTC
This is just a refactor and there should be no behavioral change from
this commit.

The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
---
 sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
 sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

Comments

H.J. Lu Aug. 14, 2024, 11:13 a.m. UTC | #1
On Tue, Aug 13, 2024 at 11:37 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is just a refactor and there should be no behavioral change from
> this commit.
>
> The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
> for controlling whether we use non-temporal memset rather than having
> extra logic based on vendor.
> ---
>  sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
>  sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
>  2 files changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 18ed008040..a4786d23c7 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -756,6 +756,12 @@ init_cpu_features (struct cpu_features *cpu_features)
>    unsigned int stepping = 0;
>    enum cpu_features_kind kind;
>
> +  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
> +     as of writing this, we only have benchmarks indicatings it profitability
> +     on Intel/AMD.  */
> +  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +      |= bit_arch_Avoid_Non_Temporal_Memset;
> +
>    cpu_features->cachesize_non_temporal_divisor = 4;
>  #if !HAS_CPUID
>    if (__get_cpuid_max (0, 0) == 0)
> @@ -781,6 +787,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>
>        update_active (cpu_features);
>
> +      /* Benchmarks indicate non-temporal memset can be profitable on Intel
> +       hardware.  */
> +      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +         &= ~bit_arch_Avoid_Non_Temporal_Memset;
> +
>        if (family == 0x06)
>         {
>           model += extended_model;
> @@ -992,6 +1003,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
>
>        ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
>
> +      /* Benchmarks indicate non-temporal memset can be profitable on AMD
> +       hardware.  */
> +      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +         &= ~bit_arch_Avoid_Non_Temporal_Memset;
> +
>        if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
>         {
>           /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index a1c03b8903..3d0c8d43b8 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
>      rep_movsb_threshold = 2112;
>
> -  /* Non-temporal stores are more performant on Intel and AMD hardware above
> -     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
> -  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> -  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
> -      && (cpu_features->basic.kind == arch_kind_intel
> -         || cpu_features->basic.kind == arch_kind_amd))
> -    memset_non_temporal_threshold = non_temporal_threshold;
> -
>    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
>       cases slower than the vectorized path (and for some alignments,
>       it is really slow, check BZ #30994).  */
> @@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (tunable_size != 0)
>      shared = tunable_size;
>
> +  /* Non-temporal stores are more performant on some hardware above
> +     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
> +     Intel and AMD hardware. */
> +  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
> +    memset_non_temporal_threshold = non_temporal_threshold;
> +
>    tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
>    if (tunable_size > minimum_non_temporal_threshold
>        && tunable_size <= maximum_non_temporal_threshold)
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 18ed008040..a4786d23c7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -756,6 +756,12 @@  init_cpu_features (struct cpu_features *cpu_features)
   unsigned int stepping = 0;
   enum cpu_features_kind kind;
 
+  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+     as of writing this, we only have benchmarks indicatings it profitability
+     on Intel/AMD.  */
+  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+      |= bit_arch_Avoid_Non_Temporal_Memset;
+
   cpu_features->cachesize_non_temporal_divisor = 4;
 #if !HAS_CPUID
   if (__get_cpuid_max (0, 0) == 0)
@@ -781,6 +787,11 @@  init_cpu_features (struct cpu_features *cpu_features)
 
       update_active (cpu_features);
 
+      /* Benchmarks indicate non-temporal memset can be profitable on Intel
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
       if (family == 0x06)
 	{
 	  model += extended_model;
@@ -992,6 +1003,11 @@  https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
 
       ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
 
+      /* Benchmarks indicate non-temporal memset can be profitable on AMD
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
 	{
 	  /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a1c03b8903..3d0c8d43b8 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -988,14 +988,6 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-  /* Non-temporal stores are more performant on Intel and AMD hardware above
-     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
-  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
-      && (cpu_features->basic.kind == arch_kind_intel
-	  || cpu_features->basic.kind == arch_kind_amd))
-    memset_non_temporal_threshold = non_temporal_threshold;
-
   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
      cases slower than the vectorized path (and for some alignments,
      it is really slow, check BZ #30994).  */
@@ -1017,6 +1009,13 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (tunable_size != 0)
     shared = tunable_size;
 
+  /* Non-temporal stores are more performant on some hardware above
+     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+     Intel and AMD hardware. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
   if (tunable_size > minimum_non_temporal_threshold
       && tunable_size <= maximum_non_temporal_threshold)