diff mbox series

[v2,1/3] x86: Set preferred CPU features on the KH-40000 and KX-7000 Zhaoxin processors

Message ID 20240629035828.4145216-1-MayShao-oc@zhaoxin.com
State New
Headers show
Series [v2,1/3] x86: Set preferred CPU features on the KH-40000 and KX-7000 Zhaoxin processors | expand

Commit Message

Mayshao-oc June 29, 2024, 3:58 a.m. UTC
Fix code formatting under the Zhaoxin branch and add comments for
different Zhaoxin models.

Unaligned AVX load are slower on KH-40000 and KX-7000, so disable
the AVX_Fast_Unaligned_Load.

Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to
use sse2_unaligned version of memset,strcpy and strcat.
---
 sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 16 deletions(-)

Comments

Noah Goldstein June 30, 2024, 2:06 a.m. UTC | #1
On Saturday, June 29, 2024, MayShao-oc <MayShao-oc@zhaoxin.com> wrote:

> Fix code formatting under the Zhaoxin branch and add comments for
> different Zhaoxin models.
>
> Unaligned AVX load are slower on KH-40000 and KX-7000, so disable
> the AVX_Fast_Unaligned_Load.
>
> Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to
> use sse2_unaligned version of memset,strcpy and strcat.
> ---
>  sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------
>  1 file changed, 35 insertions(+), 16 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 3d7c2819d7..1927f65699 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -1023,39 +1023,58 @@ https://www.intel.com/content/
> www/us/en/support/articles/000059422/processors.ht
>
>        model += extended_model;
>        if (family == 0x6)
> -        {
> -          if (model == 0xf || model == 0x19)
> -            {
> +       {
> +         /* Tuning for older Zhaoxin processors.  */
> +         if (model == 0xf || model == 0x19)
> +           {
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
>
> -              cpu_features->preferred[index_arch_Slow_SSE4_2]
> -                |= bit_arch_Slow_SSE4_2;
> +             cpu_features->preferred[index_arch_Slow_SSE4_2]
> +                 |= bit_arch_Slow_SSE4_2;
>
> +             /*  Unaligned AVX loads are slower.  */
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> -            }
> -        }
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +           }
> +       }
>        else if (family == 0x7)
> -        {
> -         if (model == 0x1b)
> +       {
> +         switch (model)
>             {
> +             /* Wudaokou microarch tuning.  */
> +           case 0x1b:
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
>
>               cpu_features->preferred[index_arch_Slow_SSE4_2]
> -               |= bit_arch_Slow_SSE4_2;
> +                 |= bit_arch_Slow_SSE4_2;
>
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> -           }
> -         else if (model == 0x3b)
> -           {
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +             break;
> +
> +             /* Lujiazui microarch tuning.  */
> +           case 0x3b:
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
>
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +             break;
> +
> +             /* Yongfeng and Shijidadao mircoarch tuning.  */
> +           case 0x5b:
> +           case 0x6b:
> +             cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +
> +             /* To use sse2_unaligned versions of memset, strcpy and
> strcat.
> +              */
> +             cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
> +                 |= (bit_arch_Prefer_No_VZEROUPPER
> +                     | bit_arch_Fast_Unaligned_Load);
> +             break;
>             }
>         }
>      }
> --
> 2.34.1


LGTM.


Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
Mayshao-oc June 30, 2024, 1:40 p.m. UTC | #2
Hi Folrain, Ruoyao, H.J. and Carlos ,

I would like to ask, can anyone help push this series of patches to master? Because
tomorrow is the freeze date, but I don't have commit access permission, and Noah can't
submit now either.

Thank you in advance, everyone.

Best Regards,
May Shao


> On Sunday, Jun 30, 2024 at 10:06 AM  Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> On Saturday, June 29, 2024, MayShao-oc <MayShao-oc@zhaoxin.com<mailto:MayShao-oc@zhaoxin.com>> wrote:
> Fix code formatting under the Zhaoxin branch and add comments for
> different Zhaoxin models.
> 
> Unaligned AVX load are slower on KH-40000 and KX-7000, so disable
> the AVX_Fast_Unaligned_Load.
> 
> Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to
> use sse2_unaligned version of memset,strcpy and strcat.
> ---
>  sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------
>  1 file changed, 35 insertions(+), 16 deletions(-)
> 
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 3d7c2819d7..1927f65699 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -1023,39 +1023,58 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
> 
>        model += extended_model;
>        if (family == 0x6)
> -        {
> -          if (model == 0xf || model == 0x19)
> -            {
> +       {
> +         /* Tuning for older Zhaoxin processors.  */
> +         if (model == 0xf || model == 0x19)
> +           {
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
> 
> -              cpu_features->preferred[index_arch_Slow_SSE4_2]
> -                |= bit_arch_Slow_SSE4_2;
> +             cpu_features->preferred[index_arch_Slow_SSE4_2]
> +                 |= bit_arch_Slow_SSE4_2;
> 
> +             /*  Unaligned AVX loads are slower.  */
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> -            }
> -        }
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +           }
> +       }
>        else if (family == 0x7)
> -        {
> -         if (model == 0x1b)
> +       {
> +         switch (model)
>             {
> +             /* Wudaokou microarch tuning.  */
> +           case 0x1b:
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
> 
>               cpu_features->preferred[index_arch_Slow_SSE4_2]
> -               |= bit_arch_Slow_SSE4_2;
> +                 |= bit_arch_Slow_SSE4_2;
> 
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> -           }
> -         else if (model == 0x3b)
> -           {
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +             break;
> +
> +             /* Lujiazui microarch tuning.  */
> +           case 0x3b:
>               CPU_FEATURE_UNSET (cpu_features, AVX);
>               CPU_FEATURE_UNSET (cpu_features, AVX2);
> 
>               cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> -               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +             break;
> +
> +             /* Yongfeng and Shijidadao mircoarch tuning.  */
> +           case 0x5b:
> +           case 0x6b:
> +             cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
> +                 &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +
> +             /* To use sse2_unaligned versions of memset, strcpy and strcat.
> +              */
> +             cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
> +                 |= (bit_arch_Prefer_No_VZEROUPPER
> +                     | bit_arch_Fast_Unaligned_Load);
> +             break;
>             }
>         }
>      }
> --
> 2.34.1
> 
> LGTM.
> 
> 
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com<mailto:goldstein.w.n@gmail.com>>
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3d7c2819d7..1927f65699 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1023,39 +1023,58 @@  https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
 
       model += extended_model;
       if (family == 0x6)
-        {
-          if (model == 0xf || model == 0x19)
-            {
+	{
+	  /* Tuning for older Zhaoxin processors.  */
+	  if (model == 0xf || model == 0x19)
+	    {
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
-              cpu_features->preferred[index_arch_Slow_SSE4_2]
-                |= bit_arch_Slow_SSE4_2;
+	      cpu_features->preferred[index_arch_Slow_SSE4_2]
+		  |= bit_arch_Slow_SSE4_2;
 
+	      /*  Unaligned AVX loads are slower.  */
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
-            }
-        }
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	    }
+	}
       else if (family == 0x7)
-        {
-	  if (model == 0x1b)
+	{
+	  switch (model)
 	    {
+	      /* Wudaokou microarch tuning.  */
+	    case 0x1b:
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
 	      cpu_features->preferred[index_arch_Slow_SSE4_2]
-		|= bit_arch_Slow_SSE4_2;
+		  |= bit_arch_Slow_SSE4_2;
 
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
-	    }
-	  else if (model == 0x3b)
-	    {
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Lujiazui microarch tuning.  */
+	    case 0x3b:
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Yongfeng and Shijidadao mircoarch tuning.  */
+	    case 0x5b:
+	    case 0x6b:
+	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+
+	      /* To use sse2_unaligned versions of memset, strcpy and strcat.
+	       */
+	      cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+		  |= (bit_arch_Prefer_No_VZEROUPPER
+		      | bit_arch_Fast_Unaligned_Load);
+	      break;
 	    }
 	}
     }