diff mbox series

[v3,2/2] x86: Continue building memmove-ssse3.S as ISA level V3

Message ID 20220718132530.1847929-2-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Fix type of `Slow_SSE4_2` def in isa-level.h | expand

Commit Message

Noah Goldstein July 18, 2022, 1:25 p.m. UTC
Some V3 processors still strongly prefer memmove-ssse3.S because it is
heavily optimized to avoid unaligned memory accesses.

Tested builds for x86-64 v1, v2, v3, and v4 with and without
multiarch.
---
 sysdeps/x86/isa-level.h                    | 15 +++++++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 30 +++++++++++++---------
 sysdeps/x86_64/multiarch/ifunc-memmove.h   | 14 +++++-----
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |  4 ++-
 4 files changed, 44 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index fe56af7e2b..f49336acf3 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -90,6 +90,14 @@ 
 
 /* For X86_ISA_CPU_FEATURES_ARCH_P.  */
 
+
+/* NB: This is just an alias to `AVX_Fast_Unaligned_Load` that will
+   continue doing runtime check up to ISA level >= 4.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V4_AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 4
+#define V4_AVX_Fast_Unaligned_Load AVX_Fast_Unaligned_Load
+
 /* NB: This feature is enabled when ISA level >= 3, which was disabled
    for the following CPUs:
         - AMD Excavator
@@ -106,6 +114,13 @@ 
    this feature don't run on glibc built with ISA level >= 3.  */
 #define Slow_SSE4_2_X86_ISA_LEVEL 3
 
+/* NB: This is just an alias to `Fast_Unaligned_Copy` that will
+   continue doing runtime check up to ISA level >= 3.  This is for
+   some Zhaoxin CPUs which build at ISA level V3 but still have a
+   strong preference for avoiding unaligned `ymm` loads.  */
+#define V3_Fast_Unaligned_Copy_X86_ISA_LEVEL 3
+#define V3_Fast_Unaligned_Copy Fast_Unaligned_Copy
+
 /* Feature(s) enabled when ISA level >= 2.  */
 #define Fast_Unaligned_Load_X86_ISA_LEVEL 2
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..427f127427 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -143,8 +143,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -190,8 +191,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memmove_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1004,8 +1006,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1051,8 +1054,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __memcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1098,8 +1102,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_chk_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_chk_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1145,8 +1150,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_avx_unaligned_erms_rtm)
-	      /* By V3 we assume fast aligned copy.  */
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
+	      /* Some V3 implementations still heavily prefer aligned
+	         loads so keep SSSE3 implementation around.  */
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __mempcpy_ssse3)
 	      /* ISA V2 wrapper for SSE2 implementation because the SSE2
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1643d32887..be0c758783 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -72,7 +72,7 @@  IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
-				   AVX_Fast_Unaligned_Load, ))
+				   V4_AVX_Fast_Unaligned_Load, ))
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 	{
@@ -101,11 +101,13 @@  IFUNC_SELECTOR (void)
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      /* Leave this as runtime check.  The SSSE3 is optimized almost
-         exclusively for avoiding unaligned memory access during the
-         copy and by and large is not better than the sse2
-         implementation as a general purpose memmove.  */
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+      /* Leave this as runtime check for V2.  By V3 assume it must be
+	     set.  The SSSE3 is optimized almost exclusively for avoiding
+	     unaligned memory access during the copy and by and large is
+	     not better than the sse2 implementation as a general purpose
+	     memmove. */
+      && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				      V3_Fast_Unaligned_Copy, !))
     {
       return OPTIMIZE (ssse3);
     }
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 57599752c7..15cafee766 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -20,7 +20,9 @@ 
 
 #include <isa-level.h>
 
-#if ISA_SHOULD_BUILD (2)
+/* Continue building up to ISA level V3 as some V3 CPUs strongly
+   prefer this implementation.  */
+#if ISA_SHOULD_BUILD (3)
 
 # include <sysdep.h>
 # ifndef MEMMOVE