@@ -90,6 +90,14 @@
/* For X86_ISA_CPU_FEATURES_ARCH_P. */
+
+/* NB: This is just an alias to `AVX_Fast_Unaligned_Load` that will
+ continue doing runtime check up to ISA level >= 4. This is for
+ some Zhaoxin CPUs which build at ISA level V3 but still have a
+ strong preference for avoiding unaligned `ymm` loads. */
+#define V4_AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 4
+#define V4_AVX_Fast_Unaligned_Load AVX_Fast_Unaligned_Load
+
/* NB: This feature is enabled when ISA level >= 3, which was disabled
for the following CPUs:
- AMD Excavator
@@ -106,6 +114,13 @@
this feature don't run on glibc built with ISA level >= 3. */
#define Slow_SSE4_2_X86_ISA_LEVEL 3
+/* NB: This is just an alias to `Fast_Unaligned_Copy` that will
+ continue doing runtime check up to ISA level >= 3. This is for
+ some Zhaoxin CPUs which build at ISA level V3 but still have a
+ strong preference for avoiding unaligned `ymm` loads. */
+#define V3_Fast_Unaligned_Copy_X86_ISA_LEVEL 3
+#define V3_Fast_Unaligned_Copy Fast_Unaligned_Copy
+
/* Feature(s) enabled when ISA level >= 2. */
#define Fast_Unaligned_Load_X86_ISA_LEVEL 2
@@ -143,8 +143,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -190,8 +191,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1004,8 +1006,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1051,8 +1054,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1098,8 +1102,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -1145,8 +1150,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_erms_rtm)
- /* By V3 we assume fast aligned copy. */
- X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
+ /* Some V3 implementations still heavily prefer aligned
+ loads so keep SSSE3 implementation around. */
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -72,7 +72,7 @@ IFUNC_SELECTOR (void)
}
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
- AVX_Fast_Unaligned_Load, ))
+ V4_AVX_Fast_Unaligned_Load, ))
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
{
@@ -101,11 +101,13 @@ IFUNC_SELECTOR (void)
}
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- /* Leave this as runtime check. The SSSE3 is optimized almost
- exclusively for avoiding unaligned memory access during the
- copy and by and large is not better than the sse2
- implementation as a general purpose memmove. */
- && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ /* Leave this as runtime check for V2. By V3 assume it must be
+ set. The SSSE3 is optimized almost exclusively for avoiding
+ unaligned memory access during the copy and by and large is
+ not better than the sse2 implementation as a general purpose
+ memmove. */
+ && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+ V3_Fast_Unaligned_Copy, !))
{
return OPTIMIZE (ssse3);
}
@@ -20,7 +20,9 @@
#include <isa-level.h>
-#if ISA_SHOULD_BUILD (2)
+/* Continue building up to ISA level V3 as some V3 CPUs strongly
+ prefer this implementation. */
+#if ISA_SHOULD_BUILD (3)
# include <sysdep.h>
# ifndef MEMMOVE