[v1] x86: Continue building memmove-ssse3.S as ISA level V3

Message ID	20220718103811.1842054-2-goldstein.w.n@gmail.com
State	New
Headers	show Return-Path: <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 82AAE3857B88 To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Continue building memmove-ssse3.S as ISA level V3 Date: Mon, 18 Jul 2022 03:38:11 -0700 Message-Id: <20220718103811.1842054-2-goldstein.w.n@gmail.com> In-Reply-To: <20220718103811.1842054-1-goldstein.w.n@gmail.com> References: <20220718103811.1842054-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Noah Goldstein <goldstein.w.n@gmail.com> Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org>
Series	[v1] x86: Continue building memmove-ssse3.S as ISA level V3 \| expand [v1] x86: Continue building memmove-ssse3.S as ISA level V3

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h index fe56af7e2b..f49336acf3 100644 --- a/sysdeps/x86/isa-level.h +++ b/sysdeps/x86/isa-level.h @@ -90,6 +90,14 @@ /* For X86_ISA_CPU_FEATURES_ARCH_P. */ + +/* NB: This is just an alias to `AVX_Fast_Unaligned_Load` that will + continue doing runtime check up to ISA level >= 4. This is for + some Zhaoxin CPUs which build at ISA level V3 but still have a + strong preference for avoiding unaligned `ymm` loads. */ +#define V4_AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 4 +#define V4_AVX_Fast_Unaligned_Load AVX_Fast_Unaligned_Load + /* NB: This feature is enabled when ISA level >= 3, which was disabled for the following CPUs: - AMD Excavator @@ -106,6 +114,13 @@ this feature don't run on glibc built with ISA level >= 3. */ #define Slow_SSE4_2_X86_ISA_LEVEL 3 +/* NB: This is just an alias to `Fast_Unaligned_Copy` that will + continue doing runtime check up to ISA level >= 3. This is for + some Zhaoxin CPUs which build at ISA level V3 but still have a + strong preference for avoiding unaligned `ymm` loads. */ +#define V3_Fast_Unaligned_Copy_X86_ISA_LEVEL 3 +#define V3_Fast_Unaligned_Copy Fast_Unaligned_Copy + /* Feature(s) enabled when ISA level >= 2. */ #define Fast_Unaligned_Load_X86_ISA_LEVEL 2 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a71444eccb..427f127427 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -143,8 +143,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __memmove_chk_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 @@ -190,8 +191,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __memmove_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, memmove, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 @@ -1004,8 +1006,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __memcpy_chk_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 @@ -1051,8 +1054,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __memcpy_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 @@ -1098,8 +1102,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __mempcpy_chk_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 @@ -1145,8 +1150,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX) && CPU_FEATURE_USABLE (RTM)), __mempcpy_avx_unaligned_erms_rtm) - /* By V3 we assume fast aligned copy. */ - X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy, + /* Some V3 implementations still heavily prefer aligned + loads so keep SSSE3 implementation around. */ + X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3) /* ISA V2 wrapper for SSE2 implementation because the SSE2 diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 1643d32887..be0c758783 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -72,7 +72,7 @@ IFUNC_SELECTOR (void) } if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, - AVX_Fast_Unaligned_Load, )) + V4_AVX_Fast_Unaligned_Load, )) { if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) { @@ -101,11 +101,13 @@ IFUNC_SELECTOR (void) } if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - /* Leave this as runtime check. The SSSE3 is optimized almost - exclusively for avoiding unaligned memory access during the - copy and by and large is not better than the sse2 - implementation as a general purpose memmove. */ - && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + /* Leave this as runtime check for V2. By V3 assume it must be + set. The SSSE3 is optimized almost exclusively for avoiding + unaligned memory access during the copy and by and large is + not better than the sse2 implementation as a general purpose + memmove. */ + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + V3_Fast_Unaligned_Copy, !)) { return OPTIMIZE (ssse3); } diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 57599752c7..15cafee766 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -20,7 +20,9 @@ #include <isa-level.h> -#if ISA_SHOULD_BUILD (2) +/* Continue building up to ISA level V3 as some V3 CPUs strongly + prefer this implementation. */ +#if ISA_SHOULD_BUILD (3) # include <sysdep.h> # ifndef MEMMOVE

[v1] x86: Continue building memmove-ssse3.S as ISA level V3

Commit Message

Patch