@@ -16,17 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
-#define VEC_SIZE 16
-#define VEC(i) xmm##i
-#define PREFETCHNT prefetchnta
-#define VMOVNT movntdq
-/* Use movups and movaps for smaller code sizes. */
-#define VMOVU movups
-#define VMOVA movaps
-#define MOV_SIZE 3
-#define SECTION(p) p
#ifdef USE_MULTIARCH
# if !IS_IN (libc)
@@ -42,12 +31,20 @@
#if !defined USE_MULTIARCH || !IS_IN (libc)
# define MEMPCPY_SYMBOL(p,s) __mempcpy
#endif
-#ifndef MEMMOVE_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s) p
-# define MEMMOVE_SYMBOL(p,s) memmove
-#endif
-#include "multiarch/memmove-vec-unaligned-erms.S"
+#define MEMMOVE_CHK_SYMBOL(p,s) p
+#define MEMMOVE_SYMBOL(p,s) memmove
+
+
+#define DEFAULT_IMPL_V1 "multiarch/memmove-sse2-unaligned-erms.S"
+#define DEFAULT_IMPL_V3 "multiarch/memmove-avx-unaligned-erms.S"
+#define DEFAULT_IMPL_V4 "multiarch/memmove-evex-unaligned-erms.S"
+
+#include "isa-default-impl.h"
+
+#if defined USE_MULTIARCH && !IS_IN (libc)
+weak_alias (__mempcpy, mempcpy)
+#endif
#ifndef USE_MULTIARCH
libc_hidden_builtin_def (memmove)
@@ -59,13 +56,10 @@ libc_hidden_def (__mempcpy)
weak_alias (__mempcpy, mempcpy)
libc_hidden_builtin_def (mempcpy)
+
# if defined SHARED && IS_IN (libc)
# undef memcpy
# include <shlib-compat.h>
versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
-
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
-# endif
# endif
#endif
@@ -18,6 +18,7 @@ sysdep_routines += \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
+ memmove-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
@@ -109,83 +109,93 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
IFUNC_IMPL (i, name, __memmove_chk,
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512F),
__memmove_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_chk_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_erms))
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_erms))
#endif
/* Support sysdeps/x86_64/multiarch/memmove.c. */
IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512F),
+ __memmove_avx512_no_vzeroupper)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_avx512_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_avx512_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, memmove,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
- __memmove_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512VL),
- __memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
+ CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memmove, 1,
__memmove_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memmove, 1,
__memmove_sse2_unaligned_erms))
/* Support sysdeps/x86_64/multiarch/memrchr.c. */
@@ -847,165 +857,186 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.c. */
IFUNC_IMPL (i, name, __memcpy_chk,
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_chk_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_erms))
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms))
#endif
/* Support sysdeps/x86_64/multiarch/memcpy.c. */
IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512F),
+ __memcpy_avx512_no_vzeroupper)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_avx512_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_avx512_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, memcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
+ CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
- __memcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __memcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy, 1,
- __memcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned)
+ X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms))
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.c. */
IFUNC_IMPL (i, name, __mempcpy_chk,
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __mempcpy_chk_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512VL),
- __mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_erms))
+ X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms))
#endif
/* Support sysdeps/x86_64/multiarch/mempcpy.c. */
IFUNC_IMPL (i, name, mempcpy,
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_rtm)
- IFUNC_IMPL_ADD (array, i, mempcpy,
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_erms_rtm)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __mempcpy_evex_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512VL),
- __mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ /* By V3 we assume fast aligned copy. */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ /* Can be lowered to V1 if a general V2 implementation is
+ added (ssse3 is too optimized around aligned copy to be
+ better as general purpose memmove). */
+ X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- __mempcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
+ X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms))
/* Support sysdeps/x86_64/multiarch/strncmp.c. */
IFUNC_IMPL (i, name, strncmp,
@@ -20,11 +20,19 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (erms) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper)
+ attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
+
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -32,30 +40,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper)
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
- const struct cpu_features* cpu_features = __get_cpu_features ();
+ const struct cpu_features *cpu_features = __get_cpu_features ();
if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_ERMS)
|| CPU_FEATURES_ARCH_P (cpu_features, Prefer_FSRM))
return OPTIMIZE (erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+ if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (avx512_unaligned_erms);
@@ -66,9 +71,10 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx512_no_vzeroupper);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+ AVX_Fast_Unaligned_Load, ))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (evex_unaligned_erms);
@@ -84,7 +90,8 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx_unaligned_rtm);
}
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_No_VZEROUPPER, !))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (avx_unaligned_erms);
@@ -93,7 +100,11 @@ IFUNC_SELECTOR (void)
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ /* Leave this as runtime check. The SSSE3 is optimized almost
+ exclusively for avoiding unaligned memory access during the
+ copy and by and large is not better than the sse2
+ implementation as a general purpose memmove. */
&& !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
return OPTIMIZE (ssse3);
@@ -1,12 +1,23 @@
-#if IS_IN (libc)
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
# define VEC_SIZE 32
# define VEC(i) ymm##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define MOV_SIZE 4
+
# define SECTION(p) p##.avx
-# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
+
+# ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
+# endif
# include "memmove-vec-unaligned-erms.S"
+
+# if MINIMUM_X86_ISA_LEVEL == 3
+# include "memmove-shlib-compat.h"
+# endif
#endif
@@ -17,8 +17,9 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#include <isa-level.h>
-#if IS_IN (libc)
+#if ISA_SHOULD_BUILD (4)
# include "asm-syntax.h"
@@ -1,4 +1,7 @@
-#if IS_IN (libc)
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
# define VEC_SIZE 64
# define XMM0 xmm16
# define XMM1 xmm17
@@ -26,8 +29,12 @@
# define VMOVA vmovdqa64
# define VZEROUPPER
# define MOV_SIZE 6
+
# define SECTION(p) p##.evex512
-# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
+
+# ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
+# endif
# include "memmove-vec-unaligned-erms.S"
#endif
@@ -1,4 +1,7 @@
-#if IS_IN (libc)
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
# define VEC_SIZE 32
# define XMM0 xmm16
# define XMM1 xmm17
@@ -26,8 +29,17 @@
# define VMOVA vmovdqa64
# define VZEROUPPER
# define MOV_SIZE 6
+
# define SECTION(p) p##.evex
-# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+# endif
# include "memmove-vec-unaligned-erms.S"
+
+
+# if MINIMUM_X86_ISA_LEVEL == 4
+# include "memmove-shlib-compat.h"
+# endif
#endif
new file mode 100644
@@ -0,0 +1,26 @@
+/* Copyright (C) 2016-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined SHARED && IS_IN(libc)
+# include <shlib-compat.h>
+# if SHLIB_COMPAT(libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_{isa_level}_unaligned to support overlapping
+ addresses. */
+compat_symbol (libc, MEMMOVE_SYMBOL (__memmove, unaligned), memcpy,
+ GLIBC_2_2_5);
+# endif
+#endif
@@ -16,18 +16,32 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
-#else
-weak_alias (__mempcpy, mempcpy)
-#endif
+#include <isa-level.h>
+
+/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
+ so we need this to build for ISA V2 builds. */
+#if ISA_SHOULD_BUILD (2)
+
+# include <sysdep.h>
+
+# define VEC_SIZE 16
+# define VEC(i) xmm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNT movntdq
+/* Use movups and movaps for smaller code sizes. */
+# define VMOVU movups
+# define VMOVA movaps
+# define MOV_SIZE 3
+
+# define SECTION(p) p
+
+# ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
+# endif
-#include <sysdeps/x86_64/memmove.S>
+# include "multiarch/memmove-vec-unaligned-erms.S"
-#if defined SHARED && IS_IN (libc)
-# include <shlib-compat.h>
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-/* Use __memmove_sse2_unaligned to support overlapping addresses. */
-compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# if MINIMUM_X86_ISA_LEVEL <= 2
+# include "memmove-shlib-compat.h"
# endif
#endif
@@ -1,13 +1,17 @@
-#include <sysdep.h>
-
-#ifndef MEMMOVE
-# define MEMMOVE __memmove_ssse3
-# define MEMMOVE_CHK __memmove_chk_ssse3
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (2)
+
+# include <sysdep.h>
+
+# ifndef MEMMOVE
+# define MEMMOVE __memmove_ssse3
+# define MEMMOVE_CHK __memmove_chk_ssse3
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
+# endif
.section .text.ssse3, "ax", @progbits
ENTRY(MEMPCPY_CHK)
@@ -27,10 +31,10 @@ ENTRY(MEMMOVE_CHK)
END(MEMMOVE_CHK)
ENTRY_P2ALIGN(MEMMOVE, 6)
-# ifdef __ILP32__
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
-# endif
+# endif
movq %rdi, %rax
L(start):
cmpq $16, %rdx
@@ -124,11 +128,11 @@ L(more_2x_vec):
loop. */
movups %xmm0, (%rdi)
-#ifdef SHARED_CACHE_SIZE_HALF
+# ifdef SHARED_CACHE_SIZE_HALF
cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
-#else
+# else
cmp __x86_shared_cache_size_half(%rip), %rdx
-#endif
+# endif
ja L(large_memcpy)
leaq -64(%rdi, %rdx), %r8
@@ -206,7 +210,7 @@ L(end_loop_fwd):
/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
60 bytes otherwise. */
-#define ALIGNED_LOOP_FWD(align_by); \
+# define ALIGNED_LOOP_FWD(align_by); \
.p2align 6; \
L(loop_fwd_ ## align_by): \
movaps 16(%rsi), %xmm0; \
@@ -275,7 +279,7 @@ L(end_large_loop_fwd):
/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
96-byte spacing between each. */
-#define ALIGNED_LARGE_LOOP_FWD(align_by); \
+# define ALIGNED_LARGE_LOOP_FWD(align_by); \
.p2align 5; \
L(large_loop_fwd_ ## align_by): \
movaps 16(%rsi), %xmm0; \
@@ -343,7 +347,7 @@ L(end_loop_bkwd):
/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
60 bytes otherwise. */
-#define ALIGNED_LOOP_BKWD(align_by); \
+# define ALIGNED_LOOP_BKWD(align_by); \
.p2align 6; \
L(loop_bkwd_ ## align_by): \
movaps 32(%rsi), %xmm1; \
@@ -382,3 +386,4 @@ END(MEMMOVE)
strong_alias (MEMMOVE, MEMCPY)
strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
+#endif
new file mode 100644
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../memmove.S"