Message ID | AS4PR08MB79015B70DC5F74EB452222B183229@AS4PR08MB7901.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | aarch64: Use memcpy_simd as the default memcpy | expand |
On Wed, Oct 12, 2022 at 8:20 AM Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default > if SVE is not available. > > Passes regress, OK for commit? > > --- > diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S > index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644 > --- a/sysdeps/aarch64/memcpy.S > +++ b/sysdeps/aarch64/memcpy.S > @@ -1,4 +1,5 @@ > -/* Copyright (C) 2012-2022 Free Software Foundation, Inc. > +/* Generic optimized memcpy using SIMD. > + Copyright (C) 2012-2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > @@ -20,7 +21,7 @@ > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses. > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > * > */ > > @@ -36,21 +37,18 @@ > #define B_l x8 > #define B_lw w8 > #define B_h x9 > -#define C_l x10 > #define C_lw w10 > -#define C_h x11 > -#define D_l x12 > -#define D_h x13 > -#define E_l x14 > -#define E_h x15 > -#define F_l x16 > -#define F_h x17 > -#define G_l count > -#define G_h dst > -#define H_l src > -#define H_h srcend > #define tmp1 x14 > > +#define A_q q0 > +#define B_q q1 > +#define C_q q2 > +#define D_q q3 > +#define E_q q4 > +#define F_q q5 > +#define G_q q6 > +#define H_q q7 > + > #ifndef MEMMOVE > # define MEMMOVE memmove > #endif > @@ -69,10 +67,9 @@ > Large copies use a software pipelined loop processing 64 bytes per > iteration. The destination pointer is 16-byte aligned to minimize > unaligned accesses. The loop tail is handled by always copying 64 bytes > - from the end. > -*/ > + from the end. */ > > -ENTRY_ALIGN (MEMCPY, 6) > +ENTRY (MEMCPY) > PTR_ARG (0) > PTR_ARG (1) > SIZE_ARG (2) > @@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6) > /* Small copies: 0..32 bytes. */ > cmp count, 16 > b.lo L(copy16) > - ldp A_l, A_h, [src] > - ldp D_l, D_h, [srcend, -16] > - stp A_l, A_h, [dstin] > - stp D_l, D_h, [dstend, -16] > + ldr A_q, [src] > + ldr B_q, [srcend, -16] > + str A_q, [dstin] > + str B_q, [dstend, -16] > ret > > /* Copy 8-15 bytes. */ > @@ -102,7 +99,6 @@ L(copy16): > str A_h, [dstend, -8] > ret > > - .p2align 3 > /* Copy 4-7 bytes. */ > L(copy8): > tbz count, 2, L(copy4) > @@ -128,87 +124,69 @@ L(copy0): > .p2align 4 > /* Medium copies: 33..128 bytes. */ > L(copy32_128): > - ldp A_l, A_h, [src] > - ldp B_l, B_h, [src, 16] > - ldp C_l, C_h, [srcend, -32] > - ldp D_l, D_h, [srcend, -16] > + ldp A_q, B_q, [src] > + ldp C_q, D_q, [srcend, -32] > cmp count, 64 > b.hi L(copy128) > - stp A_l, A_h, [dstin] > - stp B_l, B_h, [dstin, 16] > - stp C_l, C_h, [dstend, -32] > - stp D_l, D_h, [dstend, -16] > + stp A_q, B_q, [dstin] > + stp C_q, D_q, [dstend, -32] > ret > > .p2align 4 > /* Copy 65..128 bytes. */ > L(copy128): > - ldp E_l, E_h, [src, 32] > - ldp F_l, F_h, [src, 48] > + ldp E_q, F_q, [src, 32] > cmp count, 96 > b.ls L(copy96) > - ldp G_l, G_h, [srcend, -64] > - ldp H_l, H_h, [srcend, -48] > - stp G_l, G_h, [dstend, -64] > - stp H_l, H_h, [dstend, -48] > + ldp G_q, H_q, [srcend, -64] > + stp G_q, H_q, [dstend, -64] > L(copy96): > - stp A_l, A_h, [dstin] > - stp B_l, B_h, [dstin, 16] > - stp E_l, E_h, [dstin, 32] > - stp F_l, F_h, [dstin, 48] > - stp C_l, C_h, [dstend, -32] > - stp D_l, D_h, [dstend, -16] > + stp A_q, B_q, [dstin] > + stp E_q, F_q, [dstin, 32] > + stp C_q, D_q, [dstend, -32] > ret > > - .p2align 4 > + /* Align loop64 below to 16 bytes. */ > + nop > + > /* Copy more than 128 bytes. */ > L(copy_long): > - /* Copy 16 bytes and then align dst to 16-byte alignment. */ > - ldp D_l, D_h, [src] > - and tmp1, dstin, 15 > - bic dst, dstin, 15 > - sub src, src, tmp1 > + /* Copy 16 bytes and then align src to 16-byte alignment. */ > + ldr D_q, [src] > + and tmp1, src, 15 > + bic src, src, 15 > + sub dst, dstin, tmp1 > add count, count, tmp1 /* Count is now 16 too large. */ > - ldp A_l, A_h, [src, 16] > - stp D_l, D_h, [dstin] > - ldp B_l, B_h, [src, 32] > - ldp C_l, C_h, [src, 48] > - ldp D_l, D_h, [src, 64]! > + ldp A_q, B_q, [src, 16] > + str D_q, [dstin] > + ldp C_q, D_q, [src, 48] > subs count, count, 128 + 16 /* Test and readjust count. */ > b.ls L(copy64_from_end) > - > L(loop64): > - stp A_l, A_h, [dst, 16] > - ldp A_l, A_h, [src, 16] > - stp B_l, B_h, [dst, 32] > - ldp B_l, B_h, [src, 32] > - stp C_l, C_h, [dst, 48] > - ldp C_l, C_h, [src, 48] > - stp D_l, D_h, [dst, 64]! > - ldp D_l, D_h, [src, 64]! > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [src, 80] > + stp C_q, D_q, [dst, 48] > + ldp C_q, D_q, [src, 112] > + add src, src, 64 > + add dst, dst, 64 > subs count, count, 64 > b.hi L(loop64) > > /* Write the last iteration and copy 64 bytes from the end. */ > L(copy64_from_end): > - ldp E_l, E_h, [srcend, -64] > - stp A_l, A_h, [dst, 16] > - ldp A_l, A_h, [srcend, -48] > - stp B_l, B_h, [dst, 32] > - ldp B_l, B_h, [srcend, -32] > - stp C_l, C_h, [dst, 48] > - ldp C_l, C_h, [srcend, -16] > - stp D_l, D_h, [dst, 64] > - stp E_l, E_h, [dstend, -64] > - stp A_l, A_h, [dstend, -48] > - stp B_l, B_h, [dstend, -32] > - stp C_l, C_h, [dstend, -16] > + ldp E_q, F_q, [srcend, -64] > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [srcend, -32] > + stp C_q, D_q, [dst, 48] > + stp E_q, F_q, [dstend, -64] > + stp A_q, B_q, [dstend, -32] > ret > > END (MEMCPY) > libc_hidden_builtin_def (MEMCPY) > > -ENTRY_ALIGN (MEMMOVE, 4) > + > +ENTRY (MEMMOVE) > PTR_ARG (0) > PTR_ARG (1) > SIZE_ARG (2) > @@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4) > cmp count, 32 > b.hi L(copy32_128) > > - /* Small copies: 0..32 bytes. */ > + /* Small moves: 0..32 bytes. */ > cmp count, 16 > b.lo L(copy16) > - ldp A_l, A_h, [src] > - ldp D_l, D_h, [srcend, -16] > - stp A_l, A_h, [dstin] > - stp D_l, D_h, [dstend, -16] > + ldr A_q, [src] > + ldr B_q, [srcend, -16] > + str A_q, [dstin] > + str B_q, [dstend, -16] > ret > > - .p2align 4 > L(move_long): > /* Only use backward copy if there is an overlap. */ > sub tmp1, dstin, src > - cbz tmp1, L(copy0) > + cbz tmp1, L(move0) > cmp tmp1, count > b.hs L(copy_long) > > /* Large backwards copy for overlapping copies. > - Copy 16 bytes and then align dst to 16-byte alignment. */ > - ldp D_l, D_h, [srcend, -16] > - and tmp1, dstend, 15 > - sub srcend, srcend, tmp1 > + Copy 16 bytes and then align srcend to 16-byte alignment. */ > +L(copy_long_backwards): > + ldr D_q, [srcend, -16] > + and tmp1, srcend, 15 > + bic srcend, srcend, 15 > sub count, count, tmp1 > - ldp A_l, A_h, [srcend, -16] > - stp D_l, D_h, [dstend, -16] > - ldp B_l, B_h, [srcend, -32] > - ldp C_l, C_h, [srcend, -48] > - ldp D_l, D_h, [srcend, -64]! > + ldp A_q, B_q, [srcend, -32] > + str D_q, [dstend, -16] > + ldp C_q, D_q, [srcend, -64] > sub dstend, dstend, tmp1 > subs count, count, 128 > b.ls L(copy64_from_start) > > L(loop64_backwards): > - stp A_l, A_h, [dstend, -16] > - ldp A_l, A_h, [srcend, -16] > - stp B_l, B_h, [dstend, -32] > - ldp B_l, B_h, [srcend, -32] > - stp C_l, C_h, [dstend, -48] > - ldp C_l, C_h, [srcend, -48] > - stp D_l, D_h, [dstend, -64]! > - ldp D_l, D_h, [srcend, -64]! > + str B_q, [dstend, -16] > + str A_q, [dstend, -32] > + ldp A_q, B_q, [srcend, -96] > + str D_q, [dstend, -48] > + str C_q, [dstend, -64]! > + ldp C_q, D_q, [srcend, -128] > + sub srcend, srcend, 64 > subs count, count, 64 > b.hi L(loop64_backwards) > > /* Write the last iteration and copy 64 bytes from the start. */ > L(copy64_from_start): > - ldp G_l, G_h, [src, 48] > - stp A_l, A_h, [dstend, -16] > - ldp A_l, A_h, [src, 32] > - stp B_l, B_h, [dstend, -32] > - ldp B_l, B_h, [src, 16] > - stp C_l, C_h, [dstend, -48] > - ldp C_l, C_h, [src] > - stp D_l, D_h, [dstend, -64] > - stp G_l, G_h, [dstin, 48] > - stp A_l, A_h, [dstin, 32] > - stp B_l, B_h, [dstin, 16] > - stp C_l, C_h, [dstin] > + ldp E_q, F_q, [src, 32] > + stp A_q, B_q, [dstend, -32] > + ldp A_q, B_q, [src] > + stp C_q, D_q, [dstend, -64] > + stp E_q, F_q, [dstin, 32] > + stp A_q, B_q, [dstin] > +L(move0): > ret > > END (MEMMOVE) > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -3,7 +3,6 @@ sysdep_routines += \ > memchr_generic \ > memchr_nosimd \ > memcpy_a64fx \ > - memcpy_advsimd \ > memcpy_generic \ > memcpy_sve \ > memcpy_thunderx \ > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL (i, name, memcpy, > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) > IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) > - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) > IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) > @@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL (i, name, memmove, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) > - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) > IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -29,7 +29,6 @@ > extern __typeof (__redirect_memcpy) __libc_memcpy; > > extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; > -extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; > @@ -40,9 +39,6 @@ select_memcpy_ifunc (void) > { > INIT_ARCH (); > > - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) > - return __memcpy_simd; > - > if (sve && HAVE_AARCH64_SVE_ASM) > { This changes how neoverse-n2 is handled, is that expected? That is neoverse-n2 was returning __memcpy_simd before and now will be returning __memcpy_sve as n2 has SVE. Thanks, Andrew Pinski > if (IS_A64FX (midr)) > diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S > deleted file mode 100644 > index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000 > --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S > +++ /dev/null > @@ -1,248 +0,0 @@ > -/* Generic optimized memcpy using SIMD. > - Copyright (C) 2020-2022 Free Software Foundation, Inc. > - > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library. If not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -/* Assumptions: > - * > - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > - * > - */ > - > -#define dstin x0 > -#define src x1 > -#define count x2 > -#define dst x3 > -#define srcend x4 > -#define dstend x5 > -#define A_l x6 > -#define A_lw w6 > -#define A_h x7 > -#define B_l x8 > -#define B_lw w8 > -#define B_h x9 > -#define C_lw w10 > -#define tmp1 x14 > - > -#define A_q q0 > -#define B_q q1 > -#define C_q q2 > -#define D_q q3 > -#define E_q q4 > -#define F_q q5 > -#define G_q q6 > -#define H_q q7 > - > - > -/* This implementation supports both memcpy and memmove and shares most code. > - It uses unaligned accesses and branchless sequences to keep the code small, > - simple and improve performance. > - > - Copies are split into 3 main cases: small copies of up to 32 bytes, medium > - copies of up to 128 bytes, and large copies. The overhead of the overlap > - check in memmove is negligible since it is only required for large copies. > - > - Large copies use a software pipelined loop processing 64 bytes per > - iteration. The destination pointer is 16-byte aligned to minimize > - unaligned accesses. The loop tail is handled by always copying 64 bytes > - from the end. */ > - > -ENTRY (__memcpy_simd) > - PTR_ARG (0) > - PTR_ARG (1) > - SIZE_ARG (2) > - > - add srcend, src, count > - add dstend, dstin, count > - cmp count, 128 > - b.hi L(copy_long) > - cmp count, 32 > - b.hi L(copy32_128) > - > - /* Small copies: 0..32 bytes. */ > - cmp count, 16 > - b.lo L(copy16) > - ldr A_q, [src] > - ldr B_q, [srcend, -16] > - str A_q, [dstin] > - str B_q, [dstend, -16] > - ret > - > - /* Copy 8-15 bytes. */ > -L(copy16): > - tbz count, 3, L(copy8) > - ldr A_l, [src] > - ldr A_h, [srcend, -8] > - str A_l, [dstin] > - str A_h, [dstend, -8] > - ret > - > - /* Copy 4-7 bytes. */ > -L(copy8): > - tbz count, 2, L(copy4) > - ldr A_lw, [src] > - ldr B_lw, [srcend, -4] > - str A_lw, [dstin] > - str B_lw, [dstend, -4] > - ret > - > - /* Copy 0..3 bytes using a branchless sequence. */ > -L(copy4): > - cbz count, L(copy0) > - lsr tmp1, count, 1 > - ldrb A_lw, [src] > - ldrb C_lw, [srcend, -1] > - ldrb B_lw, [src, tmp1] > - strb A_lw, [dstin] > - strb B_lw, [dstin, tmp1] > - strb C_lw, [dstend, -1] > -L(copy0): > - ret > - > - .p2align 4 > - /* Medium copies: 33..128 bytes. */ > -L(copy32_128): > - ldp A_q, B_q, [src] > - ldp C_q, D_q, [srcend, -32] > - cmp count, 64 > - b.hi L(copy128) > - stp A_q, B_q, [dstin] > - stp C_q, D_q, [dstend, -32] > - ret > - > - .p2align 4 > - /* Copy 65..128 bytes. */ > -L(copy128): > - ldp E_q, F_q, [src, 32] > - cmp count, 96 > - b.ls L(copy96) > - ldp G_q, H_q, [srcend, -64] > - stp G_q, H_q, [dstend, -64] > -L(copy96): > - stp A_q, B_q, [dstin] > - stp E_q, F_q, [dstin, 32] > - stp C_q, D_q, [dstend, -32] > - ret > - > - /* Align loop64 below to 16 bytes. */ > - nop > - > - /* Copy more than 128 bytes. */ > -L(copy_long): > - /* Copy 16 bytes and then align src to 16-byte alignment. */ > - ldr D_q, [src] > - and tmp1, src, 15 > - bic src, src, 15 > - sub dst, dstin, tmp1 > - add count, count, tmp1 /* Count is now 16 too large. */ > - ldp A_q, B_q, [src, 16] > - str D_q, [dstin] > - ldp C_q, D_q, [src, 48] > - subs count, count, 128 + 16 /* Test and readjust count. */ > - b.ls L(copy64_from_end) > -L(loop64): > - stp A_q, B_q, [dst, 16] > - ldp A_q, B_q, [src, 80] > - stp C_q, D_q, [dst, 48] > - ldp C_q, D_q, [src, 112] > - add src, src, 64 > - add dst, dst, 64 > - subs count, count, 64 > - b.hi L(loop64) > - > - /* Write the last iteration and copy 64 bytes from the end. */ > -L(copy64_from_end): > - ldp E_q, F_q, [srcend, -64] > - stp A_q, B_q, [dst, 16] > - ldp A_q, B_q, [srcend, -32] > - stp C_q, D_q, [dst, 48] > - stp E_q, F_q, [dstend, -64] > - stp A_q, B_q, [dstend, -32] > - ret > - > -END (__memcpy_simd) > -libc_hidden_builtin_def (__memcpy_simd) > - > - > -ENTRY (__memmove_simd) > - PTR_ARG (0) > - PTR_ARG (1) > - SIZE_ARG (2) > - > - add srcend, src, count > - add dstend, dstin, count > - cmp count, 128 > - b.hi L(move_long) > - cmp count, 32 > - b.hi L(copy32_128) > - > - /* Small moves: 0..32 bytes. */ > - cmp count, 16 > - b.lo L(copy16) > - ldr A_q, [src] > - ldr B_q, [srcend, -16] > - str A_q, [dstin] > - str B_q, [dstend, -16] > - ret > - > -L(move_long): > - /* Only use backward copy if there is an overlap. */ > - sub tmp1, dstin, src > - cbz tmp1, L(move0) > - cmp tmp1, count > - b.hs L(copy_long) > - > - /* Large backwards copy for overlapping copies. > - Copy 16 bytes and then align srcend to 16-byte alignment. */ > -L(copy_long_backwards): > - ldr D_q, [srcend, -16] > - and tmp1, srcend, 15 > - bic srcend, srcend, 15 > - sub count, count, tmp1 > - ldp A_q, B_q, [srcend, -32] > - str D_q, [dstend, -16] > - ldp C_q, D_q, [srcend, -64] > - sub dstend, dstend, tmp1 > - subs count, count, 128 > - b.ls L(copy64_from_start) > - > -L(loop64_backwards): > - str B_q, [dstend, -16] > - str A_q, [dstend, -32] > - ldp A_q, B_q, [srcend, -96] > - str D_q, [dstend, -48] > - str C_q, [dstend, -64]! > - ldp C_q, D_q, [srcend, -128] > - sub srcend, srcend, 64 > - subs count, count, 64 > - b.hi L(loop64_backwards) > - > - /* Write the last iteration and copy 64 bytes from the start. */ > -L(copy64_from_start): > - ldp E_q, F_q, [src, 32] > - stp A_q, B_q, [dstend, -32] > - ldp A_q, B_q, [src] > - stp C_q, D_q, [dstend, -64] > - stp E_q, F_q, [dstin, 32] > - stp A_q, B_q, [dstin] > -L(move0): > - ret > - > -END (__memmove_simd) > -libc_hidden_builtin_def (__memmove_simd) > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -29,7 +29,6 @@ > extern __typeof (__redirect_memmove) __libc_memmove; > > extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; > -extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; > @@ -40,9 +39,6 @@ select_memmove_ifunc (void) > { > INIT_ARCH (); > > - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) > - return __memmove_simd; > - > if (sve && HAVE_AARCH64_SVE_ASM) > { > if (IS_A64FX (midr)) >
Hi Andrew, > This changes how neoverse-n2 is handled, is that expected? > That is neoverse-n2 was returning __memcpy_simd before and now will be > returning __memcpy_sve as n2 has SVE. Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this as a general rule is better than special casing every CPU. Cheers, Wilco
On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote: > Hi Andrew, > >> This changes how neoverse-n2 is handled, is that expected? >> That is neoverse-n2 was returning __memcpy_simd before and now will be >> returning __memcpy_sve as n2 has SVE. > > Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this > as a general rule is better than special casing every CPU. > > Cheers, > Wilco Maybe move this change to a different patch?
Hi Adhemerval, On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote: > Hi Andrew, > >> This changes how neoverse-n2 is handled, is that expected? >> That is neoverse-n2 was returning __memcpy_simd before and now will be >> returning __memcpy_sve as n2 has SVE. > > Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this > as a general rule is better than special casing every CPU. > Maybe move this change to a different patch? That if statement made no sense after the change, so I removed it altogether. Either way, it doesn't seem large or important enough to warrant a separate patch. I could add a note in the commit log, eg: Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default. If SVE is available, a SVE memcpy will be used by default (including Neoverse N2). Cheers, Wilco
The 10/19/2022 12:31, Wilco Dijkstra via Libc-alpha wrote: > Hi Adhemerval, > > On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote: > > Hi Andrew, > > > >> This changes how neoverse-n2 is handled, is that expected? > >> That is neoverse-n2 was returning __memcpy_simd before and now will be > >> returning __memcpy_sve as n2 has SVE. > > > > Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this > > as a general rule is better than special casing every CPU. > > > Maybe move this change to a different patch? > > That if statement made no sense after the change, so I removed it altogether. Either > way, it doesn't seem large or important enough to warrant a separate patch. I could > add a note in the commit log, eg: > > Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default. > If SVE is available, a SVE memcpy will be used by default (including Neoverse N2). the patch is OK to commit with this note. thanks.
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2022 Free Software Foundation, Inc. +/* Generic optimized memcpy using SIMD. + Copyright (C) 2012-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -20,7 +21,7 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ @@ -36,21 +37,18 @@ #define B_l x8 #define B_lw w8 #define B_h x9 -#define C_l x10 #define C_lw w10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l x14 -#define E_h x15 -#define F_l x16 -#define F_h x17 -#define G_l count -#define G_h dst -#define H_l src -#define H_h srcend #define tmp1 x14 +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + #ifndef MEMMOVE # define MEMMOVE memmove #endif @@ -69,10 +67,9 @@ Large copies use a software pipelined loop processing 64 bytes per iteration. The destination pointer is 16-byte aligned to minimize unaligned accesses. The loop tail is handled by always copying 64 bytes - from the end. -*/ + from the end. */ -ENTRY_ALIGN (MEMCPY, 6) +ENTRY (MEMCPY) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) @@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6) /* Small copies: 0..32 bytes. */ cmp count, 16 b.lo L(copy16) - ldp A_l, A_h, [src] - ldp D_l, D_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret /* Copy 8-15 bytes. */ @@ -102,7 +99,6 @@ L(copy16): str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) @@ -128,87 +124,69 @@ L(copy0): .p2align 4 /* Medium copies: 33..128 bytes. */ L(copy32_128): - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - ldp D_l, D_h, [srcend, -16] + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] cmp count, 64 b.hi L(copy128) - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] - stp D_l, D_h, [dstend, -16] + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] ret .p2align 4 /* Copy 65..128 bytes. */ L(copy128): - ldp E_l, E_h, [src, 32] - ldp F_l, F_h, [src, 48] + ldp E_q, F_q, [src, 32] cmp count, 96 b.ls L(copy96) - ldp G_l, G_h, [srcend, -64] - ldp H_l, H_h, [srcend, -48] - stp G_l, G_h, [dstend, -64] - stp H_l, H_h, [dstend, -48] + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] L(copy96): - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp E_l, E_h, [dstin, 32] - stp F_l, F_h, [dstin, 48] - stp C_l, C_h, [dstend, -32] - stp D_l, D_h, [dstend, -16] + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] ret - .p2align 4 + /* Align loop64 below to 16 bytes. */ + nop + /* Copy more than 128 bytes. */ L(copy_long): - /* Copy 16 bytes and then align dst to 16-byte alignment. */ - ldp D_l, D_h, [src] - and tmp1, dstin, 15 - bic dst, dstin, 15 - sub src, src, tmp1 + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] subs count, count, 128 + 16 /* Test and readjust count. */ b.ls L(copy64_from_end) - L(loop64): - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 subs count, count, 64 b.hi L(loop64) /* Write the last iteration and copy 64 bytes from the end. */ L(copy64_from_end): - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] ret END (MEMCPY) libc_hidden_builtin_def (MEMCPY) -ENTRY_ALIGN (MEMMOVE, 4) + +ENTRY (MEMMOVE) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) @@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4) cmp count, 32 b.hi L(copy32_128) - /* Small copies: 0..32 bytes. */ + /* Small moves: 0..32 bytes. */ cmp count, 16 b.lo L(copy16) - ldp A_l, A_h, [src] - ldp D_l, D_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret - .p2align 4 L(move_long): /* Only use backward copy if there is an overlap. */ sub tmp1, dstin, src - cbz tmp1, L(copy0) + cbz tmp1, L(move0) cmp tmp1, count b.hs L(copy_long) /* Large backwards copy for overlapping copies. - Copy 16 bytes and then align dst to 16-byte alignment. */ - ldp D_l, D_h, [srcend, -16] - and tmp1, dstend, 15 - sub srcend, srcend, tmp1 + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] sub dstend, dstend, tmp1 subs count, count, 128 b.ls L(copy64_from_start) L(loop64_backwards): - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 subs count, count, 64 b.hi L(loop64_backwards) /* Write the last iteration and copy 64 bytes from the start. */ L(copy64_from_start): - ldp G_l, G_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp G_l, G_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(move0): ret END (MEMMOVE) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -3,7 +3,6 @@ sysdep_routines += \ memchr_generic \ memchr_nosimd \ memcpy_a64fx \ - memcpy_advsimd \ memcpy_generic \ memcpy_sve \ memcpy_thunderx \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) @@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -29,7 +29,6 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; @@ -40,9 +39,6 @@ select_memcpy_ifunc (void) { INIT_ARCH (); - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) - return __memcpy_simd; - if (sve && HAVE_AARCH64_SVE_ASM) { if (IS_A64FX (midr)) diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S deleted file mode 100644 index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000 --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S +++ /dev/null @@ -1,248 +0,0 @@ -/* Generic optimized memcpy using SIMD. - Copyright (C) 2020-2022 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_lw w10 -#define tmp1 x14 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 - - -/* This implementation supports both memcpy and memmove and shares most code. - It uses unaligned accesses and branchless sequences to keep the code small, - simple and improve performance. - - Copies are split into 3 main cases: small copies of up to 32 bytes, medium - copies of up to 128 bytes, and large copies. The overhead of the overlap - check in memmove is negligible since it is only required for large copies. - - Large copies use a software pipelined loop processing 64 bytes per - iteration. The destination pointer is 16-byte aligned to minimize - unaligned accesses. The loop tail is handled by always copying 64 bytes - from the end. */ - -ENTRY (__memcpy_simd) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - add dstend, dstin, count - cmp count, 128 - b.hi L(copy_long) - cmp count, 32 - b.hi L(copy32_128) - - /* Small copies: 0..32 bytes. */ - cmp count, 16 - b.lo L(copy16) - ldr A_q, [src] - ldr B_q, [srcend, -16] - str A_q, [dstin] - str B_q, [dstend, -16] - ret - - /* Copy 8-15 bytes. */ -L(copy16): - tbz count, 3, L(copy8) - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret - - /* Copy 4-7 bytes. */ -L(copy8): - tbz count, 2, L(copy4) - ldr A_lw, [src] - ldr B_lw, [srcend, -4] - str A_lw, [dstin] - str B_lw, [dstend, -4] - ret - - /* Copy 0..3 bytes using a branchless sequence. */ -L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] -L(copy0): - ret - - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 - /* Copy 65..128 bytes. */ -L(copy128): - ldp E_q, F_q, [src, 32] - cmp count, 96 - b.ls L(copy96) - ldp G_q, H_q, [srcend, -64] - stp G_q, H_q, [dstend, -64] -L(copy96): - stp A_q, B_q, [dstin] - stp E_q, F_q, [dstin, 32] - stp C_q, D_q, [dstend, -32] - ret - - /* Align loop64 below to 16 bytes. */ - nop - - /* Copy more than 128 bytes. */ -L(copy_long): - /* Copy 16 bytes and then align src to 16-byte alignment. */ - ldr D_q, [src] - and tmp1, src, 15 - bic src, src, 15 - sub dst, dstin, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_q, B_q, [src, 16] - str D_q, [dstin] - ldp C_q, D_q, [src, 48] - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(copy64_from_end) -L(loop64): - stp A_q, B_q, [dst, 16] - ldp A_q, B_q, [src, 80] - stp C_q, D_q, [dst, 48] - ldp C_q, D_q, [src, 112] - add src, src, 64 - add dst, dst, 64 - subs count, count, 64 - b.hi L(loop64) - - /* Write the last iteration and copy 64 bytes from the end. */ -L(copy64_from_end): - ldp E_q, F_q, [srcend, -64] - stp A_q, B_q, [dst, 16] - ldp A_q, B_q, [srcend, -32] - stp C_q, D_q, [dst, 48] - stp E_q, F_q, [dstend, -64] - stp A_q, B_q, [dstend, -32] - ret - -END (__memcpy_simd) -libc_hidden_builtin_def (__memcpy_simd) - - -ENTRY (__memmove_simd) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - add dstend, dstin, count - cmp count, 128 - b.hi L(move_long) - cmp count, 32 - b.hi L(copy32_128) - - /* Small moves: 0..32 bytes. */ - cmp count, 16 - b.lo L(copy16) - ldr A_q, [src] - ldr B_q, [srcend, -16] - str A_q, [dstin] - str B_q, [dstend, -16] - ret - -L(move_long): - /* Only use backward copy if there is an overlap. */ - sub tmp1, dstin, src - cbz tmp1, L(move0) - cmp tmp1, count - b.hs L(copy_long) - - /* Large backwards copy for overlapping copies. - Copy 16 bytes and then align srcend to 16-byte alignment. */ -L(copy_long_backwards): - ldr D_q, [srcend, -16] - and tmp1, srcend, 15 - bic srcend, srcend, 15 - sub count, count, tmp1 - ldp A_q, B_q, [srcend, -32] - str D_q, [dstend, -16] - ldp C_q, D_q, [srcend, -64] - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls L(copy64_from_start) - -L(loop64_backwards): - str B_q, [dstend, -16] - str A_q, [dstend, -32] - ldp A_q, B_q, [srcend, -96] - str D_q, [dstend, -48] - str C_q, [dstend, -64]! - ldp C_q, D_q, [srcend, -128] - sub srcend, srcend, 64 - subs count, count, 64 - b.hi L(loop64_backwards) - - /* Write the last iteration and copy 64 bytes from the start. */ -L(copy64_from_start): - ldp E_q, F_q, [src, 32] - stp A_q, B_q, [dstend, -32] - ldp A_q, B_q, [src] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstin, 32] - stp A_q, B_q, [dstin] -L(move0): - ret - -END (__memmove_simd) -libc_hidden_builtin_def (__memmove_simd) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -29,7 +29,6 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; @@ -40,9 +39,6 @@ select_memmove_ifunc (void) { INIT_ARCH (); - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) - return __memmove_simd; - if (sve && HAVE_AARCH64_SVE_ASM) { if (IS_A64FX (midr))