diff mbox series

aarch64: Use memcpy_simd as the default memcpy

Message ID AS4PR08MB79015B70DC5F74EB452222B183229@AS4PR08MB7901.eurprd08.prod.outlook.com
State New
Headers show
Series aarch64: Use memcpy_simd as the default memcpy | expand

Commit Message

Wilco Dijkstra Oct. 12, 2022, 3:19 p.m. UTC
Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default
if SVE is not available.

Passes regress, OK for commit?

---

Comments

Andrew Pinski Oct. 12, 2022, 7:12 p.m. UTC | #1
On Wed, Oct 12, 2022 at 8:20 AM Wilco Dijkstra via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default
> if SVE is not available.
>
> Passes regress, OK for commit?
>
> ---
> diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
> index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644
> --- a/sysdeps/aarch64/memcpy.S
> +++ b/sysdeps/aarch64/memcpy.S
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
> +/* Generic optimized memcpy using SIMD.
> +   Copyright (C) 2012-2022 Free Software Foundation, Inc.
>
>     This file is part of the GNU C Library.
>
> @@ -20,7 +21,7 @@
>
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
>   *
>   */
>
> @@ -36,21 +37,18 @@
>  #define B_l    x8
>  #define B_lw   w8
>  #define B_h    x9
> -#define C_l    x10
>  #define C_lw   w10
> -#define C_h    x11
> -#define D_l    x12
> -#define D_h    x13
> -#define E_l    x14
> -#define E_h    x15
> -#define F_l    x16
> -#define F_h    x17
> -#define G_l    count
> -#define G_h    dst
> -#define H_l    src
> -#define H_h    srcend
>  #define tmp1   x14
>
> +#define A_q    q0
> +#define B_q    q1
> +#define C_q    q2
> +#define D_q    q3
> +#define E_q    q4
> +#define F_q    q5
> +#define G_q    q6
> +#define H_q    q7
> +
>  #ifndef MEMMOVE
>  # define MEMMOVE memmove
>  #endif
> @@ -69,10 +67,9 @@
>     Large copies use a software pipelined loop processing 64 bytes per
>     iteration.  The destination pointer is 16-byte aligned to minimize
>     unaligned accesses.  The loop tail is handled by always copying 64 bytes
> -   from the end.
> -*/
> +   from the end.  */
>
> -ENTRY_ALIGN (MEMCPY, 6)
> +ENTRY (MEMCPY)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         SIZE_ARG (2)
> @@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
>         /* Small copies: 0..32 bytes.  */
>         cmp     count, 16
>         b.lo    L(copy16)
> -       ldp     A_l, A_h, [src]
> -       ldp     D_l, D_h, [srcend, -16]
> -       stp     A_l, A_h, [dstin]
> -       stp     D_l, D_h, [dstend, -16]
> +       ldr     A_q, [src]
> +       ldr     B_q, [srcend, -16]
> +       str     A_q, [dstin]
> +       str     B_q, [dstend, -16]
>         ret
>
>         /* Copy 8-15 bytes.  */
> @@ -102,7 +99,6 @@ L(copy16):
>         str     A_h, [dstend, -8]
>         ret
>
> -       .p2align 3
>         /* Copy 4-7 bytes.  */
>  L(copy8):
>         tbz     count, 2, L(copy4)
> @@ -128,87 +124,69 @@ L(copy0):
>         .p2align 4
>         /* Medium copies: 33..128 bytes.  */
>  L(copy32_128):
> -       ldp     A_l, A_h, [src]
> -       ldp     B_l, B_h, [src, 16]
> -       ldp     C_l, C_h, [srcend, -32]
> -       ldp     D_l, D_h, [srcend, -16]
> +       ldp     A_q, B_q, [src]
> +       ldp     C_q, D_q, [srcend, -32]
>         cmp     count, 64
>         b.hi    L(copy128)
> -       stp     A_l, A_h, [dstin]
> -       stp     B_l, B_h, [dstin, 16]
> -       stp     C_l, C_h, [dstend, -32]
> -       stp     D_l, D_h, [dstend, -16]
> +       stp     A_q, B_q, [dstin]
> +       stp     C_q, D_q, [dstend, -32]
>         ret
>
>         .p2align 4
>         /* Copy 65..128 bytes.  */
>  L(copy128):
> -       ldp     E_l, E_h, [src, 32]
> -       ldp     F_l, F_h, [src, 48]
> +       ldp     E_q, F_q, [src, 32]
>         cmp     count, 96
>         b.ls    L(copy96)
> -       ldp     G_l, G_h, [srcend, -64]
> -       ldp     H_l, H_h, [srcend, -48]
> -       stp     G_l, G_h, [dstend, -64]
> -       stp     H_l, H_h, [dstend, -48]
> +       ldp     G_q, H_q, [srcend, -64]
> +       stp     G_q, H_q, [dstend, -64]
>  L(copy96):
> -       stp     A_l, A_h, [dstin]
> -       stp     B_l, B_h, [dstin, 16]
> -       stp     E_l, E_h, [dstin, 32]
> -       stp     F_l, F_h, [dstin, 48]
> -       stp     C_l, C_h, [dstend, -32]
> -       stp     D_l, D_h, [dstend, -16]
> +       stp     A_q, B_q, [dstin]
> +       stp     E_q, F_q, [dstin, 32]
> +       stp     C_q, D_q, [dstend, -32]
>         ret
>
> -       .p2align 4
> +       /* Align loop64 below to 16 bytes.  */
> +       nop
> +
>         /* Copy more than 128 bytes.  */
>  L(copy_long):
> -       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
> -       ldp     D_l, D_h, [src]
> -       and     tmp1, dstin, 15
> -       bic     dst, dstin, 15
> -       sub     src, src, tmp1
> +       /* Copy 16 bytes and then align src to 16-byte alignment.  */
> +       ldr     D_q, [src]
> +       and     tmp1, src, 15
> +       bic     src, src, 15
> +       sub     dst, dstin, tmp1
>         add     count, count, tmp1      /* Count is now 16 too large.  */
> -       ldp     A_l, A_h, [src, 16]
> -       stp     D_l, D_h, [dstin]
> -       ldp     B_l, B_h, [src, 32]
> -       ldp     C_l, C_h, [src, 48]
> -       ldp     D_l, D_h, [src, 64]!
> +       ldp     A_q, B_q, [src, 16]
> +       str     D_q, [dstin]
> +       ldp     C_q, D_q, [src, 48]
>         subs    count, count, 128 + 16  /* Test and readjust count.  */
>         b.ls    L(copy64_from_end)
> -
>  L(loop64):
> -       stp     A_l, A_h, [dst, 16]
> -       ldp     A_l, A_h, [src, 16]
> -       stp     B_l, B_h, [dst, 32]
> -       ldp     B_l, B_h, [src, 32]
> -       stp     C_l, C_h, [dst, 48]
> -       ldp     C_l, C_h, [src, 48]
> -       stp     D_l, D_h, [dst, 64]!
> -       ldp     D_l, D_h, [src, 64]!
> +       stp     A_q, B_q, [dst, 16]
> +       ldp     A_q, B_q, [src, 80]
> +       stp     C_q, D_q, [dst, 48]
> +       ldp     C_q, D_q, [src, 112]
> +       add     src, src, 64
> +       add     dst, dst, 64
>         subs    count, count, 64
>         b.hi    L(loop64)
>
>         /* Write the last iteration and copy 64 bytes from the end.  */
>  L(copy64_from_end):
> -       ldp     E_l, E_h, [srcend, -64]
> -       stp     A_l, A_h, [dst, 16]
> -       ldp     A_l, A_h, [srcend, -48]
> -       stp     B_l, B_h, [dst, 32]
> -       ldp     B_l, B_h, [srcend, -32]
> -       stp     C_l, C_h, [dst, 48]
> -       ldp     C_l, C_h, [srcend, -16]
> -       stp     D_l, D_h, [dst, 64]
> -       stp     E_l, E_h, [dstend, -64]
> -       stp     A_l, A_h, [dstend, -48]
> -       stp     B_l, B_h, [dstend, -32]
> -       stp     C_l, C_h, [dstend, -16]
> +       ldp     E_q, F_q, [srcend, -64]
> +       stp     A_q, B_q, [dst, 16]
> +       ldp     A_q, B_q, [srcend, -32]
> +       stp     C_q, D_q, [dst, 48]
> +       stp     E_q, F_q, [dstend, -64]
> +       stp     A_q, B_q, [dstend, -32]
>         ret
>
>  END (MEMCPY)
>  libc_hidden_builtin_def (MEMCPY)
>
> -ENTRY_ALIGN (MEMMOVE, 4)
> +
> +ENTRY (MEMMOVE)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         SIZE_ARG (2)
> @@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
>         cmp     count, 32
>         b.hi    L(copy32_128)
>
> -       /* Small copies: 0..32 bytes.  */
> +       /* Small moves: 0..32 bytes.  */
>         cmp     count, 16
>         b.lo    L(copy16)
> -       ldp     A_l, A_h, [src]
> -       ldp     D_l, D_h, [srcend, -16]
> -       stp     A_l, A_h, [dstin]
> -       stp     D_l, D_h, [dstend, -16]
> +       ldr     A_q, [src]
> +       ldr     B_q, [srcend, -16]
> +       str     A_q, [dstin]
> +       str     B_q, [dstend, -16]
>         ret
>
> -       .p2align 4
>  L(move_long):
>         /* Only use backward copy if there is an overlap.  */
>         sub     tmp1, dstin, src
> -       cbz     tmp1, L(copy0)
> +       cbz     tmp1, L(move0)
>         cmp     tmp1, count
>         b.hs    L(copy_long)
>
>         /* Large backwards copy for overlapping copies.
> -          Copy 16 bytes and then align dst to 16-byte alignment.  */
> -       ldp     D_l, D_h, [srcend, -16]
> -       and     tmp1, dstend, 15
> -       sub     srcend, srcend, tmp1
> +          Copy 16 bytes and then align srcend to 16-byte alignment.  */
> +L(copy_long_backwards):
> +       ldr     D_q, [srcend, -16]
> +       and     tmp1, srcend, 15
> +       bic     srcend, srcend, 15
>         sub     count, count, tmp1
> -       ldp     A_l, A_h, [srcend, -16]
> -       stp     D_l, D_h, [dstend, -16]
> -       ldp     B_l, B_h, [srcend, -32]
> -       ldp     C_l, C_h, [srcend, -48]
> -       ldp     D_l, D_h, [srcend, -64]!
> +       ldp     A_q, B_q, [srcend, -32]
> +       str     D_q, [dstend, -16]
> +       ldp     C_q, D_q, [srcend, -64]
>         sub     dstend, dstend, tmp1
>         subs    count, count, 128
>         b.ls    L(copy64_from_start)
>
>  L(loop64_backwards):
> -       stp     A_l, A_h, [dstend, -16]
> -       ldp     A_l, A_h, [srcend, -16]
> -       stp     B_l, B_h, [dstend, -32]
> -       ldp     B_l, B_h, [srcend, -32]
> -       stp     C_l, C_h, [dstend, -48]
> -       ldp     C_l, C_h, [srcend, -48]
> -       stp     D_l, D_h, [dstend, -64]!
> -       ldp     D_l, D_h, [srcend, -64]!
> +       str     B_q, [dstend, -16]
> +       str     A_q, [dstend, -32]
> +       ldp     A_q, B_q, [srcend, -96]
> +       str     D_q, [dstend, -48]
> +       str     C_q, [dstend, -64]!
> +       ldp     C_q, D_q, [srcend, -128]
> +       sub     srcend, srcend, 64
>         subs    count, count, 64
>         b.hi    L(loop64_backwards)
>
>         /* Write the last iteration and copy 64 bytes from the start.  */
>  L(copy64_from_start):
> -       ldp     G_l, G_h, [src, 48]
> -       stp     A_l, A_h, [dstend, -16]
> -       ldp     A_l, A_h, [src, 32]
> -       stp     B_l, B_h, [dstend, -32]
> -       ldp     B_l, B_h, [src, 16]
> -       stp     C_l, C_h, [dstend, -48]
> -       ldp     C_l, C_h, [src]
> -       stp     D_l, D_h, [dstend, -64]
> -       stp     G_l, G_h, [dstin, 48]
> -       stp     A_l, A_h, [dstin, 32]
> -       stp     B_l, B_h, [dstin, 16]
> -       stp     C_l, C_h, [dstin]
> +       ldp     E_q, F_q, [src, 32]
> +       stp     A_q, B_q, [dstend, -32]
> +       ldp     A_q, B_q, [src]
> +       stp     C_q, D_q, [dstend, -64]
> +       stp     E_q, F_q, [dstin, 32]
> +       stp     A_q, B_q, [dstin]
> +L(move0):
>         ret
>
>  END (MEMMOVE)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -3,7 +3,6 @@ sysdep_routines += \
>    memchr_generic \
>    memchr_nosimd \
>    memcpy_a64fx \
> -  memcpy_advsimd \
>    memcpy_generic \
>    memcpy_sve \
>    memcpy_thunderx \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    IFUNC_IMPL (i, name, memcpy,
>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
>               IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
> -             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
>  #if HAVE_AARCH64_SVE_ASM
>               IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
>               IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
> @@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    IFUNC_IMPL (i, name, memmove,
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>               IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
> -             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
>  #if HAVE_AARCH64_SVE_ASM
>               IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
>               IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -29,7 +29,6 @@
>  extern __typeof (__redirect_memcpy) __libc_memcpy;
>
>  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> -extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memcpy_ifunc (void)
>  {
>    INIT_ARCH ();
>
> -  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> -    return __memcpy_simd;
> -
>    if (sve && HAVE_AARCH64_SVE_ASM)
>      {


This changes how neoverse-n2 is handled, is that expected?
That is neoverse-n2 was returning __memcpy_simd before and now will be
returning __memcpy_sve as n2 has SVE.

Thanks,
Andrew Pinski


>        if (IS_A64FX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> deleted file mode 100644
> index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> +++ /dev/null
> @@ -1,248 +0,0 @@
> -/* Generic optimized memcpy using SIMD.
> -   Copyright (C) 2020-2022 Free Software Foundation, Inc.
> -
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library.  If not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -/* Assumptions:
> - *
> - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> - *
> - */
> -
> -#define dstin  x0
> -#define src    x1
> -#define count  x2
> -#define dst    x3
> -#define srcend x4
> -#define dstend x5
> -#define A_l    x6
> -#define A_lw   w6
> -#define A_h    x7
> -#define B_l    x8
> -#define B_lw   w8
> -#define B_h    x9
> -#define C_lw   w10
> -#define tmp1   x14
> -
> -#define A_q    q0
> -#define B_q    q1
> -#define C_q    q2
> -#define D_q    q3
> -#define E_q    q4
> -#define F_q    q5
> -#define G_q    q6
> -#define H_q    q7
> -
> -
> -/* This implementation supports both memcpy and memmove and shares most code.
> -   It uses unaligned accesses and branchless sequences to keep the code small,
> -   simple and improve performance.
> -
> -   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> -   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> -   check in memmove is negligible since it is only required for large copies.
> -
> -   Large copies use a software pipelined loop processing 64 bytes per
> -   iteration.  The destination pointer is 16-byte aligned to minimize
> -   unaligned accesses.  The loop tail is handled by always copying 64 bytes
> -   from the end.  */
> -
> -ENTRY (__memcpy_simd)
> -       PTR_ARG (0)
> -       PTR_ARG (1)
> -       SIZE_ARG (2)
> -
> -       add     srcend, src, count
> -       add     dstend, dstin, count
> -       cmp     count, 128
> -       b.hi    L(copy_long)
> -       cmp     count, 32
> -       b.hi    L(copy32_128)
> -
> -       /* Small copies: 0..32 bytes.  */
> -       cmp     count, 16
> -       b.lo    L(copy16)
> -       ldr     A_q, [src]
> -       ldr     B_q, [srcend, -16]
> -       str     A_q, [dstin]
> -       str     B_q, [dstend, -16]
> -       ret
> -
> -       /* Copy 8-15 bytes.  */
> -L(copy16):
> -       tbz     count, 3, L(copy8)
> -       ldr     A_l, [src]
> -       ldr     A_h, [srcend, -8]
> -       str     A_l, [dstin]
> -       str     A_h, [dstend, -8]
> -       ret
> -
> -       /* Copy 4-7 bytes.  */
> -L(copy8):
> -       tbz     count, 2, L(copy4)
> -       ldr     A_lw, [src]
> -       ldr     B_lw, [srcend, -4]
> -       str     A_lw, [dstin]
> -       str     B_lw, [dstend, -4]
> -       ret
> -
> -       /* Copy 0..3 bytes using a branchless sequence.  */
> -L(copy4):
> -       cbz     count, L(copy0)
> -       lsr     tmp1, count, 1
> -       ldrb    A_lw, [src]
> -       ldrb    C_lw, [srcend, -1]
> -       ldrb    B_lw, [src, tmp1]
> -       strb    A_lw, [dstin]
> -       strb    B_lw, [dstin, tmp1]
> -       strb    C_lw, [dstend, -1]
> -L(copy0):
> -       ret
> -
> -       .p2align 4
> -       /* Medium copies: 33..128 bytes.  */
> -L(copy32_128):
> -       ldp     A_q, B_q, [src]
> -       ldp     C_q, D_q, [srcend, -32]
> -       cmp     count, 64
> -       b.hi    L(copy128)
> -       stp     A_q, B_q, [dstin]
> -       stp     C_q, D_q, [dstend, -32]
> -       ret
> -
> -       .p2align 4
> -       /* Copy 65..128 bytes.  */
> -L(copy128):
> -       ldp     E_q, F_q, [src, 32]
> -       cmp     count, 96
> -       b.ls    L(copy96)
> -       ldp     G_q, H_q, [srcend, -64]
> -       stp     G_q, H_q, [dstend, -64]
> -L(copy96):
> -       stp     A_q, B_q, [dstin]
> -       stp     E_q, F_q, [dstin, 32]
> -       stp     C_q, D_q, [dstend, -32]
> -       ret
> -
> -       /* Align loop64 below to 16 bytes.  */
> -       nop
> -
> -       /* Copy more than 128 bytes.  */
> -L(copy_long):
> -       /* Copy 16 bytes and then align src to 16-byte alignment.  */
> -       ldr     D_q, [src]
> -       and     tmp1, src, 15
> -       bic     src, src, 15
> -       sub     dst, dstin, tmp1
> -       add     count, count, tmp1      /* Count is now 16 too large.  */
> -       ldp     A_q, B_q, [src, 16]
> -       str     D_q, [dstin]
> -       ldp     C_q, D_q, [src, 48]
> -       subs    count, count, 128 + 16  /* Test and readjust count.  */
> -       b.ls    L(copy64_from_end)
> -L(loop64):
> -       stp     A_q, B_q, [dst, 16]
> -       ldp     A_q, B_q, [src, 80]
> -       stp     C_q, D_q, [dst, 48]
> -       ldp     C_q, D_q, [src, 112]
> -       add     src, src, 64
> -       add     dst, dst, 64
> -       subs    count, count, 64
> -       b.hi    L(loop64)
> -
> -       /* Write the last iteration and copy 64 bytes from the end.  */
> -L(copy64_from_end):
> -       ldp     E_q, F_q, [srcend, -64]
> -       stp     A_q, B_q, [dst, 16]
> -       ldp     A_q, B_q, [srcend, -32]
> -       stp     C_q, D_q, [dst, 48]
> -       stp     E_q, F_q, [dstend, -64]
> -       stp     A_q, B_q, [dstend, -32]
> -       ret
> -
> -END (__memcpy_simd)
> -libc_hidden_builtin_def (__memcpy_simd)
> -
> -
> -ENTRY (__memmove_simd)
> -       PTR_ARG (0)
> -       PTR_ARG (1)
> -       SIZE_ARG (2)
> -
> -       add     srcend, src, count
> -       add     dstend, dstin, count
> -       cmp     count, 128
> -       b.hi    L(move_long)
> -       cmp     count, 32
> -       b.hi    L(copy32_128)
> -
> -       /* Small moves: 0..32 bytes.  */
> -       cmp     count, 16
> -       b.lo    L(copy16)
> -       ldr     A_q, [src]
> -       ldr     B_q, [srcend, -16]
> -       str     A_q, [dstin]
> -       str     B_q, [dstend, -16]
> -       ret
> -
> -L(move_long):
> -       /* Only use backward copy if there is an overlap.  */
> -       sub     tmp1, dstin, src
> -       cbz     tmp1, L(move0)
> -       cmp     tmp1, count
> -       b.hs    L(copy_long)
> -
> -       /* Large backwards copy for overlapping copies.
> -          Copy 16 bytes and then align srcend to 16-byte alignment.  */
> -L(copy_long_backwards):
> -       ldr     D_q, [srcend, -16]
> -       and     tmp1, srcend, 15
> -       bic     srcend, srcend, 15
> -       sub     count, count, tmp1
> -       ldp     A_q, B_q, [srcend, -32]
> -       str     D_q, [dstend, -16]
> -       ldp     C_q, D_q, [srcend, -64]
> -       sub     dstend, dstend, tmp1
> -       subs    count, count, 128
> -       b.ls    L(copy64_from_start)
> -
> -L(loop64_backwards):
> -       str     B_q, [dstend, -16]
> -       str     A_q, [dstend, -32]
> -       ldp     A_q, B_q, [srcend, -96]
> -       str     D_q, [dstend, -48]
> -       str     C_q, [dstend, -64]!
> -       ldp     C_q, D_q, [srcend, -128]
> -       sub     srcend, srcend, 64
> -       subs    count, count, 64
> -       b.hi    L(loop64_backwards)
> -
> -       /* Write the last iteration and copy 64 bytes from the start.  */
> -L(copy64_from_start):
> -       ldp     E_q, F_q, [src, 32]
> -       stp     A_q, B_q, [dstend, -32]
> -       ldp     A_q, B_q, [src]
> -       stp     C_q, D_q, [dstend, -64]
> -       stp     E_q, F_q, [dstin, 32]
> -       stp     A_q, B_q, [dstin]
> -L(move0):
> -       ret
> -
> -END (__memmove_simd)
> -libc_hidden_builtin_def (__memmove_simd)
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -29,7 +29,6 @@
>  extern __typeof (__redirect_memmove) __libc_memmove;
>
>  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> -extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memmove_ifunc (void)
>  {
>    INIT_ARCH ();
>
> -  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> -    return __memmove_simd;
> -
>    if (sve && HAVE_AARCH64_SVE_ASM)
>      {
>        if (IS_A64FX (midr))
>
Wilco Dijkstra Oct. 13, 2022, 12:28 p.m. UTC | #2
Hi Andrew,

> This changes how neoverse-n2 is handled, is that expected?
> That is neoverse-n2 was returning __memcpy_simd before and now will be
> returning __memcpy_sve as n2 has SVE.

Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
as a general rule is better than special casing every CPU.

Cheers,
Wilco
Adhemerval Zanella Netto Oct. 13, 2022, 12:30 p.m. UTC | #3
On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> Hi Andrew,
> 
>> This changes how neoverse-n2 is handled, is that expected?
>> That is neoverse-n2 was returning __memcpy_simd before and now will be
>> returning __memcpy_sve as n2 has SVE.
> 
> Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> as a general rule is better than special casing every CPU.
> 
> Cheers,
> Wilco


Maybe move this change to a different patch?
Wilco Dijkstra Oct. 19, 2022, 12:31 p.m. UTC | #4
Hi Adhemerval,

On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> Hi Andrew,
> 
>> This changes how neoverse-n2 is handled, is that expected?
>> That is neoverse-n2 was returning __memcpy_simd before and now will be
>> returning __memcpy_sve as n2 has SVE.
> 
> Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> as a general rule is better than special casing every CPU.

> Maybe move this change to a different patch?

That if statement made no sense after the change, so I removed it altogether. Either
way, it doesn't seem large or important enough to warrant a separate patch. I could
add a note in the commit log, eg:

Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default.
If SVE is available, a SVE memcpy will be used by default (including Neoverse N2).

Cheers,
Wilco
Szabolcs Nagy Oct. 25, 2022, 12:55 p.m. UTC | #5
The 10/19/2022 12:31, Wilco Dijkstra via Libc-alpha wrote:
> Hi Adhemerval,
> 
> On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> > Hi Andrew,
> > 
> >> This changes how neoverse-n2 is handled, is that expected?
> >> That is neoverse-n2 was returning __memcpy_simd before and now will be
> >> returning __memcpy_sve as n2 has SVE.
> > 
> > Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> > as a general rule is better than special casing every CPU.
> 
> > Maybe move this change to a different patch?
> 
> That if statement made no sense after the change, so I removed it altogether. Either
> way, it doesn't seem large or important enough to warrant a separate patch. I could
> add a note in the commit log, eg:
> 
> Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default.
> If SVE is available, a SVE memcpy will be used by default (including Neoverse N2).

the patch is OK to commit with this note.
thanks.
diff mbox series

Patch

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,5 @@ 
-/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2012-2022 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -20,7 +21,7 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
@@ -36,21 +37,18 @@ 
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
-#define C_l	x10
 #define C_lw	w10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	x14
-#define E_h	x15
-#define F_l	x16
-#define F_h	x17
-#define G_l	count
-#define G_h	dst
-#define H_l	src
-#define H_h	srcend
 #define tmp1	x14
 
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -69,10 +67,9 @@ 
    Large copies use a software pipelined loop processing 64 bytes per
    iteration.  The destination pointer is 16-byte aligned to minimize
    unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.
-*/
+   from the end.  */
 
-ENTRY_ALIGN (MEMCPY, 6)
+ENTRY (MEMCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -87,10 +84,10 @@  ENTRY_ALIGN (MEMCPY, 6)
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldp	A_l, A_h, [src]
-	ldp	D_l, D_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
@@ -102,7 +99,6 @@  L(copy16):
 	str	A_h, [dstend, -8]
 	ret
 
-	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
@@ -128,87 +124,69 @@  L(copy0):
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	ldp	D_l, D_h, [srcend, -16]
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
 	cmp	count, 64
 	b.hi	L(copy128)
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
-	ldp	E_l, E_h, [src, 32]
-	ldp	F_l, F_h, [src, 48]
+	ldp	E_q, F_q, [src, 32]
 	cmp	count, 96
 	b.ls	L(copy96)
-	ldp	G_l, G_h, [srcend, -64]
-	ldp	H_l, H_h, [srcend, -48]
-	stp	G_l, G_h, [dstend, -64]
-	stp	H_l, H_h, [dstend, -48]
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
 L(copy96):
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	E_l, E_h, [dstin, 32]
-	stp	F_l, F_h, [dstin, 48]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
-	.p2align 4
+	/* Align loop64 below to 16 bytes.  */
+	nop
+
 	/* Copy more than 128 bytes.  */
 L(copy_long):
-	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
-	ldp	D_l, D_h, [src]
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	sub	src, src, tmp1
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
-
 L(loop64):
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]!
-	ldp	D_l, D_h, [src, 64]!
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
 	ret
 
 END (MEMCPY)
 libc_hidden_builtin_def (MEMCPY)
 
-ENTRY_ALIGN (MEMMOVE, 4)
+
+ENTRY (MEMMOVE)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -220,64 +198,56 @@  ENTRY_ALIGN (MEMMOVE, 4)
 	cmp	count, 32
 	b.hi	L(copy32_128)
 
-	/* Small copies: 0..32 bytes.  */
+	/* Small moves: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldp	A_l, A_h, [src]
-	ldp	D_l, D_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
-	.p2align 4
 L(move_long):
 	/* Only use backward copy if there is an overlap.  */
 	sub	tmp1, dstin, src
-	cbz	tmp1, L(copy0)
+	cbz	tmp1, L(move0)
 	cmp	tmp1, count
 	b.hs	L(copy_long)
 
 	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align dst to 16-byte alignment.  */
-	ldp	D_l, D_h, [srcend, -16]
-	and	tmp1, dstend, 15
-	sub	srcend, srcend, tmp1
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
 	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
-	ldp	G_l, G_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	G_l, G_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(move0):
 	ret
 
 END (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,7 +3,6 @@  sysdep_routines += \
   memchr_generic \
   memchr_nosimd \
   memcpy_a64fx \
-  memcpy_advsimd \
   memcpy_generic \
   memcpy_sve \
   memcpy_thunderx \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -36,7 +36,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
@@ -45,7 +44,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,7 +29,6 @@ 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
@@ -40,9 +39,6 @@  select_memcpy_ifunc (void)
 {
   INIT_ARCH ();
 
-  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
-    return __memcpy_simd;
-
   if (sve && HAVE_AARCH64_SVE_ASM)
     {
       if (IS_A64FX (midr))
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
deleted file mode 100644
index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000
--- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
+++ /dev/null
@@ -1,248 +0,0 @@ 
-/* Generic optimized memcpy using SIMD.
-   Copyright (C) 2020-2022 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define dst	x3
-#define srcend	x4
-#define dstend	x5
-#define A_l	x6
-#define A_lw	w6
-#define A_h	x7
-#define B_l	x8
-#define B_lw	w8
-#define B_h	x9
-#define C_lw	w10
-#define tmp1	x14
-
-#define A_q	q0
-#define B_q	q1
-#define C_q	q2
-#define D_q	q3
-#define E_q	q4
-#define F_q	q5
-#define G_q	q6
-#define H_q	q7
-
-
-/* This implementation supports both memcpy and memmove and shares most code.
-   It uses unaligned accesses and branchless sequences to keep the code small,
-   simple and improve performance.
-
-   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-   copies of up to 128 bytes, and large copies.  The overhead of the overlap
-   check in memmove is negligible since it is only required for large copies.
-
-   Large copies use a software pipelined loop processing 64 bytes per
-   iteration.  The destination pointer is 16-byte aligned to minimize
-   unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.  */
-
-ENTRY (__memcpy_simd)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
-	add	srcend, src, count
-	add	dstend, dstin, count
-	cmp	count, 128
-	b.hi	L(copy_long)
-	cmp	count, 32
-	b.hi	L(copy32_128)
-
-	/* Small copies: 0..32 bytes.  */
-	cmp	count, 16
-	b.lo	L(copy16)
-	ldr	A_q, [src]
-	ldr	B_q, [srcend, -16]
-	str	A_q, [dstin]
-	str	B_q, [dstend, -16]
-	ret
-
-	/* Copy 8-15 bytes.  */
-L(copy16):
-	tbz	count, 3, L(copy8)
-	ldr	A_l, [src]
-	ldr	A_h, [srcend, -8]
-	str	A_l, [dstin]
-	str	A_h, [dstend, -8]
-	ret
-
-	/* Copy 4-7 bytes.  */
-L(copy8):
-	tbz	count, 2, L(copy4)
-	ldr	A_lw, [src]
-	ldr	B_lw, [srcend, -4]
-	str	A_lw, [dstin]
-	str	B_lw, [dstend, -4]
-	ret
-
-	/* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-	cbz	count, L(copy0)
-	lsr	tmp1, count, 1
-	ldrb	A_lw, [src]
-	ldrb	C_lw, [srcend, -1]
-	ldrb	B_lw, [src, tmp1]
-	strb	A_lw, [dstin]
-	strb	B_lw, [dstin, tmp1]
-	strb	C_lw, [dstend, -1]
-L(copy0):
-	ret
-
-	.p2align 4
-	/* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-	ldp	A_q, B_q, [src]
-	ldp	C_q, D_q, [srcend, -32]
-	cmp	count, 64
-	b.hi	L(copy128)
-	stp	A_q, B_q, [dstin]
-	stp	C_q, D_q, [dstend, -32]
-	ret
-
-	.p2align 4
-	/* Copy 65..128 bytes.  */
-L(copy128):
-	ldp	E_q, F_q, [src, 32]
-	cmp	count, 96
-	b.ls	L(copy96)
-	ldp	G_q, H_q, [srcend, -64]
-	stp	G_q, H_q, [dstend, -64]
-L(copy96):
-	stp	A_q, B_q, [dstin]
-	stp	E_q, F_q, [dstin, 32]
-	stp	C_q, D_q, [dstend, -32]
-	ret
-
-	/* Align loop64 below to 16 bytes.  */
-	nop
-
-	/* Copy more than 128 bytes.  */
-L(copy_long):
-	/* Copy 16 bytes and then align src to 16-byte alignment.  */
-	ldr	D_q, [src]
-	and	tmp1, src, 15
-	bic	src, src, 15
-	sub	dst, dstin, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_q, B_q, [src, 16]
-	str	D_q, [dstin]
-	ldp	C_q, D_q, [src, 48]
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(copy64_from_end)
-L(loop64):
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [src, 80]
-	stp	C_q, D_q, [dst, 48]
-	ldp	C_q, D_q, [src, 112]
-	add	src, src, 64
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	L(loop64)
-
-	/* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
-	ldp	E_q, F_q, [srcend, -64]
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [srcend, -32]
-	stp	C_q, D_q, [dst, 48]
-	stp	E_q, F_q, [dstend, -64]
-	stp	A_q, B_q, [dstend, -32]
-	ret
-
-END (__memcpy_simd)
-libc_hidden_builtin_def (__memcpy_simd)
-
-
-ENTRY (__memmove_simd)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
-	add	srcend, src, count
-	add	dstend, dstin, count
-	cmp	count, 128
-	b.hi	L(move_long)
-	cmp	count, 32
-	b.hi	L(copy32_128)
-
-	/* Small moves: 0..32 bytes.  */
-	cmp	count, 16
-	b.lo	L(copy16)
-	ldr	A_q, [src]
-	ldr	B_q, [srcend, -16]
-	str	A_q, [dstin]
-	str	B_q, [dstend, -16]
-	ret
-
-L(move_long):
-	/* Only use backward copy if there is an overlap.  */
-	sub	tmp1, dstin, src
-	cbz	tmp1, L(move0)
-	cmp	tmp1, count
-	b.hs	L(copy_long)
-
-	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
-L(copy_long_backwards):
-	ldr	D_q, [srcend, -16]
-	and	tmp1, srcend, 15
-	bic	srcend, srcend, 15
-	sub	count, count, tmp1
-	ldp	A_q, B_q, [srcend, -32]
-	str	D_q, [dstend, -16]
-	ldp	C_q, D_q, [srcend, -64]
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	L(copy64_from_start)
-
-L(loop64_backwards):
-	str	B_q, [dstend, -16]
-	str	A_q, [dstend, -32]
-	ldp	A_q, B_q, [srcend, -96]
-	str	D_q, [dstend, -48]
-	str	C_q, [dstend, -64]!
-	ldp	C_q, D_q, [srcend, -128]
-	sub	srcend, srcend, 64
-	subs	count, count, 64
-	b.hi	L(loop64_backwards)
-
-	/* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
-	ldp	E_q, F_q, [src, 32]
-	stp	A_q, B_q, [dstend, -32]
-	ldp	A_q, B_q, [src]
-	stp	C_q, D_q, [dstend, -64]
-	stp	E_q, F_q, [dstin, 32]
-	stp	A_q, B_q, [dstin]
-L(move0):
-	ret
-
-END (__memmove_simd)
-libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,7 +29,6 @@ 
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
@@ -40,9 +39,6 @@  select_memmove_ifunc (void)
 {
   INIT_ARCH ();
 
-  if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
-    return __memmove_simd;
-
   if (sve && HAVE_AARCH64_SVE_ASM)
     {
       if (IS_A64FX (midr))