[AArch64] Optimized memcpy/memmove

Message ID	002901d0f794$66138480$323a8d80$@com
State	New
Headers	show Return-Path: <libc-alpha-return-63479-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:mime-version :content-type; q=dns; s=default; b=d2iLIx/6urPH073qWWwTLo5Au0NWw m0a/a8blpV1sXBFhdC2bUeHXPBftk+25o6xTQxlwn5BGEJKWd8mUL4Y3KekHAIAy qWSRQMfSItpCKtHPCHCtz/9x/lDe8ZFwNZC6Hl651XzXL/HiwduT71EfOg6WYqU+ RS58I/sgiqe+Tg= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org From: "Wilco Dijkstra" <wdijkstr@arm.com> To: "'GNU C Library'" <libc-alpha@sourceware.org> Subject: [PATCH][AArch64] Optimized memcpy/memmove Date: Fri, 25 Sep 2015 14:16:33 +0100 Message-ID: <002901d0f794$66138480$323a8d80$@com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="----=_NextPart_000_002A_01D0F79C.C7D7EC80"

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index b3d550e..51e7268 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -9,168 +9,236 @@ The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see <http://www.gnu.org/licenses/>. */ +#include <sysdep.h> + /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses + * ARMv8-a, AArch64, unaligned accesses. * */ #define dstin x0 #define src x1 #define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 -#include <sysdep.h> +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ -ENTRY_ALIGN (memcpy, 6) - - mov dst, dstin - cmp count, #64 - b.ge L(cpy_not_short) - cmp count, #15 - b.le L(tail15tiny) - - /* Deal with small copies quickly by dropping straight into the - * exit block. */ -L(tail63): - /* Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15) - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] - -L(tail15): - ands count, count, #15 - beq 1f - add src, src, count - ldp A_l, A_h, [src, #-16] - add dst, dst, count - stp A_l, A_h, [dst, #-16] -1: - RET - -L(tail15tiny): - /* Copy up to 15 bytes of data. Does not assume additional data - being copied. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 +ENTRY_ALIGN (memmove, 6) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (memmove) +libc_hidden_builtin_def (memmove) +ENTRY (memcpy) + + add srcend, src, count + add dstend, dstin, count + cmp count, 96 + b.hi L(copy_long) + cmp count, 16 + b.hs L(copy_medium) + + /* Small copies: 0..16 bytes. */ +L(copy16): + tbz count, 3, 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret 1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + .p2align 4 1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 + cbz count, 2f + ldrb A_lw, [src] + tbz count, 1, 1f + ldrh A_hw, [srcend, -2] + strh A_hw, [dstend, -2] +1: strb A_lw, [dstin] +2: ret + + .p2align 4 + /* Medium copies: 17..96 bytes. */ +L(copy_medium): + ldp A_l, A_h, [src] + tbnz count, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz count, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] 1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls 2f 1: - RET - -L(cpy_not_short): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Copy more data than needed; it's faster than jumping - * around copying sub-Quadword quantities. We know that - * it can't overrun. */ - ldp A_l, A_h, [src] - add src, src, tmp2 - stp A_l, A_h, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63) + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ 2: - subs count, count, #128 - b.ge L(cpy_body_large) - /* Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne L(tail63) - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(cpy_body_large): - /* There are at least 128 bytes to copy. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop 1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne L(tail63) - RET + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + END (memcpy) libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S index 8d0b328..e531b14 100644 --- a/sysdeps/aarch64/memmove.S +++ b/sysdeps/aarch64/memmove.S @@ -1,312 +1,3 @@ -/* Copyright (C) 2012-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. +/* memmove is part of memcpy.S. */ - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Unaligned accesses - */ - -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 - -ENTRY_ALIGN (memmove, 6) - - cmp dstin, src - b.lo L(downwards) - add tmp1, src, count - cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ - - /* Upwards move with potential overlap. - * Need to move from the tail backwards. SRC and DST point one - * byte beyond the remaining data to move. */ - add dst, dstin, count - add src, src, count - cmp count, #64 - b.ge L(mov_not_short_up) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63up): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15up) - sub dst, dst, tmp1 - sub src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #32] - stp A_l, A_h, [dst, #32] -1: - ldp A_l, A_h, [src, #16] - stp A_l, A_h, [dst, #16] -2: - ldp A_l, A_h, [src] - stp A_l, A_h, [dst] -L(tail15up): - /* Move up to 15 bytes of data. Does not assume additional data - * being moved. */ - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz count, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz count, #0, 1f - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] -1: - RET - -L(mov_not_short_up): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63up) -2: - subs count, count, #128 - b.ge L(mov_body_large_up) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src, #-64]! - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst, #-64]! - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - b.ne L(tail63up) - RET - - /* Critical loop. Start at a new Icache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_up): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - tst count, #0x3f - b.ne L(tail63up) - RET - -L(downwards): - /* For a downwards move we can safely use memcpy provided that - * DST is more than 16 bytes away from SRC. */ - sub tmp1, src, #16 - cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ - - mov dst, dstin /* Preserve DSTIN for return value. */ - cmp count, #64 - b.ge L(mov_not_short_down) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63down): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15down) - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] -L(tail15down): - /* Move up to 15 bytes of data. Does not assume additional data - being moved. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] -1: - RET - -L(mov_not_short_down): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63down) -2: - subs count, count, #128 - b.ge L(mov_body_large_down) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne L(tail63down) - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_down): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne L(tail63down) - RET -END (memmove) - -libc_hidden_builtin_def (memmove)

[AArch64] Optimized memcpy/memmove

Commit Message

Comments

Patch