@@ -1,3 +1,3 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor
endif
@@ -25,7 +25,7 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 2
+#define MAX_IFUNC 3
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
IFUNC_IMPL (i, name, memcpy,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
@@ -30,9 +30,14 @@ extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
libc_ifunc (__libc_memcpy,
- IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+ (IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : (IS_FALKOR (midr)
+ ? __memcpy_falkor
+ : __memcpy_generic)));
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
new file mode 100644
@@ -0,0 +1,294 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define tmp1 x14
+#define res x15
+
+#include <sysdep.h>
+
+/* Copy 64 bytes at a time and branch to LABEL on COND. */
+.macro copy_line_and_branch cond, label
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.\cond \label
+.endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ Small and medium copies read all data before writing, allowing any
+ kind of overlap, and memmove tailcalls memcpy for these cases as
+ well as non-overlapping copies. */
+
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls 5f
+
+ /* Unroll the copy for 512 bytes. We do this so that smaller copies
+ don't get penalized by the extra checks we do for larger sizes
+ further down. */
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+ copy_line_and_branch ls, 5f
+
+ /* If less than 2048 bytes remain, jump to the final loop and finish
+ off the copy. Otherwise, keep the last 2048 bytes for the final
+ loop and try a couple of prefetching loops to optimize cache
+ usage. */
+ subs count, count, 2048
+ b.hi 3f
+
+6:
+ /* 2048 bytes or less remaining, adjust COUNT and copy 64 bytes at a
+ time. */
+ add count, count, 2048
+4:
+ copy_line_and_branch hi, 4b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+5:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ /* Find an offset within the source that operates on a memory bank
+ other than the one used by the source and destination pointers. If
+ we find one within the 1K to 4K range, then we can prefetch at two
+ offsets to stream more data in from the source. This computation
+ formula is provided by Qualcomm. */
+3:
+ sub res, src, dst
+ sub res, res, 1024
+ and res, res, 0x7ff
+ subs tmp1, count, res
+ b.hi 7f
+ /* We still have more than 2K bytes remaining, so copy 128 bytes at a
+ time, prefetching at 2K-128 for every iteration until there is less
+ than 2K left. That way we make future data available in L1 and at
+ the same time, limit our prefetch to within the source data. */
+2:
+ prfm PLDL1STRM, [src, 1920]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]
+ ldp D_l, D_h, [src, 64]
+ stp A_l, A_h, [dst, 80]
+ ldp A_l, A_h, [src, 80]
+ stp B_l, B_h, [dst, 96]
+ ldp B_l, B_h, [src, 96]
+ stp C_l, C_h, [dst, 112]
+ ldp C_l, C_h, [src, 112]
+ stp D_l, D_h, [dst, 128]!
+ ldp D_l, D_h, [src, 128]!
+ subs count, count, 128
+ b.hi 2b
+ b 6b
+
+7:
+ add res, res, 2048
+
+ /* We found an appropriate offset. Copy 128 bytes at a time,
+ prefetching at 2K and the computed offset while the computed offset
+ is within the source data. */
+1:
+ prfm PLDL1STRM, [src, 2048]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]
+ ldp D_l, D_h, [src, 64]
+ prfm PLDL1STRM, [src, res]
+ stp A_l, A_h, [dst, 80]
+ ldp A_l, A_h, [src, 80]
+ stp B_l, B_h, [dst, 96]
+ ldp B_l, B_h, [src, 96]
+ stp C_l, C_h, [dst, 112]
+ ldp C_l, C_h, [src, 112]
+ stp D_l, D_h, [dst, 128]!
+ ldp D_l, D_h, [src, 128]!
+ subs tmp1, tmp1, 128
+ b.hi 1b
+
+ /* Update count once the loop is done. Subtract the 2048 added to RES
+ for the prefetch offset to account for the 2048 we deducted from
+ COUNT earlier. We will have copied:
+
+ (COUNT - RES) - TMP1
+
+ bytes so the remaining size is COUNT - (COUNT - RES - TMP1)
+
+ i.e. RES + TMP1. If COUNT is non-positive then we have 2048 bytes
+ or less remaining and we jump forward to the end. Otherwise we
+ cascade into the loop below with a single prefetch before cascading
+ further into the last loop. */
+ sub res, res, 2048
+ adds count, res, tmp1
+ b.ls 6b
+ b 2b
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
@@ -41,6 +41,9 @@
#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \
+ && MIDR_PARTNUM(midr) == 0xc00)
+
struct cpu_features
{
uint64_t midr_el1;