@@ -78,15 +78,18 @@ enum cf_protection_level
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
/* Define an entry point visible from C. */
-#define ENTRY(name) \
- .globl C_SYMBOL_NAME(name); \
- .type C_SYMBOL_NAME(name),@function; \
- .align ALIGNARG(4); \
+#define P2ALIGN_ENTRY(name, alignment) \
+ .globl C_SYMBOL_NAME(name); \
+ .type C_SYMBOL_NAME(name),@function; \
+ .align ALIGNARG(alignment); \
C_LABEL(name) \
cfi_startproc; \
- _CET_ENDBR; \
+ _CET_ENDBR; \
CALL_MCOUNT
+#define ENTRY(name) P2ALIGN_ENTRY(name, 4)
+
+
#undef END
#define END(name) \
cfi_endproc; \
@@ -165,6 +165,32 @@
# error Invalid LARGE_LOAD_SIZE
#endif
+/* Whether to align before movsb. Ultimately we want 64 byte align
+ and not worth it to load 4x VEC for VEC_SIZE == 16. */
+#define ALIGN_MOVSB (VEC_SIZE > 16)
+
+/* Number of VECs to align movsb to. */
+#if VEC_SIZE == 64
+# define MOVSB_ALIGN_TO (VEC_SIZE)
+#else
+# define MOVSB_ALIGN_TO (VEC_SIZE * 2)
+#endif
+
+/* Macro for copying inclusive power of 2 range with two register
+ loads. */
+#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0, tmp_reg1) \
+ mov_inst (%src_reg), %tmp_reg0; \
+ mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \
+ mov_inst %tmp_reg0, (%dst_reg); \
+ mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg);
+
+/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or
+ 64. */
+#define COPY_4_8 COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi)
+#define COPY_8_16 COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi)
+#define COPY_16_32 COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1)
+#define COPY_32_64 COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16, ymm17)
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -198,7 +224,13 @@ L(start):
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
+ /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+ really hot so we want them to take the same branch path. */
+#if VEC_SIZE > 16
+ jbe L(less_vec)
+#else
jb L(less_vec)
+#endif
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
@@ -206,15 +238,10 @@ L(last_2x_vec):
#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
- ret
-#else
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
-#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
@@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+/* Cache align entry so that branch heavy L(less_vec) maintains good
+ alignment. */
+P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
movq %rdi, %rax
L(start_erms):
# ifdef __ILP32__
@@ -297,123 +326,217 @@ L(start_erms):
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
+ /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are
+ really hot so we want them to take the same branch path. */
+# if VEC_SIZE > 16
+ jbe L(less_vec)
+# else
jb L(less_vec)
+# endif
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
ret
+# endif
#endif
+#if VEC_SIZE == 64
+L(copy_8_15):
+ COPY_8_16
+ ret
-L(movsb):
- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
- jae L(more_8x_vec)
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %r9
- cmpq %r9, %rdi
- /* Avoid slow backward REP MOVSB. */
- jb L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rdi, %rcx
- subq %rsi, %rcx
- jmp 2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rsi, %rcx
- subq %rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
- is N*4GB + [1..63] with N >= 0. */
- cmpl $63, %ecx
- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
-3:
-# endif
- mov %RDX_LP, %RCX_LP
- rep movsb
-L(nop):
+L(copy_33_63):
+ COPY_32_64
ret
#endif
-
+ /* Only worth aligning if near end of 16 byte block and won't get
+ first branch in first decode after jump. */
+ .p2align 4,, 6
L(less_vec):
- /* Less than 1 VEC. */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
-#if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
+ /* Second set of branches for smallest copies. */
+ cmpl $(VEC_SIZE / 4), %edx
+ jb L(less_quarter_vec)
+
+ cmpl $(VEC_SIZE / 2), %edx
+#if VEC_SIZE == 64
+ /* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall
+ through path as [16, 32] is hotter. */
+ ja L(copy_33_63)
+ COPY_16_32
+#elif VEC_SIZE == 32
+ /* Branch to [8, 15]. Fall through to [16, 32]. */
+ jb L(copy_8_15)
+ COPY_16_32
+#else
+ /* Branch to [4, 7]. Fall through to [8, 15]. */
+ jb L(copy_4_7)
+ COPY_8_16
#endif
-#if VEC_SIZE > 16
- cmpb $16, %dl
- jae L(between_16_31)
-#endif
- cmpb $8, %dl
- jae L(between_8_15)
- cmpb $4, %dl
- jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
+ ret
+ /* Align if won't cost too many bytes. */
+ .p2align 4,, 6
+L(copy_4_7):
+ COPY_4_8
+ ret
+
+ /* Cold target. No need to align. */
+L(copy_1):
movzbl (%rsi), %ecx
movb %cl, (%rdi)
-1:
ret
+
+ /* Colder copy case for [0, VEC_SIZE / 4 - 1]. */
+L(less_quarter_vec):
#if VEC_SIZE > 32
-L(between_32_63):
- /* From 32 to 63. No branch when size == 32. */
- VMOVU (%rsi), %YMM0
- VMOVU -32(%rsi,%rdx), %YMM1
- VMOVU %YMM0, (%rdi)
- VMOVU %YMM1, -32(%rdi,%rdx)
- VZEROUPPER_RETURN
+ cmpl $8, %edx
+ jae L(copy_8_15)
#endif
#if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- VMOVU (%rsi), %XMM0
- VMOVU -16(%rsi,%rdx), %XMM1
- VMOVU %XMM0, (%rdi)
- VMOVU %XMM1, -16(%rdi,%rdx)
- VZEROUPPER_RETURN
-#endif
-L(between_8_15):
- /* From 8 to 15. No branch when size == 8. */
- movq -8(%rsi,%rdx), %rcx
- movq (%rsi), %rsi
- movq %rcx, -8(%rdi,%rdx)
- movq %rsi, (%rdi)
- ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl -4(%rsi,%rdx), %ecx
- movl (%rsi), %esi
- movl %ecx, -4(%rdi,%rdx)
- movl %esi, (%rdi)
+ cmpl $4, %edx
+ jae L(copy_4_7)
+#endif
+ cmpl $1, %edx
+ je L(copy_1)
+ jb L(copy_0)
+ /* Fall through into copy [2, 3] as it is more common than [0, 1].
+ */
+ movzwl (%rsi), %ecx
+ movzbl -1(%rsi, %rdx), %esi
+ movw %cx, (%rdi)
+ movb %sil, -1(%rdi, %rdx)
+L(copy_0):
ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movzwl -2(%rsi,%rdx), %ecx
- movzwl (%rsi), %esi
- movw %cx, -2(%rdi,%rdx)
- movw %si, (%rdi)
+
+ .p2align 4
+#if VEC_SIZE == 32
+L(copy_8_15):
+ COPY_8_16
ret
+ /* COPY_8_16 is exactly 17 bytes so don't want to p2align after as
+ it wastes 15 bytes of code and 1 byte off is fine. */
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ /* Go to backwards temporal copy if overlap no matter what as
+ backward movsb is slow. */
+ cmpq %rdx, %rcx
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
+ jb L(more_8x_vec_backward_check_nop)
+ /* If above __x86_rep_movsb_stop_threshold most likely is candidate
+ for NT moves aswell. */
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+ jae L(large_memcpy_2x_check)
+# if ALIGN_MOVSB
+ VMOVU (%rsi), %VEC(0)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# endif
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+# error Unsupported MOVSB_ALIGN_TO
+# endif
+ /* Store dst for use after rep movsb. */
+ movq %rdi, %r8
+# endif
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ /* Only avoid short movsb if CPU has FSRM. */
+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+ jz L(skip_short_movsb_check)
+ /* Avoid "rep movsb" if RCX, the distance between source and
+ destination, is N*4GB + [1..63] with N >= 0. */
+
+ /* ecx contains dst - src. Early check for backward copy conditions
+ means only case of slow movsb with src = dst + [0, 63] is ecx in
+ [-63, 0]. Use unsigned comparison with -64 check for that case. */
+ cmpl $-64, %ecx
+ ja L(more_8x_vec_forward)
+# endif
+# if ALIGN_MOVSB
+ /* Fall through means cpu has FSRM. In that case exclusively align
+ destination. */
+
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Add dst to len. Subtract back after dst aligned. */
+ leaq (%rdi, %rdx), %rcx
+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
+ addq $(MOVSB_ALIGN_TO - 1), %rdi
+ andq $-(MOVSB_ALIGN_TO), %rdi
+ /* Restore src and len adjusted with new values for aligned dst. */
+ addq %rdi, %rsi
+ subq %rdi, %rcx
+ rep movsb
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
+ VZEROUPPER_RETURN
+L(movsb_align_dst):
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Add dst to len. Subtract back after dst aligned. -1 because dst
+ is initially aligned to MOVSB_ALIGN_TO - 1. */
+ leaq -(1)(%rdi, %rdx), %rcx
+ /* Inclusively align dst to MOVSB_ALIGN_TO - 1. */
+ orq $(MOVSB_ALIGN_TO - 1), %rdi
+ leaq 1(%rdi, %rsi), %rsi
+ /* Restore src and len adjusted with new values for aligned dst. */
+ subq %rdi, %rcx
+ /* Finish aligning dst. */
+ incq %rdi
+ rep movsb
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
+ VZEROUPPER_RETURN
+
+L(skip_short_movsb_check):
+ /* If CPU does not have FSRM two options for aligning. Align src if
+ dst and src 4k alias. Otherwise align dst. */
+ testl $(PAGE_SIZE - 512), %ecx
+ jnz L(movsb_align_dst)
+ /* rcx already has dst - src. */
+ movq %rcx, %r9
+ /* Add src to len. Subtract back after src aligned. -1 because src
+ is initially aligned to MOVSB_ALIGN_TO - 1. */
+ leaq -(1)(%rsi, %rdx), %rcx
+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
+ orq $(MOVSB_ALIGN_TO - 1), %rsi
+ /* Restore dst and len adjusted with new values for aligned dst. */
+ leaq 1(%rsi, %r9), %rdi
+ subq %rsi, %rcx
+ /* Finish aligning src. */
+ incq %rsi
+ rep movsb
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
+ VZEROUPPER_RETURN
+# else
+ /* Not alignined rep movsb so just copy. */
+ mov %RDX_LP, %RCX_LP
+ rep movsb
+ ret
+# endif
+#endif
+ /* Align if doesn't cost too many bytes. */
+ .p2align 4,, 6
#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
@@ -426,50 +549,60 @@ L(more_2x_vec):
ja L(more_8x_vec)
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec)
- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
VZEROUPPER_RETURN
+ /* Align if doesn't cost too much code size. 6 bytes so that after
+ jump to target a full mov instruction will always be able to be
+ fetched. */
+ .p2align 4,, 6
L(last_4x_vec):
- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+ /* Keep nop target close to jmp for 2-byte encoding. */
+L(nop):
VZEROUPPER_RETURN
-
+ /* Align if doesn't cost too much code size. */
+ .p2align 4,, 10
L(more_8x_vec):
/* Check if non-temporal move candidate. */
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_memcpy_2x)
#endif
- /* Entry if rdx is greater than non-temporal threshold but there
- is overlap. */
+ /* Entry if rdx is greater than non-temporal threshold but there is
+ overlap. */
L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
je L(nop)
+ /* Entry if rdx is greater than movsb or stop movsb threshold but
+ there is overlap with dst > src. */
+L(more_8x_vec_forward):
/* Load the first VEC and last 4 * VEC to support overlapping
addresses. */
VMOVU (%rsi), %VEC(4)
@@ -477,22 +610,18 @@ L(more_8x_vec_check):
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
- /* Save start and stop of the destination buffer. */
- movq %rdi, %r11
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
- /* Align destination for aligned stores in the loop. Compute
- how much destination is misaligned. */
- movq %rdi, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %r8
- /* Adjust source. */
- subq %r8, %rsi
- /* Adjust destination which should be aligned now. */
- subq %r8, %rdi
- /* Adjust length. */
- addq %r8, %rdx
-
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Store end of buffer minus tail in rdx. */
+ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx
+ /* Save begining of dst. */
+ movq %rdi, %rcx
+ /* Align dst to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ /* Restore src adjusted with new value for aligned dst. */
+ leaq 1(%rdi, %rsi), %rsi
+ /* Finish aligning dst. */
+ incq %rdi
.p2align 4
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
@@ -501,23 +630,27 @@ L(loop_4x_vec_forward):
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
subq $-(VEC_SIZE * 4), %rsi
- addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
- cmpq $(VEC_SIZE * 4), %rdx
+ cmpq %rdi, %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
+ VMOVU %VEC(7), VEC_SIZE(%rdx)
+ VMOVU %VEC(8), (%rdx)
/* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU %VEC(4), (%rcx)
+ /* Keep nop target close to jmp for 2-byte encoding. */
+L(nop2):
VZEROUPPER_RETURN
-
+ /* Entry from fail movsb. Need to test if dst - src == 0 still. */
+L(more_8x_vec_backward_check_nop):
+ testq %rcx, %rcx
+ jz L(nop2)
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
@@ -525,49 +658,50 @@ L(more_8x_vec_backward):
VMOVU VEC_SIZE(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
- /* Save stop of the destination buffer. */
- leaq -VEC_SIZE(%rdi, %rdx), %r11
- /* Align destination end for aligned stores in the loop. Compute
- how much destination end is misaligned. */
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- movq %r11, %r9
- movq %r11, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Adjust source. */
- subq %r8, %rcx
- /* Adjust the end of destination which should be aligned now. */
- subq %r8, %r9
- /* Adjust length. */
- subq %r8, %rdx
-
- .p2align 4
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Save begining of buffer. */
+ movq %rdi, %rcx
+ /* Set dst to begining of region to copy. -1 for inclusive
+ alignment. */
+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi
+ /* Align dst. */
+ andq $-(VEC_SIZE), %rdi
+ /* Restore src. */
+ addq %rdi, %rsi
+ /* Don't use multi-byte nop to align. */
+ .p2align 4,, 11
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- addq $-(VEC_SIZE * 4), %rcx
- addq $-(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%r9)
- VMOVA %VEC(1), -VEC_SIZE(%r9)
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- addq $-(VEC_SIZE * 4), %r9
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_backward)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(0)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(3)
+ addq $(VEC_SIZE * -4), %rsi
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
+ VMOVA %VEC(1), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 1)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 0)(%rdi)
+ addq $(VEC_SIZE * -4), %rdi
+ cmpq %rdi, %rcx
+ jb L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), (%rcx)
+ VMOVU %VEC(5), VEC_SIZE(%rcx)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rcx)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rcx)
/* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rcx)
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
.p2align 4
+ /* Entry if dst > stop movsb threshold (usually set to non-temporal
+ threshold). */
+L(large_memcpy_2x_check):
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ jb L(more_8x_vec_forward)
L(large_memcpy_2x):
/* Compute absolute value of difference between source and
destination. */