[v1,5/5] X86-64: Optimize memmove-vec-unaligned-erms.S

Message ID	20210824193227.3474346-5-goldstein.w.n@gmail.com
State	New
Headers	show Return-Path: <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 166D23858038 To: libc-alpha@sourceware.org Subject: [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Date: Tue, 24 Aug 2021 15:32:27 -0400 Message-Id: <20210824193227.3474346-5-goldstein.w.n@gmail.com> In-Reply-To: <20210824193227.3474346-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> <20210824193227.3474346-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Noah Goldstein <goldstein.w.n@gmail.com> Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org>
Series	[v1,1/5] string: Make tests birdirectional test-memcpy.c \| expand [v1,1/5] string: Make tests birdirectional test-memcpy.c [v1,2/5] benchtests: Add new random cases to bench-memcpy-random.c [v1,3/5] benchtests: Add partial overlap case in bench-memmove-walk.c [v1,4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c [v1,5/5] X86-64: Optimize memmove-vec-unaligned-erms.S

diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index cac1d762fb..9226d2c6c9 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -78,15 +78,18 @@ enum cf_protection_level #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; /* Define an entry point visible from C. */ -#define ENTRY(name) \ - .globl C_SYMBOL_NAME(name); \ - .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ +#define P2ALIGN_ENTRY(name, alignment) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(alignment); \ C_LABEL(name) \ cfi_startproc; \ - _CET_ENDBR; \ + _CET_ENDBR; \ CALL_MCOUNT +#define ENTRY(name) P2ALIGN_ENTRY(name, 4) + + #undef END #define END(name) \ cfi_endproc; \ diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 9f02624375..75b6efe969 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -165,6 +165,32 @@ # error Invalid LARGE_LOAD_SIZE #endif +/* Whether to align before movsb. Ultimately we want 64 byte align + and not worth it to load 4x VEC for VEC_SIZE == 16. */ +#define ALIGN_MOVSB (VEC_SIZE > 16) + +/* Number of VECs to align movsb to. */ +#if VEC_SIZE == 64 +# define MOVSB_ALIGN_TO (VEC_SIZE) +#else +# define MOVSB_ALIGN_TO (VEC_SIZE * 2) +#endif + +/* Macro for copying inclusive power of 2 range with two register + loads. */ +#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0, tmp_reg1) \ + mov_inst (%src_reg), %tmp_reg0; \ + mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \ + mov_inst %tmp_reg0, (%dst_reg); \ + mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg); + +/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or + 64. */ +#define COPY_4_8 COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi) +#define COPY_8_16 COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi) +#define COPY_16_32 COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1) +#define COPY_32_64 COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16, ymm17) + #ifndef SECTION # error SECTION is not defined! #endif @@ -198,7 +224,13 @@ L(start): movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP + /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are + really hot so we want them to take the same branch path. */ +#if VEC_SIZE > 16 + jbe L(less_vec) +#else jb L(less_vec) +#endif cmp $(VEC_SIZE * 2), %RDX_LP ja L(more_2x_vec) #if !defined USE_MULTIARCH || !IS_IN (libc) @@ -206,15 +238,10 @@ L(last_2x_vec): #endif /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(nop): - ret -#else + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN -#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) +/* Cache align entry so that branch heavy L(less_vec) maintains good + alignment. */ +P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) movq %rdi, %rax L(start_erms): # ifdef __ILP32__ @@ -297,123 +326,217 @@ L(start_erms): movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP + /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are + really hot so we want them to take the same branch path. */ +# if VEC_SIZE > 16 + jbe L(less_vec) +# else jb L(less_vec) +# endif cmp $(VEC_SIZE * 2), %RDX_LP ja L(movsb_more_2x_vec) L(last_2x_vec): - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) L(return): -#if VEC_SIZE > 16 +# if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN -#else +# else ret +# endif #endif +#if VEC_SIZE == 64 +L(copy_8_15): + COPY_8_16 + ret -L(movsb): - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP - jae L(more_8x_vec) - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je L(nop) - leaq (%rsi,%rdx), %r9 - cmpq %r9, %rdi - /* Avoid slow backward REP MOVSB. */ - jb L(more_8x_vec_backward) -# if AVOID_SHORT_DISTANCE_REP_MOVSB - andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rdi, %rcx - subq %rsi, %rcx - jmp 2f -# endif -1: -# if AVOID_SHORT_DISTANCE_REP_MOVSB - andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rsi, %rcx - subq %rdi, %rcx -2: -/* Avoid "rep movsb" if RCX, the distance between source and destination, - is N*4GB + [1..63] with N >= 0. */ - cmpl $63, %ecx - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ -3: -# endif - mov %RDX_LP, %RCX_LP - rep movsb -L(nop): +L(copy_33_63): + COPY_32_64 ret #endif - + /* Only worth aligning if near end of 16 byte block and won't get + first branch in first decode after jump. */ + .p2align 4,, 6 L(less_vec): - /* Less than 1 VEC. */ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! #endif -#if VEC_SIZE > 32 - cmpb $32, %dl - jae L(between_32_63) + /* Second set of branches for smallest copies. */ + cmpl $(VEC_SIZE / 4), %edx + jb L(less_quarter_vec) + + cmpl $(VEC_SIZE / 2), %edx +#if VEC_SIZE == 64 + /* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall + through path as [16, 32] is hotter. */ + ja L(copy_33_63) + COPY_16_32 +#elif VEC_SIZE == 32 + /* Branch to [8, 15]. Fall through to [16, 32]. */ + jb L(copy_8_15) + COPY_16_32 +#else + /* Branch to [4, 7]. Fall through to [8, 15]. */ + jb L(copy_4_7) + COPY_8_16 #endif -#if VEC_SIZE > 16 - cmpb $16, %dl - jae L(between_16_31) -#endif - cmpb $8, %dl - jae L(between_8_15) - cmpb $4, %dl - jae L(between_4_7) - cmpb $1, %dl - ja L(between_2_3) - jb 1f + ret + /* Align if won't cost too many bytes. */ + .p2align 4,, 6 +L(copy_4_7): + COPY_4_8 + ret + + /* Cold target. No need to align. */ +L(copy_1): movzbl (%rsi), %ecx movb %cl, (%rdi) -1: ret + + /* Colder copy case for [0, VEC_SIZE / 4 - 1]. */ +L(less_quarter_vec): #if VEC_SIZE > 32 -L(between_32_63): - /* From 32 to 63. No branch when size == 32. */ - VMOVU (%rsi), %YMM0 - VMOVU -32(%rsi,%rdx), %YMM1 - VMOVU %YMM0, (%rdi) - VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER_RETURN + cmpl $8, %edx + jae L(copy_8_15) #endif #if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ -L(between_16_31): - VMOVU (%rsi), %XMM0 - VMOVU -16(%rsi,%rdx), %XMM1 - VMOVU %XMM0, (%rdi) - VMOVU %XMM1, -16(%rdi,%rdx) - VZEROUPPER_RETURN -#endif -L(between_8_15): - /* From 8 to 15. No branch when size == 8. */ - movq -8(%rsi,%rdx), %rcx - movq (%rsi), %rsi - movq %rcx, -8(%rdi,%rdx) - movq %rsi, (%rdi) - ret -L(between_4_7): - /* From 4 to 7. No branch when size == 4. */ - movl -4(%rsi,%rdx), %ecx - movl (%rsi), %esi - movl %ecx, -4(%rdi,%rdx) - movl %esi, (%rdi) + cmpl $4, %edx + jae L(copy_4_7) +#endif + cmpl $1, %edx + je L(copy_1) + jb L(copy_0) + /* Fall through into copy [2, 3] as it is more common than [0, 1]. + */ + movzwl (%rsi), %ecx + movzbl -1(%rsi, %rdx), %esi + movw %cx, (%rdi) + movb %sil, -1(%rdi, %rdx) +L(copy_0): ret -L(between_2_3): - /* From 2 to 3. No branch when size == 2. */ - movzwl -2(%rsi,%rdx), %ecx - movzwl (%rsi), %esi - movw %cx, -2(%rdi,%rdx) - movw %si, (%rdi) + + .p2align 4 +#if VEC_SIZE == 32 +L(copy_8_15): + COPY_8_16 ret + /* COPY_8_16 is exactly 17 bytes so don't want to p2align after as + it wastes 15 bytes of code and 1 byte off is fine. */ +#endif + +#if defined USE_MULTIARCH && IS_IN (libc) +L(movsb): + movq %rdi, %rcx + subq %rsi, %rcx + /* Go to backwards temporal copy if overlap no matter what as + backward movsb is slow. */ + cmpq %rdx, %rcx + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ + jb L(more_8x_vec_backward_check_nop) + /* If above __x86_rep_movsb_stop_threshold most likely is candidate + for NT moves aswell. */ + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP + jae L(large_memcpy_2x_check) +# if ALIGN_MOVSB + VMOVU (%rsi), %VEC(0) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU VEC_SIZE(%rsi), %VEC(1) +# endif +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) +# error Unsupported MOVSB_ALIGN_TO +# endif + /* Store dst for use after rep movsb. */ + movq %rdi, %r8 +# endif +# if AVOID_SHORT_DISTANCE_REP_MOVSB + /* Only avoid short movsb if CPU has FSRM. */ + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) + jz L(skip_short_movsb_check) + /* Avoid "rep movsb" if RCX, the distance between source and + destination, is N*4GB + [1..63] with N >= 0. */ + + /* ecx contains dst - src. Early check for backward copy conditions + means only case of slow movsb with src = dst + [0, 63] is ecx in + [-63, 0]. Use unsigned comparison with -64 check for that case. */ + cmpl $-64, %ecx + ja L(more_8x_vec_forward) +# endif +# if ALIGN_MOVSB + /* Fall through means cpu has FSRM. In that case exclusively align + destination. */ + + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Add dst to len. Subtract back after dst aligned. */ + leaq (%rdi, %rdx), %rcx + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ + addq $(MOVSB_ALIGN_TO - 1), %rdi + andq $-(MOVSB_ALIGN_TO), %rdi + /* Restore src and len adjusted with new values for aligned dst. */ + addq %rdi, %rsi + subq %rdi, %rcx + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN +L(movsb_align_dst): + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Add dst to len. Subtract back after dst aligned. -1 because dst + is initially aligned to MOVSB_ALIGN_TO - 1. */ + leaq -(1)(%rdi, %rdx), %rcx + /* Inclusively align dst to MOVSB_ALIGN_TO - 1. */ + orq $(MOVSB_ALIGN_TO - 1), %rdi + leaq 1(%rdi, %rsi), %rsi + /* Restore src and len adjusted with new values for aligned dst. */ + subq %rdi, %rcx + /* Finish aligning dst. */ + incq %rdi + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN + +L(skip_short_movsb_check): + /* If CPU does not have FSRM two options for aligning. Align src if + dst and src 4k alias. Otherwise align dst. */ + testl $(PAGE_SIZE - 512), %ecx + jnz L(movsb_align_dst) + /* rcx already has dst - src. */ + movq %rcx, %r9 + /* Add src to len. Subtract back after src aligned. -1 because src + is initially aligned to MOVSB_ALIGN_TO - 1. */ + leaq -(1)(%rsi, %rdx), %rcx + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ + orq $(MOVSB_ALIGN_TO - 1), %rsi + /* Restore dst and len adjusted with new values for aligned dst. */ + leaq 1(%rsi, %r9), %rdi + subq %rsi, %rcx + /* Finish aligning src. */ + incq %rsi + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN +# else + /* Not alignined rep movsb so just copy. */ + mov %RDX_LP, %RCX_LP + rep movsb + ret +# endif +#endif + /* Align if doesn't cost too many bytes. */ + .p2align 4,, 6 #if defined USE_MULTIARCH && IS_IN (libc) L(movsb_more_2x_vec): cmp __x86_rep_movsb_threshold(%rip), %RDX_LP @@ -426,50 +549,60 @@ L(more_2x_vec): ja L(more_8x_vec) cmpq $(VEC_SIZE * 4), %rdx jbe L(last_4x_vec) - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) VZEROUPPER_RETURN + /* Align if doesn't cost too much code size. 6 bytes so that after + jump to target a full mov instruction will always be able to be + fetched. */ + .p2align 4,, 6 L(last_4x_vec): - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) + /* Keep nop target close to jmp for 2-byte encoding. */ +L(nop): VZEROUPPER_RETURN - + /* Align if doesn't cost too much code size. */ + .p2align 4,, 10 L(more_8x_vec): /* Check if non-temporal move candidate. */ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ja L(large_memcpy_2x) #endif - /* Entry if rdx is greater than non-temporal threshold but there - is overlap. */ + /* Entry if rdx is greater than non-temporal threshold but there is + overlap. */ L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ je L(nop) + /* Entry if rdx is greater than movsb or stop movsb threshold but + there is overlap with dst > src. */ +L(more_8x_vec_forward): /* Load the first VEC and last 4 * VEC to support overlapping addresses. */ VMOVU (%rsi), %VEC(4) @@ -477,22 +610,18 @@ L(more_8x_vec_check): VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) - /* Save start and stop of the destination buffer. */ - movq %rdi, %r11 - leaq -VEC_SIZE(%rdi, %rdx), %rcx - /* Align destination for aligned stores in the loop. Compute - how much destination is misaligned. */ - movq %rdi, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %r8 - /* Adjust source. */ - subq %r8, %rsi - /* Adjust destination which should be aligned now. */ - subq %r8, %rdi - /* Adjust length. */ - addq %r8, %rdx - + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Store end of buffer minus tail in rdx. */ + leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx + /* Save begining of dst. */ + movq %rdi, %rcx + /* Align dst to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + /* Restore src adjusted with new value for aligned dst. */ + leaq 1(%rdi, %rsi), %rsi + /* Finish aligning dst. */ + incq %rdi .p2align 4 L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ @@ -501,23 +630,27 @@ L(loop_4x_vec_forward): VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) subq $-(VEC_SIZE * 4), %rsi - addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%rdi) VMOVA %VEC(1), VEC_SIZE(%rdi) VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) subq $-(VEC_SIZE * 4), %rdi - cmpq $(VEC_SIZE * 4), %rdx + cmpq %rdi, %rdx ja L(loop_4x_vec_forward) /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) + VMOVU %VEC(7), VEC_SIZE(%rdx) + VMOVU %VEC(8), (%rdx) /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU %VEC(4), (%rcx) + /* Keep nop target close to jmp for 2-byte encoding. */ +L(nop2): VZEROUPPER_RETURN - + /* Entry from fail movsb. Need to test if dst - src == 0 still. */ +L(more_8x_vec_backward_check_nop): + testq %rcx, %rcx + jz L(nop2) L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping addresses. */ @@ -525,49 +658,50 @@ L(more_8x_vec_backward): VMOVU VEC_SIZE(%rsi), %VEC(5) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) - /* Save stop of the destination buffer. */ - leaq -VEC_SIZE(%rdi, %rdx), %r11 - /* Align destination end for aligned stores in the loop. Compute - how much destination end is misaligned. */ - leaq -VEC_SIZE(%rsi, %rdx), %rcx - movq %r11, %r9 - movq %r11, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Adjust source. */ - subq %r8, %rcx - /* Adjust the end of destination which should be aligned now. */ - subq %r8, %r9 - /* Adjust length. */ - subq %r8, %rdx - - .p2align 4 + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Save begining of buffer. */ + movq %rdi, %rcx + /* Set dst to begining of region to copy. -1 for inclusive + alignment. */ + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi + /* Align dst. */ + andq $-(VEC_SIZE), %rdi + /* Restore src. */ + addq %rdi, %rsi + /* Don't use multi-byte nop to align. */ + .p2align 4,, 11 L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - addq $-(VEC_SIZE * 4), %rcx - addq $-(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%r9) - VMOVA %VEC(1), -VEC_SIZE(%r9) - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - addq $-(VEC_SIZE * 4), %r9 - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_backward) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(0) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(3) + addq $(VEC_SIZE * -4), %rsi + VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVA %VEC(1), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 1)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 0)(%rdi) + addq $(VEC_SIZE * -4), %rdi + cmpq %rdi, %rcx + jb L(loop_4x_vec_backward) /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), (%rcx) + VMOVU %VEC(5), VEC_SIZE(%rcx) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rcx) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rcx) /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rcx) VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) .p2align 4 + /* Entry if dst > stop movsb threshold (usually set to non-temporal + threshold). */ +L(large_memcpy_2x_check): + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + jb L(more_8x_vec_forward) L(large_memcpy_2x): /* Compute absolute value of difference between source and destination. */

[v1,5/5] X86-64: Optimize memmove-vec-unaligned-erms.S

Commit Message

Patch