Message ID | 20211106183322.3129442-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v4,1/5] string: Make tests birdirectional test-memcpy.c | expand |
On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. > > The optimizations are as follows: > > 1) Always align entry to 64 bytes. This makes behavior more > predictable and makes other frontend optimizations easier. > > 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have > significant benefits in the case that: > 0 < (dst - src) < [256, 512] > > 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%] > improvement and for FSRM [-10%, 25%]. > > In addition to these primary changes there is general cleanup > throughout to optimize the aligning routines and control flow logic. > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > --- > sysdeps/x86_64/memmove.S | 2 +- > .../memmove-avx-unaligned-erms-rtm.S | 2 +- > .../multiarch/memmove-avx-unaligned-erms.S | 2 +- > .../multiarch/memmove-avx512-unaligned-erms.S | 2 +- > .../multiarch/memmove-evex-unaligned-erms.S | 2 +- > .../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++------- > 6 files changed, 381 insertions(+), 224 deletions(-) > > diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S > index db106a7a1f..b2b3180848 100644 > --- a/sysdeps/x86_64/memmove.S > +++ b/sysdeps/x86_64/memmove.S > @@ -25,7 +25,7 @@ > /* Use movups and movaps for smaller code sizes. */ > #define VMOVU movups > #define VMOVA movaps > - > +#define MOV_SIZE 3 > #define SECTION(p) p > > #ifdef USE_MULTIARCH > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > index 1ec1962e86..67a55f0c85 100644 > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > @@ -4,7 +4,7 @@ > # define VMOVNT vmovntdq > # define VMOVU vmovdqu > # define VMOVA vmovdqa > - > +# define MOV_SIZE 4 > # define ZERO_UPPER_VEC_REGISTERS_RETURN \ > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > index e195e93f15..975ae6c051 100644 > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > @@ -4,7 +4,7 @@ > # define VMOVNT vmovntdq > # define VMOVU vmovdqu > # define VMOVA vmovdqa > - > +# define MOV_SIZE 4 > # define SECTION(p) p##.avx > # define MEMMOVE_SYMBOL(p,s) p##_avx_##s > > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > index 848848ab39..0fa7126830 100644 > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > @@ -25,7 +25,7 @@ > # define VMOVU vmovdqu64 > # define VMOVA vmovdqa64 > # define VZEROUPPER > - > +# define MOV_SIZE 6 > # define SECTION(p) p##.evex512 > # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s > > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > index 0cbce8f944..88715441fe 100644 > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > @@ -25,7 +25,7 @@ > # define VMOVU vmovdqu64 > # define VMOVA vmovdqa64 > # define VZEROUPPER > - > +# define MOV_SIZE 6 > # define SECTION(p) p##.evex > # define MEMMOVE_SYMBOL(p,s) p##_evex_##s > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index abde8438d4..7b27cbdda5 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -76,6 +76,25 @@ > # endif > #endif > > +/* Whether to align before movsb. Ultimately we want 64 byte > + align and not worth it to load 4x VEC for VEC_SIZE == 16. */ > +#define ALIGN_MOVSB (VEC_SIZE > 16) > +/* Number of bytes to align movsb to. */ > +#define MOVSB_ALIGN_TO 64 > + > +#define SMALL_MOV_SIZE (MOV_SIZE <= 4) > +#define LARGE_MOV_SIZE (MOV_SIZE > 4) > + > +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 > +# error MOV_SIZE Unknown > +#endif > + > +#if LARGE_MOV_SIZE > +# define SMALL_SIZE_OFFSET (4) > +#else > +# define SMALL_SIZE_OFFSET (0) > +#endif > + > #ifndef PAGE_SIZE > # define PAGE_SIZE 4096 > #endif > @@ -199,25 +218,21 @@ L(start): > # endif > cmp $VEC_SIZE, %RDX_LP > jb L(less_vec) > + /* Load regardless. */ > + VMOVU (%rsi), %VEC(0) > cmp $(VEC_SIZE * 2), %RDX_LP > ja L(more_2x_vec) > -#if !defined USE_MULTIARCH || !IS_IN (libc) > -L(last_2x_vec): > -#endif > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU (%rsi), %VEC(0) > VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) > VMOVU %VEC(0), (%rdi) > VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) > -#if !defined USE_MULTIARCH || !IS_IN (libc) > -L(nop): > - ret > +#if !(defined USE_MULTIARCH && IS_IN (libc)) > + ZERO_UPPER_VEC_REGISTERS_RETURN > #else > VZEROUPPER_RETURN > #endif > #if defined USE_MULTIARCH && IS_IN (libc) > END (MEMMOVE_SYMBOL (__memmove, unaligned)) > - > # if VEC_SIZE == 16 > ENTRY (__mempcpy_chk_erms) > cmp %RDX_LP, %RCX_LP > @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) > END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) > # endif > > -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) > movq %rdi, %rax > L(start_erms): > # ifdef __ILP32__ > @@ -298,310 +313,448 @@ L(start_erms): > # endif > cmp $VEC_SIZE, %RDX_LP > jb L(less_vec) > + /* Load regardless. */ > + VMOVU (%rsi), %VEC(0) > cmp $(VEC_SIZE * 2), %RDX_LP > ja L(movsb_more_2x_vec) > -L(last_2x_vec): > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU (%rsi), %VEC(0) > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. > + */ > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) > VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) > L(return): > -#if VEC_SIZE > 16 > +# if VEC_SIZE > 16 > ZERO_UPPER_VEC_REGISTERS_RETURN > -#else > +# else > ret > +# endif > #endif > > -L(movsb): > - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > - jae L(more_8x_vec) > - cmpq %rsi, %rdi > - jb 1f > - /* Source == destination is less common. */ > - je L(nop) > - leaq (%rsi,%rdx), %r9 > - cmpq %r9, %rdi > - /* Avoid slow backward REP MOVSB. */ > - jb L(more_8x_vec_backward) > -# if AVOID_SHORT_DISTANCE_REP_MOVSB > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > - jz 3f > - movq %rdi, %rcx > - subq %rsi, %rcx > - jmp 2f > -# endif > -1: > -# if AVOID_SHORT_DISTANCE_REP_MOVSB > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > - jz 3f > - movq %rsi, %rcx > - subq %rdi, %rcx > -2: > -/* Avoid "rep movsb" if RCX, the distance between source and destination, > - is N*4GB + [1..63] with N >= 0. */ > - cmpl $63, %ecx > - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > -3: > -# endif > - mov %RDX_LP, %RCX_LP > - rep movsb > -L(nop): > +#if LARGE_MOV_SIZE > + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the > + ENTRY block and L(less_vec). */ > + .p2align 4,, 8 > +L(between_4_7): > + /* From 4 to 7. No branch when size == 4. */ > + movl (%rsi), %ecx > + movl (%rsi, %rdx), %esi > + movl %ecx, (%rdi) > + movl %esi, (%rdi, %rdx) > ret > #endif > > + .p2align 4 > L(less_vec): > /* Less than 1 VEC. */ > #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > # error Unsupported VEC_SIZE! > #endif > #if VEC_SIZE > 32 > - cmpb $32, %dl > + cmpl $32, %edx > jae L(between_32_63) > #endif > #if VEC_SIZE > 16 > - cmpb $16, %dl > + cmpl $16, %edx > jae L(between_16_31) > #endif > - cmpb $8, %dl > + cmpl $8, %edx > jae L(between_8_15) > - cmpb $4, %dl > +#if SMALL_MOV_SIZE > + cmpl $4, %edx > +#else > + subq $4, %rdx > +#endif > jae L(between_4_7) > - cmpb $1, %dl > - ja L(between_2_3) > - jb 1f > - movzbl (%rsi), %ecx > + cmpl $(1 - SMALL_SIZE_OFFSET), %edx > + jl L(copy_0) > + movb (%rsi), %cl > + je L(copy_1) > + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi > + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) > +L(copy_1): > movb %cl, (%rdi) > -1: > +L(copy_0): > ret > + > +#if SMALL_MOV_SIZE > + .p2align 4,, 8 > +L(between_4_7): > + /* From 4 to 7. No branch when size == 4. */ > + movl -4(%rsi, %rdx), %ecx > + movl (%rsi), %esi > + movl %ecx, -4(%rdi, %rdx) > + movl %esi, (%rdi) > + ret > +#endif > + > +#if VEC_SIZE > 16 > + /* From 16 to 31. No branch when size == 16. */ > + .p2align 4,, 8 > +L(between_16_31): > + vmovdqu (%rsi), %xmm0 > + vmovdqu -16(%rsi, %rdx), %xmm1 > + vmovdqu %xmm0, (%rdi) > + vmovdqu %xmm1, -16(%rdi, %rdx) > + /* No ymm registers have been touched. */ > + ret > +#endif > + > #if VEC_SIZE > 32 > + .p2align 4,, 10 > L(between_32_63): > /* From 32 to 63. No branch when size == 32. */ > VMOVU (%rsi), %YMM0 > - VMOVU -32(%rsi,%rdx), %YMM1 > + VMOVU -32(%rsi, %rdx), %YMM1 > VMOVU %YMM0, (%rdi) > - VMOVU %YMM1, -32(%rdi,%rdx) > - VZEROUPPER_RETURN > -#endif > -#if VEC_SIZE > 16 > - /* From 16 to 31. No branch when size == 16. */ > -L(between_16_31): > - VMOVU (%rsi), %XMM0 > - VMOVU -16(%rsi,%rdx), %XMM1 > - VMOVU %XMM0, (%rdi) > - VMOVU %XMM1, -16(%rdi,%rdx) > + VMOVU %YMM1, -32(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > + > + .p2align 4,, 10 > L(between_8_15): > /* From 8 to 15. No branch when size == 8. */ > - movq -8(%rsi,%rdx), %rcx > + movq -8(%rsi, %rdx), %rcx > movq (%rsi), %rsi > - movq %rcx, -8(%rdi,%rdx) > movq %rsi, (%rdi) > + movq %rcx, -8(%rdi, %rdx) > ret > -L(between_4_7): > - /* From 4 to 7. No branch when size == 4. */ > - movl -4(%rsi,%rdx), %ecx > - movl (%rsi), %esi > - movl %ecx, -4(%rdi,%rdx) > - movl %esi, (%rdi) > - ret > -L(between_2_3): > - /* From 2 to 3. No branch when size == 2. */ > - movzwl -2(%rsi,%rdx), %ecx > - movzwl (%rsi), %esi > - movw %cx, -2(%rdi,%rdx) > - movw %si, (%rdi) > - ret > > + .p2align 4,, 10 > +L(last_4x_vec): > + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ > + > + /* VEC(0) and VEC(1) have already been loaded. */ > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(1), VEC_SIZE(%rdi) > + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) > + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) > + VZEROUPPER_RETURN > + > + .p2align 4 > #if defined USE_MULTIARCH && IS_IN (libc) > L(movsb_more_2x_vec): > cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > ja L(movsb) > #endif > L(more_2x_vec): > - /* More than 2 * VEC and there may be overlap between destination > - and source. */ > + /* More than 2 * VEC and there may be overlap between > + destination and source. */ > cmpq $(VEC_SIZE * 8), %rdx > ja L(more_8x_vec) > + /* Load VEC(1) regardless. VEC(0) has already been loaded. */ > + VMOVU VEC_SIZE(%rsi), %VEC(1) > cmpq $(VEC_SIZE * 4), %rdx > jbe L(last_4x_vec) > - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ > - VMOVU (%rsi), %VEC(0) > - VMOVU VEC_SIZE(%rsi), %VEC(1) > + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) > - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) > - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) > VMOVU %VEC(0), (%rdi) > VMOVU %VEC(1), VEC_SIZE(%rdi) > VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) > VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) > - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) > - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) > - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) > - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) > - VZEROUPPER_RETURN > -L(last_4x_vec): > - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ > - VMOVU (%rsi), %VEC(0) > - VMOVU VEC_SIZE(%rsi), %VEC(1) > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) > - VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(1), VEC_SIZE(%rdi) > - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) > - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) > + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) > + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) > VZEROUPPER_RETURN > > + .p2align 4,, 4 > L(more_8x_vec): > + movq %rdi, %rcx > + subq %rsi, %rcx > + /* Go to backwards temporal copy if overlap no matter what as > + backward REP MOVSB is slow and we don't want to use NT stores if > + there is overlap. */ > + cmpq %rdx, %rcx > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ > + jb L(more_8x_vec_backward_check_nop) > /* Check if non-temporal move candidate. */ > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > ja L(large_memcpy_2x) > #endif > - /* Entry if rdx is greater than non-temporal threshold but there > - is overlap. */ > + /* To reach this point there cannot be overlap and dst > src. So > + check for overlap and src > dst in which case correctness > + requires forward copy. Otherwise decide between backward/forward > + copy depending on address aliasing. */ > + > + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold > + but less than __x86_shared_non_temporal_threshold. */ > L(more_8x_vec_check): > - cmpq %rsi, %rdi > - ja L(more_8x_vec_backward) > - /* Source == destination is less common. */ > - je L(nop) > - /* Load the first VEC and last 4 * VEC to support overlapping > - addresses. */ > - VMOVU (%rsi), %VEC(4) > + /* rcx contains dst - src. Add back length (rdx). */ > + leaq (%rcx, %rdx), %r8 > + /* If r8 has different sign than rcx then there is overlap so we > + must do forward copy. */ > + xorq %rcx, %r8 > + /* Isolate just sign bit of r8. */ > + shrq $63, %r8 > + /* Get 4k difference dst - src. */ > + andl $(PAGE_SIZE - 256), %ecx > + /* If r8 is non-zero must do foward for correctness. Otherwise > + if ecx is non-zero there is 4k False Alaising so do backward > + copy. */ > + addl %r8d, %ecx > + jz L(more_8x_vec_backward) > + > + /* if rdx is greater than __x86_shared_non_temporal_threshold > + but there is overlap, or from short distance movsb. */ > +L(more_8x_vec_forward): > + /* Load first and last 4 * VEC to support overlapping addresses. > + */ > + > + /* First vec was already loaded into VEC(0). */ > VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) > VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) > + /* Save begining of dst. */ > + movq %rdi, %rcx > + /* Align dst to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) > VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) > - /* Save start and stop of the destination buffer. */ > - movq %rdi, %r11 > - leaq -VEC_SIZE(%rdi, %rdx), %rcx > - /* Align destination for aligned stores in the loop. Compute > - how much destination is misaligned. */ > - movq %rdi, %r8 > - andq $(VEC_SIZE - 1), %r8 > - /* Get the negative of offset for alignment. */ > - subq $VEC_SIZE, %r8 > - /* Adjust source. */ > - subq %r8, %rsi > - /* Adjust destination which should be aligned now. */ > - subq %r8, %rdi > - /* Adjust length. */ > - addq %r8, %rdx > > - .p2align 4 > + /* Subtract dst from src. Add back after dst aligned. */ > + subq %rcx, %rsi > + /* Finish aligning dst. */ > + incq %rdi > + /* Restore src adjusted with new value for aligned dst. */ > + addq %rdi, %rsi > + /* Store end of buffer minus tail in rdx. */ > + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx > + > + /* Dont use multi-byte nop to align. */ > + .p2align 4,, 11 > L(loop_4x_vec_forward): > /* Copy 4 * VEC a time forward. */ > - VMOVU (%rsi), %VEC(0) > - VMOVU VEC_SIZE(%rsi), %VEC(1) > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > + VMOVU (%rsi), %VEC(1) > + VMOVU VEC_SIZE(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) > subq $-(VEC_SIZE * 4), %rsi > - addq $-(VEC_SIZE * 4), %rdx > - VMOVA %VEC(0), (%rdi) > - VMOVA %VEC(1), VEC_SIZE(%rdi) > - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + VMOVA %VEC(1), (%rdi) > + VMOVA %VEC(2), VEC_SIZE(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) > subq $-(VEC_SIZE * 4), %rdi > - cmpq $(VEC_SIZE * 4), %rdx > + cmpq %rdi, %rdx > ja L(loop_4x_vec_forward) > /* Store the last 4 * VEC. */ > - VMOVU %VEC(5), (%rcx) > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) > + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) > + VMOVU %VEC(7), VEC_SIZE(%rdx) > + VMOVU %VEC(8), (%rdx) > /* Store the first VEC. */ > - VMOVU %VEC(4), (%r11) > + VMOVU %VEC(0), (%rcx) > + /* Keep L(nop_backward) target close to jmp for 2-byte encoding. > + */ > +L(nop_backward): > VZEROUPPER_RETURN > > + .p2align 4,, 8 > +L(more_8x_vec_backward_check_nop): > + /* rcx contains dst - src. Test for dst == src to skip all of > + memmove. */ > + testq %rcx, %rcx > + jz L(nop_backward) > L(more_8x_vec_backward): > /* Load the first 4 * VEC and last VEC to support overlapping > addresses. */ > - VMOVU (%rsi), %VEC(4) > + > + /* First vec was also loaded into VEC(0). */ > VMOVU VEC_SIZE(%rsi), %VEC(5) > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) > + /* Begining of region for 4x backward copy stored in rcx. */ > + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) > - /* Save stop of the destination buffer. */ > - leaq -VEC_SIZE(%rdi, %rdx), %r11 > - /* Align destination end for aligned stores in the loop. Compute > - how much destination end is misaligned. */ > - leaq -VEC_SIZE(%rsi, %rdx), %rcx > - movq %r11, %r9 > - movq %r11, %r8 > - andq $(VEC_SIZE - 1), %r8 > - /* Adjust source. */ > - subq %r8, %rcx > - /* Adjust the end of destination which should be aligned now. */ > - subq %r8, %r9 > - /* Adjust length. */ > - subq %r8, %rdx > - > - .p2align 4 > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) > + /* Subtract dst from src. Add back after dst aligned. */ > + subq %rdi, %rsi > + /* Align dst. */ > + andq $-(VEC_SIZE), %rcx > + /* Restore src. */ > + addq %rcx, %rsi > + > + /* Don't use multi-byte nop to align. */ > + .p2align 4,, 11 > L(loop_4x_vec_backward): > /* Copy 4 * VEC a time backward. */ > - VMOVU (%rcx), %VEC(0) > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > - addq $-(VEC_SIZE * 4), %rcx > - addq $-(VEC_SIZE * 4), %rdx > - VMOVA %VEC(0), (%r9) > - VMOVA %VEC(1), -VEC_SIZE(%r9) > - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > - addq $-(VEC_SIZE * 4), %r9 > - cmpq $(VEC_SIZE * 4), %rdx > - ja L(loop_4x_vec_backward) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) > + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) > + addq $(VEC_SIZE * -4), %rsi > + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) > + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) > + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) > + addq $(VEC_SIZE * -4), %rcx > + cmpq %rcx, %rdi > + jb L(loop_4x_vec_backward) > /* Store the first 4 * VEC. */ > - VMOVU %VEC(4), (%rdi) > + VMOVU %VEC(0), (%rdi) > VMOVU %VEC(5), VEC_SIZE(%rdi) > VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > /* Store the last VEC. */ > - VMOVU %VEC(8), (%r11) > + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) > + VZEROUPPER_RETURN > + > +#if defined USE_MULTIARCH && IS_IN (libc) > + /* L(skip_short_movsb_check) is only used with ERMS. Not for > + FSRM. */ > + .p2align 5,, 16 > +# if ALIGN_MOVSB > +L(skip_short_movsb_check): > +# if MOVSB_ALIGN_TO > VEC_SIZE > + VMOVU VEC_SIZE(%rsi), %VEC(1) > +# endif > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) > +# error Unsupported MOVSB_ALIGN_TO > +# endif > + /* If CPU does not have FSRM two options for aligning. Align src > + if dst and src 4k alias. Otherwise align dst. */ > + testl $(PAGE_SIZE - 512), %ecx > + jnz L(movsb_align_dst) > + /* Fall through. dst and src 4k alias. It's better to align src > + here because the bottleneck will be loads dues to the false > + dependency on dst. */ > + > + /* rcx already has dst - src. */ > + movq %rcx, %r9 > + /* Add src to len. Subtract back after src aligned. -1 because > + src is initially aligned to MOVSB_ALIGN_TO - 1. */ > + leaq -1(%rsi, %rdx), %rcx > + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ > + orq $(MOVSB_ALIGN_TO - 1), %rsi > + /* Restore dst and len adjusted with new values for aligned dst. > + */ > + leaq 1(%rsi, %r9), %rdi > + subq %rsi, %rcx > + /* Finish aligning src. */ > + incq %rsi > + > + rep movsb > + > + VMOVU %VEC(0), (%r8) > +# if MOVSB_ALIGN_TO > VEC_SIZE > + VMOVU %VEC(1), VEC_SIZE(%r8) > +# endif > VZEROUPPER_RETURN > +# endif > + > + .p2align 4,, 12 > +L(movsb): > + movq %rdi, %rcx > + subq %rsi, %rcx > + /* Go to backwards temporal copy if overlap no matter what as > + backward REP MOVSB is slow and we don't want to use NT stores if > + there is overlap. */ > + cmpq %rdx, %rcx > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ > + jb L(more_8x_vec_backward_check_nop) > +# if ALIGN_MOVSB > + /* Save dest for storing aligning VECs later. */ > + movq %rdi, %r8 > +# endif > + /* If above __x86_rep_movsb_stop_threshold most likely is > + candidate for NT moves aswell. */ > + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > + jae L(large_memcpy_2x_check) > +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB > + /* Only avoid short movsb if CPU has FSRM. */ > + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > + jz L(skip_short_movsb_check) > +# if AVOID_SHORT_DISTANCE_REP_MOVSB > + /* Avoid "rep movsb" if RCX, the distance between source and > + destination, is N*4GB + [1..63] with N >= 0. */ > + > + /* ecx contains dst - src. Early check for backward copy > + conditions means only case of slow movsb with src = dst + [0, > + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check > + for that case. */ > + cmpl $-64, %ecx > + ja L(more_8x_vec_forward) > +# endif > +# endif > +# if ALIGN_MOVSB > +# if MOVSB_ALIGN_TO > VEC_SIZE > + VMOVU VEC_SIZE(%rsi), %VEC(1) > +# endif > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) > +# error Unsupported MOVSB_ALIGN_TO > +# endif > + /* Fall through means cpu has FSRM. In that case exclusively > + align destination. */ > +L(movsb_align_dst): > + /* Subtract dst from src. Add back after dst aligned. */ > + subq %rdi, %rsi > + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ > + addq $(MOVSB_ALIGN_TO - 1), %rdi > + /* Add dst to len. Subtract back after dst aligned. */ > + leaq (%r8, %rdx), %rcx > + /* Finish aligning dst. */ > + andq $-(MOVSB_ALIGN_TO), %rdi > + /* Restore src and len adjusted with new values for aligned dst. > + */ > + addq %rdi, %rsi > + subq %rdi, %rcx > + > + rep movsb > + > + /* Store VECs loaded for aligning. */ > + VMOVU %VEC(0), (%r8) > +# if MOVSB_ALIGN_TO > VEC_SIZE > + VMOVU %VEC(1), VEC_SIZE(%r8) > +# endif > + VZEROUPPER_RETURN > +# else /* !ALIGN_MOVSB. */ > +L(skip_short_movsb_check): > + mov %RDX_LP, %RCX_LP > + rep movsb > + ret > +# endif > +#endif > > + .p2align 4,, 10 > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - .p2align 4 > +L(large_memcpy_2x_check): > + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > + jb L(more_8x_vec_check) > L(large_memcpy_2x): > - /* Compute absolute value of difference between source and > - destination. */ > - movq %rdi, %r9 > - subq %rsi, %r9 > - movq %r9, %r8 > - leaq -1(%r9), %rcx > - sarq $63, %r8 > - xorq %r8, %r9 > - subq %r8, %r9 > - /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache when > - source is loaded. */ > - cmpq %r9, %rdx > - ja L(more_8x_vec_check) > + /* To reach this point it is impossible for dst > src and > + overlap. Remaining to check is src > dst and overlap. rcx > + already contains dst - src. Negate rcx to get src - dst. If > + length > rcx then there is overlap and forward copy is best. */ > + negq %rcx > + cmpq %rcx, %rdx > + ja L(more_8x_vec_forward) > > /* Cache align destination. First store the first 64 bytes then > adjust alignments. */ > - VMOVU (%rsi), %VEC(8) > -#if VEC_SIZE < 64 > - VMOVU VEC_SIZE(%rsi), %VEC(9) > -#if VEC_SIZE < 32 > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > -#endif > -#endif > - VMOVU %VEC(8), (%rdi) > -#if VEC_SIZE < 64 > - VMOVU %VEC(9), VEC_SIZE(%rdi) > -#if VEC_SIZE < 32 > - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > -#endif > -#endif > + > + /* First vec was also loaded into VEC(0). */ > +# if VEC_SIZE < 64 > + VMOVU VEC_SIZE(%rsi), %VEC(1) > +# if VEC_SIZE < 32 > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > +# endif > +# endif > + VMOVU %VEC(0), (%rdi) > +# if VEC_SIZE < 64 > + VMOVU %VEC(1), VEC_SIZE(%rdi) > +# if VEC_SIZE < 32 > + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) > +# endif > +# endif > + > /* Adjust source, destination, and size. */ > movq %rdi, %r8 > andq $63, %r8 > @@ -614,9 +767,13 @@ L(large_memcpy_2x): > /* Adjust length. */ > addq %r8, %rdx > > - /* Test if source and destination addresses will alias. If they do > - the larger pipeline in large_memcpy_4x alleviated the > + /* Test if source and destination addresses will alias. If they > + do the larger pipeline in large_memcpy_4x alleviated the > performance drop. */ > + > + /* ecx contains -(dst - src). not ecx will return dst - src - 1 > + which works for testing aliasing. */ > + notl %ecx > testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > jz L(large_memcpy_4x) > > @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer): > /* ecx stores inner loop counter. */ > movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > L(loop_large_memcpy_4x_inner): > - /* Only one prefetch set per page as doing 4 pages give more time > - for prefetcher to keep up. */ > + /* Only one prefetch set per page as doing 4 pages give more > + time for prefetcher to keep up. */ > PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Sat, Nov 6, 2021 at 12:12 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > No bug. > > > > The optimizations are as follows: > > > > 1) Always align entry to 64 bytes. This makes behavior more > > predictable and makes other frontend optimizations easier. > > > > 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have > > significant benefits in the case that: > > 0 < (dst - src) < [256, 512] > > > > 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%] > > improvement and for FSRM [-10%, 25%]. > > > > In addition to these primary changes there is general cleanup > > throughout to optimize the aligning routines and control flow logic. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > --- > > sysdeps/x86_64/memmove.S | 2 +- > > .../memmove-avx-unaligned-erms-rtm.S | 2 +- > > .../multiarch/memmove-avx-unaligned-erms.S | 2 +- > > .../multiarch/memmove-avx512-unaligned-erms.S | 2 +- > > .../multiarch/memmove-evex-unaligned-erms.S | 2 +- > > .../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++------- > > 6 files changed, 381 insertions(+), 224 deletions(-) > > > > diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S > > index db106a7a1f..b2b3180848 100644 > > --- a/sysdeps/x86_64/memmove.S > > +++ b/sysdeps/x86_64/memmove.S > > @@ -25,7 +25,7 @@ > > /* Use movups and movaps for smaller code sizes. */ > > #define VMOVU movups > > #define VMOVA movaps > > - > > +#define MOV_SIZE 3 > > #define SECTION(p) p > > > > #ifdef USE_MULTIARCH > > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > > index 1ec1962e86..67a55f0c85 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S > > @@ -4,7 +4,7 @@ > > # define VMOVNT vmovntdq > > # define VMOVU vmovdqu > > # define VMOVA vmovdqa > > - > > +# define MOV_SIZE 4 > > # define ZERO_UPPER_VEC_REGISTERS_RETURN \ > > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > > index e195e93f15..975ae6c051 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S > > @@ -4,7 +4,7 @@ > > # define VMOVNT vmovntdq > > # define VMOVU vmovdqu > > # define VMOVA vmovdqa > > - > > +# define MOV_SIZE 4 > > # define SECTION(p) p##.avx > > # define MEMMOVE_SYMBOL(p,s) p##_avx_##s > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > > index 848848ab39..0fa7126830 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S > > @@ -25,7 +25,7 @@ > > # define VMOVU vmovdqu64 > > # define VMOVA vmovdqa64 > > # define VZEROUPPER > > - > > +# define MOV_SIZE 6 > > # define SECTION(p) p##.evex512 > > # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > > index 0cbce8f944..88715441fe 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S > > @@ -25,7 +25,7 @@ > > # define VMOVU vmovdqu64 > > # define VMOVA vmovdqa64 > > # define VZEROUPPER > > - > > +# define MOV_SIZE 6 > > # define SECTION(p) p##.evex > > # define MEMMOVE_SYMBOL(p,s) p##_evex_##s > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > index abde8438d4..7b27cbdda5 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -76,6 +76,25 @@ > > # endif > > #endif > > > > +/* Whether to align before movsb. Ultimately we want 64 byte > > + align and not worth it to load 4x VEC for VEC_SIZE == 16. */ > > +#define ALIGN_MOVSB (VEC_SIZE > 16) > > +/* Number of bytes to align movsb to. */ > > +#define MOVSB_ALIGN_TO 64 > > + > > +#define SMALL_MOV_SIZE (MOV_SIZE <= 4) > > +#define LARGE_MOV_SIZE (MOV_SIZE > 4) > > + > > +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 > > +# error MOV_SIZE Unknown > > +#endif > > + > > +#if LARGE_MOV_SIZE > > +# define SMALL_SIZE_OFFSET (4) > > +#else > > +# define SMALL_SIZE_OFFSET (0) > > +#endif > > + > > #ifndef PAGE_SIZE > > # define PAGE_SIZE 4096 > > #endif > > @@ -199,25 +218,21 @@ L(start): > > # endif > > cmp $VEC_SIZE, %RDX_LP > > jb L(less_vec) > > + /* Load regardless. */ > > + VMOVU (%rsi), %VEC(0) > > cmp $(VEC_SIZE * 2), %RDX_LP > > ja L(more_2x_vec) > > -#if !defined USE_MULTIARCH || !IS_IN (libc) > > -L(last_2x_vec): > > -#endif > > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > - VMOVU (%rsi), %VEC(0) > > VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) > > VMOVU %VEC(0), (%rdi) > > VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) > > -#if !defined USE_MULTIARCH || !IS_IN (libc) > > -L(nop): > > - ret > > +#if !(defined USE_MULTIARCH && IS_IN (libc)) > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > #else > > VZEROUPPER_RETURN > > #endif > > #if defined USE_MULTIARCH && IS_IN (libc) > > END (MEMMOVE_SYMBOL (__memmove, unaligned)) > > - > > # if VEC_SIZE == 16 > > ENTRY (__mempcpy_chk_erms) > > cmp %RDX_LP, %RCX_LP > > @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) > > END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) > > # endif > > > > -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) > > movq %rdi, %rax > > L(start_erms): > > # ifdef __ILP32__ > > @@ -298,310 +313,448 @@ L(start_erms): > > # endif > > cmp $VEC_SIZE, %RDX_LP > > jb L(less_vec) > > + /* Load regardless. */ > > + VMOVU (%rsi), %VEC(0) > > cmp $(VEC_SIZE * 2), %RDX_LP > > ja L(movsb_more_2x_vec) > > -L(last_2x_vec): > > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > - VMOVU (%rsi), %VEC(0) > > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. > > + */ > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) > > VMOVU %VEC(0), (%rdi) > > - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) > > + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) > > L(return): > > -#if VEC_SIZE > 16 > > +# if VEC_SIZE > 16 > > ZERO_UPPER_VEC_REGISTERS_RETURN > > -#else > > +# else > > ret > > +# endif > > #endif > > > > -L(movsb): > > - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > > - jae L(more_8x_vec) > > - cmpq %rsi, %rdi > > - jb 1f > > - /* Source == destination is less common. */ > > - je L(nop) > > - leaq (%rsi,%rdx), %r9 > > - cmpq %r9, %rdi > > - /* Avoid slow backward REP MOVSB. */ > > - jb L(more_8x_vec_backward) > > -# if AVOID_SHORT_DISTANCE_REP_MOVSB > > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > > - jz 3f > > - movq %rdi, %rcx > > - subq %rsi, %rcx > > - jmp 2f > > -# endif > > -1: > > -# if AVOID_SHORT_DISTANCE_REP_MOVSB > > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > > - jz 3f > > - movq %rsi, %rcx > > - subq %rdi, %rcx > > -2: > > -/* Avoid "rep movsb" if RCX, the distance between source and destination, > > - is N*4GB + [1..63] with N >= 0. */ > > - cmpl $63, %ecx > > - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > > -3: > > -# endif > > - mov %RDX_LP, %RCX_LP > > - rep movsb > > -L(nop): > > +#if LARGE_MOV_SIZE > > + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the > > + ENTRY block and L(less_vec). */ > > + .p2align 4,, 8 > > +L(between_4_7): > > + /* From 4 to 7. No branch when size == 4. */ > > + movl (%rsi), %ecx > > + movl (%rsi, %rdx), %esi > > + movl %ecx, (%rdi) > > + movl %esi, (%rdi, %rdx) > > ret > > #endif > > > > + .p2align 4 > > L(less_vec): > > /* Less than 1 VEC. */ > > #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > > # error Unsupported VEC_SIZE! > > #endif > > #if VEC_SIZE > 32 > > - cmpb $32, %dl > > + cmpl $32, %edx > > jae L(between_32_63) > > #endif > > #if VEC_SIZE > 16 > > - cmpb $16, %dl > > + cmpl $16, %edx > > jae L(between_16_31) > > #endif > > - cmpb $8, %dl > > + cmpl $8, %edx > > jae L(between_8_15) > > - cmpb $4, %dl > > +#if SMALL_MOV_SIZE > > + cmpl $4, %edx > > +#else > > + subq $4, %rdx > > +#endif > > jae L(between_4_7) > > - cmpb $1, %dl > > - ja L(between_2_3) > > - jb 1f > > - movzbl (%rsi), %ecx > > + cmpl $(1 - SMALL_SIZE_OFFSET), %edx > > + jl L(copy_0) > > + movb (%rsi), %cl > > + je L(copy_1) > > + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi > > + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) > > +L(copy_1): > > movb %cl, (%rdi) > > -1: > > +L(copy_0): > > ret > > + > > +#if SMALL_MOV_SIZE > > + .p2align 4,, 8 > > +L(between_4_7): > > + /* From 4 to 7. No branch when size == 4. */ > > + movl -4(%rsi, %rdx), %ecx > > + movl (%rsi), %esi > > + movl %ecx, -4(%rdi, %rdx) > > + movl %esi, (%rdi) > > + ret > > +#endif > > + > > +#if VEC_SIZE > 16 > > + /* From 16 to 31. No branch when size == 16. */ > > + .p2align 4,, 8 > > +L(between_16_31): > > + vmovdqu (%rsi), %xmm0 > > + vmovdqu -16(%rsi, %rdx), %xmm1 > > + vmovdqu %xmm0, (%rdi) > > + vmovdqu %xmm1, -16(%rdi, %rdx) > > + /* No ymm registers have been touched. */ > > + ret > > +#endif > > + > > #if VEC_SIZE > 32 > > + .p2align 4,, 10 > > L(between_32_63): > > /* From 32 to 63. No branch when size == 32. */ > > VMOVU (%rsi), %YMM0 > > - VMOVU -32(%rsi,%rdx), %YMM1 > > + VMOVU -32(%rsi, %rdx), %YMM1 > > VMOVU %YMM0, (%rdi) > > - VMOVU %YMM1, -32(%rdi,%rdx) > > - VZEROUPPER_RETURN > > -#endif > > -#if VEC_SIZE > 16 > > - /* From 16 to 31. No branch when size == 16. */ > > -L(between_16_31): > > - VMOVU (%rsi), %XMM0 > > - VMOVU -16(%rsi,%rdx), %XMM1 > > - VMOVU %XMM0, (%rdi) > > - VMOVU %XMM1, -16(%rdi,%rdx) > > + VMOVU %YMM1, -32(%rdi, %rdx) > > VZEROUPPER_RETURN > > #endif > > + > > + .p2align 4,, 10 > > L(between_8_15): > > /* From 8 to 15. No branch when size == 8. */ > > - movq -8(%rsi,%rdx), %rcx > > + movq -8(%rsi, %rdx), %rcx > > movq (%rsi), %rsi > > - movq %rcx, -8(%rdi,%rdx) > > movq %rsi, (%rdi) > > + movq %rcx, -8(%rdi, %rdx) > > ret > > -L(between_4_7): > > - /* From 4 to 7. No branch when size == 4. */ > > - movl -4(%rsi,%rdx), %ecx > > - movl (%rsi), %esi > > - movl %ecx, -4(%rdi,%rdx) > > - movl %esi, (%rdi) > > - ret > > -L(between_2_3): > > - /* From 2 to 3. No branch when size == 2. */ > > - movzwl -2(%rsi,%rdx), %ecx > > - movzwl (%rsi), %esi > > - movw %cx, -2(%rdi,%rdx) > > - movw %si, (%rdi) > > - ret > > > > + .p2align 4,, 10 > > +L(last_4x_vec): > > + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ > > + > > + /* VEC(0) and VEC(1) have already been loaded. */ > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) > > + VMOVU %VEC(0), (%rdi) > > + VMOVU %VEC(1), VEC_SIZE(%rdi) > > + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) > > + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > #if defined USE_MULTIARCH && IS_IN (libc) > > L(movsb_more_2x_vec): > > cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > > ja L(movsb) > > #endif > > L(more_2x_vec): > > - /* More than 2 * VEC and there may be overlap between destination > > - and source. */ > > + /* More than 2 * VEC and there may be overlap between > > + destination and source. */ > > cmpq $(VEC_SIZE * 8), %rdx > > ja L(more_8x_vec) > > + /* Load VEC(1) regardless. VEC(0) has already been loaded. */ > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > cmpq $(VEC_SIZE * 4), %rdx > > jbe L(last_4x_vec) > > - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ > > - VMOVU (%rsi), %VEC(0) > > - VMOVU VEC_SIZE(%rsi), %VEC(1) > > + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) > > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) > > - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) > > - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) > > VMOVU %VEC(0), (%rdi) > > VMOVU %VEC(1), VEC_SIZE(%rdi) > > VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) > > VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) > > - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) > > - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) > > - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) > > - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) > > - VZEROUPPER_RETURN > > -L(last_4x_vec): > > - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ > > - VMOVU (%rsi), %VEC(0) > > - VMOVU VEC_SIZE(%rsi), %VEC(1) > > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) > > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) > > - VMOVU %VEC(0), (%rdi) > > - VMOVU %VEC(1), VEC_SIZE(%rdi) > > - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) > > - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) > > + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) > > + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) > > VZEROUPPER_RETURN > > > > + .p2align 4,, 4 > > L(more_8x_vec): > > + movq %rdi, %rcx > > + subq %rsi, %rcx > > + /* Go to backwards temporal copy if overlap no matter what as > > + backward REP MOVSB is slow and we don't want to use NT stores if > > + there is overlap. */ > > + cmpq %rdx, %rcx > > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ > > + jb L(more_8x_vec_backward_check_nop) > > /* Check if non-temporal move candidate. */ > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > ja L(large_memcpy_2x) > > #endif > > - /* Entry if rdx is greater than non-temporal threshold but there > > - is overlap. */ > > + /* To reach this point there cannot be overlap and dst > src. So > > + check for overlap and src > dst in which case correctness > > + requires forward copy. Otherwise decide between backward/forward > > + copy depending on address aliasing. */ > > + > > + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold > > + but less than __x86_shared_non_temporal_threshold. */ > > L(more_8x_vec_check): > > - cmpq %rsi, %rdi > > - ja L(more_8x_vec_backward) > > - /* Source == destination is less common. */ > > - je L(nop) > > - /* Load the first VEC and last 4 * VEC to support overlapping > > - addresses. */ > > - VMOVU (%rsi), %VEC(4) > > + /* rcx contains dst - src. Add back length (rdx). */ > > + leaq (%rcx, %rdx), %r8 > > + /* If r8 has different sign than rcx then there is overlap so we > > + must do forward copy. */ > > + xorq %rcx, %r8 > > + /* Isolate just sign bit of r8. */ > > + shrq $63, %r8 > > + /* Get 4k difference dst - src. */ > > + andl $(PAGE_SIZE - 256), %ecx > > + /* If r8 is non-zero must do foward for correctness. Otherwise > > + if ecx is non-zero there is 4k False Alaising so do backward > > + copy. */ > > + addl %r8d, %ecx > > + jz L(more_8x_vec_backward) > > + > > + /* if rdx is greater than __x86_shared_non_temporal_threshold > > + but there is overlap, or from short distance movsb. */ > > +L(more_8x_vec_forward): > > + /* Load first and last 4 * VEC to support overlapping addresses. > > + */ > > + > > + /* First vec was already loaded into VEC(0). */ > > VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) > > VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) > > + /* Save begining of dst. */ > > + movq %rdi, %rcx > > + /* Align dst to VEC_SIZE - 1. */ > > + orq $(VEC_SIZE - 1), %rdi > > VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) > > VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) > > - /* Save start and stop of the destination buffer. */ > > - movq %rdi, %r11 > > - leaq -VEC_SIZE(%rdi, %rdx), %rcx > > - /* Align destination for aligned stores in the loop. Compute > > - how much destination is misaligned. */ > > - movq %rdi, %r8 > > - andq $(VEC_SIZE - 1), %r8 > > - /* Get the negative of offset for alignment. */ > > - subq $VEC_SIZE, %r8 > > - /* Adjust source. */ > > - subq %r8, %rsi > > - /* Adjust destination which should be aligned now. */ > > - subq %r8, %rdi > > - /* Adjust length. */ > > - addq %r8, %rdx > > > > - .p2align 4 > > + /* Subtract dst from src. Add back after dst aligned. */ > > + subq %rcx, %rsi > > + /* Finish aligning dst. */ > > + incq %rdi > > + /* Restore src adjusted with new value for aligned dst. */ > > + addq %rdi, %rsi > > + /* Store end of buffer minus tail in rdx. */ > > + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx > > + > > + /* Dont use multi-byte nop to align. */ > > + .p2align 4,, 11 > > L(loop_4x_vec_forward): > > /* Copy 4 * VEC a time forward. */ > > - VMOVU (%rsi), %VEC(0) > > - VMOVU VEC_SIZE(%rsi), %VEC(1) > > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > + VMOVU (%rsi), %VEC(1) > > + VMOVU VEC_SIZE(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) > > subq $-(VEC_SIZE * 4), %rsi > > - addq $-(VEC_SIZE * 4), %rdx > > - VMOVA %VEC(0), (%rdi) > > - VMOVA %VEC(1), VEC_SIZE(%rdi) > > - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + VMOVA %VEC(1), (%rdi) > > + VMOVA %VEC(2), VEC_SIZE(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) > > subq $-(VEC_SIZE * 4), %rdi > > - cmpq $(VEC_SIZE * 4), %rdx > > + cmpq %rdi, %rdx > > ja L(loop_4x_vec_forward) > > /* Store the last 4 * VEC. */ > > - VMOVU %VEC(5), (%rcx) > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) > > + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) > > + VMOVU %VEC(7), VEC_SIZE(%rdx) > > + VMOVU %VEC(8), (%rdx) > > /* Store the first VEC. */ > > - VMOVU %VEC(4), (%r11) > > + VMOVU %VEC(0), (%rcx) > > + /* Keep L(nop_backward) target close to jmp for 2-byte encoding. > > + */ > > +L(nop_backward): > > VZEROUPPER_RETURN > > > > + .p2align 4,, 8 > > +L(more_8x_vec_backward_check_nop): > > + /* rcx contains dst - src. Test for dst == src to skip all of > > + memmove. */ > > + testq %rcx, %rcx > > + jz L(nop_backward) > > L(more_8x_vec_backward): > > /* Load the first 4 * VEC and last VEC to support overlapping > > addresses. */ > > - VMOVU (%rsi), %VEC(4) > > + > > + /* First vec was also loaded into VEC(0). */ > > VMOVU VEC_SIZE(%rsi), %VEC(5) > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) > > + /* Begining of region for 4x backward copy stored in rcx. */ > > + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) > > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) > > - /* Save stop of the destination buffer. */ > > - leaq -VEC_SIZE(%rdi, %rdx), %r11 > > - /* Align destination end for aligned stores in the loop. Compute > > - how much destination end is misaligned. */ > > - leaq -VEC_SIZE(%rsi, %rdx), %rcx > > - movq %r11, %r9 > > - movq %r11, %r8 > > - andq $(VEC_SIZE - 1), %r8 > > - /* Adjust source. */ > > - subq %r8, %rcx > > - /* Adjust the end of destination which should be aligned now. */ > > - subq %r8, %r9 > > - /* Adjust length. */ > > - subq %r8, %rdx > > - > > - .p2align 4 > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) > > + /* Subtract dst from src. Add back after dst aligned. */ > > + subq %rdi, %rsi > > + /* Align dst. */ > > + andq $-(VEC_SIZE), %rcx > > + /* Restore src. */ > > + addq %rcx, %rsi > > + > > + /* Don't use multi-byte nop to align. */ > > + .p2align 4,, 11 > > L(loop_4x_vec_backward): > > /* Copy 4 * VEC a time backward. */ > > - VMOVU (%rcx), %VEC(0) > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > - addq $-(VEC_SIZE * 4), %rcx > > - addq $-(VEC_SIZE * 4), %rdx > > - VMOVA %VEC(0), (%r9) > > - VMOVA %VEC(1), -VEC_SIZE(%r9) > > - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) > > - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) > > - addq $-(VEC_SIZE * 4), %r9 > > - cmpq $(VEC_SIZE * 4), %rdx > > - ja L(loop_4x_vec_backward) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) > > + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) > > + addq $(VEC_SIZE * -4), %rsi > > + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) > > + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) > > + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) > > + addq $(VEC_SIZE * -4), %rcx > > + cmpq %rcx, %rdi > > + jb L(loop_4x_vec_backward) > > /* Store the first 4 * VEC. */ > > - VMOVU %VEC(4), (%rdi) > > + VMOVU %VEC(0), (%rdi) > > VMOVU %VEC(5), VEC_SIZE(%rdi) > > VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > /* Store the last VEC. */ > > - VMOVU %VEC(8), (%r11) > > + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) > > + VZEROUPPER_RETURN > > + > > +#if defined USE_MULTIARCH && IS_IN (libc) > > + /* L(skip_short_movsb_check) is only used with ERMS. Not for > > + FSRM. */ > > + .p2align 5,, 16 > > +# if ALIGN_MOVSB > > +L(skip_short_movsb_check): > > +# if MOVSB_ALIGN_TO > VEC_SIZE > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > +# endif > > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) > > +# error Unsupported MOVSB_ALIGN_TO > > +# endif > > + /* If CPU does not have FSRM two options for aligning. Align src > > + if dst and src 4k alias. Otherwise align dst. */ > > + testl $(PAGE_SIZE - 512), %ecx > > + jnz L(movsb_align_dst) > > + /* Fall through. dst and src 4k alias. It's better to align src > > + here because the bottleneck will be loads dues to the false > > + dependency on dst. */ > > + > > + /* rcx already has dst - src. */ > > + movq %rcx, %r9 > > + /* Add src to len. Subtract back after src aligned. -1 because > > + src is initially aligned to MOVSB_ALIGN_TO - 1. */ > > + leaq -1(%rsi, %rdx), %rcx > > + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ > > + orq $(MOVSB_ALIGN_TO - 1), %rsi > > + /* Restore dst and len adjusted with new values for aligned dst. > > + */ > > + leaq 1(%rsi, %r9), %rdi > > + subq %rsi, %rcx > > + /* Finish aligning src. */ > > + incq %rsi > > + > > + rep movsb > > + > > + VMOVU %VEC(0), (%r8) > > +# if MOVSB_ALIGN_TO > VEC_SIZE > > + VMOVU %VEC(1), VEC_SIZE(%r8) > > +# endif > > VZEROUPPER_RETURN > > +# endif > > + > > + .p2align 4,, 12 > > +L(movsb): > > + movq %rdi, %rcx > > + subq %rsi, %rcx > > + /* Go to backwards temporal copy if overlap no matter what as > > + backward REP MOVSB is slow and we don't want to use NT stores if > > + there is overlap. */ > > + cmpq %rdx, %rcx > > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ > > + jb L(more_8x_vec_backward_check_nop) > > +# if ALIGN_MOVSB > > + /* Save dest for storing aligning VECs later. */ > > + movq %rdi, %r8 > > +# endif > > + /* If above __x86_rep_movsb_stop_threshold most likely is > > + candidate for NT moves aswell. */ > > + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP > > + jae L(large_memcpy_2x_check) > > +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB > > + /* Only avoid short movsb if CPU has FSRM. */ > > + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > > + jz L(skip_short_movsb_check) > > +# if AVOID_SHORT_DISTANCE_REP_MOVSB > > + /* Avoid "rep movsb" if RCX, the distance between source and > > + destination, is N*4GB + [1..63] with N >= 0. */ > > + > > + /* ecx contains dst - src. Early check for backward copy > > + conditions means only case of slow movsb with src = dst + [0, > > + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check > > + for that case. */ > > + cmpl $-64, %ecx > > + ja L(more_8x_vec_forward) > > +# endif > > +# endif > > +# if ALIGN_MOVSB > > +# if MOVSB_ALIGN_TO > VEC_SIZE > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > +# endif > > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) > > +# error Unsupported MOVSB_ALIGN_TO > > +# endif > > + /* Fall through means cpu has FSRM. In that case exclusively > > + align destination. */ > > +L(movsb_align_dst): > > + /* Subtract dst from src. Add back after dst aligned. */ > > + subq %rdi, %rsi > > + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ > > + addq $(MOVSB_ALIGN_TO - 1), %rdi > > + /* Add dst to len. Subtract back after dst aligned. */ > > + leaq (%r8, %rdx), %rcx > > + /* Finish aligning dst. */ > > + andq $-(MOVSB_ALIGN_TO), %rdi > > + /* Restore src and len adjusted with new values for aligned dst. > > + */ > > + addq %rdi, %rsi > > + subq %rdi, %rcx > > + > > + rep movsb > > + > > + /* Store VECs loaded for aligning. */ > > + VMOVU %VEC(0), (%r8) > > +# if MOVSB_ALIGN_TO > VEC_SIZE > > + VMOVU %VEC(1), VEC_SIZE(%r8) > > +# endif > > + VZEROUPPER_RETURN > > +# else /* !ALIGN_MOVSB. */ > > +L(skip_short_movsb_check): > > + mov %RDX_LP, %RCX_LP > > + rep movsb > > + ret > > +# endif > > +#endif > > > > + .p2align 4,, 10 > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - .p2align 4 > > +L(large_memcpy_2x_check): > > + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP > > + jb L(more_8x_vec_check) > > L(large_memcpy_2x): > > - /* Compute absolute value of difference between source and > > - destination. */ > > - movq %rdi, %r9 > > - subq %rsi, %r9 > > - movq %r9, %r8 > > - leaq -1(%r9), %rcx > > - sarq $63, %r8 > > - xorq %r8, %r9 > > - subq %r8, %r9 > > - /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache when > > - source is loaded. */ > > - cmpq %r9, %rdx > > - ja L(more_8x_vec_check) > > + /* To reach this point it is impossible for dst > src and > > + overlap. Remaining to check is src > dst and overlap. rcx > > + already contains dst - src. Negate rcx to get src - dst. If > > + length > rcx then there is overlap and forward copy is best. */ > > + negq %rcx > > + cmpq %rcx, %rdx > > + ja L(more_8x_vec_forward) > > > > /* Cache align destination. First store the first 64 bytes then > > adjust alignments. */ > > - VMOVU (%rsi), %VEC(8) > > -#if VEC_SIZE < 64 > > - VMOVU VEC_SIZE(%rsi), %VEC(9) > > -#if VEC_SIZE < 32 > > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > -#endif > > -#endif > > - VMOVU %VEC(8), (%rdi) > > -#if VEC_SIZE < 64 > > - VMOVU %VEC(9), VEC_SIZE(%rdi) > > -#if VEC_SIZE < 32 > > - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > -#endif > > -#endif > > + > > + /* First vec was also loaded into VEC(0). */ > > +# if VEC_SIZE < 64 > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > +# if VEC_SIZE < 32 > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > +# endif > > +# endif > > + VMOVU %VEC(0), (%rdi) > > +# if VEC_SIZE < 64 > > + VMOVU %VEC(1), VEC_SIZE(%rdi) > > +# if VEC_SIZE < 32 > > + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) > > +# endif > > +# endif > > + > > /* Adjust source, destination, and size. */ > > movq %rdi, %r8 > > andq $63, %r8 > > @@ -614,9 +767,13 @@ L(large_memcpy_2x): > > /* Adjust length. */ > > addq %r8, %rdx > > > > - /* Test if source and destination addresses will alias. If they do > > - the larger pipeline in large_memcpy_4x alleviated the > > + /* Test if source and destination addresses will alias. If they > > + do the larger pipeline in large_memcpy_4x alleviated the > > performance drop. */ > > + > > + /* ecx contains -(dst - src). not ecx will return dst - src - 1 > > + which works for testing aliasing. */ > > + notl %ecx > > testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > jz L(large_memcpy_4x) > > > > @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer): > > /* ecx stores inner loop counter. */ > > movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > L(loop_large_memcpy_4x_inner): > > - /* Only one prefetch set per page as doing 4 pages give more time > > - for prefetcher to keep up. */ > > + /* Only one prefetch set per page as doing 4 pages give more > > + time for prefetcher to keep up. */ > > PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S index db106a7a1f..b2b3180848 100644 --- a/sysdeps/x86_64/memmove.S +++ b/sysdeps/x86_64/memmove.S @@ -25,7 +25,7 @@ /* Use movups and movaps for smaller code sizes. */ #define VMOVU movups #define VMOVA movaps - +#define MOV_SIZE 3 #define SECTION(p) p #ifdef USE_MULTIARCH diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S index 1ec1962e86..67a55f0c85 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -4,7 +4,7 @@ # define VMOVNT vmovntdq # define VMOVU vmovdqu # define VMOVA vmovdqa - +# define MOV_SIZE 4 # define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S index e195e93f15..975ae6c051 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S @@ -4,7 +4,7 @@ # define VMOVNT vmovntdq # define VMOVU vmovdqu # define VMOVA vmovdqa - +# define MOV_SIZE 4 # define SECTION(p) p##.avx # define MEMMOVE_SYMBOL(p,s) p##_avx_##s diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S index 848848ab39..0fa7126830 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -25,7 +25,7 @@ # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 # define VZEROUPPER - +# define MOV_SIZE 6 # define SECTION(p) p##.evex512 # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S index 0cbce8f944..88715441fe 100644 --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S @@ -25,7 +25,7 @@ # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 # define VZEROUPPER - +# define MOV_SIZE 6 # define SECTION(p) p##.evex # define MEMMOVE_SYMBOL(p,s) p##_evex_##s diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index abde8438d4..7b27cbdda5 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -76,6 +76,25 @@ # endif #endif +/* Whether to align before movsb. Ultimately we want 64 byte + align and not worth it to load 4x VEC for VEC_SIZE == 16. */ +#define ALIGN_MOVSB (VEC_SIZE > 16) +/* Number of bytes to align movsb to. */ +#define MOVSB_ALIGN_TO 64 + +#define SMALL_MOV_SIZE (MOV_SIZE <= 4) +#define LARGE_MOV_SIZE (MOV_SIZE > 4) + +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 +# error MOV_SIZE Unknown +#endif + +#if LARGE_MOV_SIZE +# define SMALL_SIZE_OFFSET (4) +#else +# define SMALL_SIZE_OFFSET (0) +#endif + #ifndef PAGE_SIZE # define PAGE_SIZE 4096 #endif @@ -199,25 +218,21 @@ L(start): # endif cmp $VEC_SIZE, %RDX_LP jb L(less_vec) + /* Load regardless. */ + VMOVU (%rsi), %VEC(0) cmp $(VEC_SIZE * 2), %RDX_LP ja L(more_2x_vec) -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(last_2x_vec): -#endif /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(nop): - ret +#if !(defined USE_MULTIARCH && IS_IN (libc)) + ZERO_UPPER_VEC_REGISTERS_RETURN #else VZEROUPPER_RETURN #endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) - # if VEC_SIZE == 16 ENTRY (__mempcpy_chk_erms) cmp %RDX_LP, %RCX_LP @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) movq %rdi, %rax L(start_erms): # ifdef __ILP32__ @@ -298,310 +313,448 @@ L(start_erms): # endif cmp $VEC_SIZE, %RDX_LP jb L(less_vec) + /* Load regardless. */ + VMOVU (%rsi), %VEC(0) cmp $(VEC_SIZE * 2), %RDX_LP ja L(movsb_more_2x_vec) -L(last_2x_vec): - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. + */ + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) L(return): -#if VEC_SIZE > 16 +# if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN -#else +# else ret +# endif #endif -L(movsb): - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP - jae L(more_8x_vec) - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je L(nop) - leaq (%rsi,%rdx), %r9 - cmpq %r9, %rdi - /* Avoid slow backward REP MOVSB. */ - jb L(more_8x_vec_backward) -# if AVOID_SHORT_DISTANCE_REP_MOVSB - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rdi, %rcx - subq %rsi, %rcx - jmp 2f -# endif -1: -# if AVOID_SHORT_DISTANCE_REP_MOVSB - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rsi, %rcx - subq %rdi, %rcx -2: -/* Avoid "rep movsb" if RCX, the distance between source and destination, - is N*4GB + [1..63] with N >= 0. */ - cmpl $63, %ecx - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ -3: -# endif - mov %RDX_LP, %RCX_LP - rep movsb -L(nop): +#if LARGE_MOV_SIZE + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the + ENTRY block and L(less_vec). */ + .p2align 4,, 8 +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl (%rsi), %ecx + movl (%rsi, %rdx), %esi + movl %ecx, (%rdi) + movl %esi, (%rdi, %rdx) ret #endif + .p2align 4 L(less_vec): /* Less than 1 VEC. */ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! #endif #if VEC_SIZE > 32 - cmpb $32, %dl + cmpl $32, %edx jae L(between_32_63) #endif #if VEC_SIZE > 16 - cmpb $16, %dl + cmpl $16, %edx jae L(between_16_31) #endif - cmpb $8, %dl + cmpl $8, %edx jae L(between_8_15) - cmpb $4, %dl +#if SMALL_MOV_SIZE + cmpl $4, %edx +#else + subq $4, %rdx +#endif jae L(between_4_7) - cmpb $1, %dl - ja L(between_2_3) - jb 1f - movzbl (%rsi), %ecx + cmpl $(1 - SMALL_SIZE_OFFSET), %edx + jl L(copy_0) + movb (%rsi), %cl + je L(copy_1) + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) +L(copy_1): movb %cl, (%rdi) -1: +L(copy_0): ret + +#if SMALL_MOV_SIZE + .p2align 4,, 8 +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi, %rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi, %rdx) + movl %esi, (%rdi) + ret +#endif + +#if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + .p2align 4,, 8 +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi, %rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi, %rdx) + /* No ymm registers have been touched. */ + ret +#endif + #if VEC_SIZE > 32 + .p2align 4,, 10 L(between_32_63): /* From 32 to 63. No branch when size == 32. */ VMOVU (%rsi), %YMM0 - VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU -32(%rsi, %rdx), %YMM1 VMOVU %YMM0, (%rdi) - VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER_RETURN -#endif -#if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ -L(between_16_31): - VMOVU (%rsi), %XMM0 - VMOVU -16(%rsi,%rdx), %XMM1 - VMOVU %XMM0, (%rdi) - VMOVU %XMM1, -16(%rdi,%rdx) + VMOVU %YMM1, -32(%rdi, %rdx) VZEROUPPER_RETURN #endif + + .p2align 4,, 10 L(between_8_15): /* From 8 to 15. No branch when size == 8. */ - movq -8(%rsi,%rdx), %rcx + movq -8(%rsi, %rdx), %rcx movq (%rsi), %rsi - movq %rcx, -8(%rdi,%rdx) movq %rsi, (%rdi) + movq %rcx, -8(%rdi, %rdx) ret -L(between_4_7): - /* From 4 to 7. No branch when size == 4. */ - movl -4(%rsi,%rdx), %ecx - movl (%rsi), %esi - movl %ecx, -4(%rdi,%rdx) - movl %esi, (%rdi) - ret -L(between_2_3): - /* From 2 to 3. No branch when size == 2. */ - movzwl -2(%rsi,%rdx), %ecx - movzwl (%rsi), %esi - movw %cx, -2(%rdi,%rdx) - movw %si, (%rdi) - ret + .p2align 4,, 10 +L(last_4x_vec): + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ + + /* VEC(0) and VEC(1) have already been loaded. */ + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) + VZEROUPPER_RETURN + + .p2align 4 #if defined USE_MULTIARCH && IS_IN (libc) L(movsb_more_2x_vec): cmp __x86_rep_movsb_threshold(%rip), %RDX_LP ja L(movsb) #endif L(more_2x_vec): - /* More than 2 * VEC and there may be overlap between destination - and source. */ + /* More than 2 * VEC and there may be overlap between + destination and source. */ cmpq $(VEC_SIZE * 8), %rdx ja L(more_8x_vec) + /* Load VEC(1) regardless. VEC(0) has already been loaded. */ + VMOVU VEC_SIZE(%rsi), %VEC(1) cmpq $(VEC_SIZE * 4), %rdx jbe L(last_4x_vec) - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER_RETURN -L(last_4x_vec): - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) VZEROUPPER_RETURN + .p2align 4,, 4 L(more_8x_vec): + movq %rdi, %rcx + subq %rsi, %rcx + /* Go to backwards temporal copy if overlap no matter what as + backward REP MOVSB is slow and we don't want to use NT stores if + there is overlap. */ + cmpq %rdx, %rcx + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ + jb L(more_8x_vec_backward_check_nop) /* Check if non-temporal move candidate. */ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ja L(large_memcpy_2x) #endif - /* Entry if rdx is greater than non-temporal threshold but there - is overlap. */ + /* To reach this point there cannot be overlap and dst > src. So + check for overlap and src > dst in which case correctness + requires forward copy. Otherwise decide between backward/forward + copy depending on address aliasing. */ + + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold + but less than __x86_shared_non_temporal_threshold. */ L(more_8x_vec_check): - cmpq %rsi, %rdi - ja L(more_8x_vec_backward) - /* Source == destination is less common. */ - je L(nop) - /* Load the first VEC and last 4 * VEC to support overlapping - addresses. */ - VMOVU (%rsi), %VEC(4) + /* rcx contains dst - src. Add back length (rdx). */ + leaq (%rcx, %rdx), %r8 + /* If r8 has different sign than rcx then there is overlap so we + must do forward copy. */ + xorq %rcx, %r8 + /* Isolate just sign bit of r8. */ + shrq $63, %r8 + /* Get 4k difference dst - src. */ + andl $(PAGE_SIZE - 256), %ecx + /* If r8 is non-zero must do foward for correctness. Otherwise + if ecx is non-zero there is 4k False Alaising so do backward + copy. */ + addl %r8d, %ecx + jz L(more_8x_vec_backward) + + /* if rdx is greater than __x86_shared_non_temporal_threshold + but there is overlap, or from short distance movsb. */ +L(more_8x_vec_forward): + /* Load first and last 4 * VEC to support overlapping addresses. + */ + + /* First vec was already loaded into VEC(0). */ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + /* Save begining of dst. */ + movq %rdi, %rcx + /* Align dst to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) - /* Save start and stop of the destination buffer. */ - movq %rdi, %r11 - leaq -VEC_SIZE(%rdi, %rdx), %rcx - /* Align destination for aligned stores in the loop. Compute - how much destination is misaligned. */ - movq %rdi, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %r8 - /* Adjust source. */ - subq %r8, %rsi - /* Adjust destination which should be aligned now. */ - subq %r8, %rdi - /* Adjust length. */ - addq %r8, %rdx - .p2align 4 + /* Subtract dst from src. Add back after dst aligned. */ + subq %rcx, %rsi + /* Finish aligning dst. */ + incq %rdi + /* Restore src adjusted with new value for aligned dst. */ + addq %rdi, %rsi + /* Store end of buffer minus tail in rdx. */ + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx + + /* Dont use multi-byte nop to align. */ + .p2align 4,, 11 L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU (%rsi), %VEC(1) + VMOVU VEC_SIZE(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) subq $-(VEC_SIZE * 4), %rsi - addq $-(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%rdi) - VMOVA %VEC(1), VEC_SIZE(%rdi) - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVA %VEC(1), (%rdi) + VMOVA %VEC(2), VEC_SIZE(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) subq $-(VEC_SIZE * 4), %rdi - cmpq $(VEC_SIZE * 4), %rdx + cmpq %rdi, %rdx ja L(loop_4x_vec_forward) /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) + VMOVU %VEC(7), VEC_SIZE(%rdx) + VMOVU %VEC(8), (%rdx) /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU %VEC(0), (%rcx) + /* Keep L(nop_backward) target close to jmp for 2-byte encoding. + */ +L(nop_backward): VZEROUPPER_RETURN + .p2align 4,, 8 +L(more_8x_vec_backward_check_nop): + /* rcx contains dst - src. Test for dst == src to skip all of + memmove. */ + testq %rcx, %rcx + jz L(nop_backward) L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping addresses. */ - VMOVU (%rsi), %VEC(4) + + /* First vec was also loaded into VEC(0). */ VMOVU VEC_SIZE(%rsi), %VEC(5) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + /* Begining of region for 4x backward copy stored in rcx. */ + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) - /* Save stop of the destination buffer. */ - leaq -VEC_SIZE(%rdi, %rdx), %r11 - /* Align destination end for aligned stores in the loop. Compute - how much destination end is misaligned. */ - leaq -VEC_SIZE(%rsi, %rdx), %rcx - movq %r11, %r9 - movq %r11, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Adjust source. */ - subq %r8, %rcx - /* Adjust the end of destination which should be aligned now. */ - subq %r8, %r9 - /* Adjust length. */ - subq %r8, %rdx - - .p2align 4 + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Align dst. */ + andq $-(VEC_SIZE), %rcx + /* Restore src. */ + addq %rcx, %rsi + + /* Don't use multi-byte nop to align. */ + .p2align 4,, 11 L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - addq $-(VEC_SIZE * 4), %rcx - addq $-(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%r9) - VMOVA %VEC(1), -VEC_SIZE(%r9) - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - addq $-(VEC_SIZE * 4), %r9 - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_backward) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) + addq $(VEC_SIZE * -4), %rsi + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) + addq $(VEC_SIZE * -4), %rcx + cmpq %rcx, %rdi + jb L(loop_4x_vec_backward) /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) + VMOVU %VEC(0), (%rdi) VMOVU %VEC(5), VEC_SIZE(%rdi) VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) + VZEROUPPER_RETURN + +#if defined USE_MULTIARCH && IS_IN (libc) + /* L(skip_short_movsb_check) is only used with ERMS. Not for + FSRM. */ + .p2align 5,, 16 +# if ALIGN_MOVSB +L(skip_short_movsb_check): +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU VEC_SIZE(%rsi), %VEC(1) +# endif +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) +# error Unsupported MOVSB_ALIGN_TO +# endif + /* If CPU does not have FSRM two options for aligning. Align src + if dst and src 4k alias. Otherwise align dst. */ + testl $(PAGE_SIZE - 512), %ecx + jnz L(movsb_align_dst) + /* Fall through. dst and src 4k alias. It's better to align src + here because the bottleneck will be loads dues to the false + dependency on dst. */ + + /* rcx already has dst - src. */ + movq %rcx, %r9 + /* Add src to len. Subtract back after src aligned. -1 because + src is initially aligned to MOVSB_ALIGN_TO - 1. */ + leaq -1(%rsi, %rdx), %rcx + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ + orq $(MOVSB_ALIGN_TO - 1), %rsi + /* Restore dst and len adjusted with new values for aligned dst. + */ + leaq 1(%rsi, %r9), %rdi + subq %rsi, %rcx + /* Finish aligning src. */ + incq %rsi + + rep movsb + + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif VZEROUPPER_RETURN +# endif + + .p2align 4,, 12 +L(movsb): + movq %rdi, %rcx + subq %rsi, %rcx + /* Go to backwards temporal copy if overlap no matter what as + backward REP MOVSB is slow and we don't want to use NT stores if + there is overlap. */ + cmpq %rdx, %rcx + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ + jb L(more_8x_vec_backward_check_nop) +# if ALIGN_MOVSB + /* Save dest for storing aligning VECs later. */ + movq %rdi, %r8 +# endif + /* If above __x86_rep_movsb_stop_threshold most likely is + candidate for NT moves aswell. */ + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP + jae L(large_memcpy_2x_check) +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB + /* Only avoid short movsb if CPU has FSRM. */ + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) + jz L(skip_short_movsb_check) +# if AVOID_SHORT_DISTANCE_REP_MOVSB + /* Avoid "rep movsb" if RCX, the distance between source and + destination, is N*4GB + [1..63] with N >= 0. */ + + /* ecx contains dst - src. Early check for backward copy + conditions means only case of slow movsb with src = dst + [0, + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check + for that case. */ + cmpl $-64, %ecx + ja L(more_8x_vec_forward) +# endif +# endif +# if ALIGN_MOVSB +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU VEC_SIZE(%rsi), %VEC(1) +# endif +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) +# error Unsupported MOVSB_ALIGN_TO +# endif + /* Fall through means cpu has FSRM. In that case exclusively + align destination. */ +L(movsb_align_dst): + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ + addq $(MOVSB_ALIGN_TO - 1), %rdi + /* Add dst to len. Subtract back after dst aligned. */ + leaq (%r8, %rdx), %rcx + /* Finish aligning dst. */ + andq $-(MOVSB_ALIGN_TO), %rdi + /* Restore src and len adjusted with new values for aligned dst. + */ + addq %rdi, %rsi + subq %rdi, %rcx + + rep movsb + + /* Store VECs loaded for aligning. */ + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN +# else /* !ALIGN_MOVSB. */ +L(skip_short_movsb_check): + mov %RDX_LP, %RCX_LP + rep movsb + ret +# endif +#endif + .p2align 4,, 10 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - .p2align 4 +L(large_memcpy_2x_check): + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP + jb L(more_8x_vec_check) L(large_memcpy_2x): - /* Compute absolute value of difference between source and - destination. */ - movq %rdi, %r9 - subq %rsi, %r9 - movq %r9, %r8 - leaq -1(%r9), %rcx - sarq $63, %r8 - xorq %r8, %r9 - subq %r8, %r9 - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache when - source is loaded. */ - cmpq %r9, %rdx - ja L(more_8x_vec_check) + /* To reach this point it is impossible for dst > src and + overlap. Remaining to check is src > dst and overlap. rcx + already contains dst - src. Negate rcx to get src - dst. If + length > rcx then there is overlap and forward copy is best. */ + negq %rcx + cmpq %rcx, %rdx + ja L(more_8x_vec_forward) /* Cache align destination. First store the first 64 bytes then adjust alignments. */ - VMOVU (%rsi), %VEC(8) -#if VEC_SIZE < 64 - VMOVU VEC_SIZE(%rsi), %VEC(9) -#if VEC_SIZE < 32 - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) -#endif -#endif - VMOVU %VEC(8), (%rdi) -#if VEC_SIZE < 64 - VMOVU %VEC(9), VEC_SIZE(%rdi) -#if VEC_SIZE < 32 - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) -#endif -#endif + + /* First vec was also loaded into VEC(0). */ +# if VEC_SIZE < 64 + VMOVU VEC_SIZE(%rsi), %VEC(1) +# if VEC_SIZE < 32 + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +# endif +# endif + VMOVU %VEC(0), (%rdi) +# if VEC_SIZE < 64 + VMOVU %VEC(1), VEC_SIZE(%rdi) +# if VEC_SIZE < 32 + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) +# endif +# endif + /* Adjust source, destination, and size. */ movq %rdi, %r8 andq $63, %r8 @@ -614,9 +767,13 @@ L(large_memcpy_2x): /* Adjust length. */ addq %r8, %rdx - /* Test if source and destination addresses will alias. If they do - the larger pipeline in large_memcpy_4x alleviated the + /* Test if source and destination addresses will alias. If they + do the larger pipeline in large_memcpy_4x alleviated the performance drop. */ + + /* ecx contains -(dst - src). not ecx will return dst - src - 1 + which works for testing aliasing. */ + notl %ecx testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx jz L(large_memcpy_4x) @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer): /* ecx stores inner loop counter. */ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx L(loop_large_memcpy_4x_inner): - /* Only one prefetch set per page as doing 4 pages give more time - for prefetcher to keep up. */ + /* Only one prefetch set per page as doing 4 pages give more + time for prefetcher to keep up. */ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
No bug. The optimizations are as follows: 1) Always align entry to 64 bytes. This makes behavior more predictable and makes other frontend optimizations easier. 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have significant benefits in the case that: 0 < (dst - src) < [256, 512] 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%] improvement and for FSRM [-10%, 25%]. In addition to these primary changes there is general cleanup throughout to optimize the aligning routines and control flow logic. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> --- sysdeps/x86_64/memmove.S | 2 +- .../memmove-avx-unaligned-erms-rtm.S | 2 +- .../multiarch/memmove-avx-unaligned-erms.S | 2 +- .../multiarch/memmove-avx512-unaligned-erms.S | 2 +- .../multiarch/memmove-evex-unaligned-erms.S | 2 +- .../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++------- 6 files changed, 381 insertions(+), 224 deletions(-)