Message ID | 20210330001217.2430903-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S | expand |
On Mon, Mar 29, 2021 at 5:12 PM noah <goldstein.w.n@gmail.com> wrote: > > No Bug. This commit updates the large memcpy case (no overlap). The > update is to perform memcpy on either 2 or 4 contiguous pages at > once. This 1) helps to alleviate the affects of false memory aliasing > when destination and source have a close 4k alignment and 2) In most > cases and for most DRAM units is a modestly more efficient access > pattern. These changes are a clear performance improvement for > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > pass. > > Signed-off-by: noah <goldstein.w.n@gmail.com> > --- > .../multiarch/memmove-vec-unaligned-erms.S | 323 ++++++++++++++---- > 1 file changed, 253 insertions(+), 70 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index 897a3d9762..6d22e62a43 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -67,6 +67,35 @@ > # endif > #endif Please update comments at the beginning which describes the algorithm. > +#ifndef PAGE_SIZE > +# define PAGE_SIZE 4096 > +#endif > + > +#if PAGE_SIZE != 4096 > +# error Unsupported PAGE_SIZE > +#endif > + > +#ifndef LOG_PAGE_SIZE > +# define LOG_PAGE_SIZE 12 > +#endif > + > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > +# error Invalid LOG_PAGE_SIZE > +#endif > + > +/* Byte per page for large_memcpy inner loop. */ > +#if VEC_SIZE == 64 > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > +#else > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > +#endif > + > + > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > +#ifndef LOG_4X_MEMCPY_THRESH > +# define LOG_4X_MEMCPY_THRESH 4 > +#endif > + > /* Avoid short distance rep movsb only with non-SSE vector. */ > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > @@ -106,6 +135,28 @@ > # error Unsupported PREFETCH_SIZE! > #endif > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > +#else > +# error Invalid LARGE_LOAD_SIZE > +#endif > + > #ifndef SECTION > # error SECTION is not defined! > #endif > @@ -393,6 +444,15 @@ L(last_4x_vec): > VZEROUPPER_RETURN > > L(more_8x_vec): > + /* Check if non-temporal move candidate. */ > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > + /* Check non-temporal store threshold. */ > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + ja L(large_memcpy_2x) > +#endif > + /* Entry if rdx is greater than non-temporal threshold but there > + is overlap. */ > +L(more_8x_vec_check): > cmpq %rsi, %rdi > ja L(more_8x_vec_backward) > /* Source == destination is less common. */ > @@ -419,11 +479,6 @@ L(more_8x_vec): > subq %r8, %rdi > /* Adjust length. */ > addq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_forward) > -#endif > L(loop_4x_vec_forward): > /* Copy 4 * VEC a time forward. */ > VMOVU (%rsi), %VEC(0) > @@ -470,11 +525,6 @@ L(more_8x_vec_backward): > subq %r8, %r9 > /* Adjust length. */ > subq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_backward) > -#endif > L(loop_4x_vec_backward): > /* Copy 4 * VEC a time backward. */ > VMOVU (%rcx), %VEC(0) > @@ -500,72 +550,205 @@ L(loop_4x_vec_backward): > VZEROUPPER_RETURN > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > -L(large_forward): > - /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rdi, %rdx), %r10 > - cmpq %r10, %rsi > - jb L(loop_4x_vec_forward) > -L(loop_large_forward): > - /* Copy 4 * VEC a time forward with non-temporal stores. */ > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > +L(large_memcpy_2x): > + /* Compute absolute value of difference between source and > + destination. */ > + movq %rdi, %r9 > + subq %rsi, %r9 > + movq %r9, %r8 > + leaq -1(%r9), %rcx > + sarq $63, %r8 > + xorq %r8, %r9 > + subq %r8, %r9 > + /* Don't use non-temporal store if there is overlap between > + destination and source since destination may be in cache when > + source is loaded. */ > + cmpq %r9, %rdx > + ja L(more_8x_vec_check) > + > + /* Cache align destination. First store the first 64 bytes then > + adjust alignments. */ > + VMOVU (%rsi), %VEC(8) > +#if VEC_SIZE < 64 > + VMOVU VEC_SIZE(%rsi), %VEC(9) > +#if VEC_SIZE < 32 > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > +#endif > +#endif > + VMOVU %VEC(8), (%rdi) > +#if VEC_SIZE < 64 > + VMOVU %VEC(9), VEC_SIZE(%rdi) > +#if VEC_SIZE < 32 > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > +#endif > +#endif > + /* Adjust source, destination, and size. */ > + MOVQ %rdi, %r8 > + andq $63, %r8 > + /* Get the negative of offset for alignment. */ > + subq $64, %r8 > + /* Adjust source. */ > + subq %r8, %rsi > + /* Adjust destination which should be aligned now. */ > + subq %r8, %rdi > + /* Adjust length. */ > + addq %r8, %rdx > + > + /* Test if source and destination addresses will alias. If they do > + the larger pipeline in large_memcpy_4x alleviated the > + performance drop. */ > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > + jz L(large_memcpy_4x) > + > + movq %rdx, %r10 > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > + jae L(large_memcpy_4x) > + > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 2 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > + /* Copy 4x VEC at a time from 2 pages. */ > + .p2align 5 Is alignment really needed here? > +L(loop_large_memcpy_2x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_2x_inner): > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + addq $LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + addq $LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_2x_inner) > + addq $PAGE_SIZE, %rdi > + addq $PAGE_SIZE, %rsi > + decq %r10 > + jne L(loop_large_memcpy_2x_outer) > + sfence > + > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_2x_end) > + > + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores > + here. The region will fit in cache and it should fit user > + expectations for the tail of the memcpy region to be hot. */ > + .p2align 4 Is alignment really needed here? > +L(loop_large_memcpy_2x_tail): > + /* Copy 4 * VEC a time forward with temporal stores. */ > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > VMOVU (%rsi), %VEC(0) > VMOVU VEC_SIZE(%rsi), %VEC(1) > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > - addq $PREFETCHED_LOAD_SIZE, %rsi > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%rdi) > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > - addq $PREFETCHED_LOAD_SIZE, %rdi > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_forward) > - sfence > + addq $(VEC_SIZE * 4), %rsi > + subl $(VEC_SIZE * 4), %edx There should be a tab before instruction. > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + addq $(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx There should be a tab before instruction. > + ja L(loop_large_memcpy_2x_tail) > + > +L(large_memcpy_2x_end): > /* Store the last 4 * VEC. */ > - VMOVU %VEC(5), (%rcx) > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > - /* Store the first VEC. */ > - VMOVU %VEC(4), (%r11) > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) There should be a tab before instruction. > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > - > -L(large_backward): > - /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rcx, %rdx), %r10 > - cmpq %r10, %r9 > - jb L(loop_4x_vec_backward) > -L(loop_large_backward): > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > - VMOVU (%rcx), %VEC(0) > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > - subq $PREFETCHED_LOAD_SIZE, %rcx > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%r9) > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > - subq $PREFETCHED_LOAD_SIZE, %r9 > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_backward) > - sfence > - /* Store the first 4 * VEC. */ > - VMOVU %VEC(4), (%rdi) > - VMOVU %VEC(5), VEC_SIZE(%rdi) > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > - /* Store the last VEC. */ > - VMOVU %VEC(8), (%r11) > + > +L(large_memcpy_4x): > + movq %rdx, %r10 > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 4 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $(LOG_PAGE_SIZE + 2), %r10 > + /* Copy 4x VEC at a time from 4 pages. */ > + .p2align 5 Is alignment really needed here? > +L(loop_large_memcpy_4x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_4x_inner): > + /* Only one prefetch set per page as doing 4 pages give more time > + for prefetcher to keep up. */ > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + addq $LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + addq $LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_4x_inner) > + addq $(PAGE_SIZE * 3), %rdi > + addq $(PAGE_SIZE * 3), %rsi > + decq %r10 > + jne L(loop_large_memcpy_4x_outer) > + sfence > + > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_4x_end) > + > + /* Handle the last 4 * PAGE_SIZE bytes. */ > + .p2align 4 Is alignment really needed here? > +L(loop_large_memcpy_4x_tail): > + /* Copy 4 * VEC a time forward with temporal stores. */ > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > + VMOVU (%rsi), %VEC(0) > + VMOVU VEC_SIZE(%rsi), %VEC(1) > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > + addq $(VEC_SIZE * 4), %rsi > + subl $(VEC_SIZE * 4), %edx Tab. > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + addq $(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx > + ja L(loop_large_memcpy_4x_tail) > + > +L(large_memcpy_4x_end): > + /* Store the last 4 * VEC. */ > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) Tab. > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > -- > 2.29.2 >
On Tue, Mar 30, 2021 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Mon, Mar 29, 2021 at 5:12 PM noah <goldstein.w.n@gmail.com> wrote: > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > update is to perform memcpy on either 2 or 4 contiguous pages at > > once. This 1) helps to alleviate the affects of false memory aliasing > > when destination and source have a close 4k alignment and 2) In most > > cases and for most DRAM units is a modestly more efficient access > > pattern. These changes are a clear performance improvement for > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > pass. > > > > Signed-off-by: noah <goldstein.w.n@gmail.com> > > --- > > .../multiarch/memmove-vec-unaligned-erms.S | 323 ++++++++++++++---- > > 1 file changed, 253 insertions(+), 70 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > index 897a3d9762..6d22e62a43 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -67,6 +67,35 @@ > > # endif > > #endif > > Please update comments at the beginning which describes the algorithm. done. > > > +#ifndef PAGE_SIZE > > +# define PAGE_SIZE 4096 > > +#endif > > + > > +#if PAGE_SIZE != 4096 > > +# error Unsupported PAGE_SIZE > > +#endif > > + > > +#ifndef LOG_PAGE_SIZE > > +# define LOG_PAGE_SIZE 12 > > +#endif > > + > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > +# error Invalid LOG_PAGE_SIZE > > +#endif > > + > > +/* Byte per page for large_memcpy inner loop. */ > > +#if VEC_SIZE == 64 > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > +#else > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > +#endif > > + > > + > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > +#ifndef LOG_4X_MEMCPY_THRESH > > +# define LOG_4X_MEMCPY_THRESH 4 > > +#endif > > + > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > @@ -106,6 +135,28 @@ > > # error Unsupported PREFETCH_SIZE! > > #endif > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > +#else > > +# error Invalid LARGE_LOAD_SIZE > > +#endif > > + > > #ifndef SECTION > > # error SECTION is not defined! > > #endif > > @@ -393,6 +444,15 @@ L(last_4x_vec): > > VZEROUPPER_RETURN > > > > L(more_8x_vec): > > + /* Check if non-temporal move candidate. */ > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > + /* Check non-temporal store threshold. */ > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > + ja L(large_memcpy_2x) > > +#endif > > + /* Entry if rdx is greater than non-temporal threshold but there > > + is overlap. */ > > +L(more_8x_vec_check): > > cmpq %rsi, %rdi > > ja L(more_8x_vec_backward) > > /* Source == destination is less common. */ > > @@ -419,11 +479,6 @@ L(more_8x_vec): > > subq %r8, %rdi > > /* Adjust length. */ > > addq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_forward) > > -#endif > > L(loop_4x_vec_forward): > > /* Copy 4 * VEC a time forward. */ > > VMOVU (%rsi), %VEC(0) > > @@ -470,11 +525,6 @@ L(more_8x_vec_backward): > > subq %r8, %r9 > > /* Adjust length. */ > > subq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_backward) > > -#endif > > L(loop_4x_vec_backward): > > /* Copy 4 * VEC a time backward. */ > > VMOVU (%rcx), %VEC(0) > > @@ -500,72 +550,205 @@ L(loop_4x_vec_backward): > > VZEROUPPER_RETURN > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > -L(large_forward): > > - /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rdi, %rdx), %r10 > > - cmpq %r10, %rsi > > - jb L(loop_4x_vec_forward) > > -L(loop_large_forward): > > - /* Copy 4 * VEC a time forward with non-temporal stores. */ > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > +L(large_memcpy_2x): > > + /* Compute absolute value of difference between source and > > + destination. */ > > + movq %rdi, %r9 > > + subq %rsi, %r9 > > + movq %r9, %r8 > > + leaq -1(%r9), %rcx > > + sarq $63, %r8 > > + xorq %r8, %r9 > > + subq %r8, %r9 > > + /* Don't use non-temporal store if there is overlap between > > + destination and source since destination may be in cache when > > + source is loaded. */ > > + cmpq %r9, %rdx > > + ja L(more_8x_vec_check) > > + > > + /* Cache align destination. First store the first 64 bytes then > > + adjust alignments. */ > > + VMOVU (%rsi), %VEC(8) > > +#if VEC_SIZE < 64 > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > +#if VEC_SIZE < 32 > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > +#endif > > +#endif > > + VMOVU %VEC(8), (%rdi) > > +#if VEC_SIZE < 64 > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > +#if VEC_SIZE < 32 > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > +#endif > > +#endif > > + /* Adjust source, destination, and size. */ > > + MOVQ %rdi, %r8 > > + andq $63, %r8 > > + /* Get the negative of offset for alignment. */ > > + subq $64, %r8 > > + /* Adjust source. */ > > + subq %r8, %rsi > > + /* Adjust destination which should be aligned now. */ > > + subq %r8, %rdi > > + /* Adjust length. */ > > + addq %r8, %rdx > > + > > + /* Test if source and destination addresses will alias. If they do > > + the larger pipeline in large_memcpy_4x alleviated the > > + performance drop. */ > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > + jz L(large_memcpy_4x) > > + > > + movq %rdx, %r10 > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > + jae L(large_memcpy_4x) > > + > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 2 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > + /* Copy 4x VEC at a time from 2 pages. */ > > + .p2align 5 > > Is alignment really needed here? Adjusted to .p2align 4. I think it's worth it for the following reasons. 1) We know this loop (and the inner loop) are going to run many times so the cost of the nops to enter the loop will be negligible overhead. 2) This loop body (as well as the 4x one) is too large to the LSD to its going to be running out of the uop cache where alignment can matter a great deal. > > > +L(loop_large_memcpy_2x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_2x_inner): > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + addq $LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + addq $LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_2x_inner) > > + addq $PAGE_SIZE, %rdi > > + addq $PAGE_SIZE, %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_2x_outer) > > + sfence > > + > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_2x_end) > > + > > + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores > > + here. The region will fit in cache and it should fit user > > + expectations for the tail of the memcpy region to be hot. */ > > + .p2align 4 > > Is alignment really needed here? no, removed. > > > +L(loop_large_memcpy_2x_tail): > > + /* Copy 4 * VEC a time forward with temporal stores. */ > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > VMOVU (%rsi), %VEC(0) > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%rdi) > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_forward) > > - sfence > > + addq $(VEC_SIZE * 4), %rsi > > + subl $(VEC_SIZE * 4), %edx > > There should be a tab before instruction. done and fixed all tab issues AFAICT. > > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + addq $(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > There should be a tab before instruction. done. > > > + ja L(loop_large_memcpy_2x_tail) > > + > > +L(large_memcpy_2x_end): > > /* Store the last 4 * VEC. */ > > - VMOVU %VEC(5), (%rcx) > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > - /* Store the first VEC. */ > > - VMOVU %VEC(4), (%r11) > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > There should be a tab before instruction. done. > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > - > > -L(large_backward): > > - /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rcx, %rdx), %r10 > > - cmpq %r10, %r9 > > - jb L(loop_4x_vec_backward) > > -L(loop_large_backward): > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > - VMOVU (%rcx), %VEC(0) > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%r9) > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_backward) > > - sfence > > - /* Store the first 4 * VEC. */ > > - VMOVU %VEC(4), (%rdi) > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > - /* Store the last VEC. */ > > - VMOVU %VEC(8), (%r11) > > + > > +L(large_memcpy_4x): > > + movq %rdx, %r10 > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 4 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > + /* Copy 4x VEC at a time from 4 pages. */ > > + .p2align 5 > > Is alignment really needed here? Moved to .p2align 4 but think its worth it for same reason as above. > > > +L(loop_large_memcpy_4x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_4x_inner): > > + /* Only one prefetch set per page as doing 4 pages give more time > > + for prefetcher to keep up. */ > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + addq $LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + addq $LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_4x_inner) > > + addq $(PAGE_SIZE * 3), %rdi > > + addq $(PAGE_SIZE * 3), %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_4x_outer) > > + sfence > > + > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_4x_end) > > + > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > + .p2align 4 > > Is alignment really needed here? no, removed. > > > +L(loop_large_memcpy_4x_tail): > > + /* Copy 4 * VEC a time forward with temporal stores. */ > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > + VMOVU (%rsi), %VEC(0) > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > + addq $(VEC_SIZE * 4), %rsi > > + subl $(VEC_SIZE * 4), %edx > > Tab. done. > > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + addq $(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > + ja L(loop_large_memcpy_4x_tail) > > + > > +L(large_memcpy_4x_end): > > + /* Store the last 4 * VEC. */ > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > Tab. done. > > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > #endif > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > -- > > 2.29.2 > > > > > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 897a3d9762..6d22e62a43 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -67,6 +67,35 @@ # endif #endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif + +#if PAGE_SIZE != 4096 +# error Unsupported PAGE_SIZE +#endif + +#ifndef LOG_PAGE_SIZE +# define LOG_PAGE_SIZE 12 +#endif + +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) +# error Invalid LOG_PAGE_SIZE +#endif + +/* Byte per page for large_memcpy inner loop. */ +#if VEC_SIZE == 64 +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) +#else +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) +#endif + + +/* Amount to shift rdx by to compare for memcpy_large_4x. */ +#ifndef LOG_4X_MEMCPY_THRESH +# define LOG_4X_MEMCPY_THRESH 4 +#endif + /* Avoid short distance rep movsb only with non-SSE vector. */ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) @@ -106,6 +135,28 @@ # error Unsupported PREFETCH_SIZE! #endif +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; \ + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; +#else +# error Invalid LARGE_LOAD_SIZE +#endif + #ifndef SECTION # error SECTION is not defined! #endif @@ -393,6 +444,15 @@ L(last_4x_vec): VZEROUPPER_RETURN L(more_8x_vec): + /* Check if non-temporal move candidate. */ +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_memcpy_2x) +#endif + /* Entry if rdx is greater than non-temporal threshold but there + is overlap. */ +L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ @@ -419,11 +479,6 @@ L(more_8x_vec): subq %r8, %rdi /* Adjust length. */ addq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_forward) -#endif L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ VMOVU (%rsi), %VEC(0) @@ -470,11 +525,6 @@ L(more_8x_vec_backward): subq %r8, %r9 /* Adjust length. */ subq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_backward) -#endif L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ VMOVU (%rcx), %VEC(0) @@ -500,72 +550,205 @@ L(loop_4x_vec_backward): VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) -L(large_forward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) -L(loop_large_forward): - /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) +L(large_memcpy_2x): + /* Compute absolute value of difference between source and + destination. */ + movq %rdi, %r9 + subq %rsi, %r9 + movq %r9, %r8 + leaq -1(%r9), %rcx + sarq $63, %r8 + xorq %r8, %r9 + subq %r8, %r9 + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache when + source is loaded. */ + cmpq %r9, %rdx + ja L(more_8x_vec_check) + + /* Cache align destination. First store the first 64 bytes then + adjust alignments. */ + VMOVU (%rsi), %VEC(8) +#if VEC_SIZE < 64 + VMOVU VEC_SIZE(%rsi), %VEC(9) +#if VEC_SIZE < 32 + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) +#endif +#endif + VMOVU %VEC(8), (%rdi) +#if VEC_SIZE < 64 + VMOVU %VEC(9), VEC_SIZE(%rdi) +#if VEC_SIZE < 32 + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) +#endif +#endif + /* Adjust source, destination, and size. */ + MOVQ %rdi, %r8 + andq $63, %r8 + /* Get the negative of offset for alignment. */ + subq $64, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx + + /* Test if source and destination addresses will alias. If they do + the larger pipeline in large_memcpy_4x alleviated the + performance drop. */ + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + + movq %rdx, %r10 + shrq $LOG_4X_MEMCPY_THRESH, %r10 + cmp __x86_shared_non_temporal_threshold(%rip), %r10 + jae L(large_memcpy_4x) + + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 2 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 + /* Copy 4x VEC at a time from 2 pages. */ + .p2align 5 +L(loop_large_memcpy_2x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_2x_inner): + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + addq $LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + addq $LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_2x_inner) + addq $PAGE_SIZE, %rdi + addq $PAGE_SIZE, %rsi + decq %r10 + jne L(loop_large_memcpy_2x_outer) + sfence + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_2x_end) + + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores + here. The region will fit in cache and it should fit user + expectations for the tail of the memcpy region to be hot. */ + .p2align 4 +L(loop_large_memcpy_2x_tail): + /* Copy 4 * VEC a time forward with temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence + addq $(VEC_SIZE * 4), %rsi + subl $(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_2x_tail) + +L(large_memcpy_2x_end): /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN - -L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) -L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) - sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + +L(large_memcpy_4x): + movq %rdx, %r10 + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 4 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $(LOG_PAGE_SIZE + 2), %r10 + /* Copy 4x VEC at a time from 4 pages. */ + .p2align 5 +L(loop_large_memcpy_4x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_4x_inner): + /* Only one prefetch set per page as doing 4 pages give more time + for prefetcher to keep up. */ + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + addq $LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + addq $LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_4x_inner) + addq $(PAGE_SIZE * 3), %rdi + addq $(PAGE_SIZE * 3), %rsi + decq %r10 + jne L(loop_large_memcpy_4x_outer) + sfence + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_4x_end) + + /* Handle the last 4 * PAGE_SIZE bytes. */ + .p2align 4 +L(loop_large_memcpy_4x_tail): + /* Copy 4 * VEC a time forward with temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subl $(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_4x_tail) + +L(large_memcpy_4x_end): + /* Store the last 4 * VEC. */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
No Bug. This commit updates the large memcpy case (no overlap). The update is to perform memcpy on either 2 or 4 contiguous pages at once. This 1) helps to alleviate the affects of false memory aliasing when destination and source have a close 4k alignment and 2) In most cases and for most DRAM units is a modestly more efficient access pattern. These changes are a clear performance improvement for VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all pass. Signed-off-by: noah <goldstein.w.n@gmail.com> --- .../multiarch/memmove-vec-unaligned-erms.S | 323 ++++++++++++++---- 1 file changed, 253 insertions(+), 70 deletions(-)