Message ID | 20210330213807.492074-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v4,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S | expand |
On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote: > > No Bug. This commit updates the large memcpy case (no overlap). The > update is to perform memcpy on either 2 or 4 contiguous pages at > once. This 1) helps to alleviate the affects of false memory aliasing > when destination and source have a close 4k alignment and 2) In most > cases and for most DRAM units is a modestly more efficient access > pattern. These changes are a clear performance improvement for > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > pass. > > Signed-off-by: noah <goldstein.w.n@gmail.com> > --- > .../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++---- > 1 file changed, 258 insertions(+), 68 deletions(-) 1. There are many trailing whitespaces. 2. Replace "jccSPACElabel" with "jccTABlabel". 3. Replace "insnSPACEoperand" with "insnTABoperand" if needed. > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index 897a3d9762..dae3e2bac5 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -35,7 +35,16 @@ > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > 7. If size >= __x86_shared_non_temporal_threshold and there is no > overlap between destination and source, use non-temporal store > - instead of aligned store. */ > + instead of aligned store copying from either 2 or 4 pages at > + once. > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > + and source and destination do not page alias, copy from 2 pages > + at once using non-temporal stores. Page aliasing in this case is > + considered true if destination's page alignment - sources' page > + alignment is less than 8 * VEC_SIZE. > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > + and destination do page alias copy from 4 pages at once using > + non-temporal stores. */ > > #include <sysdep.h> > > @@ -67,6 +76,35 @@ > # endif > #endif > > +#ifndef PAGE_SIZE > +# define PAGE_SIZE 4096 > +#endif > + > +#if PAGE_SIZE != 4096 > +# error Unsupported PAGE_SIZE > +#endif > + > +#ifndef LOG_PAGE_SIZE > +# define LOG_PAGE_SIZE 12 > +#endif > + > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > +# error Invalid LOG_PAGE_SIZE > +#endif > + > +/* Byte per page for large_memcpy inner loop. */ > +#if VEC_SIZE == 64 > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > +#else > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > +#endif > + > + > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > +#ifndef LOG_4X_MEMCPY_THRESH > +# define LOG_4X_MEMCPY_THRESH 4 > +#endif > + > /* Avoid short distance rep movsb only with non-SSE vector. */ > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > @@ -106,6 +144,28 @@ > # error Unsupported PREFETCH_SIZE! > #endif > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVU (offset)base, vec0; \ > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > + VMOVNT vec0, (offset)base; \ > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > +#else > +# error Invalid LARGE_LOAD_SIZE > +#endif > + > #ifndef SECTION > # error SECTION is not defined! > #endif > @@ -393,6 +453,15 @@ L(last_4x_vec): > VZEROUPPER_RETURN > > L(more_8x_vec): > + /* Check if non-temporal move candidate. */ > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > + /* Check non-temporal store threshold. */ > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > + ja L(large_memcpy_2x) > +#endif > + /* Entry if rdx is greater than non-temporal threshold but there > + is overlap. */ > +L(more_8x_vec_check): > cmpq %rsi, %rdi > ja L(more_8x_vec_backward) > /* Source == destination is less common. */ > @@ -419,11 +488,6 @@ L(more_8x_vec): > subq %r8, %rdi > /* Adjust length. */ > addq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_forward) > -#endif > L(loop_4x_vec_forward): > /* Copy 4 * VEC a time forward. */ > VMOVU (%rsi), %VEC(0) > @@ -470,11 +534,6 @@ L(more_8x_vec_backward): > subq %r8, %r9 > /* Adjust length. */ > subq %r8, %rdx > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > - /* Check non-temporal store threshold. */ > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > - ja L(large_backward) > -#endif > L(loop_4x_vec_backward): > /* Copy 4 * VEC a time backward. */ > VMOVU (%rcx), %VEC(0) > @@ -500,72 +559,203 @@ L(loop_4x_vec_backward): > VZEROUPPER_RETURN > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > -L(large_forward): > +L(large_memcpy_2x): > + /* Compute absolute value of difference between source and > + destination. */ > + movq %rdi, %r9 > + subq %rsi, %r9 > + movq %r9, %r8 > + leaq -1(%r9), %rcx > + sarq $63, %r8 > + xorq %r8, %r9 > + subq %r8, %r9 > /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rdi, %rdx), %r10 > - cmpq %r10, %rsi > - jb L(loop_4x_vec_forward) > -L(loop_large_forward): > - /* Copy 4 * VEC a time forward with non-temporal stores. */ > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > + destination and source since destination may be in cache when > + source is loaded. */ > + cmpq %r9, %rdx > + ja L(more_8x_vec_check) > + > + /* Cache align destination. First store the first 64 bytes then > + adjust alignments. */ > + VMOVU (%rsi), %VEC(8) > +#if VEC_SIZE < 64 > + VMOVU VEC_SIZE(%rsi), %VEC(9) > +#if VEC_SIZE < 32 > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > +#endif > +#endif > + VMOVU %VEC(8), (%rdi) > +#if VEC_SIZE < 64 > + VMOVU %VEC(9), VEC_SIZE(%rdi) > +#if VEC_SIZE < 32 > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > +#endif > +#endif > + /* Adjust source, destination, and size. */ > + MOVQ %rdi, %r8 movq > + andq $63, %r8 > + /* Get the negative of offset for alignment. */ > + subq $64, %r8 > + /* Adjust source. */ > + subq %r8, %rsi > + /* Adjust destination which should be aligned now. */ > + subq %r8, %rdi > + /* Adjust length. */ > + addq %r8, %rdx > + > + /* Test if source and destination addresses will alias. If they do > + the larger pipeline in large_memcpy_4x alleviated the > + performance drop. */ > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > + jz L(large_memcpy_4x) > + > + movq %rdx, %r10 > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > + jae L(large_memcpy_4x) > + > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 2 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > + /* Copy 4x VEC at a time from 2 pages. */ > + .p2align 4 If you drop .p2align, will it show up on the glibc benchtest? > +L(loop_large_memcpy_2x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_2x_inner): > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + addq $LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + addq $LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_2x_inner) > + addq $PAGE_SIZE, %rdi > + addq $PAGE_SIZE, %rsi > + decq %r10 > + jne L(loop_large_memcpy_2x_outer) > + sfence > + > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_2x_end) > + > + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores > + here. The region will fit in cache and it should fit user > + expectations for the tail of the memcpy region to be hot. */ > +L(loop_large_memcpy_2x_tail): > + /* Copy 4 * VEC a time forward with temporal stores. */ > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > VMOVU (%rsi), %VEC(0) > VMOVU VEC_SIZE(%rsi), %VEC(1) > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > - addq $PREFETCHED_LOAD_SIZE, %rsi > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%rdi) > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > - addq $PREFETCHED_LOAD_SIZE, %rdi > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_forward) > - sfence > + addq $(VEC_SIZE * 4), %rsi > + subl $(VEC_SIZE * 4), %edx > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + addq $(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx > + ja L(loop_large_memcpy_2x_tail) > + > +L(large_memcpy_2x_end): > /* Store the last 4 * VEC. */ > - VMOVU %VEC(5), (%rcx) > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > - /* Store the first VEC. */ > - VMOVU %VEC(4), (%r11) > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > > -L(large_backward): > - /* Don't use non-temporal store if there is overlap between > - destination and source since destination may be in cache > - when source is loaded. */ > - leaq (%rcx, %rdx), %r10 > - cmpq %r10, %r9 > - jb L(loop_4x_vec_backward) > -L(loop_large_backward): > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > - VMOVU (%rcx), %VEC(0) > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > - subq $PREFETCHED_LOAD_SIZE, %rcx > - subq $PREFETCHED_LOAD_SIZE, %rdx > - VMOVNT %VEC(0), (%r9) > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > - subq $PREFETCHED_LOAD_SIZE, %r9 > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > - ja L(loop_large_backward) > +L(large_memcpy_4x): > + movq %rdx, %r10 > + /* edx will store remainder size for copying tail. */ > + andl $(PAGE_SIZE * 4 - 1), %edx > + /* r10 stores outer loop counter. */ > + shrq $(LOG_PAGE_SIZE + 2), %r10 > + /* Copy 4x VEC at a time from 4 pages. */ > + .p2align 4 If you drop .p2align, will it show up on the glibc benchtest? > +L(loop_large_memcpy_4x_outer): > + /* ecx stores inner loop counter. */ > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > +L(loop_large_memcpy_4x_inner): > + /* Only one prefetch set per page as doing 4 pages give more time > + for prefetcher to keep up. */ > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > + /* Load vectors from rsi. */ > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + addq $LARGE_LOAD_SIZE, %rsi > + /* Non-temporal store vectors to rdi. */ > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > + addq $LARGE_LOAD_SIZE, %rdi > + decl %ecx > + jnz L(loop_large_memcpy_4x_inner) > + addq $(PAGE_SIZE * 3), %rdi > + addq $(PAGE_SIZE * 3), %rsi > + decq %r10 > + jne L(loop_large_memcpy_4x_outer) > sfence > - /* Store the first 4 * VEC. */ > - VMOVU %VEC(4), (%rdi) > - VMOVU %VEC(5), VEC_SIZE(%rdi) > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > - /* Store the last VEC. */ > - VMOVU %VEC(8), (%r11) > + > + /* Check if only last 4 loads are needed. */ > + cmpl $(VEC_SIZE * 4), %edx > + jbe L(large_memcpy_4x_end) > + > + /* Handle the last 4 * PAGE_SIZE bytes. */ > +L(loop_large_memcpy_4x_tail): > + /* Copy 4 * VEC a time forward with temporal stores. */ > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > + VMOVU (%rsi), %VEC(0) > + VMOVU VEC_SIZE(%rsi), %VEC(1) > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > + addq $(VEC_SIZE * 4), %rsi > + subl $(VEC_SIZE * 4), %edx > + VMOVA %VEC(0), (%rdi) > + VMOVA %VEC(1), VEC_SIZE(%rdi) > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > + addq $(VEC_SIZE * 4), %rdi > + cmpl $(VEC_SIZE * 4), %edx > + ja L(loop_large_memcpy_4x_tail) > + > +L(large_memcpy_4x_end): > + /* Store the last 4 * VEC. */ > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > + > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > -- > 2.29.2 >
On Thu, Apr 1, 2021 at 9:54 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote: > > > > No Bug. This commit updates the large memcpy case (no overlap). The > > update is to perform memcpy on either 2 or 4 contiguous pages at > > once. This 1) helps to alleviate the affects of false memory aliasing > > when destination and source have a close 4k alignment and 2) In most > > cases and for most DRAM units is a modestly more efficient access > > pattern. These changes are a clear performance improvement for > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all > > pass. > > > > Signed-off-by: noah <goldstein.w.n@gmail.com> > > --- > > .../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++---- > > 1 file changed, 258 insertions(+), 68 deletions(-) > > 1. There are many trailing whitespaces. done. > 2. Replace "jccSPACElabel" with "jccTABlabel". done. > 3. Replace "insnSPACEoperand" with "insnTABoperand" if needed. done. > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > index 897a3d9762..dae3e2bac5 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -35,7 +35,16 @@ > > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. > > 7. If size >= __x86_shared_non_temporal_threshold and there is no > > overlap between destination and source, use non-temporal store > > - instead of aligned store. */ > > + instead of aligned store copying from either 2 or 4 pages at > > + once. > > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold > > + and source and destination do not page alias, copy from 2 pages > > + at once using non-temporal stores. Page aliasing in this case is > > + considered true if destination's page alignment - sources' page > > + alignment is less than 8 * VEC_SIZE. > > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source > > + and destination do page alias copy from 4 pages at once using > > + non-temporal stores. */ > > > > #include <sysdep.h> > > > > @@ -67,6 +76,35 @@ > > # endif > > #endif > > > > +#ifndef PAGE_SIZE > > +# define PAGE_SIZE 4096 > > +#endif > > + > > +#if PAGE_SIZE != 4096 > > +# error Unsupported PAGE_SIZE > > +#endif > > + > > +#ifndef LOG_PAGE_SIZE > > +# define LOG_PAGE_SIZE 12 > > +#endif > > + > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) > > +# error Invalid LOG_PAGE_SIZE > > +#endif > > + > > +/* Byte per page for large_memcpy inner loop. */ > > +#if VEC_SIZE == 64 > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) > > +#else > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) > > +#endif > > + > > + > > +/* Amount to shift rdx by to compare for memcpy_large_4x. */ > > +#ifndef LOG_4X_MEMCPY_THRESH > > +# define LOG_4X_MEMCPY_THRESH 4 > > +#endif > > + > > /* Avoid short distance rep movsb only with non-SSE vector. */ > > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB > > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) > > @@ -106,6 +144,28 @@ > > # error Unsupported PREFETCH_SIZE! > > #endif > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVU (offset)base, vec0; \ > > + VMOVU ((offset) + VEC_SIZE)base, vec1; \ > > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ > > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ > > + VMOVNT vec0, (offset)base; \ > > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ > > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ > > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; > > +#else > > +# error Invalid LARGE_LOAD_SIZE > > +#endif > > + > > #ifndef SECTION > > # error SECTION is not defined! > > #endif > > @@ -393,6 +453,15 @@ L(last_4x_vec): > > VZEROUPPER_RETURN > > > > L(more_8x_vec): > > + /* Check if non-temporal move candidate. */ > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > + /* Check non-temporal store threshold. */ > > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > + ja L(large_memcpy_2x) > > +#endif > > + /* Entry if rdx is greater than non-temporal threshold but there > > + is overlap. */ > > +L(more_8x_vec_check): > > cmpq %rsi, %rdi > > ja L(more_8x_vec_backward) > > /* Source == destination is less common. */ > > @@ -419,11 +488,6 @@ L(more_8x_vec): > > subq %r8, %rdi > > /* Adjust length. */ > > addq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_forward) > > -#endif > > L(loop_4x_vec_forward): > > /* Copy 4 * VEC a time forward. */ > > VMOVU (%rsi), %VEC(0) > > @@ -470,11 +534,6 @@ L(more_8x_vec_backward): > > subq %r8, %r9 > > /* Adjust length. */ > > subq %r8, %rdx > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > - /* Check non-temporal store threshold. */ > > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP > > - ja L(large_backward) > > -#endif > > L(loop_4x_vec_backward): > > /* Copy 4 * VEC a time backward. */ > > VMOVU (%rcx), %VEC(0) > > @@ -500,72 +559,203 @@ L(loop_4x_vec_backward): > > VZEROUPPER_RETURN > > > > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) > > -L(large_forward): > > +L(large_memcpy_2x): > > + /* Compute absolute value of difference between source and > > + destination. */ > > + movq %rdi, %r9 > > + subq %rsi, %r9 > > + movq %r9, %r8 > > + leaq -1(%r9), %rcx > > + sarq $63, %r8 > > + xorq %r8, %r9 > > + subq %r8, %r9 > > /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rdi, %rdx), %r10 > > - cmpq %r10, %rsi > > - jb L(loop_4x_vec_forward) > > -L(loop_large_forward): > > - /* Copy 4 * VEC a time forward with non-temporal stores. */ > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) > > + destination and source since destination may be in cache when > > + source is loaded. */ > > + cmpq %r9, %rdx > > + ja L(more_8x_vec_check) > > + > > + /* Cache align destination. First store the first 64 bytes then > > + adjust alignments. */ > > + VMOVU (%rsi), %VEC(8) > > +#if VEC_SIZE < 64 > > + VMOVU VEC_SIZE(%rsi), %VEC(9) > > +#if VEC_SIZE < 32 > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) > > +#endif > > +#endif > > + VMOVU %VEC(8), (%rdi) > > +#if VEC_SIZE < 64 > > + VMOVU %VEC(9), VEC_SIZE(%rdi) > > +#if VEC_SIZE < 32 > > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) > > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) > > +#endif > > +#endif > > + /* Adjust source, destination, and size. */ > > + MOVQ %rdi, %r8 > > movq done. > > > + andq $63, %r8 > > + /* Get the negative of offset for alignment. */ > > + subq $64, %r8 > > + /* Adjust source. */ > > + subq %r8, %rsi > > + /* Adjust destination which should be aligned now. */ > > + subq %r8, %rdi > > + /* Adjust length. */ > > + addq %r8, %rdx > > + > > + /* Test if source and destination addresses will alias. If they do > > + the larger pipeline in large_memcpy_4x alleviated the > > + performance drop. */ > > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx > > + jz L(large_memcpy_4x) > > + > > + movq %rdx, %r10 > > + shrq $LOG_4X_MEMCPY_THRESH, %r10 > > + cmp __x86_shared_non_temporal_threshold(%rip), %r10 > > + jae L(large_memcpy_4x) > > + > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 2 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 > > + /* Copy 4x VEC at a time from 2 pages. */ > > + .p2align 4 > > If you drop .p2align, will it show up on the glibc benchtest? No. Dropped it. > > > +L(loop_large_memcpy_2x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_2x_inner): > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + addq $LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + addq $LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_2x_inner) > > + addq $PAGE_SIZE, %rdi > > + addq $PAGE_SIZE, %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_2x_outer) > > + sfence > > + > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_2x_end) > > + > > + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores > > + here. The region will fit in cache and it should fit user > > + expectations for the tail of the memcpy region to be hot. */ > > +L(loop_large_memcpy_2x_tail): > > + /* Copy 4 * VEC a time forward with temporal stores. */ > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > VMOVU (%rsi), %VEC(0) > > VMOVU VEC_SIZE(%rsi), %VEC(1) > > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > - addq $PREFETCHED_LOAD_SIZE, %rsi > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%rdi) > > - VMOVNT %VEC(1), VEC_SIZE(%rdi) > > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) > > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) > > - addq $PREFETCHED_LOAD_SIZE, %rdi > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_forward) > > - sfence > > + addq $(VEC_SIZE * 4), %rsi > > + subl $(VEC_SIZE * 4), %edx > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + addq $(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > + ja L(loop_large_memcpy_2x_tail) > > + > > +L(large_memcpy_2x_end): > > /* Store the last 4 * VEC. */ > > - VMOVU %VEC(5), (%rcx) > > - VMOVU %VEC(6), -VEC_SIZE(%rcx) > > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) > > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) > > - /* Store the first VEC. */ > > - VMOVU %VEC(4), (%r11) > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > > > -L(large_backward): > > - /* Don't use non-temporal store if there is overlap between > > - destination and source since destination may be in cache > > - when source is loaded. */ > > - leaq (%rcx, %rdx), %r10 > > - cmpq %r10, %r9 > > - jb L(loop_4x_vec_backward) > > -L(loop_large_backward): > > - /* Copy 4 * VEC a time backward with non-temporal stores. */ > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) > > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) > > - VMOVU (%rcx), %VEC(0) > > - VMOVU -VEC_SIZE(%rcx), %VEC(1) > > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) > > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) > > - subq $PREFETCHED_LOAD_SIZE, %rcx > > - subq $PREFETCHED_LOAD_SIZE, %rdx > > - VMOVNT %VEC(0), (%r9) > > - VMOVNT %VEC(1), -VEC_SIZE(%r9) > > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) > > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) > > - subq $PREFETCHED_LOAD_SIZE, %r9 > > - cmpq $PREFETCHED_LOAD_SIZE, %rdx > > - ja L(loop_large_backward) > > +L(large_memcpy_4x): > > + movq %rdx, %r10 > > + /* edx will store remainder size for copying tail. */ > > + andl $(PAGE_SIZE * 4 - 1), %edx > > + /* r10 stores outer loop counter. */ > > + shrq $(LOG_PAGE_SIZE + 2), %r10 > > + /* Copy 4x VEC at a time from 4 pages. */ > > + .p2align 4 > > If you drop .p2align, will it show up on the glibc benchtest? No. Dropped it. > > > +L(loop_large_memcpy_4x_outer): > > + /* ecx stores inner loop counter. */ > > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx > > +L(loop_large_memcpy_4x_inner): > > + /* Only one prefetch set per page as doing 4 pages give more time > > + for prefetcher to keep up. */ > > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) > > + /* Load vectors from rsi. */ > > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + addq $LARGE_LOAD_SIZE, %rsi > > + /* Non-temporal store vectors to rdi. */ > > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) > > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) > > + addq $LARGE_LOAD_SIZE, %rdi > > + decl %ecx > > + jnz L(loop_large_memcpy_4x_inner) > > + addq $(PAGE_SIZE * 3), %rdi > > + addq $(PAGE_SIZE * 3), %rsi > > + decq %r10 > > + jne L(loop_large_memcpy_4x_outer) > > sfence > > - /* Store the first 4 * VEC. */ > > - VMOVU %VEC(4), (%rdi) > > - VMOVU %VEC(5), VEC_SIZE(%rdi) > > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) > > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) > > - /* Store the last VEC. */ > > - VMOVU %VEC(8), (%r11) > > + > > + /* Check if only last 4 loads are needed. */ > > + cmpl $(VEC_SIZE * 4), %edx > > + jbe L(large_memcpy_4x_end) > > + > > + /* Handle the last 4 * PAGE_SIZE bytes. */ > > +L(loop_large_memcpy_4x_tail): > > + /* Copy 4 * VEC a time forward with temporal stores. */ > > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) > > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) > > + VMOVU (%rsi), %VEC(0) > > + VMOVU VEC_SIZE(%rsi), %VEC(1) > > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) > > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) > > + addq $(VEC_SIZE * 4), %rsi > > + subl $(VEC_SIZE * 4), %edx > > + VMOVA %VEC(0), (%rdi) > > + VMOVA %VEC(1), VEC_SIZE(%rdi) > > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) > > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) > > + addq $(VEC_SIZE * 4), %rdi > > + cmpl $(VEC_SIZE * 4), %edx > > + ja L(loop_large_memcpy_4x_tail) > > + > > +L(large_memcpy_4x_end): > > + /* Store the last 4 * VEC. */ > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) > > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) > > + > > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) > > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) > > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) > > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) > > VZEROUPPER_RETURN > > #endif > > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) > > -- > > 2.29.2 > > > > > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 897a3d9762..dae3e2bac5 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -35,7 +35,16 @@ __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. 7. If size >= __x86_shared_non_temporal_threshold and there is no overlap between destination and source, use non-temporal store - instead of aligned store. */ + instead of aligned store copying from either 2 or 4 pages at + once. + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold + and source and destination do not page alias, copy from 2 pages + at once using non-temporal stores. Page aliasing in this case is + considered true if destination's page alignment - sources' page + alignment is less than 8 * VEC_SIZE. + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source + and destination do page alias copy from 4 pages at once using + non-temporal stores. */ #include <sysdep.h> @@ -67,6 +76,35 @@ # endif #endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif + +#if PAGE_SIZE != 4096 +# error Unsupported PAGE_SIZE +#endif + +#ifndef LOG_PAGE_SIZE +# define LOG_PAGE_SIZE 12 +#endif + +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) +# error Invalid LOG_PAGE_SIZE +#endif + +/* Byte per page for large_memcpy inner loop. */ +#if VEC_SIZE == 64 +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) +#else +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) +#endif + + +/* Amount to shift rdx by to compare for memcpy_large_4x. */ +#ifndef LOG_4X_MEMCPY_THRESH +# define LOG_4X_MEMCPY_THRESH 4 +#endif + /* Avoid short distance rep movsb only with non-SSE vector. */ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) @@ -106,6 +144,28 @@ # error Unsupported PREFETCH_SIZE! #endif +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; \ + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; +#else +# error Invalid LARGE_LOAD_SIZE +#endif + #ifndef SECTION # error SECTION is not defined! #endif @@ -393,6 +453,15 @@ L(last_4x_vec): VZEROUPPER_RETURN L(more_8x_vec): + /* Check if non-temporal move candidate. */ +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_memcpy_2x) +#endif + /* Entry if rdx is greater than non-temporal threshold but there + is overlap. */ +L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ @@ -419,11 +488,6 @@ L(more_8x_vec): subq %r8, %rdi /* Adjust length. */ addq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_forward) -#endif L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ VMOVU (%rsi), %VEC(0) @@ -470,11 +534,6 @@ L(more_8x_vec_backward): subq %r8, %r9 /* Adjust length. */ subq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_backward) -#endif L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ VMOVU (%rcx), %VEC(0) @@ -500,72 +559,203 @@ L(loop_4x_vec_backward): VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) -L(large_forward): +L(large_memcpy_2x): + /* Compute absolute value of difference between source and + destination. */ + movq %rdi, %r9 + subq %rsi, %r9 + movq %r9, %r8 + leaq -1(%r9), %rcx + sarq $63, %r8 + xorq %r8, %r9 + subq %r8, %r9 /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) -L(loop_large_forward): - /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + destination and source since destination may be in cache when + source is loaded. */ + cmpq %r9, %rdx + ja L(more_8x_vec_check) + + /* Cache align destination. First store the first 64 bytes then + adjust alignments. */ + VMOVU (%rsi), %VEC(8) +#if VEC_SIZE < 64 + VMOVU VEC_SIZE(%rsi), %VEC(9) +#if VEC_SIZE < 32 + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) +#endif +#endif + VMOVU %VEC(8), (%rdi) +#if VEC_SIZE < 64 + VMOVU %VEC(9), VEC_SIZE(%rdi) +#if VEC_SIZE < 32 + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) +#endif +#endif + /* Adjust source, destination, and size. */ + MOVQ %rdi, %r8 + andq $63, %r8 + /* Get the negative of offset for alignment. */ + subq $64, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx + + /* Test if source and destination addresses will alias. If they do + the larger pipeline in large_memcpy_4x alleviated the + performance drop. */ + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + + movq %rdx, %r10 + shrq $LOG_4X_MEMCPY_THRESH, %r10 + cmp __x86_shared_non_temporal_threshold(%rip), %r10 + jae L(large_memcpy_4x) + + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 2 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 + /* Copy 4x VEC at a time from 2 pages. */ + .p2align 4 +L(loop_large_memcpy_2x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_2x_inner): + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + addq $LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + addq $LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_2x_inner) + addq $PAGE_SIZE, %rdi + addq $PAGE_SIZE, %rsi + decq %r10 + jne L(loop_large_memcpy_2x_outer) + sfence + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_2x_end) + + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores + here. The region will fit in cache and it should fit user + expectations for the tail of the memcpy region to be hot. */ +L(loop_large_memcpy_2x_tail): + /* Copy 4 * VEC a time forward with temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence + addq $(VEC_SIZE * 4), %rsi + subl $(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_2x_tail) + +L(large_memcpy_2x_end): /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN -L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) -L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) +L(large_memcpy_4x): + movq %rdx, %r10 + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 4 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $(LOG_PAGE_SIZE + 2), %r10 + /* Copy 4x VEC at a time from 4 pages. */ + .p2align 4 +L(loop_large_memcpy_4x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_4x_inner): + /* Only one prefetch set per page as doing 4 pages give more time + for prefetcher to keep up. */ + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + addq $LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + addq $LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_4x_inner) + addq $(PAGE_SIZE * 3), %rdi + addq $(PAGE_SIZE * 3), %rsi + decq %r10 + jne L(loop_large_memcpy_4x_outer) sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_4x_end) + + /* Handle the last 4 * PAGE_SIZE bytes. */ +L(loop_large_memcpy_4x_tail): + /* Copy 4 * VEC a time forward with temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subl $(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_4x_tail) + +L(large_memcpy_4x_end): + /* Store the last 4 * VEC. */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
No Bug. This commit updates the large memcpy case (no overlap). The update is to perform memcpy on either 2 or 4 contiguous pages at once. This 1) helps to alleviate the affects of false memory aliasing when destination and source have a close 4k alignment and 2) In most cases and for most DRAM units is a modestly more efficient access pattern. These changes are a clear performance improvement for VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all pass. Signed-off-by: noah <goldstein.w.n@gmail.com> --- .../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++---- 1 file changed, 258 insertions(+), 68 deletions(-)