Message ID | 20220205224201.1166468-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Improve vec generation in memset-vec-unaligned-erms.S | expand |
On Sat, Feb 5, 2022 at 2:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. > > Split vec generation into multiple steps. This allows the > broadcast in AVX2 to use 'xmm' registers for the L(less_vec) > case. This saves an expensive lane-cross instruction and removes > the need for 'vzeroupper'. > > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for > byte broadcast. I am working on the same codes. Please give me a few hours. > Results for memset-avx2 small (geomean of N = 20 benchset runs). > > size, New Time, Old Time, New / Old > 1, 5.074, 4.399, 0.867 > 2, 4.433, 4.411, 0.995 > 4, 4.487, 4.415, 0.984 > 8, 4.454, 4.396, 0.987 > 16, 4.502, 4.443, 0.987 > > All relevant string/wcsmbs tests are passing. > --- > sysdeps/x86_64/memset.S | 21 ++- > .../multiarch/memset-avx2-unaligned-erms.S | 18 +- > .../multiarch/memset-avx512-unaligned-erms.S | 18 +- > .../multiarch/memset-evex-unaligned-erms.S | 18 +- > .../multiarch/memset-vec-unaligned-erms.S | 163 +++++++++++------- > 5 files changed, 151 insertions(+), 87 deletions(-) > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > index 65c09bd0ac..ccf036be53 100644 > --- a/sysdeps/x86_64/memset.S > +++ b/sysdeps/x86_64/memset.S > @@ -28,17 +28,22 @@ > #define VMOVU movups > #define VMOVA movaps > > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > - movq r, %rax; \ > - punpcklbw %xmm0, %xmm0; \ > - punpcklwd %xmm0, %xmm0; \ > - pshufd $0, %xmm0, %xmm0 > + pxor %xmm1, %xmm1; \ > + pshufb %xmm1, %xmm0; \ > + movq r, %rax > > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > - movq r, %rax; \ > - pshufd $0, %xmm0, %xmm0 > + pshufd $0, %xmm0, %xmm0; \ > + movq r, %rax > + > +# define MEMSET_VDUP_TO_VEC0_HIGH() > +# define MEMSET_VDUP_TO_VEC0_LOW() > + > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > #define SECTION(p) p > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > index 1af668af0a..c0bf2875d0 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -10,15 +10,18 @@ > # define VMOVU vmovdqu > # define VMOVA vmovdqa > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vmovd d, %xmm0; \ > - movq r, %rax; \ > - vpbroadcastb %xmm0, %ymm0 > + movq r, %rax; > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > - vmovd d, %xmm0; \ > - movq r, %rax; \ > - vpbroadcastd %xmm0, %ymm0 > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > + > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 > + > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 > > # ifndef SECTION > # define SECTION(p) p##.avx > @@ -30,5 +33,6 @@ > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > > +# define USE_XMM_LESS_VEC > # include "memset-vec-unaligned-erms.S" > #endif > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > index f14d6f8493..5241216a77 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -15,13 +15,19 @@ > > # define VZEROUPPER > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > - movq r, %rax; \ > - vpbroadcastb d, %VEC0 > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > + vpbroadcastb d, %VEC0; \ > + movq r, %rax > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > - movq r, %rax; \ > - vpbroadcastd d, %VEC0 > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > + vpbroadcastd d, %VEC0; \ > + movq r, %rax > + > +# define MEMSET_VDUP_TO_VEC0_HIGH() > +# define MEMSET_VDUP_TO_VEC0_LOW() > + > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > # define SECTION(p) p##.evex512 > # define MEMSET_SYMBOL(p,s) p##_avx512_##s > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > index 64b09e77cc..6370021506 100644 > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > @@ -15,13 +15,19 @@ > > # define VZEROUPPER > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > - movq r, %rax; \ > - vpbroadcastb d, %VEC0 > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > + vpbroadcastb d, %VEC0; \ > + movq r, %rax > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > - movq r, %rax; \ > - vpbroadcastd d, %VEC0 > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > + vpbroadcastd d, %VEC0; \ > + movq r, %rax > + > +# define MEMSET_VDUP_TO_VEC0_HIGH() > +# define MEMSET_VDUP_TO_VEC0_LOW() > + > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > # define SECTION(p) p##.evex > # define MEMSET_SYMBOL(p,s) p##_evex_##s > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 1e0511c79a..fb9053f4d6 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -58,8 +58,10 @@ > #ifndef MOVQ > # if VEC_SIZE > 16 > # define MOVQ vmovq > +# define MOVD vmovd > # else > # define MOVQ movq > +# define MOVD movd > # endif > #endif > > @@ -72,9 +74,17 @@ > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > # define END_REG rcx > # define LOOP_REG rdi > +# define LESS_VEC_REG rax > #else > # define END_REG rdi > # define LOOP_REG rdx > +# define LESS_VEC_REG rdi > +#endif > + > +#ifdef USE_XMM_LESS_VEC > +# define XMM_SMALL 1 > +#else > +# define XMM_SMALL 0 > #endif > > #define PAGE_SIZE 4096 > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) > > ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > shl $2, %RDX_LP > - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > - jmp L(entry_from_bzero) > + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > + WMEMSET_VDUP_TO_VEC0_LOW() > + cmpq $VEC_SIZE, %rdx > + jb L(less_vec_no_vdup) > + WMEMSET_VDUP_TO_VEC0_HIGH() > + jmp L(entry_from_wmemset) > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > #endif > > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > #endif > > ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > # ifdef __ILP32__ > /* Clear the upper 32 bits. */ > mov %edx, %edx > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > L(entry_from_bzero): > cmpq $VEC_SIZE, %rdx > jb L(less_vec) > + MEMSET_VDUP_TO_VEC0_HIGH() > +L(entry_from_wmemset): > cmpq $(VEC_SIZE * 2), %rdx > ja L(more_2x_vec) > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > # endif > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > # ifdef __ILP32__ > /* Clear the upper 32 bits. */ > mov %edx, %edx > # endif > cmp $VEC_SIZE, %RDX_LP > jb L(less_vec) > + MEMSET_VDUP_TO_VEC0_HIGH () > cmp $(VEC_SIZE * 2), %RDX_LP > ja L(stosb_more_2x_vec) > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. > - */ > - VMOVU %VEC(0), (%rax) > - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > > - .p2align 4,, 10 > + .p2align 4,, 4 > L(last_2x_vec): > #ifdef USE_LESS_VEC_MASK_STORE > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) > + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > #else > VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) > VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) > @@ -212,6 +228,7 @@ L(last_2x_vec): > #ifdef USE_LESS_VEC_MASK_STORE > .p2align 4,, 10 > L(less_vec): > +L(less_vec_no_vdup): > /* Less than 1 VEC. */ > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > # error Unsupported VEC_SIZE! > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec): > /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] > and (4x, 8x] jump to target. */ > L(more_2x_vec): > - > - /* Two different methods of setting up pointers / compare. The > - two methods are based on the fact that EVEX/AVX512 mov > - instructions take more bytes then AVX2/SSE2 mov instructions. As > - well that EVEX/AVX512 machines also have fast LEA_BID. Both > - setup and END_REG to avoid complex address mode. For EVEX/AVX512 > - this saves code size and keeps a few targets in one fetch block. > - For AVX2/SSE2 this helps prevent AGU bottlenecks. */ > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + > - LOOP_4X_OFFSET) with LEA_BID. */ > - > - /* END_REG is rcx for EVEX/AVX512. */ > - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG > -#endif > - > - /* Stores to first 2x VEC before cmp as any path forward will > - require it. */ > - VMOVU %VEC(0), (%rax) > - VMOVU %VEC(0), VEC_SIZE(%rax) > + /* Store next 2x vec regardless. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) > > > + /* Two different methods of setting up pointers / compare. The two > + methods are based on the fact that EVEX/AVX512 mov instructions take > + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 > + machines also have fast LEA_BID. Both setup and END_REG to avoid complex > + address mode. For EVEX/AVX512 this saves code size and keeps a few > + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU > + bottlenecks. */ > #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) > /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ > addq %rdx, %END_REG > @@ -292,6 +299,15 @@ L(more_2x_vec): > cmpq $(VEC_SIZE * 4), %rdx > jbe L(last_2x_vec) > > + > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with > + LEA_BID. */ > + > + /* END_REG is rcx for EVEX/AVX512. */ > + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG > +#endif > + > /* Store next 2x vec regardless. */ > VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) > VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) > @@ -355,65 +371,92 @@ L(stosb_local): > /* Define L(less_vec) only if not otherwise defined. */ > .p2align 4 > L(less_vec): > + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > + xmm). This is only does anything for AVX2. */ > + MEMSET_VDUP_TO_VEC0_LOW () > +L(less_vec_no_vdup): > #endif > L(cross_page): > #if VEC_SIZE > 32 > cmpl $32, %edx > - jae L(between_32_63) > + jge L(between_32_63) > #endif > #if VEC_SIZE > 16 > cmpl $16, %edx > - jae L(between_16_31) > + jge L(between_16_31) > +#endif > +#ifndef USE_XMM_LESS_VEC > + MOVQ %XMM0, %rcx > #endif > - MOVQ %XMM0, %rdi > cmpl $8, %edx > - jae L(between_8_15) > + jge L(between_8_15) > cmpl $4, %edx > - jae L(between_4_7) > + jge L(between_4_7) > cmpl $1, %edx > - ja L(between_2_3) > - jb L(return) > - movb %sil, (%rax) > - VZEROUPPER_RETURN > + jg L(between_2_3) > + jl L(return) > + movb %sil, (%LESS_VEC_REG) > + ret > > - /* Align small targets only if not doing so would cross a fetch > - line. */ > + /* Align small targets only if not doing so would cross a fetch line. > + */ > #if VEC_SIZE > 32 > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > /* From 32 to 63. No branch when size == 32. */ > L(between_32_63): > - VMOVU %YMM0, (%rax) > - VMOVU %YMM0, -32(%rax, %rdx) > + VMOVU %YMM0, (%LESS_VEC_REG) > + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) > VZEROUPPER_RETURN > #endif > > #if VEC_SIZE >= 32 > - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) > L(between_16_31): > /* From 16 to 31. No branch when size == 16. */ > - VMOVU %XMM0, (%rax) > - VMOVU %XMM0, -16(%rax, %rdx) > - VZEROUPPER_RETURN > + VMOVU %XMM0, (%LESS_VEC_REG) > + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) > + ret > #endif > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) > + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. > + */ > + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) > L(between_8_15): > /* From 8 to 15. No branch when size == 8. */ > - movq %rdi, (%rax) > - movq %rdi, -8(%rax, %rdx) > - VZEROUPPER_RETURN > +#ifdef USE_XMM_LESS_VEC > + MOVQ %XMM0, (%rdi) > + MOVQ %XMM0, -8(%rdi, %rdx) > +#else > + movq %rcx, (%LESS_VEC_REG) > + movq %rcx, -8(%LESS_VEC_REG, %rdx) > +#endif > + ret > > - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) > + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. > + */ > + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) > L(between_4_7): > /* From 4 to 7. No branch when size == 4. */ > - movl %edi, (%rax) > - movl %edi, -4(%rax, %rdx) > - VZEROUPPER_RETURN > +#ifdef USE_XMM_LESS_VEC > + MOVD %XMM0, (%rdi) > + MOVD %XMM0, -4(%rdi, %rdx) > +#else > + movl %ecx, (%LESS_VEC_REG) > + movl %ecx, -4(%LESS_VEC_REG, %rdx) > +#endif > + ret > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) > + /* 4 * XMM_SMALL for the third mov for AVX2. */ > + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) > L(between_2_3): > /* From 2 to 3. No branch when size == 2. */ > - movw %di, (%rax) > - movb %dil, -1(%rax, %rdx) > - VZEROUPPER_RETURN > +#ifdef USE_XMM_LESS_VEC > + movb %sil, (%rdi) > + movb %sil, 1(%rdi) > + movb %sil, -1(%rdi, %rdx) > +#else > + movw %cx, (%LESS_VEC_REG) > + movb %sil, -1(%LESS_VEC_REG, %rdx) > +#endif > + ret > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > -- > 2.25.1 >
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 65c09bd0ac..ccf036be53 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -28,17 +28,22 @@ #define VMOVU movups #define VMOVA movaps -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ - movq r, %rax; \ - punpcklbw %xmm0, %xmm0; \ - punpcklwd %xmm0, %xmm0; \ - pshufd $0, %xmm0, %xmm0 + pxor %xmm1, %xmm1; \ + pshufb %xmm1, %xmm0; \ + movq r, %rax -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ - movq r, %rax; \ - pshufd $0, %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() #define SECTION(p) p diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 1af668af0a..c0bf2875d0 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -10,15 +10,18 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastb %xmm0, %ymm0 + movq r, %rax; -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 + +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 # ifndef SECTION # define SECTION(p) p##.avx @@ -30,5 +33,6 @@ # define WMEMSET_SYMBOL(p,s) p##_avx2_##s # endif +# define USE_XMM_LESS_VEC # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index f14d6f8493..5241216a77 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -15,13 +15,19 @@ # define VZEROUPPER -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastb d, %VEC0 +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastb d, %VEC0; \ + movq r, %rax -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastd d, %VEC0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() # define SECTION(p) p##.evex512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index 64b09e77cc..6370021506 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -15,13 +15,19 @@ # define VZEROUPPER -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastb d, %VEC0 +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastb d, %VEC0; \ + movq r, %rax -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastd d, %VEC0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() # define SECTION(p) p##.evex # define MEMSET_SYMBOL(p,s) p##_evex_##s diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 1e0511c79a..fb9053f4d6 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -58,8 +58,10 @@ #ifndef MOVQ # if VEC_SIZE > 16 # define MOVQ vmovq +# define MOVD vmovd # else # define MOVQ movq +# define MOVD movd # endif #endif @@ -72,9 +74,17 @@ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 # define END_REG rcx # define LOOP_REG rdi +# define LESS_VEC_REG rax #else # define END_REG rdi # define LOOP_REG rdx +# define LESS_VEC_REG rdi +#endif + +#ifdef USE_XMM_LESS_VEC +# define XMM_SMALL 1 +#else +# define XMM_SMALL 0 #endif #define PAGE_SIZE 4096 @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) shl $2, %RDX_LP - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) - jmp L(entry_from_bzero) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx + jb L(less_vec_no_vdup) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) END (WMEMSET_SYMBOL (__wmemset, unaligned)) #endif @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) # ifdef __ILP32__ /* Clear the upper 32 bits. */ mov %edx, %edx @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +L(entry_from_wmemset): cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) # ifdef __ILP32__ /* Clear the upper 32 bits. */ mov %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH () cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. - */ - VMOVU %VEC(0), (%rax) - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) VZEROUPPER_RETURN #endif - .p2align 4,, 10 + .p2align 4,, 4 L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) #else VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) @@ -212,6 +228,7 @@ L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE .p2align 4,, 10 L(less_vec): +L(less_vec_no_vdup): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! @@ -262,28 +279,18 @@ L(stosb_more_2x_vec): /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] and (4x, 8x] jump to target. */ L(more_2x_vec): - - /* Two different methods of setting up pointers / compare. The - two methods are based on the fact that EVEX/AVX512 mov - instructions take more bytes then AVX2/SSE2 mov instructions. As - well that EVEX/AVX512 machines also have fast LEA_BID. Both - setup and END_REG to avoid complex address mode. For EVEX/AVX512 - this saves code size and keeps a few targets in one fetch block. - For AVX2/SSE2 this helps prevent AGU bottlenecks. */ -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + - LOOP_4X_OFFSET) with LEA_BID. */ - - /* END_REG is rcx for EVEX/AVX512. */ - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG -#endif - - /* Stores to first 2x VEC before cmp as any path forward will - require it. */ - VMOVU %VEC(0), (%rax) - VMOVU %VEC(0), VEC_SIZE(%rax) + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + /* Two different methods of setting up pointers / compare. The two + methods are based on the fact that EVEX/AVX512 mov instructions take + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 + machines also have fast LEA_BID. Both setup and END_REG to avoid complex + address mode. For EVEX/AVX512 this saves code size and keeps a few + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU + bottlenecks. */ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ addq %rdx, %END_REG @@ -292,6 +299,15 @@ L(more_2x_vec): cmpq $(VEC_SIZE * 4), %rdx jbe L(last_2x_vec) + +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with + LEA_BID. */ + + /* END_REG is rcx for EVEX/AVX512. */ + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +#endif + /* Store next 2x vec regardless. */ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) @@ -355,65 +371,92 @@ L(stosb_local): /* Define L(less_vec) only if not otherwise defined. */ .p2align 4 L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () +L(less_vec_no_vdup): #endif L(cross_page): #if VEC_SIZE > 32 cmpl $32, %edx - jae L(between_32_63) + jge L(between_32_63) #endif #if VEC_SIZE > 16 cmpl $16, %edx - jae L(between_16_31) + jge L(between_16_31) +#endif +#ifndef USE_XMM_LESS_VEC + MOVQ %XMM0, %rcx #endif - MOVQ %XMM0, %rdi cmpl $8, %edx - jae L(between_8_15) + jge L(between_8_15) cmpl $4, %edx - jae L(between_4_7) + jge L(between_4_7) cmpl $1, %edx - ja L(between_2_3) - jb L(return) - movb %sil, (%rax) - VZEROUPPER_RETURN + jg L(between_2_3) + jl L(return) + movb %sil, (%LESS_VEC_REG) + ret - /* Align small targets only if not doing so would cross a fetch - line. */ + /* Align small targets only if not doing so would cross a fetch line. + */ #if VEC_SIZE > 32 .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, (%rax) - VMOVU %YMM0, -32(%rax, %rdx) + VMOVU %YMM0, (%LESS_VEC_REG) + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) VZEROUPPER_RETURN #endif #if VEC_SIZE >= 32 - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) L(between_16_31): /* From 16 to 31. No branch when size == 16. */ - VMOVU %XMM0, (%rax) - VMOVU %XMM0, -16(%rax, %rdx) - VZEROUPPER_RETURN + VMOVU %XMM0, (%LESS_VEC_REG) + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) + ret #endif - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. + */ + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) L(between_8_15): /* From 8 to 15. No branch when size == 8. */ - movq %rdi, (%rax) - movq %rdi, -8(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) +#else + movq %rcx, (%LESS_VEC_REG) + movq %rcx, -8(%LESS_VEC_REG, %rdx) +#endif + ret - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. + */ + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) L(between_4_7): /* From 4 to 7. No branch when size == 4. */ - movl %edi, (%rax) - movl %edi, -4(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) +#else + movl %ecx, (%LESS_VEC_REG) + movl %ecx, -4(%LESS_VEC_REG, %rdx) +#endif + ret - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + /* 4 * XMM_SMALL for the third mov for AVX2. */ + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) L(between_2_3): /* From 2 to 3. No branch when size == 2. */ - movw %di, (%rax) - movb %dil, -1(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + movb %sil, (%rdi) + movb %sil, 1(%rdi) + movb %sil, -1(%rdi, %rdx) +#else + movw %cx, (%LESS_VEC_REG) + movb %sil, -1(%LESS_VEC_REG, %rdx) +#endif + ret END (MEMSET_SYMBOL (__memset, unaligned_erms))