Message ID | 20221015002100.129511-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v9,1/6] x86: Update VEC macros to complete API for evex/evex512 impls | expand |
On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Replace %VEC(n) -> %VMM(n) > > This commit does not change libc.so > > Tested build on x86-64 > --- > .../memset-avx2-unaligned-erms-rtm.S | 8 +-- > .../multiarch/memset-avx2-unaligned-erms.S | 14 +--- > .../multiarch/memset-avx512-unaligned-erms.S | 20 +----- > .../multiarch/memset-evex-unaligned-erms.S | 20 +----- > .../multiarch/memset-sse2-unaligned-erms.S | 10 +-- > .../multiarch/memset-vec-unaligned-erms.S | 70 ++++++++----------- > 6 files changed, 43 insertions(+), 99 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > index 8ac3e479bb..bc8605faf3 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > @@ -1,10 +1,6 @@ > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > +#include "x86-avx-rtm-vecs.h" > > -#define VZEROUPPER_RETURN jmp L(return) > - > -#define SECTION(p) p##.avx.rtm > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > -#include "memset-avx2-unaligned-erms.S" > +# include "memset-avx2-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > index a9054a9122..47cf5072a4 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -4,14 +4,9 @@ > > # define USE_WITH_AVX2 1 > > -# define VEC_SIZE 32 > -# define MOV_SIZE 4 > -# define RET_SIZE 4 > - > -# define VEC(i) ymm##i > - > -# define VMOVU vmovdqu > -# define VMOVA vmovdqa > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vmovd d, %xmm0; \ > @@ -26,9 +21,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 > # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 > > -# ifndef SECTION > -# define SECTION(p) p##.avx > -# endif > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > index 47623b8ee8..84145b6c27 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -4,26 +4,14 @@ > > # define USE_WITH_AVX512 1 > > -# define VEC_SIZE 64 > -# define MOV_SIZE 6 > -# define RET_SIZE 1 > - > -# define XMM0 xmm16 > -# define YMM0 ymm16 > -# define VEC0 zmm16 > -# define VEC(i) VEC##i > - > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -# define VZEROUPPER > +# include "x86-evex512-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastb d, %VEC0; \ > + vpbroadcastb d, %VMM(0); \ > movq r, %rax > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastd d, %VEC0; \ > + vpbroadcastd d, %VMM(0); \ > movq r, %rax > > # define MEMSET_VDUP_TO_VEC0_HIGH() > @@ -32,8 +20,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p##.evex512 > - > #ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx512_##s > #endif > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > index ac4b2d2d50..1f03b26bf8 100644 > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > @@ -4,26 +4,14 @@ > > # define USE_WITH_EVEX 1 > > -# define VEC_SIZE 32 > -# define MOV_SIZE 6 > -# define RET_SIZE 1 > - > -# define XMM0 xmm16 > -# define YMM0 ymm16 > -# define VEC0 ymm16 > -# define VEC(i) VEC##i > - > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -# define VZEROUPPER > +# include "x86-evex256-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastb d, %VEC0; \ > + vpbroadcastb d, %VMM(0); \ > movq r, %rax > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastd d, %VEC0; \ > + vpbroadcastd d, %VMM(0); \ > movq r, %rax > > # define MEMSET_VDUP_TO_VEC0_HIGH() > @@ -32,8 +20,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p##.evex > - > #ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_evex_##s > #endif > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > index 44f9b8888b..34b245d8ca 100644 > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > @@ -26,13 +26,7 @@ > # include <sysdep.h> > # define USE_WITH_SSE2 1 > > -# define VEC_SIZE 16 > -# define MOV_SIZE 3 > -# define RET_SIZE 1 > - > -# define VEC(i) xmm##i > -# define VMOVU movups > -# define VMOVA movaps > +# include "x86-sse2-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > @@ -52,8 +46,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p > - > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 905d0fa464..03de0ab907 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -34,14 +34,6 @@ > # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) > #endif > > -#ifndef XMM0 > -# define XMM0 xmm0 > -#endif > - > -#ifndef YMM0 > -# define YMM0 ymm0 > -#endif > - > #ifndef VZEROUPPER > # if VEC_SIZE > 16 > # define VZEROUPPER vzeroupper > @@ -150,8 +142,8 @@ L(entry_from_wmemset): > cmpq $(VEC_SIZE * 2), %rdx > ja L(more_2x_vec) > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > - VMOVU %VEC(0), (%rdi) > + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VMM(0), (%rdi) > VZEROUPPER_RETURN > #if defined USE_MULTIARCH && IS_IN (libc) > END (MEMSET_SYMBOL (__memset, unaligned)) > @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > cmp $(VEC_SIZE * 2), %RDX_LP > ja L(stosb_more_2x_vec) > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VMOVU %VMM(0), (%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > > .p2align 4,, 4 > L(last_2x_vec): > #ifdef USE_LESS_VEC_MASK_STORE > - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) > #else > - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi) > #endif > VZEROUPPER_RETURN > > @@ -221,7 +213,7 @@ L(less_vec_from_wmemset): > bzhil %edx, %ecx, %ecx > kmovd %ecx, %k1 > # endif > - vmovdqu8 %VEC(0), (%rax){%k1} > + vmovdqu8 %VMM(0), (%rax){%k1} > VZEROUPPER_RETURN > > # if defined USE_MULTIARCH && IS_IN (libc) > @@ -249,8 +241,8 @@ L(stosb_more_2x_vec): > and (4x, 8x] jump to target. */ > L(more_2x_vec): > /* Store next 2x vec regardless. */ > - VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) > + VMOVU %VMM(0), (%rdi) > + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdi) > > > /* Two different methods of setting up pointers / compare. The two > @@ -278,8 +270,8 @@ L(more_2x_vec): > #endif > > /* Store next 2x vec regardless. */ > - VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) > - VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) > + VMOVU %VMM(0), (VEC_SIZE * 2)(%rax) > + VMOVU %VMM(0), (VEC_SIZE * 3)(%rax) > > > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > @@ -304,20 +296,20 @@ L(more_2x_vec): > andq $(VEC_SIZE * -2), %LOOP_REG > .p2align 4 > L(loop): > - VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) > subq $-(VEC_SIZE * 4), %LOOP_REG > cmpq %END_REG, %LOOP_REG > jb L(loop) > .p2align 4,, MOV_SIZE > L(last_4x_vec): > - VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) > -L(return): > + VMOVU %VMM(0), LOOP_4X_OFFSET(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) > +L(return_vzeroupper): > #if VEC_SIZE > 16 > ZERO_UPPER_VEC_REGISTERS_RETURN > #else > @@ -355,7 +347,7 @@ L(cross_page): > jge L(between_16_31) > #endif > #ifndef USE_XMM_LESS_VEC > - MOVQ %XMM0, %SET_REG64 > + MOVQ %VMM_128(0), %SET_REG64 > #endif > cmpl $8, %edx > jge L(between_8_15) > @@ -374,8 +366,8 @@ L(between_0_0): > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > /* From 32 to 63. No branch when size == 32. */ > L(between_32_63): > - VMOVU %YMM0, (%LESS_VEC_REG) > - VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) > + VMOVU %VMM_256(0), (%LESS_VEC_REG) > + VMOVU %VMM_256(0), -32(%LESS_VEC_REG, %rdx) > VZEROUPPER_RETURN > #endif > > @@ -383,8 +375,8 @@ L(between_32_63): > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) > L(between_16_31): > /* From 16 to 31. No branch when size == 16. */ > - VMOVU %XMM0, (%LESS_VEC_REG) > - VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) > + VMOVU %VMM_128(0), (%LESS_VEC_REG) > + VMOVU %VMM_128(0), -16(%LESS_VEC_REG, %rdx) > ret > #endif > > @@ -394,8 +386,8 @@ L(between_16_31): > L(between_8_15): > /* From 8 to 15. No branch when size == 8. */ > #ifdef USE_XMM_LESS_VEC > - MOVQ %XMM0, (%rdi) > - MOVQ %XMM0, -8(%rdi, %rdx) > + MOVQ %VMM_128(0), (%rdi) > + MOVQ %VMM_128(0), -8(%rdi, %rdx) > #else > movq %SET_REG64, (%LESS_VEC_REG) > movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > @@ -408,8 +400,8 @@ L(between_8_15): > L(between_4_7): > /* From 4 to 7. No branch when size == 4. */ > #ifdef USE_XMM_LESS_VEC > - MOVD %XMM0, (%rdi) > - MOVD %XMM0, -4(%rdi, %rdx) > + MOVD %VMM_128(0), (%rdi) > + MOVD %VMM_128(0), -4(%rdi, %rdx) > #else > movl %SET_REG32, (%LESS_VEC_REG) > movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S index 8ac3e479bb..bc8605faf3 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -1,10 +1,6 @@ -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST +#include "x86-avx-rtm-vecs.h" -#define VZEROUPPER_RETURN jmp L(return) - -#define SECTION(p) p##.avx.rtm #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm -#include "memset-avx2-unaligned-erms.S" +# include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index a9054a9122..47cf5072a4 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,14 +4,9 @@ # define USE_WITH_AVX2 1 -# define VEC_SIZE 32 -# define MOV_SIZE 4 -# define RET_SIZE 4 - -# define VEC(i) ymm##i - -# define VMOVU vmovdqu -# define VMOVA vmovdqa +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ @@ -26,9 +21,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 -# ifndef SECTION -# define SECTION(p) p##.avx -# endif # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 47623b8ee8..84145b6c27 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_AVX512 1 -# define VEC_SIZE 64 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 zmm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex512-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex512 - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx512_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index ac4b2d2d50..1f03b26bf8 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_EVEX 1 -# define VEC_SIZE 32 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 ymm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex256-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_evex_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index 44f9b8888b..34b245d8ca 100644 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -26,13 +26,7 @@ # include <sysdep.h> # define USE_WITH_SSE2 1 -# define VEC_SIZE 16 -# define MOV_SIZE 3 -# define RET_SIZE 1 - -# define VEC(i) xmm##i -# define VMOVU movups -# define VMOVA movaps +# include "x86-sse2-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ @@ -52,8 +46,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p - # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_sse2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 905d0fa464..03de0ab907 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,14 +34,6 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif -#ifndef XMM0 -# define XMM0 xmm0 -#endif - -#ifndef YMM0 -# define YMM0 ymm0 -#endif - #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -150,8 +142,8 @@ L(entry_from_wmemset): cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(0), (%rdi) + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) VZEROUPPER_RETURN #endif .p2align 4,, 4 L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) #else - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi) #endif VZEROUPPER_RETURN @@ -221,7 +213,7 @@ L(less_vec_from_wmemset): bzhil %edx, %ecx, %ecx kmovd %ecx, %k1 # endif - vmovdqu8 %VEC(0), (%rax){%k1} + vmovdqu8 %VMM(0), (%rax){%k1} VZEROUPPER_RETURN # if defined USE_MULTIARCH && IS_IN (libc) @@ -249,8 +241,8 @@ L(stosb_more_2x_vec): and (4x, 8x] jump to target. */ L(more_2x_vec): /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdi) /* Two different methods of setting up pointers / compare. The two @@ -278,8 +270,8 @@ L(more_2x_vec): #endif /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) - VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 3)(%rax) #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 @@ -304,20 +296,20 @@ L(more_2x_vec): andq $(VEC_SIZE * -2), %LOOP_REG .p2align 4 L(loop): - VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) subq $-(VEC_SIZE * 4), %LOOP_REG cmpq %END_REG, %LOOP_REG jb L(loop) .p2align 4,, MOV_SIZE L(last_4x_vec): - VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) - VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) -L(return): + VMOVU %VMM(0), LOOP_4X_OFFSET(%END_REG) + VMOVU %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) +L(return_vzeroupper): #if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN #else @@ -355,7 +347,7 @@ L(cross_page): jge L(between_16_31) #endif #ifndef USE_XMM_LESS_VEC - MOVQ %XMM0, %SET_REG64 + MOVQ %VMM_128(0), %SET_REG64 #endif cmpl $8, %edx jge L(between_8_15) @@ -374,8 +366,8 @@ L(between_0_0): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, (%LESS_VEC_REG) - VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VMOVU %VMM_256(0), (%LESS_VEC_REG) + VMOVU %VMM_256(0), -32(%LESS_VEC_REG, %rdx) VZEROUPPER_RETURN #endif @@ -383,8 +375,8 @@ L(between_32_63): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) L(between_16_31): /* From 16 to 31. No branch when size == 16. */ - VMOVU %XMM0, (%LESS_VEC_REG) - VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) + VMOVU %VMM_128(0), (%LESS_VEC_REG) + VMOVU %VMM_128(0), -16(%LESS_VEC_REG, %rdx) ret #endif @@ -394,8 +386,8 @@ L(between_16_31): L(between_8_15): /* From 8 to 15. No branch when size == 8. */ #ifdef USE_XMM_LESS_VEC - MOVQ %XMM0, (%rdi) - MOVQ %XMM0, -8(%rdi, %rdx) + MOVQ %VMM_128(0), (%rdi) + MOVQ %VMM_128(0), -8(%rdi, %rdx) #else movq %SET_REG64, (%LESS_VEC_REG) movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) @@ -408,8 +400,8 @@ L(between_8_15): L(between_4_7): /* From 4 to 7. No branch when size == 4. */ #ifdef USE_XMM_LESS_VEC - MOVD %XMM0, (%rdi) - MOVD %XMM0, -4(%rdi, %rdx) + MOVD %VMM_128(0), (%rdi) + MOVD %VMM_128(0), -4(%rdi, %rdx) #else movl %SET_REG32, (%LESS_VEC_REG) movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)