Message ID | 20221015030030.204172-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v10,1/6] x86: Update VEC macros to complete API for evex/evex512 impls | expand |
On Fri, Oct 14, 2022 at 8:01 PM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Replace %VEC(n) -> %VMM(n) > > This commit does not change libc.so > > Tested build on x86-64 > --- > sysdeps/x86_64/multiarch/memrchr-evex.S | 42 ++++++++++++------------- > 1 file changed, 21 insertions(+), 21 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S > index ea3a0a0a60..550b328c5a 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S > @@ -21,7 +21,7 @@ > #if ISA_SHOULD_BUILD (4) > > # include <sysdep.h> > -# include "evex256-vecs.h" > +# include "x86-evex256-vecs.h" > # if VEC_SIZE != 32 > # error "VEC_SIZE != 32 unimplemented" > # endif > @@ -31,7 +31,7 @@ > # endif > > # define PAGE_SIZE 4096 > -# define VECMATCH VEC(0) > +# define VMMMATCH VMM(0) > > .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN(MEMRCHR, 6) > @@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6) > correct page cross check and 2) it correctly sets up end ptr to be > subtract by lzcnt aligned. */ > leaq -1(%rdi, %rdx), %rax > - vpbroadcastb %esi, %VECMATCH > + vpbroadcastb %esi, %VMMMATCH > > /* Check if we can load 1x VEC without cross a page. */ > testl $(PAGE_SIZE - VEC_SIZE), %eax > @@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6) > > /* Don't use rax for pointer here because EVEX has better encoding with > offset % VEC_SIZE == 0. */ > - vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0 > kmovd %k0, %ecx > > /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ > @@ -96,7 +96,7 @@ L(more_1x_vec): > movq %rax, %rdx > > /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > subq %rdi, %rdx > @@ -115,7 +115,7 @@ L(last_2x_vec): > > /* Don't use rax for pointer here because EVEX has better encoding with > offset % VEC_SIZE == 0. */ > - vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0 > kmovd %k0, %ecx > /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ > lzcntq %rcx, %rcx > @@ -131,7 +131,7 @@ L(last_2x_vec): > L(page_cross): > movq %rax, %rsi > andq $-VEC_SIZE, %rsi > - vpcmpb $0, (%rsi), %VECMATCH, %k0 > + vpcmpb $0, (%rsi), %VMMMATCH, %k0 > kmovd %k0, %r8d > /* Shift out negative alignment (because we are starting from endptr and > working backwards). */ > @@ -165,13 +165,13 @@ L(more_2x_vec): > testl %ecx, %ecx > jnz L(ret_vec_x0_dec) > > - vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > testl %ecx, %ecx > jnz L(ret_vec_x1) > > /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > subq $(VEC_SIZE * 4), %rdx > @@ -185,7 +185,7 @@ L(last_vec): > > > /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > lzcntl %ecx, %ecx > subq $(VEC_SIZE * 3 + 1), %rax > @@ -220,7 +220,7 @@ L(more_4x_vec): > testl %ecx, %ecx > jnz L(ret_vec_x2) > > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 > + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > testl %ecx, %ecx > @@ -243,17 +243,17 @@ L(more_4x_vec): > L(loop_4x_vec): > /* Store 1 were not-equals and 0 where equals in k1 (used to mask later > on). */ > - vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 > + vpcmpb $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1 > > /* VEC(2/3) will have zero-byte where we found a CHAR. */ > - vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) > - vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) > - vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 > + vpxorq (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3) > + vpcmpb $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4 > > /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where > CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ > - vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} > - vptestnmb %VEC(3), %VEC(3), %k2 > + vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + vptestnmb %VMM(3), %VMM(3), %k2 > > /* Any 1s and we found CHAR. */ > kortestd %k2, %k4 > @@ -270,7 +270,7 @@ L(loop_4x_vec): > L(last_4x_vec): > > /* Used no matter what. */ > - vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 > + vpcmpb $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > cmpl $(VEC_SIZE * 2), %edx > @@ -280,14 +280,14 @@ L(last_4x_vec): > jnz L(ret_vec_x0_dec) > > > - vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 > + vpcmpb $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > testl %ecx, %ecx > jnz L(ret_vec_x1) > > /* Used no matter what. */ > - vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 > + vpcmpb $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0 > kmovd %k0, %ecx > > cmpl $(VEC_SIZE * 3), %edx > @@ -309,7 +309,7 @@ L(loop_end): > testl %ecx, %ecx > jnz L(ret_vec_x0_end) > > - vptestnmb %VEC(2), %VEC(2), %k0 > + vptestnmb %VMM(2), %VMM(2), %k0 > kmovd %k0, %ecx > testl %ecx, %ecx > jnz L(ret_vec_x1_end) > -- > 2.34.1 > LGTM --Sunil
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S index ea3a0a0a60..550b328c5a 100644 --- a/sysdeps/x86_64/multiarch/memrchr-evex.S +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -21,7 +21,7 @@ #if ISA_SHOULD_BUILD (4) # include <sysdep.h> -# include "evex256-vecs.h" +# include "x86-evex256-vecs.h" # if VEC_SIZE != 32 # error "VEC_SIZE != 32 unimplemented" # endif @@ -31,7 +31,7 @@ # endif # define PAGE_SIZE 4096 -# define VECMATCH VEC(0) +# define VMMMATCH VMM(0) .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN(MEMRCHR, 6) @@ -47,7 +47,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6) correct page cross check and 2) it correctly sets up end ptr to be subtract by lzcnt aligned. */ leaq -1(%rdi, %rdx), %rax - vpbroadcastb %esi, %VECMATCH + vpbroadcastb %esi, %VMMMATCH /* Check if we can load 1x VEC without cross a page. */ testl $(PAGE_SIZE - VEC_SIZE), %eax @@ -55,7 +55,7 @@ ENTRY_P2ALIGN(MEMRCHR, 6) /* Don't use rax for pointer here because EVEX has better encoding with offset % VEC_SIZE == 0. */ - vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0 kmovd %k0, %ecx /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ @@ -96,7 +96,7 @@ L(more_1x_vec): movq %rax, %rdx /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx subq %rdi, %rdx @@ -115,7 +115,7 @@ L(last_2x_vec): /* Don't use rax for pointer here because EVEX has better encoding with offset % VEC_SIZE == 0. */ - vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0 kmovd %k0, %ecx /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ lzcntq %rcx, %rcx @@ -131,7 +131,7 @@ L(last_2x_vec): L(page_cross): movq %rax, %rsi andq $-VEC_SIZE, %rsi - vpcmpb $0, (%rsi), %VECMATCH, %k0 + vpcmpb $0, (%rsi), %VMMMATCH, %k0 kmovd %k0, %r8d /* Shift out negative alignment (because we are starting from endptr and working backwards). */ @@ -165,13 +165,13 @@ L(more_2x_vec): testl %ecx, %ecx jnz L(ret_vec_x0_dec) - vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1) /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx subq $(VEC_SIZE * 4), %rdx @@ -185,7 +185,7 @@ L(last_vec): /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx lzcntl %ecx, %ecx subq $(VEC_SIZE * 3 + 1), %rax @@ -220,7 +220,7 @@ L(more_4x_vec): testl %ecx, %ecx jnz L(ret_vec_x2) - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx @@ -243,17 +243,17 @@ L(more_4x_vec): L(loop_4x_vec): /* Store 1 were not-equals and 0 where equals in k1 (used to mask later on). */ - vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 + vpcmpb $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1 /* VEC(2/3) will have zero-byte where we found a CHAR. */ - vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) - vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) - vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 + vpxorq (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2) + vpxorq (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3) + vpcmpb $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ - vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} - vptestnmb %VEC(3), %VEC(3), %k2 + vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z} + vptestnmb %VMM(3), %VMM(3), %k2 /* Any 1s and we found CHAR. */ kortestd %k2, %k4 @@ -270,7 +270,7 @@ L(loop_4x_vec): L(last_4x_vec): /* Used no matter what. */ - vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 + vpcmpb $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx cmpl $(VEC_SIZE * 2), %edx @@ -280,14 +280,14 @@ L(last_4x_vec): jnz L(ret_vec_x0_dec) - vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 + vpcmpb $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1) /* Used no matter what. */ - vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 + vpcmpb $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0 kmovd %k0, %ecx cmpl $(VEC_SIZE * 3), %edx @@ -309,7 +309,7 @@ L(loop_end): testl %ecx, %ecx jnz L(ret_vec_x0_end) - vptestnmb %VEC(2), %VEC(2), %k0 + vptestnmb %VMM(2), %VMM(2), %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1_end)