Message ID | 20221020172646.3453468-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S | expand |
On Thu, Oct 20, 2022 at 10:26 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > commit b412213eee0afa3b51dfe92b736dfc7c981309f5 > Author: Noah Goldstein <goldstein.w.n@gmail.com> > Date: Tue Oct 18 17:44:07 2022 -0700 > > x86: Optimize strrchr-evex.S and implement with VMM headers > > Added `vpcompress{b|d}` to the page-cross logic with is an > AVX512-VBMI2 instruction. This is not supported on SKX. Since the > page-cross logic is relatively cold and the benefit is minimal > revert the page-cross case back to the old logic which is supported > on SKX. Sorry I didn't catch it. Since IFUNC selector wasn't updated, strrchr-evex will fail on SKX. LGTM. Thanks. > Tested on x86-64. > --- > sysdeps/x86_64/multiarch/strrchr-evex.S | 69 +++++++++++-------------- > 1 file changed, 29 insertions(+), 40 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S > index 45487dc87a..26b3457875 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S > @@ -29,9 +29,7 @@ > # include "x86-evex256-vecs.h" > > # ifdef USE_AS_WCSRCHR > -# define RCX_M cl > -# define SHIFT_REG rcx > -# define VPCOMPRESS vpcompressd > +# define SHIFT_REG rsi > # define kunpck_2x kunpckbw > # define kmov_2x kmovd > # define maskz_2x ecx > @@ -46,9 +44,7 @@ > > # define USE_WIDE_CHAR > # else > -# define RCX_M ecx > # define SHIFT_REG rdi > -# define VPCOMPRESS vpcompressb > # define kunpck_2x kunpckdq > # define kmov_2x kmovq > # define maskz_2x rcx > @@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > jg L(cross_page_boundary) > - > +L(page_cross_continue): > VMOVU (%rdi), %VMM(1) > /* k0 has a 1 for each zero CHAR in VEC(1). */ > VPTESTN %VMM(1), %VMM(1), %k0 > @@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6) > test %VRSI, %VRSI > jz L(aligned_more) > /* fallthrough: zero CHAR in first VEC. */ > -L(page_cross_return): > /* K1 has a 1 for each search CHAR match in VEC(1). */ > VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VRAX > @@ -197,7 +192,6 @@ L(first_vec_x2): > > .p2align 4,, 12 > L(aligned_more): > -L(page_cross_continue): > /* Need to keep original pointer incase VEC(1) has last match. > */ > movq %rdi, %rsi > @@ -353,53 +347,48 @@ L(return_new_match): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 4 > L(cross_page_boundary): > + /* eax contains all the page offset bits of src (rdi). `xor rdi, > + rax` sets pointer will all page offset bits cleared so > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > + before page cross (guranteed to be safe to read). Doing this > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > + a bit of code size. */ > xorq %rdi, %rax > - mov $-1, %VRDX > - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > - VPTESTN %VMM(6), %VMM(6), %k0 > - KMOV %k0, %VRSI > - > -# ifdef USE_AS_WCSRCHR > - movl %edi, %ecx > - and $(VEC_SIZE - 1), %ecx > - shrl $2, %ecx > -# endif > - shlx %VGPR(SHIFT_REG), %VRDX, %VRDX > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VRCX > > + /* Shift out zero CHAR matches that are before the begining of > + src (rdi). */ > # ifdef USE_AS_WCSRCHR > - kmovb %edx, %k1 > -# else > - KMOV %VRDX, %k1 > + movl %edi, %esi > + andl $(VEC_SIZE - 1), %esi > + shrl $2, %esi > # endif > + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX > > - /* Need to adjust result to VEC(1) so it can be re-used by > - L(return_vec_x0_test). The alternative is to collect VEC(1) > - will a page cross load which is far more expensive. */ > - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > - > - /* We could technically just jmp back after the vpcompress but > - it doesn't save any 16-byte blocks. */ > - shrx %VGPR(SHIFT_REG), %VRSI, %VRSI > - test %VRSI, %VRSI > + test %VRCX, %VRCX > jz L(page_cross_continue) > > - /* Duplicate of return logic from ENTRY. Doesn't cause spill to > - next cache line so might as well copy it here. */ > - VPCMPEQ %VMATCH, %VMM(1), %k1 > + /* Found zero CHAR so need to test for search CHAR. */ > + VPCMP $0, %VMATCH, %VMM(1), %k1 > KMOV %k1, %VRAX > - blsmsk %VRSI, %VRSI > - and %VRSI, %VRAX > - jz L(ret_page_cross) > + /* Shift out search CHAR matches that are before the begining of > + src (rdi). */ > + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX > + > + /* Check if any search CHAR match in range. */ > + blsmsk %VRCX, %VRCX > + and %VRCX, %VRAX > + jz L(ret3) > bsr %VRAX, %VRAX > # ifdef USE_AS_WCSRCHR > leaq (%rdi, %rax, CHAR_SIZE), %rax > # else > addq %rdi, %rax > # endif > -L(ret_page_cross): > +L(ret3): > ret > - /* 1 byte till next cache line. */ > END(STRRCHR) > #endif > -- > 2.34.1 >
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index 45487dc87a..26b3457875 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -29,9 +29,7 @@ # include "x86-evex256-vecs.h" # ifdef USE_AS_WCSRCHR -# define RCX_M cl -# define SHIFT_REG rcx -# define VPCOMPRESS vpcompressd +# define SHIFT_REG rsi # define kunpck_2x kunpckbw # define kmov_2x kmovd # define maskz_2x ecx @@ -46,9 +44,7 @@ # define USE_WIDE_CHAR # else -# define RCX_M ecx # define SHIFT_REG rdi -# define VPCOMPRESS vpcompressb # define kunpck_2x kunpckdq # define kmov_2x kmovq # define maskz_2x rcx @@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in VEC(1). */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VRSI, %VRSI jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX @@ -197,7 +192,6 @@ L(first_vec_x2): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi @@ -353,53 +347,48 @@ L(return_new_match): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 - KMOV %k0, %VRSI - -# ifdef USE_AS_WCSRCHR - movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx - shrl $2, %ecx -# endif - shlx %VGPR(SHIFT_REG), %VRDX, %VRDX + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + /* Shift out zero CHAR matches that are before the begining of + src (rdi). */ # ifdef USE_AS_WCSRCHR - kmovb %edx, %k1 -# else - KMOV %VRDX, %k1 + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX - /* Need to adjust result to VEC(1) so it can be re-used by - L(return_vec_x0_test). The alternative is to collect VEC(1) - will a page cross load which is far more expensive. */ - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %VGPR(SHIFT_REG), %VRSI, %VRSI - test %VRSI, %VRSI + test %VRCX, %VRCX jz L(page_cross_continue) - /* Duplicate of return logic from ENTRY. Doesn't cause spill to - next cache line so might as well copy it here. */ - VPCMPEQ %VMATCH, %VMM(1), %k1 + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX - blsmsk %VRSI, %VRSI - and %VRSI, %VRAX - jz L(ret_page_cross) + /* Shift out search CHAR matches that are before the begining of + src (rdi). */ + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX + + /* Check if any search CHAR match in range. */ + blsmsk %VRCX, %VRCX + and %VRCX, %VRAX + jz L(ret3) bsr %VRAX, %VRAX # ifdef USE_AS_WCSRCHR leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax # endif -L(ret_page_cross): +L(ret3): ret - /* 1 byte till next cache line. */ END(STRRCHR) #endif