Message ID | 20231101221657.311121-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S | expand |
On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > was missing the CPU_FEATURE checks for VBMI2 in the > ifunc/ifunc-impl-list. > > The fix is either to add those checks or change the logic to not use > `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > implementation is usable on SKX. > > New implementation is a bit slower, but this is in a cold path so its > probably okay. > --- > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- > 1 file changed, 43 insertions(+), 21 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > index cd6a0a870a..b174d43208 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > jg L(cross_page_boundary) > - > +L(page_cross_continue): > VMOVU (%rdi), %VMM(1) > /* k0 has a 1 for each zero CHAR in YMM1. */ > VPTESTN %VMM(1), %VMM(1), %k0 > @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > test %VGPR(rsi), %VGPR(rsi) > jz L(aligned_more) > /* fallthrough: zero CHAR in first VEC. */ > -L(page_cross_return): > + > /* K1 has a 1 for each search CHAR match in VEC(1). */ > VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VGPR(rax) > +L(page_cross_return): > /* Build mask up until first zero CHAR (used to mask of > potential search CHAR matches past the end of the string). */ > blsmsk %VGPR(rsi), %VGPR(rsi) > @@ -167,7 +168,6 @@ L(first_vec_x1_return): > > .p2align 4,, 12 > L(aligned_more): > -L(page_cross_continue): > /* Need to keep original pointer incase VEC(1) has last match. */ > movq %rdi, %rsi > andq $-VEC_SIZE, %rdi > @@ -340,34 +340,56 @@ L(return_new_match_ret): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 4 > L(cross_page_boundary): > + /* eax contains all the page offset bits of src (rdi). `xor rdi, > + rax` sets pointer will all page offset bits cleared so > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > + before page cross (guaranteed to be safe to read). Doing this > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > + a bit of code size. */ > xorq %rdi, %rax > - mov $-1, %VRDX > - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > - VPTESTN %VMM(6), %VMM(6), %k0 > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > KMOV %k0, %VRSI > > + /* Shift out zero CHAR matches that are before the beginning of > + src (rdi). */ > # ifdef USE_AS_WCSRCHR > movl %edi, %ecx > - and $(VEC_SIZE - 1), %ecx > + andl $(VEC_SIZE - 1), %ecx > shrl $2, %ecx > # endif > - shlx %SHIFT_REG, %VRDX, %VRDX > + shrx %SHIFT_REG, %VRSI, %VRSI > > -# ifdef USE_AS_WCSRCHR > - kmovw %edx, %k1 > + test %VRSI, %VRSI > + jz L(page_cross_continue) > + > + /* Found zero CHAR so need to test for search CHAR. */ > + VPCMP $0, %VMATCH, %VMM(1), %k1 > + KMOV %k1, %VRAX > + /* Shift out search CHAR matches that are before the beginning of > + src (rdi). */ > + shrx %SHIFT_REG, %VRAX, %VRAX > + /* For VEC_SIZE == 64 we are just at the end of a cacheline here > + so to save code-size just re-use return logic for first VEC. > + This is relatively cold code (page cross). */ > +# if VEC_SIZE == 64 > + jmp L(page_cross_return) > + /* 6 bytes from cache-line. */ > # else > - KMOV %VRDX, %k1 > + /* Check if any search CHAR match in range. */ > + blsmsk %VRSI, %VRSI > + and %VRSI, %VRAX > + jz L(ret2) > + bsr %VRAX, %VRAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > +L(ret2): > + ret > + /* 3 bytes from cache-line. */ > # endif > - > - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > - /* We could technically just jmp back after the vpcompress but > - it doesn't save any 16-byte blocks. */ > - shrx %SHIFT_REG, %VRSI, %VRSI > - test %VRSI, %VRSI > - jnz L(page_cross_return) > - jmp L(page_cross_continue) > - /* 1-byte from cache line. */ > END(STRRCHR) > #endif > -- > 2.34.1 > ping
* Noah Goldstein: > On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> was missing the CPU_FEATURE checks for VBMI2 in the >> ifunc/ifunc-impl-list. >> >> The fix is either to add those checks or change the logic to not use >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> implementation is usable on SKX. >> >> New implementation is a bit slower, but this is in a cold path so its >> probably okay. >> --- >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- >> 1 file changed, 43 insertions(+), 21 deletions(-) >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> index cd6a0a870a..b174d43208 100644 >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> andl $(PAGE_SIZE - 1), %eax >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> jg L(cross_page_boundary) >> - >> +L(page_cross_continue): >> VMOVU (%rdi), %VMM(1) >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> VPTESTN %VMM(1), %VMM(1), %k0 >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> test %VGPR(rsi), %VGPR(rsi) >> jz L(aligned_more) >> /* fallthrough: zero CHAR in first VEC. */ >> -L(page_cross_return): >> + >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> KMOV %k1, %VGPR(rax) >> +L(page_cross_return): >> /* Build mask up until first zero CHAR (used to mask of >> potential search CHAR matches past the end of the string). */ >> blsmsk %VGPR(rsi), %VGPR(rsi) >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): >> >> .p2align 4,, 12 >> L(aligned_more): >> -L(page_cross_continue): >> /* Need to keep original pointer incase VEC(1) has last match. */ >> movq %rdi, %rsi >> andq $-VEC_SIZE, %rdi >> @@ -340,34 +340,56 @@ L(return_new_match_ret): >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> ret >> >> - .p2align 4,, 4 >> L(cross_page_boundary): >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> + rax` sets pointer will all page offset bits cleared so >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> + before page cross (guaranteed to be safe to read). Doing this >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> + a bit of code size. */ >> xorq %rdi, %rax >> - mov $-1, %VRDX >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> - VPTESTN %VMM(6), %VMM(6), %k0 >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> + VPTESTN %VMM(1), %VMM(1), %k0 >> KMOV %k0, %VRSI >> >> + /* Shift out zero CHAR matches that are before the beginning of >> + src (rdi). */ >> # ifdef USE_AS_WCSRCHR >> movl %edi, %ecx >> - and $(VEC_SIZE - 1), %ecx >> + andl $(VEC_SIZE - 1), %ecx >> shrl $2, %ecx >> # endif >> - shlx %SHIFT_REG, %VRDX, %VRDX >> + shrx %SHIFT_REG, %VRSI, %VRSI >> >> -# ifdef USE_AS_WCSRCHR >> - kmovw %edx, %k1 >> + test %VRSI, %VRSI >> + jz L(page_cross_continue) >> + >> + /* Found zero CHAR so need to test for search CHAR. */ >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> + KMOV %k1, %VRAX >> + /* Shift out search CHAR matches that are before the beginning of >> + src (rdi). */ >> + shrx %SHIFT_REG, %VRAX, %VRAX >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here >> + so to save code-size just re-use return logic for first VEC. >> + This is relatively cold code (page cross). */ >> +# if VEC_SIZE == 64 >> + jmp L(page_cross_return) >> + /* 6 bytes from cache-line. */ >> # else >> - KMOV %VRDX, %k1 >> + /* Check if any search CHAR match in range. */ >> + blsmsk %VRSI, %VRSI >> + and %VRSI, %VRAX >> + jz L(ret2) >> + bsr %VRAX, %VRAX >> +# ifdef USE_AS_WCSRCHR >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> +# else >> + addq %rdi, %rax >> +# endif >> +L(ret2): >> + ret >> + /* 3 bytes from cache-line. */ >> # endif >> - >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} >> - /* We could technically just jmp back after the vpcompress but >> - it doesn't save any 16-byte blocks. */ >> - shrx %SHIFT_REG, %VRSI, %VRSI >> - test %VRSI, %VRSI >> - jnz L(page_cross_return) >> - jmp L(page_cross_continue) >> - /* 1-byte from cache line. */ >> END(STRRCHR) >> #endif >> -- >> 2.34.1 >> > ping Sunil, could you review this? Thanks, Florian
On Tue, Nov 7, 2023 at 10:25 AM Florian Weimer <fweimer@redhat.com> wrote: > * Noah Goldstein: > > > On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > >> > >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > >> was missing the CPU_FEATURE checks for VBMI2 in the > >> ifunc/ifunc-impl-list. > >> > >> The fix is either to add those checks or change the logic to not use > >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > >> implementation is usable on SKX. > >> > >> New implementation is a bit slower, but this is in a cold path so its > >> probably okay. > >> --- > >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- > >> 1 file changed, 43 insertions(+), 21 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> index cd6a0a870a..b174d43208 100644 > >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> andl $(PAGE_SIZE - 1), %eax > >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax > >> jg L(cross_page_boundary) > >> - > >> +L(page_cross_continue): > >> VMOVU (%rdi), %VMM(1) > >> /* k0 has a 1 for each zero CHAR in YMM1. */ > >> VPTESTN %VMM(1), %VMM(1), %k0 > >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> test %VGPR(rsi), %VGPR(rsi) > >> jz L(aligned_more) > >> /* fallthrough: zero CHAR in first VEC. */ > >> -L(page_cross_return): > >> + > >> /* K1 has a 1 for each search CHAR match in VEC(1). */ > >> VPCMPEQ %VMATCH, %VMM(1), %k1 > >> KMOV %k1, %VGPR(rax) > >> +L(page_cross_return): > >> /* Build mask up until first zero CHAR (used to mask of > >> potential search CHAR matches past the end of the string). > */ > >> blsmsk %VGPR(rsi), %VGPR(rsi) > >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): > >> > >> .p2align 4,, 12 > >> L(aligned_more): > >> -L(page_cross_continue): > >> /* Need to keep original pointer incase VEC(1) has last match. > */ > >> movq %rdi, %rsi > >> andq $-VEC_SIZE, %rdi > >> @@ -340,34 +340,56 @@ L(return_new_match_ret): > >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > >> ret > >> > >> - .p2align 4,, 4 > >> L(cross_page_boundary): > >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, > >> + rax` sets pointer will all page offset bits cleared so > >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > >> + before page cross (guaranteed to be safe to read). Doing this > >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > >> + a bit of code size. */ > >> xorq %rdi, %rax > >> - mov $-1, %VRDX > >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > >> - VPTESTN %VMM(6), %VMM(6), %k0 > >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > >> + VPTESTN %VMM(1), %VMM(1), %k0 > >> KMOV %k0, %VRSI > >> > >> + /* Shift out zero CHAR matches that are before the beginning of > >> + src (rdi). */ > >> # ifdef USE_AS_WCSRCHR > >> movl %edi, %ecx > >> - and $(VEC_SIZE - 1), %ecx > >> + andl $(VEC_SIZE - 1), %ecx > >> shrl $2, %ecx > >> # endif > >> - shlx %SHIFT_REG, %VRDX, %VRDX > >> + shrx %SHIFT_REG, %VRSI, %VRSI > >> > >> -# ifdef USE_AS_WCSRCHR > >> - kmovw %edx, %k1 > >> + test %VRSI, %VRSI > >> + jz L(page_cross_continue) > >> + > >> + /* Found zero CHAR so need to test for search CHAR. */ > >> + VPCMP $0, %VMATCH, %VMM(1), %k1 > >> + KMOV %k1, %VRAX > >> + /* Shift out search CHAR matches that are before the beginning > of > >> + src (rdi). */ > >> + shrx %SHIFT_REG, %VRAX, %VRAX > >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here > >> + so to save code-size just re-use return logic for first VEC. > >> + This is relatively cold code (page cross). */ > >> +# if VEC_SIZE == 64 > >> + jmp L(page_cross_return) > >> + /* 6 bytes from cache-line. */ > >> # else > >> - KMOV %VRDX, %k1 > >> + /* Check if any search CHAR match in range. */ > >> + blsmsk %VRSI, %VRSI > >> + and %VRSI, %VRAX > >> + jz L(ret2) > >> + bsr %VRAX, %VRAX > >> +# ifdef USE_AS_WCSRCHR > >> + leaq (%rdi, %rax, CHAR_SIZE), %rax > >> +# else > >> + addq %rdi, %rax > >> +# endif > >> +L(ret2): > >> + ret > >> + /* 3 bytes from cache-line. */ > >> # endif > >> - > >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > >> - /* We could technically just jmp back after the vpcompress but > >> - it doesn't save any 16-byte blocks. */ > >> - shrx %SHIFT_REG, %VRSI, %VRSI > >> - test %VRSI, %VRSI > >> - jnz L(page_cross_return) > >> - jmp L(page_cross_continue) > >> - /* 1-byte from cache line. */ > >> END(STRRCHR) > >> #endif > >> -- > >> 2.34.1 > >> > > ping > > Sunil, could you review this? > Sure, I'm looking into it. > > Thanks, > Florian > >
On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > was missing the CPU_FEATURE checks for VBMI2 in the > ifunc/ifunc-impl-list. > > The fix is either to add those checks or change the logic to not use > `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > implementation is usable on SKX. > > New implementation is a bit slower, but this is in a cold path so its > probably okay. > --- > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- > 1 file changed, 43 insertions(+), 21 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > index cd6a0a870a..b174d43208 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > jg L(cross_page_boundary) > - > +L(page_cross_continue): > VMOVU (%rdi), %VMM(1) > /* k0 has a 1 for each zero CHAR in YMM1. */ > VPTESTN %VMM(1), %VMM(1), %k0 > @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > test %VGPR(rsi), %VGPR(rsi) > jz L(aligned_more) > /* fallthrough: zero CHAR in first VEC. */ > -L(page_cross_return): > + > /* K1 has a 1 for each search CHAR match in VEC(1). */ > VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VGPR(rax) > +L(page_cross_return): > /* Build mask up until first zero CHAR (used to mask of > potential search CHAR matches past the end of the string). */ > blsmsk %VGPR(rsi), %VGPR(rsi) > @@ -167,7 +168,6 @@ L(first_vec_x1_return): > > .p2align 4,, 12 > L(aligned_more): > -L(page_cross_continue): > /* Need to keep original pointer incase VEC(1) has last match. */ > movq %rdi, %rsi > andq $-VEC_SIZE, %rdi > @@ -340,34 +340,56 @@ L(return_new_match_ret): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 4 > L(cross_page_boundary): > + /* eax contains all the page offset bits of src (rdi). `xor rdi, > + rax` sets pointer will all page offset bits cleared so > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > + before page cross (guaranteed to be safe to read). Doing this > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > + a bit of code size. */ > xorq %rdi, %rax > - mov $-1, %VRDX > - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > - VPTESTN %VMM(6), %VMM(6), %k0 > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > KMOV %k0, %VRSI > > + /* Shift out zero CHAR matches that are before the beginning of > + src (rdi). */ > # ifdef USE_AS_WCSRCHR > movl %edi, %ecx > - and $(VEC_SIZE - 1), %ecx > + andl $(VEC_SIZE - 1), %ecx > shrl $2, %ecx > # endif > - shlx %SHIFT_REG, %VRDX, %VRDX > + shrx %SHIFT_REG, %VRSI, %VRSI > > -# ifdef USE_AS_WCSRCHR > - kmovw %edx, %k1 > + test %VRSI, %VRSI > + jz L(page_cross_continue) > + > + /* Found zero CHAR so need to test for search CHAR. */ > + VPCMP $0, %VMATCH, %VMM(1), %k1 > + KMOV %k1, %VRAX > + /* Shift out search CHAR matches that are before the beginning of > + src (rdi). */ > + shrx %SHIFT_REG, %VRAX, %VRAX > + /* For VEC_SIZE == 64 we are just at the end of a cacheline here > + so to save code-size just re-use return logic for first VEC. > + This is relatively cold code (page cross). */ > +# if VEC_SIZE == 64 > + jmp L(page_cross_return) > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board. $ tail -7 /tmp/wcsrchr-evex.out 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax 306: c4 e2 48 f3 d6 blsmsk %esi,%esi 30b: 21 f0 and %esi,%eax 30d: 74 07 je 316 <__wcsrchr_evex+0x316> 30f: 0f bd c0 bsr %eax,%eax 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax 316: c3 ret $ tail -7 /tmp/wcsrchr-evex512.out 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi 305: 85 f6 test %esi,%esi 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 314: c5 fb 93 c1 kmovd %k1,%eax 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> $ tail -7 /tmp/strrchr-evex.out 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi 2f2: 21 f0 and %esi,%eax 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> 2f6: 0f bd c0 bsr %eax,%eax 2f9: 48 01 f8 add %rdi,%rax 2fc: c3 ret $ tail -7 /tmp/strrchr-evex512.out 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi 31a: 48 85 f6 test %rsi,%rsi 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 32a: c4 e1 fb 93 c1 kmovq %k1,%rax 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> > + /* 6 bytes from cache-line. */ > # else > - KMOV %VRDX, %k1 > + /* Check if any search CHAR match in range. */ > + blsmsk %VRSI, %VRSI > + and %VRSI, %VRAX > + jz L(ret2) > + bsr %VRAX, %VRAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > +L(ret2): > + ret > + /* 3 bytes from cache-line. */ > # endif > - > - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > Please remove the VCOMPRESS definition as it's not used. > - /* We could technically just jmp back after the vpcompress but > - it doesn't save any 16-byte blocks. */ > - shrx %SHIFT_REG, %VRSI, %VRSI > - test %VRSI, %VRSI > - jnz L(page_cross_return) > - jmp L(page_cross_continue) > - /* 1-byte from cache line. */ > END(STRRCHR) > #endif > -- > 2.34.1 > >
On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> was missing the CPU_FEATURE checks for VBMI2 in the >> ifunc/ifunc-impl-list. >> >> The fix is either to add those checks or change the logic to not use >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> implementation is usable on SKX. >> >> New implementation is a bit slower, but this is in a cold path so its >> probably okay. >> --- >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- >> 1 file changed, 43 insertions(+), 21 deletions(-) >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> index cd6a0a870a..b174d43208 100644 >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> andl $(PAGE_SIZE - 1), %eax >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> jg L(cross_page_boundary) >> - >> +L(page_cross_continue): >> VMOVU (%rdi), %VMM(1) >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> VPTESTN %VMM(1), %VMM(1), %k0 >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> test %VGPR(rsi), %VGPR(rsi) >> jz L(aligned_more) >> /* fallthrough: zero CHAR in first VEC. */ >> -L(page_cross_return): >> + >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> KMOV %k1, %VGPR(rax) >> +L(page_cross_return): >> /* Build mask up until first zero CHAR (used to mask of >> potential search CHAR matches past the end of the string). */ >> blsmsk %VGPR(rsi), %VGPR(rsi) >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): >> >> .p2align 4,, 12 >> L(aligned_more): >> -L(page_cross_continue): >> /* Need to keep original pointer incase VEC(1) has last match. */ >> movq %rdi, %rsi >> andq $-VEC_SIZE, %rdi >> @@ -340,34 +340,56 @@ L(return_new_match_ret): >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> ret >> >> - .p2align 4,, 4 >> L(cross_page_boundary): >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> + rax` sets pointer will all page offset bits cleared so >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> + before page cross (guaranteed to be safe to read). Doing this >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> + a bit of code size. */ >> xorq %rdi, %rax >> - mov $-1, %VRDX >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> - VPTESTN %VMM(6), %VMM(6), %k0 >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> + VPTESTN %VMM(1), %VMM(1), %k0 >> KMOV %k0, %VRSI >> >> + /* Shift out zero CHAR matches that are before the beginning of >> + src (rdi). */ >> # ifdef USE_AS_WCSRCHR >> movl %edi, %ecx >> - and $(VEC_SIZE - 1), %ecx >> + andl $(VEC_SIZE - 1), %ecx >> shrl $2, %ecx >> # endif >> - shlx %SHIFT_REG, %VRDX, %VRDX >> + shrx %SHIFT_REG, %VRSI, %VRSI >> >> -# ifdef USE_AS_WCSRCHR >> - kmovw %edx, %k1 >> + test %VRSI, %VRSI >> + jz L(page_cross_continue) >> + >> + /* Found zero CHAR so need to test for search CHAR. */ >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> + KMOV %k1, %VRAX >> + /* Shift out search CHAR matches that are before the beginning of >> + src (rdi). */ >> + shrx %SHIFT_REG, %VRAX, %VRAX >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here >> + so to save code-size just re-use return logic for first VEC. >> + This is relatively cold code (page cross). */ >> +# if VEC_SIZE == 64 >> + jmp L(page_cross_return) > > > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board. > > $ tail -7 /tmp/wcsrchr-evex.out > 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > 306: c4 e2 48 f3 d6 blsmsk %esi,%esi > 30b: 21 f0 and %esi,%eax > 30d: 74 07 je 316 <__wcsrchr_evex+0x316> > 30f: 0f bd c0 bsr %eax,%eax > 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax > 316: c3 ret > > $ tail -7 /tmp/wcsrchr-evex512.out > 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi > 305: 85 f6 test %esi,%esi > 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> > 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 > 314: c5 fb 93 c1 kmovd %k1,%eax > 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> > > $ tail -7 /tmp/strrchr-evex.out > 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax > 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi > 2f2: 21 f0 and %esi,%eax > 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> > 2f6: 0f bd c0 bsr %eax,%eax > 2f9: 48 01 f8 add %rdi,%rax > 2fc: c3 ret > > $ tail -7 /tmp/strrchr-evex512.out > 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi > 31a: 48 85 f6 test %rsi,%rsi > 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> > 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 > 32a: c4 e1 fb 93 c1 kmovq %k1,%rax > 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax > 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> It helps here (or at least saves a cache line). But not super invested either way. > > >> >> + /* 6 bytes from cache-line. */ >> # else >> - KMOV %VRDX, %k1 >> + /* Check if any search CHAR match in range. */ >> + blsmsk %VRSI, %VRSI >> + and %VRSI, %VRAX >> + jz L(ret2) >> + bsr %VRAX, %VRAX >> +# ifdef USE_AS_WCSRCHR >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> +# else >> + addq %rdi, %rax >> +# endif >> +L(ret2): >> + ret >> + /* 3 bytes from cache-line. */ >> # endif >> - >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > > > Please remove the VCOMPRESS definition as it's not used. > >> >> - /* We could technically just jmp back after the vpcompress but >> - it doesn't save any 16-byte blocks. */ >> - shrx %SHIFT_REG, %VRSI, %VRSI >> - test %VRSI, %VRSI >> - jnz L(page_cross_return) >> - jmp L(page_cross_continue) >> - /* 1-byte from cache line. */ >> END(STRRCHR) >> #endif >> -- >> 2.34.1 >>
On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > >> > >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > >> was missing the CPU_FEATURE checks for VBMI2 in the > >> ifunc/ifunc-impl-list. > >> > >> The fix is either to add those checks or change the logic to not use > >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > >> implementation is usable on SKX. > >> > >> New implementation is a bit slower, but this is in a cold path so its > >> probably okay. > >> --- > >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- > >> 1 file changed, 43 insertions(+), 21 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> index cd6a0a870a..b174d43208 100644 > >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> andl $(PAGE_SIZE - 1), %eax > >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax > >> jg L(cross_page_boundary) > >> - > >> +L(page_cross_continue): > >> VMOVU (%rdi), %VMM(1) > >> /* k0 has a 1 for each zero CHAR in YMM1. */ > >> VPTESTN %VMM(1), %VMM(1), %k0 > >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> test %VGPR(rsi), %VGPR(rsi) > >> jz L(aligned_more) > >> /* fallthrough: zero CHAR in first VEC. */ > >> -L(page_cross_return): > >> + > >> /* K1 has a 1 for each search CHAR match in VEC(1). */ > >> VPCMPEQ %VMATCH, %VMM(1), %k1 > >> KMOV %k1, %VGPR(rax) > >> +L(page_cross_return): > >> /* Build mask up until first zero CHAR (used to mask of > >> potential search CHAR matches past the end of the string). > */ > >> blsmsk %VGPR(rsi), %VGPR(rsi) > >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): > >> > >> .p2align 4,, 12 > >> L(aligned_more): > >> -L(page_cross_continue): > >> /* Need to keep original pointer incase VEC(1) has last match. > */ > >> movq %rdi, %rsi > >> andq $-VEC_SIZE, %rdi > >> @@ -340,34 +340,56 @@ L(return_new_match_ret): > >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > >> ret > >> > >> - .p2align 4,, 4 > >> L(cross_page_boundary): > >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, > >> + rax` sets pointer will all page offset bits cleared so > >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > >> + before page cross (guaranteed to be safe to read). Doing this > >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > >> + a bit of code size. */ > >> xorq %rdi, %rax > >> - mov $-1, %VRDX > >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > >> - VPTESTN %VMM(6), %VMM(6), %k0 > >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > >> + VPTESTN %VMM(1), %VMM(1), %k0 > >> KMOV %k0, %VRSI > >> > >> + /* Shift out zero CHAR matches that are before the beginning of > >> + src (rdi). */ > >> # ifdef USE_AS_WCSRCHR > >> movl %edi, %ecx > >> - and $(VEC_SIZE - 1), %ecx > >> + andl $(VEC_SIZE - 1), %ecx > >> shrl $2, %ecx > >> # endif > >> - shlx %SHIFT_REG, %VRDX, %VRDX > >> + shrx %SHIFT_REG, %VRSI, %VRSI > >> > >> -# ifdef USE_AS_WCSRCHR > >> - kmovw %edx, %k1 > >> + test %VRSI, %VRSI > >> + jz L(page_cross_continue) > >> + > >> + /* Found zero CHAR so need to test for search CHAR. */ > >> + VPCMP $0, %VMATCH, %VMM(1), %k1 > >> + KMOV %k1, %VRAX > >> + /* Shift out search CHAR matches that are before the beginning > of > >> + src (rdi). */ > >> + shrx %SHIFT_REG, %VRAX, %VRAX > >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here > >> + so to save code-size just re-use return logic for first VEC. > >> + This is relatively cold code (page cross). */ > >> +# if VEC_SIZE == 64 > >> + jmp L(page_cross_return) > > > > > > Can you please remove unconditional jump. It's simply confusing and > doesn't help across the board. > > > > $ tail -7 /tmp/wcsrchr-evex.out > > 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > > 306: c4 e2 48 f3 d6 blsmsk %esi,%esi > > 30b: 21 f0 and %esi,%eax > > 30d: 74 07 je 316 <__wcsrchr_evex+0x316> > > 30f: 0f bd c0 bsr %eax,%eax > > 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax > > 316: c3 ret > > > > $ tail -7 /tmp/wcsrchr-evex512.out > > 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi > > 305: 85 f6 test %esi,%esi > > 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> > > 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 > > 314: c5 fb 93 c1 kmovd %k1,%eax > > 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > > 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> > > > Unnecessary unconditional jump here, as the entire code will fit in the same cache line. Also it will slow down execution. In general, I'm not in favor of unconditional jump unless it can provide significant perf boost. > $ tail -7 /tmp/strrchr-evex.out > > 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax > > 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi > > 2f2: 21 f0 and %esi,%eax > > 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> > > 2f6: 0f bd c0 bsr %eax,%eax > > 2f9: 48 01 f8 add %rdi,%rax > > 2fc: c3 ret > > > > $ tail -7 /tmp/strrchr-evex512.out > > 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi > > 31a: 48 85 f6 test %rsi,%rsi > > 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> > > 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 > > 32a: c4 e1 fb 93 c1 kmovq %k1,%rax > > 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax > > 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> > It helps here (or at least saves a cache line). But not super invested > either way. > > > > > >> > >> + /* 6 bytes from cache-line. */ > >> # else > >> - KMOV %VRDX, %k1 > >> + /* Check if any search CHAR match in range. */ > >> + blsmsk %VRSI, %VRSI > >> + and %VRSI, %VRAX > >> + jz L(ret2) > >> + bsr %VRAX, %VRAX > >> +# ifdef USE_AS_WCSRCHR > >> + leaq (%rdi, %rax, CHAR_SIZE), %rax > >> +# else > >> + addq %rdi, %rax > >> +# endif > >> +L(ret2): > >> + ret > >> + /* 3 bytes from cache-line. */ > >> # endif > >> - > >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > > > > > > Please remove the VCOMPRESS definition as it's not used. > > > >> > >> - /* We could technically just jmp back after the vpcompress but > >> - it doesn't save any 16-byte blocks. */ > >> - shrx %SHIFT_REG, %VRSI, %VRSI > >> - test %VRSI, %VRSI > >> - jnz L(page_cross_return) > >> - jmp L(page_cross_continue) > >> - /* 1-byte from cache line. */ > >> END(STRRCHR) > >> #endif > >> -- > >> 2.34.1 > >> >
On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote: >> > >> > >> > >> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> >> was missing the CPU_FEATURE checks for VBMI2 in the >> >> ifunc/ifunc-impl-list. >> >> >> >> The fix is either to add those checks or change the logic to not use >> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> >> implementation is usable on SKX. >> >> >> >> New implementation is a bit slower, but this is in a cold path so its >> >> probably okay. >> >> --- >> >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- >> >> 1 file changed, 43 insertions(+), 21 deletions(-) >> >> >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> index cd6a0a870a..b174d43208 100644 >> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> >> andl $(PAGE_SIZE - 1), %eax >> >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> >> jg L(cross_page_boundary) >> >> - >> >> +L(page_cross_continue): >> >> VMOVU (%rdi), %VMM(1) >> >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> >> VPTESTN %VMM(1), %VMM(1), %k0 >> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> >> test %VGPR(rsi), %VGPR(rsi) >> >> jz L(aligned_more) >> >> /* fallthrough: zero CHAR in first VEC. */ >> >> -L(page_cross_return): >> >> + >> >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> >> KMOV %k1, %VGPR(rax) >> >> +L(page_cross_return): >> >> /* Build mask up until first zero CHAR (used to mask of >> >> potential search CHAR matches past the end of the string). */ >> >> blsmsk %VGPR(rsi), %VGPR(rsi) >> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): >> >> >> >> .p2align 4,, 12 >> >> L(aligned_more): >> >> -L(page_cross_continue): >> >> /* Need to keep original pointer incase VEC(1) has last match. */ >> >> movq %rdi, %rsi >> >> andq $-VEC_SIZE, %rdi >> >> @@ -340,34 +340,56 @@ L(return_new_match_ret): >> >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> >> ret >> >> >> >> - .p2align 4,, 4 >> >> L(cross_page_boundary): >> >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> >> + rax` sets pointer will all page offset bits cleared so >> >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> >> + before page cross (guaranteed to be safe to read). Doing this >> >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> >> + a bit of code size. */ >> >> xorq %rdi, %rax >> >> - mov $-1, %VRDX >> >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> >> - VPTESTN %VMM(6), %VMM(6), %k0 >> >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> >> + VPTESTN %VMM(1), %VMM(1), %k0 >> >> KMOV %k0, %VRSI >> >> >> >> + /* Shift out zero CHAR matches that are before the beginning of >> >> + src (rdi). */ >> >> # ifdef USE_AS_WCSRCHR >> >> movl %edi, %ecx >> >> - and $(VEC_SIZE - 1), %ecx >> >> + andl $(VEC_SIZE - 1), %ecx >> >> shrl $2, %ecx >> >> # endif >> >> - shlx %SHIFT_REG, %VRDX, %VRDX >> >> + shrx %SHIFT_REG, %VRSI, %VRSI >> >> >> >> -# ifdef USE_AS_WCSRCHR >> >> - kmovw %edx, %k1 >> >> + test %VRSI, %VRSI >> >> + jz L(page_cross_continue) >> >> + >> >> + /* Found zero CHAR so need to test for search CHAR. */ >> >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> >> + KMOV %k1, %VRAX >> >> + /* Shift out search CHAR matches that are before the beginning of >> >> + src (rdi). */ >> >> + shrx %SHIFT_REG, %VRAX, %VRAX >> >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here >> >> + so to save code-size just re-use return logic for first VEC. >> >> + This is relatively cold code (page cross). */ >> >> +# if VEC_SIZE == 64 >> >> + jmp L(page_cross_return) >> > >> > >> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board. >> > >> > $ tail -7 /tmp/wcsrchr-evex.out >> > 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax >> > 306: c4 e2 48 f3 d6 blsmsk %esi,%esi >> > 30b: 21 f0 and %esi,%eax >> > 30d: 74 07 je 316 <__wcsrchr_evex+0x316> >> > 30f: 0f bd c0 bsr %eax,%eax >> > 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax >> > 316: c3 ret >> > >> > $ tail -7 /tmp/wcsrchr-evex512.out >> > 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi >> > 305: 85 f6 test %esi,%esi >> > 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> >> > 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 >> > 314: c5 fb 93 c1 kmovd %k1,%eax >> > 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax >> > 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> >> > > > > Unnecessary unconditional jump here, as the entire code will fit in the same cache line. > Also it will slow down execution. > > In general, I'm not in favor of unconditional jump unless it can provide significant perf boost. > The repalcement code: ``` blsmsk %rsi,%rsi : +5 and %rsi,%rax : +3 je_imm32 : +2 bsr %rax,%rax : +4 add %rsi, %rax : +3 ret : +1 : + 18 Starting at 0x334, that will spill to the next line. >> > $ tail -7 /tmp/strrchr-evex.out >> > 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax >> > 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi >> > 2f2: 21 f0 and %esi,%eax >> > 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> >> > 2f6: 0f bd c0 bsr %eax,%eax >> > 2f9: 48 01 f8 add %rdi,%rax >> > 2fc: c3 ret >> > >> > $ tail -7 /tmp/strrchr-evex512.out >> > 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi >> > 31a: 48 85 f6 test %rsi,%rsi >> > 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> >> > 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 >> > 32a: c4 e1 fb 93 c1 kmovq %k1,%rax >> > 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax >> > 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> >> It helps here (or at least saves a cache line). But not super invested >> either way. >> > >> > >> >> >> >> + /* 6 bytes from cache-line. */ >> >> # else >> >> - KMOV %VRDX, %k1 >> >> + /* Check if any search CHAR match in range. */ >> >> + blsmsk %VRSI, %VRSI >> >> + and %VRSI, %VRAX >> >> + jz L(ret2) >> >> + bsr %VRAX, %VRAX >> >> +# ifdef USE_AS_WCSRCHR >> >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> >> +# else >> >> + addq %rdi, %rax >> >> +# endif >> >> +L(ret2): >> >> + ret >> >> + /* 3 bytes from cache-line. */ >> >> # endif >> >> - >> >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} >> > >> > >> > Please remove the VCOMPRESS definition as it's not used. >> > >> >> >> >> - /* We could technically just jmp back after the vpcompress but >> >> - it doesn't save any 16-byte blocks. */ >> >> - shrx %SHIFT_REG, %VRSI, %VRSI >> >> - test %VRSI, %VRSI >> >> - jnz L(page_cross_return) >> >> - jmp L(page_cross_continue) >> >> - /* 1-byte from cache line. */ >> >> END(STRRCHR) >> >> #endif >> >> -- >> >> 2.34.1 >> >>
On Wed, Nov 8, 2023 at 3:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > >> > >> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > >> > > >> > > >> > > >> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > >> >> > >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > >> >> was missing the CPU_FEATURE checks for VBMI2 in the > >> >> ifunc/ifunc-impl-list. > >> >> > >> >> The fix is either to add those checks or change the logic to not use > >> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > >> >> implementation is usable on SKX. > >> >> > >> >> New implementation is a bit slower, but this is in a cold path so its > >> >> probably okay. > >> >> --- > >> >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- > >> >> 1 file changed, 43 insertions(+), 21 deletions(-) > >> >> > >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> >> index cd6a0a870a..b174d43208 100644 > >> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> >> andl $(PAGE_SIZE - 1), %eax > >> >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax > >> >> jg L(cross_page_boundary) > >> >> - > >> >> +L(page_cross_continue): > >> >> VMOVU (%rdi), %VMM(1) > >> >> /* k0 has a 1 for each zero CHAR in YMM1. */ > >> >> VPTESTN %VMM(1), %VMM(1), %k0 > >> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> >> test %VGPR(rsi), %VGPR(rsi) > >> >> jz L(aligned_more) > >> >> /* fallthrough: zero CHAR in first VEC. */ > >> >> -L(page_cross_return): > >> >> + > >> >> /* K1 has a 1 for each search CHAR match in VEC(1). */ > >> >> VPCMPEQ %VMATCH, %VMM(1), %k1 > >> >> KMOV %k1, %VGPR(rax) > >> >> +L(page_cross_return): > >> >> /* Build mask up until first zero CHAR (used to mask of > >> >> potential search CHAR matches past the end of the string). */ > >> >> blsmsk %VGPR(rsi), %VGPR(rsi) > >> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): > >> >> > >> >> .p2align 4,, 12 > >> >> L(aligned_more): > >> >> -L(page_cross_continue): > >> >> /* Need to keep original pointer incase VEC(1) has last match. */ > >> >> movq %rdi, %rsi > >> >> andq $-VEC_SIZE, %rdi > >> >> @@ -340,34 +340,56 @@ L(return_new_match_ret): > >> >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > >> >> ret > >> >> > >> >> - .p2align 4,, 4 > >> >> L(cross_page_boundary): > >> >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, > >> >> + rax` sets pointer will all page offset bits cleared so > >> >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > >> >> + before page cross (guaranteed to be safe to read). Doing this > >> >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > >> >> + a bit of code size. */ > >> >> xorq %rdi, %rax > >> >> - mov $-1, %VRDX > >> >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > >> >> - VPTESTN %VMM(6), %VMM(6), %k0 > >> >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > >> >> + VPTESTN %VMM(1), %VMM(1), %k0 > >> >> KMOV %k0, %VRSI > >> >> > >> >> + /* Shift out zero CHAR matches that are before the beginning of > >> >> + src (rdi). */ > >> >> # ifdef USE_AS_WCSRCHR > >> >> movl %edi, %ecx > >> >> - and $(VEC_SIZE - 1), %ecx > >> >> + andl $(VEC_SIZE - 1), %ecx > >> >> shrl $2, %ecx > >> >> # endif > >> >> - shlx %SHIFT_REG, %VRDX, %VRDX > >> >> + shrx %SHIFT_REG, %VRSI, %VRSI > >> >> > >> >> -# ifdef USE_AS_WCSRCHR > >> >> - kmovw %edx, %k1 > >> >> + test %VRSI, %VRSI > >> >> + jz L(page_cross_continue) > >> >> + > >> >> + /* Found zero CHAR so need to test for search CHAR. */ > >> >> + VPCMP $0, %VMATCH, %VMM(1), %k1 > >> >> + KMOV %k1, %VRAX > >> >> + /* Shift out search CHAR matches that are before the beginning of > >> >> + src (rdi). */ > >> >> + shrx %SHIFT_REG, %VRAX, %VRAX > >> >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here > >> >> + so to save code-size just re-use return logic for first VEC. > >> >> + This is relatively cold code (page cross). */ > >> >> +# if VEC_SIZE == 64 > >> >> + jmp L(page_cross_return) > >> > > >> > > >> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board. > >> > > >> > $ tail -7 /tmp/wcsrchr-evex.out > >> > 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > >> > 306: c4 e2 48 f3 d6 blsmsk %esi,%esi > >> > 30b: 21 f0 and %esi,%eax > >> > 30d: 74 07 je 316 <__wcsrchr_evex+0x316> > >> > 30f: 0f bd c0 bsr %eax,%eax > >> > 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax > >> > 316: c3 ret > >> > > >> > $ tail -7 /tmp/wcsrchr-evex512.out > >> > 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi > >> > 305: 85 f6 test %esi,%esi > >> > 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> > >> > 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 > >> > 314: c5 fb 93 c1 kmovd %k1,%eax > >> > 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > >> > 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> > >> > > > > > > > Unnecessary unconditional jump here, as the entire code will fit in the same cache line. > > Also it will slow down execution. > > > > In general, I'm not in favor of unconditional jump unless it can provide significant perf boost. Oh you mean make the conditional only for strrchr-evex512, but exclude wcsrchr-evx512. Sure. > > > The repalcement code: > ``` > blsmsk %rsi,%rsi : +5 > and %rsi,%rax : +3 > je_imm32 : +2 > bsr %rax,%rax : +4 > add %rsi, %rax : +3 > ret : +1 > : + 18 > Starting at 0x334, that will spill to the next line. > > >> > $ tail -7 /tmp/strrchr-evex.out > >> > 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax > >> > 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi > >> > 2f2: 21 f0 and %esi,%eax > >> > 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> > >> > 2f6: 0f bd c0 bsr %eax,%eax > >> > 2f9: 48 01 f8 add %rdi,%rax > >> > 2fc: c3 ret > >> > > >> > $ tail -7 /tmp/strrchr-evex512.out > >> > 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi > >> > 31a: 48 85 f6 test %rsi,%rsi > >> > 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> > >> > 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 > >> > 32a: c4 e1 fb 93 c1 kmovq %k1,%rax > >> > 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax > >> > 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> > >> It helps here (or at least saves a cache line). But not super invested > >> either way. > >> > > >> > > >> >> > >> >> + /* 6 bytes from cache-line. */ > >> >> # else > >> >> - KMOV %VRDX, %k1 > >> >> + /* Check if any search CHAR match in range. */ > >> >> + blsmsk %VRSI, %VRSI > >> >> + and %VRSI, %VRAX > >> >> + jz L(ret2) > >> >> + bsr %VRAX, %VRAX > >> >> +# ifdef USE_AS_WCSRCHR > >> >> + leaq (%rdi, %rax, CHAR_SIZE), %rax > >> >> +# else > >> >> + addq %rdi, %rax > >> >> +# endif > >> >> +L(ret2): > >> >> + ret > >> >> + /* 3 bytes from cache-line. */ > >> >> # endif > >> >> - > >> >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > >> > > >> > > >> > Please remove the VCOMPRESS definition as it's not used. > >> > > >> >> > >> >> - /* We could technically just jmp back after the vpcompress but > >> >> - it doesn't save any 16-byte blocks. */ > >> >> - shrx %SHIFT_REG, %VRSI, %VRSI > >> >> - test %VRSI, %VRSI > >> >> - jnz L(page_cross_return) > >> >> - jmp L(page_cross_continue) > >> >> - /* 1-byte from cache line. */ > >> >> END(STRRCHR) > >> >> #endif > >> >> -- > >> >> 2.34.1 > >> >>
On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> was missing the CPU_FEATURE checks for VBMI2 in the >> ifunc/ifunc-impl-list. >> >> The fix is either to add those checks or change the logic to not use >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> implementation is usable on SKX. >> >> New implementation is a bit slower, but this is in a cold path so its >> probably okay. >> --- >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++------- >> 1 file changed, 43 insertions(+), 21 deletions(-) >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> index cd6a0a870a..b174d43208 100644 >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> andl $(PAGE_SIZE - 1), %eax >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> jg L(cross_page_boundary) >> - >> +L(page_cross_continue): >> VMOVU (%rdi), %VMM(1) >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> VPTESTN %VMM(1), %VMM(1), %k0 >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> test %VGPR(rsi), %VGPR(rsi) >> jz L(aligned_more) >> /* fallthrough: zero CHAR in first VEC. */ >> -L(page_cross_return): >> + >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> KMOV %k1, %VGPR(rax) >> +L(page_cross_return): >> /* Build mask up until first zero CHAR (used to mask of >> potential search CHAR matches past the end of the string). */ >> blsmsk %VGPR(rsi), %VGPR(rsi) >> @@ -167,7 +168,6 @@ L(first_vec_x1_return): >> >> .p2align 4,, 12 >> L(aligned_more): >> -L(page_cross_continue): >> /* Need to keep original pointer incase VEC(1) has last match. */ >> movq %rdi, %rsi >> andq $-VEC_SIZE, %rdi >> @@ -340,34 +340,56 @@ L(return_new_match_ret): >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> ret >> >> - .p2align 4,, 4 >> L(cross_page_boundary): >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> + rax` sets pointer will all page offset bits cleared so >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> + before page cross (guaranteed to be safe to read). Doing this >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> + a bit of code size. */ >> xorq %rdi, %rax >> - mov $-1, %VRDX >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> - VPTESTN %VMM(6), %VMM(6), %k0 >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> + VPTESTN %VMM(1), %VMM(1), %k0 >> KMOV %k0, %VRSI >> >> + /* Shift out zero CHAR matches that are before the beginning of >> + src (rdi). */ >> # ifdef USE_AS_WCSRCHR >> movl %edi, %ecx >> - and $(VEC_SIZE - 1), %ecx >> + andl $(VEC_SIZE - 1), %ecx >> shrl $2, %ecx >> # endif >> - shlx %SHIFT_REG, %VRDX, %VRDX >> + shrx %SHIFT_REG, %VRSI, %VRSI >> >> -# ifdef USE_AS_WCSRCHR >> - kmovw %edx, %k1 >> + test %VRSI, %VRSI >> + jz L(page_cross_continue) >> + >> + /* Found zero CHAR so need to test for search CHAR. */ >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> + KMOV %k1, %VRAX >> + /* Shift out search CHAR matches that are before the beginning of >> + src (rdi). */ >> + shrx %SHIFT_REG, %VRAX, %VRAX >> + /* For VEC_SIZE == 64 we are just at the end of a cacheline here >> + so to save code-size just re-use return logic for first VEC. >> + This is relatively cold code (page cross). */ >> +# if VEC_SIZE == 64 >> + jmp L(page_cross_return) > > > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board. done for wcsrchr-evex512 > > $ tail -7 /tmp/wcsrchr-evex.out > 301: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > 306: c4 e2 48 f3 d6 blsmsk %esi,%esi > 30b: 21 f0 and %esi,%eax > 30d: 74 07 je 316 <__wcsrchr_evex+0x316> > 30f: 0f bd c0 bsr %eax,%eax > 312: 48 8d 04 87 lea (%rdi,%rax,4),%rax > 316: c3 ret > > $ tail -7 /tmp/wcsrchr-evex512.out > 300: c4 e2 73 f7 f6 shrx %ecx,%esi,%esi > 305: 85 f6 test %esi,%esi > 307: 0f 84 0b fd ff ff je 18 <__wcsrchr_evex512+0x18> > 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1 > 314: c5 fb 93 c1 kmovd %k1,%eax > 318: c4 e2 73 f7 c0 shrx %ecx,%eax,%eax > 31d: e9 18 fd ff ff jmp 3a <__wcsrchr_evex512+0x3a> > > $ tail -7 /tmp/strrchr-evex.out > 2e8: c4 e2 43 f7 c0 shrx %edi,%eax,%eax > 2ed: c4 e2 48 f3 d6 blsmsk %esi,%esi > 2f2: 21 f0 and %esi,%eax > 2f4: 74 06 je 2fc <__strrchr_evex+0x2fc> > 2f6: 0f bd c0 bsr %eax,%eax > 2f9: 48 01 f8 add %rdi,%rax > 2fc: c3 ret > > $ tail -7 /tmp/strrchr-evex512.out > 315: c4 e2 c3 f7 f6 shrx %rdi,%rsi,%rsi > 31a: 48 85 f6 test %rsi,%rsi > 31d: 0f 84 f5 fc ff ff je 18 <__strrchr_evex512+0x18> > 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1 > 32a: c4 e1 fb 93 c1 kmovq %k1,%rax > 32f: c4 e2 c3 f7 c0 shrx %rdi,%rax,%rax > 334: e9 04 fd ff ff jmp 3d <__strrchr_evex512+0x3d> > > >> >> + /* 6 bytes from cache-line. */ >> # else >> - KMOV %VRDX, %k1 >> + /* Check if any search CHAR match in range. */ >> + blsmsk %VRSI, %VRSI >> + and %VRSI, %VRAX >> + jz L(ret2) >> + bsr %VRAX, %VRAX >> +# ifdef USE_AS_WCSRCHR >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> +# else >> + addq %rdi, %rax >> +# endif >> +L(ret2): >> + ret >> + /* 3 bytes from cache-line. */ >> # endif >> - >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > > > Please remove the VCOMPRESS definition as it's not used. done. > >> >> - /* We could technically just jmp back after the vpcompress but >> - it doesn't save any 16-byte blocks. */ >> - shrx %SHIFT_REG, %VRSI, %VRSI >> - test %VRSI, %VRSI >> - jnz L(page_cross_return) >> - jmp L(page_cross_continue) >> - /* 1-byte from cache line. */ >> END(STRRCHR) >> #endif >> -- >> 2.34.1 >>
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S index cd6a0a870a..b174d43208 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in YMM1. */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VGPR(rsi), %VGPR(rsi) jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): + /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VGPR(rax) +L(page_cross_return): /* Build mask up until first zero CHAR (used to mask of potential search CHAR matches past the end of the string). */ blsmsk %VGPR(rsi), %VGPR(rsi) @@ -167,7 +168,6 @@ L(first_vec_x1_return): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi andq $-VEC_SIZE, %rdi @@ -340,34 +340,56 @@ L(return_new_match_ret): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guaranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VRSI + /* Shift out zero CHAR matches that are before the beginning of + src (rdi). */ # ifdef USE_AS_WCSRCHR movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx + andl $(VEC_SIZE - 1), %ecx shrl $2, %ecx # endif - shlx %SHIFT_REG, %VRDX, %VRDX + shrx %SHIFT_REG, %VRSI, %VRSI -# ifdef USE_AS_WCSRCHR - kmovw %edx, %k1 + test %VRSI, %VRSI + jz L(page_cross_continue) + + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + /* Shift out search CHAR matches that are before the beginning of + src (rdi). */ + shrx %SHIFT_REG, %VRAX, %VRAX + /* For VEC_SIZE == 64 we are just at the end of a cacheline here + so to save code-size just re-use return logic for first VEC. + This is relatively cold code (page cross). */ +# if VEC_SIZE == 64 + jmp L(page_cross_return) + /* 6 bytes from cache-line. */ # else - KMOV %VRDX, %k1 + /* Check if any search CHAR match in range. */ + blsmsk %VRSI, %VRSI + and %VRSI, %VRAX + jz L(ret2) + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +L(ret2): + ret + /* 3 bytes from cache-line. */ # endif - - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %SHIFT_REG, %VRSI, %VRSI - test %VRSI, %VRSI - jnz L(page_cross_return) - jmp L(page_cross_continue) - /* 1-byte from cache line. */ END(STRRCHR) #endif