Message ID | 20231109224918.2036632-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S | expand |
On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > was missing the CPU_FEATURE checks for VBMI2 in the > ifunc/ifunc-impl-list. > > The fix is either to add those checks or change the logic to not use > `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > implementation is usable on SKX. > > New implementation is a bit slower, but this is in a cold path so its > probably okay. > --- > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++-------- > 1 file changed, 37 insertions(+), 24 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > index cd6a0a870a..eb5f1f855a 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -35,7 +35,6 @@ > # define CHAR_SIZE 4 > # define VPCMP vpcmpd > # define VPMIN vpminud > -# define VPCOMPRESS vpcompressd > # define VPTESTN vptestnmd > # define VPTEST vptestmd > # define VPBROADCAST vpbroadcastd > @@ -46,7 +45,6 @@ > # define CHAR_SIZE 1 > # define VPCMP vpcmpb > # define VPMIN vpminub > -# define VPCOMPRESS vpcompressb > # define VPTESTN vptestnmb > # define VPTEST vptestmb > # define VPBROADCAST vpbroadcastb > @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > jg L(cross_page_boundary) > - > +L(page_cross_continue): > VMOVU (%rdi), %VMM(1) > /* k0 has a 1 for each zero CHAR in YMM1. */ > VPTESTN %VMM(1), %VMM(1), %k0 > @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > test %VGPR(rsi), %VGPR(rsi) > jz L(aligned_more) > /* fallthrough: zero CHAR in first VEC. */ > -L(page_cross_return): > + > /* K1 has a 1 for each search CHAR match in VEC(1). */ > VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VGPR(rax) > +L(page_cross_return): > Unused label. > /* Build mask up until first zero CHAR (used to mask of > potential search CHAR matches past the end of the string). */ > blsmsk %VGPR(rsi), %VGPR(rsi) > @@ -167,7 +166,6 @@ L(first_vec_x1_return): > > .p2align 4,, 12 > L(aligned_more): > -L(page_cross_continue): > /* Need to keep original pointer incase VEC(1) has last match. */ > movq %rdi, %rsi > andq $-VEC_SIZE, %rdi > @@ -340,34 +338,49 @@ L(return_new_match_ret): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 4 > L(cross_page_boundary): > + /* eax contains all the page offset bits of src (rdi). `xor rdi, > + rax` sets pointer will all page offset bits cleared so > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > + before page cross (guaranteed to be safe to read). Doing this > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > + a bit of code size. */ > xorq %rdi, %rax > - mov $-1, %VRDX > - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > - VPTESTN %VMM(6), %VMM(6), %k0 > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > KMOV %k0, %VRSI > > + /* Shift out zero CHAR matches that are before the beginning of > + src (rdi). */ > # ifdef USE_AS_WCSRCHR > movl %edi, %ecx > - and $(VEC_SIZE - 1), %ecx > + andl $(VEC_SIZE - 1), %ecx > shrl $2, %ecx > # endif > - shlx %SHIFT_REG, %VRDX, %VRDX > - > -# ifdef USE_AS_WCSRCHR > - kmovw %edx, %k1 > -# else > - KMOV %VRDX, %k1 > -# endif > - > - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > - /* We could technically just jmp back after the vpcompress but > - it doesn't save any 16-byte blocks. */ > shrx %SHIFT_REG, %VRSI, %VRSI > + > test %VRSI, %VRSI > - jnz L(page_cross_return) > - jmp L(page_cross_continue) > - /* 1-byte from cache line. */ > + jz L(page_cross_continue) > + > + /* Found zero CHAR so need to test for search CHAR. */ > + VPCMP $0, %VMATCH, %VMM(1), %k1 > + KMOV %k1, %VRAX > + /* Shift out search CHAR matches that are before the beginning of > + src (rdi). */ > + shrx %SHIFT_REG, %VRAX, %VRAX > + /* Check if any search CHAR match in range. */ > + blsmsk %VRSI, %VRSI > + and %VRSI, %VRAX > + jz L(ret2) > + bsr %VRAX, %VRAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > +L(ret2): > + ret > Modified instructions to fit page crossing logic in cache line for strrchr-evex512. 300: 48 31 f8 xor %rdi,%rax 303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17 30a: 62 b2 76 40 26 c1 vptestnmb %zmm17,%zmm17,%k0 310: c4 e1 fb 93 f0 kmovq %k0,%rsi 315: 89 f9 mov %edi,%ecx 317: 48 d3 ee shr %cl,%rsi 31a: 0f 84 f8 fc ff ff je 18 <__strrchr_evex512+0x18> 320: 62 b1 75 40 74 c8 vpcmpeqb %zmm16,%zmm17,%k1 326: c4 e1 fb 93 c1 kmovq %k1,%rax 32b: 48 d3 e8 shr %cl,%rax 32e: c4 e2 c8 f3 d6 blsmsk %rsi,%rsi 333: 48 21 f0 and %rsi,%rax 336: 74 07 je 33f <__strrchr_evex512+0x33f> 338: 48 0f bd c0 bsr %rax,%rax 33c: 48 01 f8 add %rdi,%rax 33f: c3 ret > + /* 3 bytes from cache-line for evex. > + Crosses cache line for evex512. */ > END(STRRCHR) > #endif > -- > 2.34.1 > >
On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> was missing the CPU_FEATURE checks for VBMI2 in the >> ifunc/ifunc-impl-list. >> >> The fix is either to add those checks or change the logic to not use >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> implementation is usable on SKX. >> >> New implementation is a bit slower, but this is in a cold path so its >> probably okay. >> --- >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++-------- >> 1 file changed, 37 insertions(+), 24 deletions(-) >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> index cd6a0a870a..eb5f1f855a 100644 >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> @@ -35,7 +35,6 @@ >> # define CHAR_SIZE 4 >> # define VPCMP vpcmpd >> # define VPMIN vpminud >> -# define VPCOMPRESS vpcompressd >> # define VPTESTN vptestnmd >> # define VPTEST vptestmd >> # define VPBROADCAST vpbroadcastd >> @@ -46,7 +45,6 @@ >> # define CHAR_SIZE 1 >> # define VPCMP vpcmpb >> # define VPMIN vpminub >> -# define VPCOMPRESS vpcompressb >> # define VPTESTN vptestnmb >> # define VPTEST vptestmb >> # define VPBROADCAST vpbroadcastb >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> andl $(PAGE_SIZE - 1), %eax >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> jg L(cross_page_boundary) >> - >> +L(page_cross_continue): >> VMOVU (%rdi), %VMM(1) >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> VPTESTN %VMM(1), %VMM(1), %k0 >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> test %VGPR(rsi), %VGPR(rsi) >> jz L(aligned_more) >> /* fallthrough: zero CHAR in first VEC. */ >> -L(page_cross_return): >> + >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> KMOV %k1, %VGPR(rax) >> +L(page_cross_return): > > > Unused label. > >> >> /* Build mask up until first zero CHAR (used to mask of >> potential search CHAR matches past the end of the string). */ >> blsmsk %VGPR(rsi), %VGPR(rsi) >> @@ -167,7 +166,6 @@ L(first_vec_x1_return): >> >> .p2align 4,, 12 >> L(aligned_more): >> -L(page_cross_continue): >> /* Need to keep original pointer incase VEC(1) has last match. */ >> movq %rdi, %rsi >> andq $-VEC_SIZE, %rdi >> @@ -340,34 +338,49 @@ L(return_new_match_ret): >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> ret >> >> - .p2align 4,, 4 >> L(cross_page_boundary): >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> + rax` sets pointer will all page offset bits cleared so >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> + before page cross (guaranteed to be safe to read). Doing this >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> + a bit of code size. */ >> xorq %rdi, %rax >> - mov $-1, %VRDX >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> - VPTESTN %VMM(6), %VMM(6), %k0 >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> + VPTESTN %VMM(1), %VMM(1), %k0 >> KMOV %k0, %VRSI >> >> + /* Shift out zero CHAR matches that are before the beginning of >> + src (rdi). */ >> # ifdef USE_AS_WCSRCHR >> movl %edi, %ecx >> - and $(VEC_SIZE - 1), %ecx >> + andl $(VEC_SIZE - 1), %ecx >> shrl $2, %ecx >> # endif >> - shlx %SHIFT_REG, %VRDX, %VRDX >> - >> -# ifdef USE_AS_WCSRCHR >> - kmovw %edx, %k1 >> -# else >> - KMOV %VRDX, %k1 >> -# endif >> - >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} >> - /* We could technically just jmp back after the vpcompress but >> - it doesn't save any 16-byte blocks. */ >> shrx %SHIFT_REG, %VRSI, %VRSI >> + >> test %VRSI, %VRSI >> - jnz L(page_cross_return) >> - jmp L(page_cross_continue) >> - /* 1-byte from cache line. */ >> + jz L(page_cross_continue) >> + >> + /* Found zero CHAR so need to test for search CHAR. */ >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> + KMOV %k1, %VRAX >> + /* Shift out search CHAR matches that are before the beginning of >> + src (rdi). */ >> + shrx %SHIFT_REG, %VRAX, %VRAX >> + /* Check if any search CHAR match in range. */ >> + blsmsk %VRSI, %VRSI >> + and %VRSI, %VRAX >> + jz L(ret2) >> + bsr %VRAX, %VRAX >> +# ifdef USE_AS_WCSRCHR >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> +# else >> + addq %rdi, %rax >> +# endif >> +L(ret2): >> + ret > > > Modified instructions to fit page crossing logic in cache line for strrchr-evex512. > > 300: 48 31 f8 xor %rdi,%rax > 303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17 > 30a: 62 b2 76 40 26 c1 vptestnmb %zmm17,%zmm17,%k0 > 310: c4 e1 fb 93 f0 kmovq %k0,%rsi > 315: 89 f9 mov %edi,%ecx > 317: 48 d3 ee shr %cl,%rsi > 31a: 0f 84 f8 fc ff ff je 18 <__strrchr_evex512+0x18> > 320: 62 b1 75 40 74 c8 vpcmpeqb %zmm16,%zmm17,%k1 > 326: c4 e1 fb 93 c1 kmovq %k1,%rax > 32b: 48 d3 e8 shr %cl,%rax > 32e: c4 e2 c8 f3 d6 blsmsk %rsi,%rsi > 333: 48 21 f0 and %rsi,%rax > 336: 74 07 je 33f <__strrchr_evex512+0x33f> > 338: 48 0f bd c0 bsr %rax,%rax > 33c: 48 01 f8 add %rdi,%rax > 33f: c3 ret > Can update if you want, but its a alot of ifdefs, LMK. >> >> + /* 3 bytes from cache-line for evex. >> + Crosses cache line for evex512. */ >> END(STRRCHR) >> #endif >> -- >> 2.34.1 >>
On Fri, Nov 10, 2023 at 7:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > >> > >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > >> was missing the CPU_FEATURE checks for VBMI2 in the > >> ifunc/ifunc-impl-list. > >> > >> The fix is either to add those checks or change the logic to not use > >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > >> implementation is usable on SKX. > >> > >> New implementation is a bit slower, but this is in a cold path so its > >> probably okay. > >> --- > >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++-------- > >> 1 file changed, 37 insertions(+), 24 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> index cd6a0a870a..eb5f1f855a 100644 > >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > >> @@ -35,7 +35,6 @@ > >> # define CHAR_SIZE 4 > >> # define VPCMP vpcmpd > >> # define VPMIN vpminud > >> -# define VPCOMPRESS vpcompressd > >> # define VPTESTN vptestnmd > >> # define VPTEST vptestmd > >> # define VPBROADCAST vpbroadcastd > >> @@ -46,7 +45,6 @@ > >> # define CHAR_SIZE 1 > >> # define VPCMP vpcmpb > >> # define VPMIN vpminub > >> -# define VPCOMPRESS vpcompressb > >> # define VPTESTN vptestnmb > >> # define VPTEST vptestmb > >> # define VPBROADCAST vpbroadcastb > >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> andl $(PAGE_SIZE - 1), %eax > >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax > >> jg L(cross_page_boundary) > >> - > >> +L(page_cross_continue): > >> VMOVU (%rdi), %VMM(1) > >> /* k0 has a 1 for each zero CHAR in YMM1. */ > >> VPTESTN %VMM(1), %VMM(1), %k0 > >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > >> test %VGPR(rsi), %VGPR(rsi) > >> jz L(aligned_more) > >> /* fallthrough: zero CHAR in first VEC. */ > >> -L(page_cross_return): > >> + > >> /* K1 has a 1 for each search CHAR match in VEC(1). */ > >> VPCMPEQ %VMATCH, %VMM(1), %k1 > >> KMOV %k1, %VGPR(rax) > >> +L(page_cross_return): > > > > > > Unused label. > > > >> > >> /* Build mask up until first zero CHAR (used to mask of > >> potential search CHAR matches past the end of the string). > */ > >> blsmsk %VGPR(rsi), %VGPR(rsi) > >> @@ -167,7 +166,6 @@ L(first_vec_x1_return): > >> > >> .p2align 4,, 12 > >> L(aligned_more): > >> -L(page_cross_continue): > >> /* Need to keep original pointer incase VEC(1) has last match. > */ > >> movq %rdi, %rsi > >> andq $-VEC_SIZE, %rdi > >> @@ -340,34 +338,49 @@ L(return_new_match_ret): > >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > >> ret > >> > >> - .p2align 4,, 4 > >> L(cross_page_boundary): > >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, > >> + rax` sets pointer will all page offset bits cleared so > >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > >> + before page cross (guaranteed to be safe to read). Doing this > >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > >> + a bit of code size. */ > >> xorq %rdi, %rax > >> - mov $-1, %VRDX > >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > >> - VPTESTN %VMM(6), %VMM(6), %k0 > >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > >> + VPTESTN %VMM(1), %VMM(1), %k0 > >> KMOV %k0, %VRSI > >> > >> + /* Shift out zero CHAR matches that are before the beginning of > >> + src (rdi). */ > >> # ifdef USE_AS_WCSRCHR > >> movl %edi, %ecx > >> - and $(VEC_SIZE - 1), %ecx > >> + andl $(VEC_SIZE - 1), %ecx > >> shrl $2, %ecx > >> # endif > >> - shlx %SHIFT_REG, %VRDX, %VRDX > >> - > >> -# ifdef USE_AS_WCSRCHR > >> - kmovw %edx, %k1 > >> -# else > >> - KMOV %VRDX, %k1 > >> -# endif > >> - > >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > >> - /* We could technically just jmp back after the vpcompress but > >> - it doesn't save any 16-byte blocks. */ > >> shrx %SHIFT_REG, %VRSI, %VRSI > >> + > >> test %VRSI, %VRSI > >> - jnz L(page_cross_return) > >> - jmp L(page_cross_continue) > >> - /* 1-byte from cache line. */ > >> + jz L(page_cross_continue) > >> + > >> + /* Found zero CHAR so need to test for search CHAR. */ > >> + VPCMP $0, %VMATCH, %VMM(1), %k1 > >> + KMOV %k1, %VRAX > >> + /* Shift out search CHAR matches that are before the beginning > of > >> + src (rdi). */ > >> + shrx %SHIFT_REG, %VRAX, %VRAX > >> + /* Check if any search CHAR match in range. */ > >> + blsmsk %VRSI, %VRSI > >> + and %VRSI, %VRAX > >> + jz L(ret2) > >> + bsr %VRAX, %VRAX > >> +# ifdef USE_AS_WCSRCHR > >> + leaq (%rdi, %rax, CHAR_SIZE), %rax > >> +# else > >> + addq %rdi, %rax > >> +# endif > >> +L(ret2): > >> + ret > > > > > > Modified instructions to fit page crossing logic in cache line for > strrchr-evex512. > > > > 300: 48 31 f8 xor %rdi,%rax > > 303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17 > > 30a: 62 b2 76 40 26 c1 vptestnmb %zmm17,%zmm17,%k0 > > 310: c4 e1 fb 93 f0 kmovq %k0,%rsi > > 315: 89 f9 mov %edi,%ecx > > 317: 48 d3 ee shr %cl,%rsi > > 31a: 0f 84 f8 fc ff ff je 18 <__strrchr_evex512+0x18> > > 320: 62 b1 75 40 74 c8 vpcmpeqb %zmm16,%zmm17,%k1 > > 326: c4 e1 fb 93 c1 kmovq %k1,%rax > > 32b: 48 d3 e8 shr %cl,%rax > > 32e: c4 e2 c8 f3 d6 blsmsk %rsi,%rsi > > 333: 48 21 f0 and %rsi,%rax > > 336: 74 07 je 33f <__strrchr_evex512+0x33f> > > 338: 48 0f bd c0 bsr %rax,%rax > > 33c: 48 01 f8 add %rdi,%rax > > 33f: c3 ret > > > Can update if you want, but its a alot of ifdefs, LMK. > I'm not sure why you need to use lots of ifdef. I just modified 4 lines on top of your latest patch. diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S index 9cab21c09c..3f4540a041 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -351,22 +351,20 @@ L(cross_page_boundary): /* Shift out zero CHAR matches that are before the beginning of src (rdi). */ -# ifdef USE_AS_WCSRCHR movl %edi, %ecx +# ifdef USE_AS_WCSRCHR andl $(VEC_SIZE - 1), %ecx shrl $2, %ecx # endif - shrx %SHIFT_REG, %VRSI, %VRSI - - test %VRSI, %VRSI + shr %ecx, %VRSI jz L(page_cross_continue) /* Found zero CHAR so need to test for search CHAR. */ - VPCMP $0, %VMATCH, %VMM(1), %k1 + VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX /* Shift out search CHAR matches that are before the beginning of src (rdi). */ - shrx %SHIFT_REG, %VRAX, %VRAX + shr %ecx, %VRAX /* Check if any search CHAR match in range. */ blsmsk %VRSI, %VRSI and %VRSI, %VRAX At least all string/wcsmbs tests passed. > >> > >> + /* 3 bytes from cache-line for evex. > >> + Crosses cache line for evex512. */ > >> END(STRRCHR) > >> #endif > >> -- > >> 2.34.1 > >> >
On Sun, Nov 12, 2023 at 11:57 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Fri, Nov 10, 2023 at 7:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Thu, Nov 9, 2023 at 9:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: >> > >> > >> > >> > On Thu, Nov 9, 2023 at 2:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> >> was missing the CPU_FEATURE checks for VBMI2 in the >> >> ifunc/ifunc-impl-list. >> >> >> >> The fix is either to add those checks or change the logic to not use >> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> >> implementation is usable on SKX. >> >> >> >> New implementation is a bit slower, but this is in a cold path so its >> >> probably okay. >> >> --- >> >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 61 ++++++++++++-------- >> >> 1 file changed, 37 insertions(+), 24 deletions(-) >> >> >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> index cd6a0a870a..eb5f1f855a 100644 >> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> >> @@ -35,7 +35,6 @@ >> >> # define CHAR_SIZE 4 >> >> # define VPCMP vpcmpd >> >> # define VPMIN vpminud >> >> -# define VPCOMPRESS vpcompressd >> >> # define VPTESTN vptestnmd >> >> # define VPTEST vptestmd >> >> # define VPBROADCAST vpbroadcastd >> >> @@ -46,7 +45,6 @@ >> >> # define CHAR_SIZE 1 >> >> # define VPCMP vpcmpb >> >> # define VPMIN vpminub >> >> -# define VPCOMPRESS vpcompressb >> >> # define VPTESTN vptestnmb >> >> # define VPTEST vptestmb >> >> # define VPBROADCAST vpbroadcastb >> >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> >> andl $(PAGE_SIZE - 1), %eax >> >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> >> jg L(cross_page_boundary) >> >> - >> >> +L(page_cross_continue): >> >> VMOVU (%rdi), %VMM(1) >> >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> >> VPTESTN %VMM(1), %VMM(1), %k0 >> >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> >> test %VGPR(rsi), %VGPR(rsi) >> >> jz L(aligned_more) >> >> /* fallthrough: zero CHAR in first VEC. */ >> >> -L(page_cross_return): >> >> + >> >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> >> KMOV %k1, %VGPR(rax) >> >> +L(page_cross_return): >> > >> > >> > Unused label. >> > >> >> >> >> /* Build mask up until first zero CHAR (used to mask of >> >> potential search CHAR matches past the end of the string). */ >> >> blsmsk %VGPR(rsi), %VGPR(rsi) >> >> @@ -167,7 +166,6 @@ L(first_vec_x1_return): >> >> >> >> .p2align 4,, 12 >> >> L(aligned_more): >> >> -L(page_cross_continue): >> >> /* Need to keep original pointer incase VEC(1) has last match. */ >> >> movq %rdi, %rsi >> >> andq $-VEC_SIZE, %rdi >> >> @@ -340,34 +338,49 @@ L(return_new_match_ret): >> >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> >> ret >> >> >> >> - .p2align 4,, 4 >> >> L(cross_page_boundary): >> >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> >> + rax` sets pointer will all page offset bits cleared so >> >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> >> + before page cross (guaranteed to be safe to read). Doing this >> >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> >> + a bit of code size. */ >> >> xorq %rdi, %rax >> >> - mov $-1, %VRDX >> >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> >> - VPTESTN %VMM(6), %VMM(6), %k0 >> >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> >> + VPTESTN %VMM(1), %VMM(1), %k0 >> >> KMOV %k0, %VRSI >> >> >> >> + /* Shift out zero CHAR matches that are before the beginning of >> >> + src (rdi). */ >> >> # ifdef USE_AS_WCSRCHR >> >> movl %edi, %ecx >> >> - and $(VEC_SIZE - 1), %ecx >> >> + andl $(VEC_SIZE - 1), %ecx >> >> shrl $2, %ecx >> >> # endif >> >> - shlx %SHIFT_REG, %VRDX, %VRDX >> >> - >> >> -# ifdef USE_AS_WCSRCHR >> >> - kmovw %edx, %k1 >> >> -# else >> >> - KMOV %VRDX, %k1 >> >> -# endif >> >> - >> >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} >> >> - /* We could technically just jmp back after the vpcompress but >> >> - it doesn't save any 16-byte blocks. */ >> >> shrx %SHIFT_REG, %VRSI, %VRSI >> >> + >> >> test %VRSI, %VRSI >> >> - jnz L(page_cross_return) >> >> - jmp L(page_cross_continue) >> >> - /* 1-byte from cache line. */ >> >> + jz L(page_cross_continue) >> >> + >> >> + /* Found zero CHAR so need to test for search CHAR. */ >> >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> >> + KMOV %k1, %VRAX >> >> + /* Shift out search CHAR matches that are before the beginning of >> >> + src (rdi). */ >> >> + shrx %SHIFT_REG, %VRAX, %VRAX >> >> + /* Check if any search CHAR match in range. */ >> >> + blsmsk %VRSI, %VRSI >> >> + and %VRSI, %VRAX >> >> + jz L(ret2) >> >> + bsr %VRAX, %VRAX >> >> +# ifdef USE_AS_WCSRCHR >> >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> >> +# else >> >> + addq %rdi, %rax >> >> +# endif >> >> +L(ret2): >> >> + ret >> > >> > >> > Modified instructions to fit page crossing logic in cache line for strrchr-evex512. >> > >> > 300: 48 31 f8 xor %rdi,%rax >> > 303: 62 e1 fe 48 6f 48 3f vmovdqu64 0xfc0(%rax),%zmm17 >> > 30a: 62 b2 76 40 26 c1 vptestnmb %zmm17,%zmm17,%k0 >> > 310: c4 e1 fb 93 f0 kmovq %k0,%rsi >> > 315: 89 f9 mov %edi,%ecx >> > 317: 48 d3 ee shr %cl,%rsi >> > 31a: 0f 84 f8 fc ff ff je 18 <__strrchr_evex512+0x18> >> > 320: 62 b1 75 40 74 c8 vpcmpeqb %zmm16,%zmm17,%k1 >> > 326: c4 e1 fb 93 c1 kmovq %k1,%rax >> > 32b: 48 d3 e8 shr %cl,%rax >> > 32e: c4 e2 c8 f3 d6 blsmsk %rsi,%rsi >> > 333: 48 21 f0 and %rsi,%rax >> > 336: 74 07 je 33f <__strrchr_evex512+0x33f> >> > 338: 48 0f bd c0 bsr %rax,%rax >> > 33c: 48 01 f8 add %rdi,%rax >> > 33f: c3 ret >> > >> Can update if you want, but its a alot of ifdefs, LMK. > > > I'm not sure why you need to use lots of ifdef. I just modified 4 lines on top of your latest patch. > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > index 9cab21c09c..3f4540a041 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -351,22 +351,20 @@ L(cross_page_boundary): > > /* Shift out zero CHAR matches that are before the beginning of > src (rdi). */ > -# ifdef USE_AS_WCSRCHR > movl %edi, %ecx > +# ifdef USE_AS_WCSRCHR > andl $(VEC_SIZE - 1), %ecx > shrl $2, %ecx > # endif > - shrx %SHIFT_REG, %VRSI, %VRSI > - > - test %VRSI, %VRSI > + shr %ecx, %VRSI > jz L(page_cross_continue) > > /* Found zero CHAR so need to test for search CHAR. */ > - VPCMP $0, %VMATCH, %VMM(1), %k1 > + VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VRAX > /* Shift out search CHAR matches that are before the beginning of > src (rdi). */ > - shrx %SHIFT_REG, %VRAX, %VRAX > + shr %ecx, %VRAX > /* Check if any search CHAR match in range. */ > blsmsk %VRSI, %VRSI > and %VRSI, %VRAX > > At least all string/wcsmbs tests passed. > There is a cost for everything except the avx512 version. Ill update though. >> >> >> >> >> + /* 3 bytes from cache-line for evex. >> >> + Crosses cache line for evex512. */ >> >> END(STRRCHR) >> >> #endif >> >> -- >> >> 2.34.1 >> >>
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S index cd6a0a870a..eb5f1f855a 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -35,7 +35,6 @@ # define CHAR_SIZE 4 # define VPCMP vpcmpd # define VPMIN vpminud -# define VPCOMPRESS vpcompressd # define VPTESTN vptestnmd # define VPTEST vptestmd # define VPBROADCAST vpbroadcastd @@ -46,7 +45,6 @@ # define CHAR_SIZE 1 # define VPCMP vpcmpb # define VPMIN vpminub -# define VPCOMPRESS vpcompressb # define VPTESTN vptestnmb # define VPTEST vptestmb # define VPBROADCAST vpbroadcastb @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in YMM1. */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VGPR(rsi), %VGPR(rsi) jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): + /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VGPR(rax) +L(page_cross_return): /* Build mask up until first zero CHAR (used to mask of potential search CHAR matches past the end of the string). */ blsmsk %VGPR(rsi), %VGPR(rsi) @@ -167,7 +166,6 @@ L(first_vec_x1_return): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi andq $-VEC_SIZE, %rdi @@ -340,34 +338,49 @@ L(return_new_match_ret): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guaranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VRSI + /* Shift out zero CHAR matches that are before the beginning of + src (rdi). */ # ifdef USE_AS_WCSRCHR movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx + andl $(VEC_SIZE - 1), %ecx shrl $2, %ecx # endif - shlx %SHIFT_REG, %VRDX, %VRDX - -# ifdef USE_AS_WCSRCHR - kmovw %edx, %k1 -# else - KMOV %VRDX, %k1 -# endif - - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ shrx %SHIFT_REG, %VRSI, %VRSI + test %VRSI, %VRSI - jnz L(page_cross_return) - jmp L(page_cross_continue) - /* 1-byte from cache line. */ + jz L(page_cross_continue) + + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + /* Shift out search CHAR matches that are before the beginning of + src (rdi). */ + shrx %SHIFT_REG, %VRAX, %VRAX + /* Check if any search CHAR match in range. */ + blsmsk %VRSI, %VRSI + and %VRSI, %VRAX + jz L(ret2) + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +L(ret2): + ret + /* 3 bytes from cache-line for evex. + Crosses cache line for evex512. */ END(STRRCHR) #endif