Message ID | 20231108213319.1526816-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S | expand |
On Wed, Nov 8, 2023 at 2:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but > was missing the CPU_FEATURE checks for VBMI2 in the > ifunc/ifunc-impl-list. > > The fix is either to add those checks or change the logic to not use > `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex > implementation is usable on SKX. > > New implementation is a bit slower, but this is in a cold path so its > probably okay. > --- > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 66 +++++++++++++------- > 1 file changed, 43 insertions(+), 23 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > index cd6a0a870a..7fb0a6a543 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -35,7 +35,6 @@ > # define CHAR_SIZE 4 > # define VPCMP vpcmpd > # define VPMIN vpminud > -# define VPCOMPRESS vpcompressd > # define VPTESTN vptestnmd > # define VPTEST vptestmd > # define VPBROADCAST vpbroadcastd > @@ -46,7 +45,6 @@ > # define CHAR_SIZE 1 > # define VPCMP vpcmpb > # define VPMIN vpminub > -# define VPCOMPRESS vpcompressb > # define VPTESTN vptestnmb > # define VPTEST vptestmb > # define VPBROADCAST vpbroadcastb > @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > jg L(cross_page_boundary) > - > +L(page_cross_continue): > VMOVU (%rdi), %VMM(1) > /* k0 has a 1 for each zero CHAR in YMM1. */ > VPTESTN %VMM(1), %VMM(1), %k0 > @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) > test %VGPR(rsi), %VGPR(rsi) > jz L(aligned_more) > /* fallthrough: zero CHAR in first VEC. */ > -L(page_cross_return): > + > /* K1 has a 1 for each search CHAR match in VEC(1). */ > VPCMPEQ %VMATCH, %VMM(1), %k1 > KMOV %k1, %VGPR(rax) > +L(page_cross_return): > /* Build mask up until first zero CHAR (used to mask of > potential search CHAR matches past the end of the string). */ > blsmsk %VGPR(rsi), %VGPR(rsi) > @@ -167,7 +166,6 @@ L(first_vec_x1_return): > > .p2align 4,, 12 > L(aligned_more): > -L(page_cross_continue): > /* Need to keep original pointer incase VEC(1) has last match. */ > movq %rdi, %rsi > andq $-VEC_SIZE, %rdi > @@ -340,34 +338,56 @@ L(return_new_match_ret): > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 4 > L(cross_page_boundary): > + /* eax contains all the page offset bits of src (rdi). `xor rdi, > + rax` sets pointer will all page offset bits cleared so > + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC > + before page cross (guaranteed to be safe to read). Doing this > + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves > + a bit of code size. */ > xorq %rdi, %rax > - mov $-1, %VRDX > - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) > - VPTESTN %VMM(6), %VMM(6), %k0 > + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > KMOV %k0, %VRSI > > + /* Shift out zero CHAR matches that are before the beginning of > + src (rdi). */ > # ifdef USE_AS_WCSRCHR > movl %edi, %ecx > - and $(VEC_SIZE - 1), %ecx > + andl $(VEC_SIZE - 1), %ecx > shrl $2, %ecx > # endif > - shlx %SHIFT_REG, %VRDX, %VRDX > + shrx %SHIFT_REG, %VRSI, %VRSI > > -# ifdef USE_AS_WCSRCHR > - kmovw %edx, %k1 > + test %VRSI, %VRSI > + jz L(page_cross_continue) > + > + /* Found zero CHAR so need to test for search CHAR. */ > + VPCMP $0, %VMATCH, %VMM(1), %k1 > + KMOV %k1, %VRAX > + /* Shift out search CHAR matches that are before the beginning of > + src (rdi). */ > + shrx %SHIFT_REG, %VRAX, %VRAX > + /* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline > + here so to save code-size just re-use return logic for first > + VEC. This is relatively cold code (page cross). */ > +# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR) > + jmp L(page_cross_return) > Still, unconditional jump is hard to read and understand, unless someone builds and analyzes memory layout. > + /* 6 bytes from cache-line. */ > # else > - KMOV %VRDX, %k1 > + /* Check if any search CHAR match in range. */ > + blsmsk %VRSI, %VRSI > + and %VRSI, %VRAX > + jz L(ret2) > + bsr %VRAX, %VRAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > +L(ret2): > + ret > + /* 3 bytes from cache-line. */ > # endif > - > - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} > - /* We could technically just jmp back after the vpcompress but > - it doesn't save any 16-byte blocks. */ > - shrx %SHIFT_REG, %VRSI, %VRSI > - test %VRSI, %VRSI > - jnz L(page_cross_return) > - jmp L(page_cross_continue) > - /* 1-byte from cache line. */ > END(STRRCHR) > #endif > -- > 2.34.1 > >
On Wed, Nov 8, 2023 at 6:10 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Nov 8, 2023 at 2:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but >> was missing the CPU_FEATURE checks for VBMI2 in the >> ifunc/ifunc-impl-list. >> >> The fix is either to add those checks or change the logic to not use >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex >> implementation is usable on SKX. >> >> New implementation is a bit slower, but this is in a cold path so its >> probably okay. >> --- >> sysdeps/x86_64/multiarch/strrchr-evex-base.S | 66 +++++++++++++------- >> 1 file changed, 43 insertions(+), 23 deletions(-) >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> index cd6a0a870a..7fb0a6a543 100644 >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S >> @@ -35,7 +35,6 @@ >> # define CHAR_SIZE 4 >> # define VPCMP vpcmpd >> # define VPMIN vpminud >> -# define VPCOMPRESS vpcompressd >> # define VPTESTN vptestnmd >> # define VPTEST vptestmd >> # define VPBROADCAST vpbroadcastd >> @@ -46,7 +45,6 @@ >> # define CHAR_SIZE 1 >> # define VPCMP vpcmpb >> # define VPMIN vpminub >> -# define VPCOMPRESS vpcompressb >> # define VPTESTN vptestnmb >> # define VPTEST vptestmb >> # define VPBROADCAST vpbroadcastb >> @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> andl $(PAGE_SIZE - 1), %eax >> cmpl $(PAGE_SIZE - VEC_SIZE), %eax >> jg L(cross_page_boundary) >> - >> +L(page_cross_continue): >> VMOVU (%rdi), %VMM(1) >> /* k0 has a 1 for each zero CHAR in YMM1. */ >> VPTESTN %VMM(1), %VMM(1), %k0 >> @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) >> test %VGPR(rsi), %VGPR(rsi) >> jz L(aligned_more) >> /* fallthrough: zero CHAR in first VEC. */ >> -L(page_cross_return): >> + >> /* K1 has a 1 for each search CHAR match in VEC(1). */ >> VPCMPEQ %VMATCH, %VMM(1), %k1 >> KMOV %k1, %VGPR(rax) >> +L(page_cross_return): >> /* Build mask up until first zero CHAR (used to mask of >> potential search CHAR matches past the end of the string). */ >> blsmsk %VGPR(rsi), %VGPR(rsi) >> @@ -167,7 +166,6 @@ L(first_vec_x1_return): >> >> .p2align 4,, 12 >> L(aligned_more): >> -L(page_cross_continue): >> /* Need to keep original pointer incase VEC(1) has last match. */ >> movq %rdi, %rsi >> andq $-VEC_SIZE, %rdi >> @@ -340,34 +338,56 @@ L(return_new_match_ret): >> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax >> ret >> >> - .p2align 4,, 4 >> L(cross_page_boundary): >> + /* eax contains all the page offset bits of src (rdi). `xor rdi, >> + rax` sets pointer will all page offset bits cleared so >> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC >> + before page cross (guaranteed to be safe to read). Doing this >> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves >> + a bit of code size. */ >> xorq %rdi, %rax >> - mov $-1, %VRDX >> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) >> - VPTESTN %VMM(6), %VMM(6), %k0 >> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) >> + VPTESTN %VMM(1), %VMM(1), %k0 >> KMOV %k0, %VRSI >> >> + /* Shift out zero CHAR matches that are before the beginning of >> + src (rdi). */ >> # ifdef USE_AS_WCSRCHR >> movl %edi, %ecx >> - and $(VEC_SIZE - 1), %ecx >> + andl $(VEC_SIZE - 1), %ecx >> shrl $2, %ecx >> # endif >> - shlx %SHIFT_REG, %VRDX, %VRDX >> + shrx %SHIFT_REG, %VRSI, %VRSI >> >> -# ifdef USE_AS_WCSRCHR >> - kmovw %edx, %k1 >> + test %VRSI, %VRSI >> + jz L(page_cross_continue) >> + >> + /* Found zero CHAR so need to test for search CHAR. */ >> + VPCMP $0, %VMATCH, %VMM(1), %k1 >> + KMOV %k1, %VRAX >> + /* Shift out search CHAR matches that are before the beginning of >> + src (rdi). */ >> + shrx %SHIFT_REG, %VRAX, %VRAX >> + /* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline >> + here so to save code-size just re-use return logic for first >> + VEC. This is relatively cold code (page cross). */ >> +# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR) >> + jmp L(page_cross_return) > > > Still, unconditional jump is hard to read and understand, unless someone builds and analyzes memory layout. > Thats why we have the comment. >> >> + /* 6 bytes from cache-line. */ >> # else >> - KMOV %VRDX, %k1 >> + /* Check if any search CHAR match in range. */ >> + blsmsk %VRSI, %VRSI >> + and %VRSI, %VRAX >> + jz L(ret2) >> + bsr %VRAX, %VRAX >> +# ifdef USE_AS_WCSRCHR >> + leaq (%rdi, %rax, CHAR_SIZE), %rax >> +# else >> + addq %rdi, %rax >> +# endif >> +L(ret2): >> + ret >> + /* 3 bytes from cache-line. */ >> # endif >> - >> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} >> - /* We could technically just jmp back after the vpcompress but >> - it doesn't save any 16-byte blocks. */ >> - shrx %SHIFT_REG, %VRSI, %VRSI >> - test %VRSI, %VRSI >> - jnz L(page_cross_return) >> - jmp L(page_cross_continue) >> - /* 1-byte from cache line. */ >> END(STRRCHR) >> #endif >> -- >> 2.34.1 >>
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S index cd6a0a870a..7fb0a6a543 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -35,7 +35,6 @@ # define CHAR_SIZE 4 # define VPCMP vpcmpd # define VPMIN vpminud -# define VPCOMPRESS vpcompressd # define VPTESTN vptestnmd # define VPTEST vptestmd # define VPBROADCAST vpbroadcastd @@ -46,7 +45,6 @@ # define CHAR_SIZE 1 # define VPCMP vpcmpb # define VPMIN vpminub -# define VPCOMPRESS vpcompressb # define VPTESTN vptestnmb # define VPTEST vptestmb # define VPBROADCAST vpbroadcastb @@ -71,7 +69,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in YMM1. */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -79,10 +77,11 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VGPR(rsi), %VGPR(rsi) jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): + /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VGPR(rax) +L(page_cross_return): /* Build mask up until first zero CHAR (used to mask of potential search CHAR matches past the end of the string). */ blsmsk %VGPR(rsi), %VGPR(rsi) @@ -167,7 +166,6 @@ L(first_vec_x1_return): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi andq $-VEC_SIZE, %rdi @@ -340,34 +338,56 @@ L(return_new_match_ret): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guaranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VRSI + /* Shift out zero CHAR matches that are before the beginning of + src (rdi). */ # ifdef USE_AS_WCSRCHR movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx + andl $(VEC_SIZE - 1), %ecx shrl $2, %ecx # endif - shlx %SHIFT_REG, %VRDX, %VRDX + shrx %SHIFT_REG, %VRSI, %VRSI -# ifdef USE_AS_WCSRCHR - kmovw %edx, %k1 + test %VRSI, %VRSI + jz L(page_cross_continue) + + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + /* Shift out search CHAR matches that are before the beginning of + src (rdi). */ + shrx %SHIFT_REG, %VRAX, %VRAX + /* For strrchr VEC_SIZE == 64 we are just at the end of a cacheline + here so to save code-size just re-use return logic for first + VEC. This is relatively cold code (page cross). */ +# if VEC_SIZE == 64 && !(defined USE_AS_WCSRCHR) + jmp L(page_cross_return) + /* 6 bytes from cache-line. */ # else - KMOV %VRDX, %k1 + /* Check if any search CHAR match in range. */ + blsmsk %VRSI, %VRSI + and %VRSI, %VRAX + jz L(ret2) + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +L(ret2): + ret + /* 3 bytes from cache-line. */ # endif - - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %SHIFT_REG, %VRSI, %VRSI - test %VRSI, %VRSI - jnz L(page_cross_return) - jmp L(page_cross_continue) - /* 1-byte from cache line. */ END(STRRCHR) #endif