diff mbox series

x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S

Message ID 20231101221657.311121-1-goldstein.w.n@gmail.com
State New
Headers show
Series x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S | expand

Commit Message

Noah Goldstein Nov. 1, 2023, 10:16 p.m. UTC
strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
was missing the CPU_FEATURE checks for VBMI2 in the
ifunc/ifunc-impl-list.

The fix is either to add those checks or change the logic to not use
`vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
implementation is usable on SKX.

New implementation is a bit slower, but this is in a cold path so its
probably okay.
---
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
 1 file changed, 43 insertions(+), 21 deletions(-)

Comments

Noah Goldstein Nov. 7, 2023, 5:51 p.m. UTC | #1
On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>  1 file changed, 43 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..b174d43208 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> +          so to save code-size just re-use return logic for first VEC.
> +          This is relatively cold code (page cross).  */
> +# if VEC_SIZE == 64
> +       jmp     L(page_cross_return)
> +    /* 6 bytes from cache-line.  */
>  # else
> -       KMOV    %VRDX, %k1
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line.  */
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
ping
Florian Weimer Nov. 7, 2023, 6:25 p.m. UTC | #2
* Noah Goldstein:

> On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>
> ping

Sunil, could you review this?

Thanks,
Florian
Sunil Pandey Nov. 8, 2023, 5:22 a.m. UTC | #3
On Tue, Nov 7, 2023 at 10:25 AM Florian Weimer <fweimer@redhat.com> wrote:

> * Noah Goldstein:
>
> > On Wed, Nov 1, 2023 at 5:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> ifunc/ifunc-impl-list.
> >>
> >> The fix is either to add those checks or change the logic to not use
> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> implementation is usable on SKX.
> >>
> >> New implementation is a bit slower, but this is in a cold path so its
> >> probably okay.
> >> ---
> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> index cd6a0a870a..b174d43208 100644
> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         andl    $(PAGE_SIZE - 1), %eax
> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >>         jg      L(cross_page_boundary)
> >> -
> >> +L(page_cross_continue):
> >>         VMOVU   (%rdi), %VMM(1)
> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         test    %VGPR(rsi), %VGPR(rsi)
> >>         jz      L(aligned_more)
> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> -L(page_cross_return):
> >> +
> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >>         KMOV    %k1, %VGPR(rax)
> >> +L(page_cross_return):
> >>         /* Build mask up until first zero CHAR (used to mask of
> >>            potential search CHAR matches past the end of the string).
> */
> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >>
> >>         .p2align 4,, 12
> >>  L(aligned_more):
> >> -L(page_cross_continue):
> >>         /* Need to keep original pointer incase VEC(1) has last match.
> */
> >>         movq    %rdi, %rsi
> >>         andq    $-VEC_SIZE, %rdi
> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >>         ret
> >>
> >> -       .p2align 4,, 4
> >>  L(cross_page_boundary):
> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> +          rax` sets pointer will all page offset bits cleared so
> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> +          a bit of code size.  */
> >>         xorq    %rdi, %rax
> >> -       mov     $-1, %VRDX
> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >>         KMOV    %k0, %VRSI
> >>
> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> +          src (rdi).  */
> >>  # ifdef USE_AS_WCSRCHR
> >>         movl    %edi, %ecx
> >> -       and     $(VEC_SIZE - 1), %ecx
> >> +       andl    $(VEC_SIZE - 1), %ecx
> >>         shrl    $2, %ecx
> >>  # endif
> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >>
> >> -# ifdef USE_AS_WCSRCHR
> >> -       kmovw   %edx, %k1
> >> +       test    %VRSI, %VRSI
> >> +       jz      L(page_cross_continue)
> >> +
> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> +       KMOV    %k1, %VRAX
> >> +       /* Shift out search CHAR matches that are before the beginning
> of
> >> +          src (rdi).  */
> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> +          so to save code-size just re-use return logic for first VEC.
> >> +          This is relatively cold code (page cross).  */
> >> +# if VEC_SIZE == 64
> >> +       jmp     L(page_cross_return)
> >> +    /* 6 bytes from cache-line.  */
> >>  # else
> >> -       KMOV    %VRDX, %k1
> >> +       /* Check if any search CHAR match in range.  */
> >> +       blsmsk  %VRSI, %VRSI
> >> +       and     %VRSI, %VRAX
> >> +       jz      L(ret2)
> >> +       bsr     %VRAX, %VRAX
> >> +#  ifdef USE_AS_WCSRCHR
> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> +#  else
> >> +       addq    %rdi, %rax
> >> +#  endif
> >> +L(ret2):
> >> +       ret
> >> +       /* 3 bytes from cache-line.  */
> >>  # endif
> >> -
> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >> -       /* We could technically just jmp back after the vpcompress but
> >> -          it doesn't save any 16-byte blocks.  */
> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> -       test    %VRSI, %VRSI
> >> -       jnz     L(page_cross_return)
> >> -       jmp     L(page_cross_continue)
> >> -       /* 1-byte from cache line.  */
> >>  END(STRRCHR)
> >>  #endif
> >> --
> >> 2.34.1
> >>
> > ping
>
> Sunil, could you review this?
>

Sure, I'm looking into it.


>
> Thanks,
> Florian
>
>
Sunil Pandey Nov. 8, 2023, 7:04 p.m. UTC | #4
On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> was missing the CPU_FEATURE checks for VBMI2 in the
> ifunc/ifunc-impl-list.
>
> The fix is either to add those checks or change the logic to not use
> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> implementation is usable on SKX.
>
> New implementation is a bit slower, but this is in a cold path so its
> probably okay.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>  1 file changed, 43 insertions(+), 21 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> index cd6a0a870a..b174d43208 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
> -
> +L(page_cross_continue):
>         VMOVU   (%rdi), %VMM(1)
>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>         VPTESTN %VMM(1), %VMM(1), %k0
> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>         test    %VGPR(rsi), %VGPR(rsi)
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -L(page_cross_return):
> +
>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>         VPCMPEQ %VMATCH, %VMM(1), %k1
>         KMOV    %k1, %VGPR(rax)
> +L(page_cross_return):
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).  */
>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>
>         .p2align 4,, 12
>  L(aligned_more):
> -L(page_cross_continue):
>         /* Need to keep original pointer incase VEC(1) has last match.  */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 4
>  L(cross_page_boundary):
> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> +          rax` sets pointer will all page offset bits cleared so
> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> +          before page cross (guaranteed to be safe to read). Doing this
> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> +          a bit of code size.  */
>         xorq    %rdi, %rax
> -       mov     $-1, %VRDX
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> -       VPTESTN %VMM(6), %VMM(6), %k0
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       VPTESTN %VMM(1), %VMM(1), %k0
>         KMOV    %k0, %VRSI
>
> +       /* Shift out zero CHAR matches that are before the beginning of
> +          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
>         movl    %edi, %ecx
> -       and     $(VEC_SIZE - 1), %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>  # endif
> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>
> -# ifdef USE_AS_WCSRCHR
> -       kmovw   %edx, %k1
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
> +
> +       /* Found zero CHAR so need to test for search CHAR.  */
> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       /* Shift out search CHAR matches that are before the beginning of
> +          src (rdi).  */
> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> +          so to save code-size just re-use return logic for first VEC.
> +          This is relatively cold code (page cross).  */
> +# if VEC_SIZE == 64
> +       jmp     L(page_cross_return)
>

Can you please remove unconditional jump. It's simply confusing and doesn't
help across the board.

$ tail -7 /tmp/wcsrchr-evex.out
 301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
 306: c4 e2 48 f3 d6       blsmsk %esi,%esi
 30b: 21 f0                 and    %esi,%eax
 30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
 30f: 0f bd c0             bsr    %eax,%eax
 312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
 316: c3                   ret

$ tail -7 /tmp/wcsrchr-evex512.out
 300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
 305: 85 f6                 test   %esi,%esi
 307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
 30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
 314: c5 fb 93 c1           kmovd  %k1,%eax
 318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
 31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>

$ tail -7 /tmp/strrchr-evex.out
 2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
 2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
 2f2: 21 f0                 and    %esi,%eax
 2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
 2f6: 0f bd c0             bsr    %eax,%eax
 2f9: 48 01 f8             add    %rdi,%rax
 2fc: c3                   ret

 $ tail -7 /tmp/strrchr-evex512.out
 315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
 31a: 48 85 f6             test   %rsi,%rsi
 31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
 323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
 32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
 32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
 334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>



> +    /* 6 bytes from cache-line.  */
>  # else
> -       KMOV    %VRDX, %k1
> +       /* Check if any search CHAR match in range.  */
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret2)
> +       bsr     %VRAX, %VRAX
> +#  ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
> +       addq    %rdi, %rax
> +#  endif
> +L(ret2):
> +       ret
> +       /* 3 bytes from cache-line.  */
>  # endif
> -
> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>

Please remove the VCOMPRESS definition as it's not used.


> -       /* We could technically just jmp back after the vpcompress but
> -          it doesn't save any 16-byte blocks.  */
> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> -       test    %VRSI, %VRSI
> -       jnz     L(page_cross_return)
> -       jmp     L(page_cross_continue)
> -       /* 1-byte from cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>
>
Noah Goldstein Nov. 8, 2023, 7:22 p.m. UTC | #5
On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>
>
> Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
>
> $ tail -7 /tmp/wcsrchr-evex.out
>  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  30b: 21 f0                 and    %esi,%eax
>  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>  30f: 0f bd c0             bsr    %eax,%eax
>  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>  316: c3                   ret
>
> $ tail -7 /tmp/wcsrchr-evex512.out
>  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>  305: 85 f6                 test   %esi,%esi
>  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>  314: c5 fb 93 c1           kmovd  %k1,%eax
>  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>
> $ tail -7 /tmp/strrchr-evex.out
>  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  2f2: 21 f0                 and    %esi,%eax
>  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>  2f6: 0f bd c0             bsr    %eax,%eax
>  2f9: 48 01 f8             add    %rdi,%rax
>  2fc: c3                   ret
>
>  $ tail -7 /tmp/strrchr-evex512.out
>  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>  31a: 48 85 f6             test   %rsi,%rsi
>  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
It helps here (or at least saves a cache line). But not super invested
either way.
>
>
>>
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>
>
> Please remove the VCOMPRESS definition as it's not used.
>
>>
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>
Sunil Pandey Nov. 8, 2023, 8:08 p.m. UTC | #6
On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> ifunc/ifunc-impl-list.
> >>
> >> The fix is either to add those checks or change the logic to not use
> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> implementation is usable on SKX.
> >>
> >> New implementation is a bit slower, but this is in a cold path so its
> >> probably okay.
> >> ---
> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> index cd6a0a870a..b174d43208 100644
> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         andl    $(PAGE_SIZE - 1), %eax
> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >>         jg      L(cross_page_boundary)
> >> -
> >> +L(page_cross_continue):
> >>         VMOVU   (%rdi), %VMM(1)
> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >>         test    %VGPR(rsi), %VGPR(rsi)
> >>         jz      L(aligned_more)
> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> -L(page_cross_return):
> >> +
> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >>         KMOV    %k1, %VGPR(rax)
> >> +L(page_cross_return):
> >>         /* Build mask up until first zero CHAR (used to mask of
> >>            potential search CHAR matches past the end of the string).
> */
> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >>
> >>         .p2align 4,, 12
> >>  L(aligned_more):
> >> -L(page_cross_continue):
> >>         /* Need to keep original pointer incase VEC(1) has last match.
> */
> >>         movq    %rdi, %rsi
> >>         andq    $-VEC_SIZE, %rdi
> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >>         ret
> >>
> >> -       .p2align 4,, 4
> >>  L(cross_page_boundary):
> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> +          rax` sets pointer will all page offset bits cleared so
> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> +          a bit of code size.  */
> >>         xorq    %rdi, %rax
> >> -       mov     $-1, %VRDX
> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >>         KMOV    %k0, %VRSI
> >>
> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> +          src (rdi).  */
> >>  # ifdef USE_AS_WCSRCHR
> >>         movl    %edi, %ecx
> >> -       and     $(VEC_SIZE - 1), %ecx
> >> +       andl    $(VEC_SIZE - 1), %ecx
> >>         shrl    $2, %ecx
> >>  # endif
> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >>
> >> -# ifdef USE_AS_WCSRCHR
> >> -       kmovw   %edx, %k1
> >> +       test    %VRSI, %VRSI
> >> +       jz      L(page_cross_continue)
> >> +
> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> +       KMOV    %k1, %VRAX
> >> +       /* Shift out search CHAR matches that are before the beginning
> of
> >> +          src (rdi).  */
> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> +          so to save code-size just re-use return logic for first VEC.
> >> +          This is relatively cold code (page cross).  */
> >> +# if VEC_SIZE == 64
> >> +       jmp     L(page_cross_return)
> >
> >
> > Can you please remove unconditional jump. It's simply confusing and
> doesn't help across the board.
> >
> > $ tail -7 /tmp/wcsrchr-evex.out
> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >  30b: 21 f0                 and    %esi,%eax
> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
> >  30f: 0f bd c0             bsr    %eax,%eax
> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
> >  316: c3                   ret
> >
> > $ tail -7 /tmp/wcsrchr-evex512.out
> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
> >  305: 85 f6                 test   %esi,%esi
> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
> >  314: c5 fb 93 c1           kmovd  %k1,%eax
> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
> >
>

Unnecessary unconditional jump here, as the entire code will fit in the
same cache line.
Also it will slow down execution.

In general, I'm not in favor of unconditional jump unless it can provide
significant perf boost.

> $ tail -7 /tmp/strrchr-evex.out
> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >  2f2: 21 f0                 and    %esi,%eax
> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
> >  2f6: 0f bd c0             bsr    %eax,%eax
> >  2f9: 48 01 f8             add    %rdi,%rax
> >  2fc: c3                   ret
> >
> >  $ tail -7 /tmp/strrchr-evex512.out
> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
> >  31a: 48 85 f6             test   %rsi,%rsi
> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
> It helps here (or at least saves a cache line). But not super invested
> either way.
> >
> >
> >>
> >> +    /* 6 bytes from cache-line.  */
> >>  # else
> >> -       KMOV    %VRDX, %k1
> >> +       /* Check if any search CHAR match in range.  */
> >> +       blsmsk  %VRSI, %VRSI
> >> +       and     %VRSI, %VRAX
> >> +       jz      L(ret2)
> >> +       bsr     %VRAX, %VRAX
> >> +#  ifdef USE_AS_WCSRCHR
> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> +#  else
> >> +       addq    %rdi, %rax
> >> +#  endif
> >> +L(ret2):
> >> +       ret
> >> +       /* 3 bytes from cache-line.  */
> >>  # endif
> >> -
> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >
> >
> > Please remove the VCOMPRESS definition as it's not used.
> >
> >>
> >> -       /* We could technically just jmp back after the vpcompress but
> >> -          it doesn't save any 16-byte blocks.  */
> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> -       test    %VRSI, %VRSI
> >> -       jnz     L(page_cross_return)
> >> -       jmp     L(page_cross_continue)
> >> -       /* 1-byte from cache line.  */
> >>  END(STRRCHR)
> >>  #endif
> >> --
> >> 2.34.1
> >>
>
Noah Goldstein Nov. 8, 2023, 9:21 p.m. UTC | #7
On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>> >
>> >
>> >
>> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >>
>> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> >> was missing the CPU_FEATURE checks for VBMI2 in the
>> >> ifunc/ifunc-impl-list.
>> >>
>> >> The fix is either to add those checks or change the logic to not use
>> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> >> implementation is usable on SKX.
>> >>
>> >> New implementation is a bit slower, but this is in a cold path so its
>> >> probably okay.
>> >> ---
>> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>> >>  1 file changed, 43 insertions(+), 21 deletions(-)
>> >>
>> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> index cd6a0a870a..b174d43208 100644
>> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         andl    $(PAGE_SIZE - 1), %eax
>> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>> >>         jg      L(cross_page_boundary)
>> >> -
>> >> +L(page_cross_continue):
>> >>         VMOVU   (%rdi), %VMM(1)
>> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>> >>         VPTESTN %VMM(1), %VMM(1), %k0
>> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>> >>         test    %VGPR(rsi), %VGPR(rsi)
>> >>         jz      L(aligned_more)
>> >>         /* fallthrough: zero CHAR in first VEC.  */
>> >> -L(page_cross_return):
>> >> +
>> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
>> >>         KMOV    %k1, %VGPR(rax)
>> >> +L(page_cross_return):
>> >>         /* Build mask up until first zero CHAR (used to mask of
>> >>            potential search CHAR matches past the end of the string).  */
>> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>> >>
>> >>         .p2align 4,, 12
>> >>  L(aligned_more):
>> >> -L(page_cross_continue):
>> >>         /* Need to keep original pointer incase VEC(1) has last match.  */
>> >>         movq    %rdi, %rsi
>> >>         andq    $-VEC_SIZE, %rdi
>> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>> >>         ret
>> >>
>> >> -       .p2align 4,, 4
>> >>  L(cross_page_boundary):
>> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> >> +          rax` sets pointer will all page offset bits cleared so
>> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> >> +          before page cross (guaranteed to be safe to read). Doing this
>> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> >> +          a bit of code size.  */
>> >>         xorq    %rdi, %rax
>> >> -       mov     $-1, %VRDX
>> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> >> -       VPTESTN %VMM(6), %VMM(6), %k0
>> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> >> +       VPTESTN %VMM(1), %VMM(1), %k0
>> >>         KMOV    %k0, %VRSI
>> >>
>> >> +       /* Shift out zero CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >>  # ifdef USE_AS_WCSRCHR
>> >>         movl    %edi, %ecx
>> >> -       and     $(VEC_SIZE - 1), %ecx
>> >> +       andl    $(VEC_SIZE - 1), %ecx
>> >>         shrl    $2, %ecx
>> >>  # endif
>> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>> >>
>> >> -# ifdef USE_AS_WCSRCHR
>> >> -       kmovw   %edx, %k1
>> >> +       test    %VRSI, %VRSI
>> >> +       jz      L(page_cross_continue)
>> >> +
>> >> +       /* Found zero CHAR so need to test for search CHAR.  */
>> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> >> +       KMOV    %k1, %VRAX
>> >> +       /* Shift out search CHAR matches that are before the beginning of
>> >> +          src (rdi).  */
>> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> >> +          so to save code-size just re-use return logic for first VEC.
>> >> +          This is relatively cold code (page cross).  */
>> >> +# if VEC_SIZE == 64
>> >> +       jmp     L(page_cross_return)
>> >
>> >
>> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
>> >
>> > $ tail -7 /tmp/wcsrchr-evex.out
>> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>> >  30b: 21 f0                 and    %esi,%eax
>> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>> >  30f: 0f bd c0             bsr    %eax,%eax
>> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>> >  316: c3                   ret
>> >
>> > $ tail -7 /tmp/wcsrchr-evex512.out
>> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>> >  305: 85 f6                 test   %esi,%esi
>> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>> >  314: c5 fb 93 c1           kmovd  %k1,%eax
>> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>> >
>
>
> Unnecessary unconditional jump here, as the entire code will fit in the same cache line.
> Also it will slow down execution.
>
> In general, I'm not in favor of unconditional jump unless it can provide significant perf boost.
>
The repalcement code:
```
blsmsk %rsi,%rsi            : +5
and    %rsi,%rax             : +3
je_imm32                        : +2
bsr    %rax,%rax             : +4
add    %rsi, %rax            : +3
ret                                   : +1
                                       :  + 18
Starting at 0x334, that will spill to the next line.

>> > $ tail -7 /tmp/strrchr-evex.out
>> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>> >  2f2: 21 f0                 and    %esi,%eax
>> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>> >  2f6: 0f bd c0             bsr    %eax,%eax
>> >  2f9: 48 01 f8             add    %rdi,%rax
>> >  2fc: c3                   ret
>> >
>> >  $ tail -7 /tmp/strrchr-evex512.out
>> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>> >  31a: 48 85 f6             test   %rsi,%rsi
>> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
>> It helps here (or at least saves a cache line). But not super invested
>> either way.
>> >
>> >
>> >>
>> >> +    /* 6 bytes from cache-line.  */
>> >>  # else
>> >> -       KMOV    %VRDX, %k1
>> >> +       /* Check if any search CHAR match in range.  */
>> >> +       blsmsk  %VRSI, %VRSI
>> >> +       and     %VRSI, %VRAX
>> >> +       jz      L(ret2)
>> >> +       bsr     %VRAX, %VRAX
>> >> +#  ifdef USE_AS_WCSRCHR
>> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> >> +#  else
>> >> +       addq    %rdi, %rax
>> >> +#  endif
>> >> +L(ret2):
>> >> +       ret
>> >> +       /* 3 bytes from cache-line.  */
>> >>  # endif
>> >> -
>> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>> >
>> >
>> > Please remove the VCOMPRESS definition as it's not used.
>> >
>> >>
>> >> -       /* We could technically just jmp back after the vpcompress but
>> >> -          it doesn't save any 16-byte blocks.  */
>> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> >> -       test    %VRSI, %VRSI
>> >> -       jnz     L(page_cross_return)
>> >> -       jmp     L(page_cross_continue)
>> >> -       /* 1-byte from cache line.  */
>> >>  END(STRRCHR)
>> >>  #endif
>> >> --
>> >> 2.34.1
>> >>
Noah Goldstein Nov. 8, 2023, 9:22 p.m. UTC | #8
On Wed, Nov 8, 2023 at 3:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Nov 8, 2023 at 2:08 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Wed, Nov 8, 2023 at 11:22 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >>
> >> On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >> >
> >> >
> >> >
> >> > On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >> >>
> >> >> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
> >> >> was missing the CPU_FEATURE checks for VBMI2 in the
> >> >> ifunc/ifunc-impl-list.
> >> >>
> >> >> The fix is either to add those checks or change the logic to not use
> >> >> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
> >> >> implementation is usable on SKX.
> >> >>
> >> >> New implementation is a bit slower, but this is in a cold path so its
> >> >> probably okay.
> >> >> ---
> >> >>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
> >> >>  1 file changed, 43 insertions(+), 21 deletions(-)
> >> >>
> >> >> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> index cd6a0a870a..b174d43208 100644
> >> >> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >> >> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >> >>         andl    $(PAGE_SIZE - 1), %eax
> >> >>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> >> >>         jg      L(cross_page_boundary)
> >> >> -
> >> >> +L(page_cross_continue):
> >> >>         VMOVU   (%rdi), %VMM(1)
> >> >>         /* k0 has a 1 for each zero CHAR in YMM1.  */
> >> >>         VPTESTN %VMM(1), %VMM(1), %k0
> >> >> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> >> >>         test    %VGPR(rsi), %VGPR(rsi)
> >> >>         jz      L(aligned_more)
> >> >>         /* fallthrough: zero CHAR in first VEC.  */
> >> >> -L(page_cross_return):
> >> >> +
> >> >>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
> >> >>         VPCMPEQ %VMATCH, %VMM(1), %k1
> >> >>         KMOV    %k1, %VGPR(rax)
> >> >> +L(page_cross_return):
> >> >>         /* Build mask up until first zero CHAR (used to mask of
> >> >>            potential search CHAR matches past the end of the string).  */
> >> >>         blsmsk  %VGPR(rsi), %VGPR(rsi)
> >> >> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
> >> >>
> >> >>         .p2align 4,, 12
> >> >>  L(aligned_more):
> >> >> -L(page_cross_continue):
> >> >>         /* Need to keep original pointer incase VEC(1) has last match.  */
> >> >>         movq    %rdi, %rsi
> >> >>         andq    $-VEC_SIZE, %rdi
> >> >> @@ -340,34 +340,56 @@ L(return_new_match_ret):
> >> >>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >> >>         ret
> >> >>
> >> >> -       .p2align 4,, 4
> >> >>  L(cross_page_boundary):
> >> >> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> >> >> +          rax` sets pointer will all page offset bits cleared so
> >> >> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> >> >> +          before page cross (guaranteed to be safe to read). Doing this
> >> >> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> >> >> +          a bit of code size.  */
> >> >>         xorq    %rdi, %rax
> >> >> -       mov     $-1, %VRDX
> >> >> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> >> >> -       VPTESTN %VMM(6), %VMM(6), %k0
> >> >> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> >> >> +       VPTESTN %VMM(1), %VMM(1), %k0
> >> >>         KMOV    %k0, %VRSI
> >> >>
> >> >> +       /* Shift out zero CHAR matches that are before the beginning of
> >> >> +          src (rdi).  */
> >> >>  # ifdef USE_AS_WCSRCHR
> >> >>         movl    %edi, %ecx
> >> >> -       and     $(VEC_SIZE - 1), %ecx
> >> >> +       andl    $(VEC_SIZE - 1), %ecx
> >> >>         shrl    $2, %ecx
> >> >>  # endif
> >> >> -       shlx    %SHIFT_REG, %VRDX, %VRDX
> >> >> +       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> >>
> >> >> -# ifdef USE_AS_WCSRCHR
> >> >> -       kmovw   %edx, %k1
> >> >> +       test    %VRSI, %VRSI
> >> >> +       jz      L(page_cross_continue)
> >> >> +
> >> >> +       /* Found zero CHAR so need to test for search CHAR.  */
> >> >> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
> >> >> +       KMOV    %k1, %VRAX
> >> >> +       /* Shift out search CHAR matches that are before the beginning of
> >> >> +          src (rdi).  */
> >> >> +       shrx    %SHIFT_REG, %VRAX, %VRAX
> >> >> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
> >> >> +          so to save code-size just re-use return logic for first VEC.
> >> >> +          This is relatively cold code (page cross).  */
> >> >> +# if VEC_SIZE == 64
> >> >> +       jmp     L(page_cross_return)
> >> >
> >> >
> >> > Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
> >> >
> >> > $ tail -7 /tmp/wcsrchr-evex.out
> >> >  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >> >  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >> >  30b: 21 f0                 and    %esi,%eax
> >> >  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
> >> >  30f: 0f bd c0             bsr    %eax,%eax
> >> >  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
> >> >  316: c3                   ret
> >> >
> >> > $ tail -7 /tmp/wcsrchr-evex512.out
> >> >  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
> >> >  305: 85 f6                 test   %esi,%esi
> >> >  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
> >> >  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
> >> >  314: c5 fb 93 c1           kmovd  %k1,%eax
> >> >  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
> >> >  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
> >> >
> >
> >
> > Unnecessary unconditional jump here, as the entire code will fit in the same cache line.
> > Also it will slow down execution.
> >
> > In general, I'm not in favor of unconditional jump unless it can provide significant perf boost.

Oh you mean make the conditional only for strrchr-evex512, but exclude
wcsrchr-evx512.
Sure.
> >
> The repalcement code:
> ```
> blsmsk %rsi,%rsi            : +5
> and    %rsi,%rax             : +3
> je_imm32                        : +2
> bsr    %rax,%rax             : +4
> add    %rsi, %rax            : +3
> ret                                   : +1
>                                        :  + 18
> Starting at 0x334, that will spill to the next line.
>
> >> > $ tail -7 /tmp/strrchr-evex.out
> >> >  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
> >> >  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
> >> >  2f2: 21 f0                 and    %esi,%eax
> >> >  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
> >> >  2f6: 0f bd c0             bsr    %eax,%eax
> >> >  2f9: 48 01 f8             add    %rdi,%rax
> >> >  2fc: c3                   ret
> >> >
> >> >  $ tail -7 /tmp/strrchr-evex512.out
> >> >  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
> >> >  31a: 48 85 f6             test   %rsi,%rsi
> >> >  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
> >> >  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
> >> >  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
> >> >  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
> >> >  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
> >> It helps here (or at least saves a cache line). But not super invested
> >> either way.
> >> >
> >> >
> >> >>
> >> >> +    /* 6 bytes from cache-line.  */
> >> >>  # else
> >> >> -       KMOV    %VRDX, %k1
> >> >> +       /* Check if any search CHAR match in range.  */
> >> >> +       blsmsk  %VRSI, %VRSI
> >> >> +       and     %VRSI, %VRAX
> >> >> +       jz      L(ret2)
> >> >> +       bsr     %VRAX, %VRAX
> >> >> +#  ifdef USE_AS_WCSRCHR
> >> >> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >> >> +#  else
> >> >> +       addq    %rdi, %rax
> >> >> +#  endif
> >> >> +L(ret2):
> >> >> +       ret
> >> >> +       /* 3 bytes from cache-line.  */
> >> >>  # endif
> >> >> -
> >> >> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> >> >
> >> >
> >> > Please remove the VCOMPRESS definition as it's not used.
> >> >
> >> >>
> >> >> -       /* We could technically just jmp back after the vpcompress but
> >> >> -          it doesn't save any 16-byte blocks.  */
> >> >> -       shrx    %SHIFT_REG, %VRSI, %VRSI
> >> >> -       test    %VRSI, %VRSI
> >> >> -       jnz     L(page_cross_return)
> >> >> -       jmp     L(page_cross_continue)
> >> >> -       /* 1-byte from cache line.  */
> >> >>  END(STRRCHR)
> >> >>  #endif
> >> >> --
> >> >> 2.34.1
> >> >>
Noah Goldstein Nov. 8, 2023, 9:33 p.m. UTC | #9
On Wed, Nov 8, 2023 at 1:05 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Wed, Nov 1, 2023 at 3:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> strrchr-evex-base used `vpcompress{b|d}` in the page cross logic but
>> was missing the CPU_FEATURE checks for VBMI2 in the
>> ifunc/ifunc-impl-list.
>>
>> The fix is either to add those checks or change the logic to not use
>> `vpcompress{b|d}`. Choosing the latter here so that the strrchr-evex
>> implementation is usable on SKX.
>>
>> New implementation is a bit slower, but this is in a cold path so its
>> probably okay.
>> ---
>>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 64 +++++++++++++-------
>>  1 file changed, 43 insertions(+), 21 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> index cd6a0a870a..b174d43208 100644
>> --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
>> @@ -71,7 +71,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         andl    $(PAGE_SIZE - 1), %eax
>>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>>         jg      L(cross_page_boundary)
>> -
>> +L(page_cross_continue):
>>         VMOVU   (%rdi), %VMM(1)
>>         /* k0 has a 1 for each zero CHAR in YMM1.  */
>>         VPTESTN %VMM(1), %VMM(1), %k0
>> @@ -79,10 +79,11 @@ ENTRY_P2ALIGN(STRRCHR, 6)
>>         test    %VGPR(rsi), %VGPR(rsi)
>>         jz      L(aligned_more)
>>         /* fallthrough: zero CHAR in first VEC.  */
>> -L(page_cross_return):
>> +
>>         /* K1 has a 1 for each search CHAR match in VEC(1).  */
>>         VPCMPEQ %VMATCH, %VMM(1), %k1
>>         KMOV    %k1, %VGPR(rax)
>> +L(page_cross_return):
>>         /* Build mask up until first zero CHAR (used to mask of
>>            potential search CHAR matches past the end of the string).  */
>>         blsmsk  %VGPR(rsi), %VGPR(rsi)
>> @@ -167,7 +168,6 @@ L(first_vec_x1_return):
>>
>>         .p2align 4,, 12
>>  L(aligned_more):
>> -L(page_cross_continue):
>>         /* Need to keep original pointer incase VEC(1) has last match.  */
>>         movq    %rdi, %rsi
>>         andq    $-VEC_SIZE, %rdi
>> @@ -340,34 +340,56 @@ L(return_new_match_ret):
>>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>>         ret
>>
>> -       .p2align 4,, 4
>>  L(cross_page_boundary):
>> +       /* eax contains all the page offset bits of src (rdi). `xor rdi,
>> +          rax` sets pointer will all page offset bits cleared so
>> +          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
>> +          before page cross (guaranteed to be safe to read). Doing this
>> +          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
>> +          a bit of code size.  */
>>         xorq    %rdi, %rax
>> -       mov     $-1, %VRDX
>> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
>> -       VPTESTN %VMM(6), %VMM(6), %k0
>> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
>> +       VPTESTN %VMM(1), %VMM(1), %k0
>>         KMOV    %k0, %VRSI
>>
>> +       /* Shift out zero CHAR matches that are before the beginning of
>> +          src (rdi).  */
>>  # ifdef USE_AS_WCSRCHR
>>         movl    %edi, %ecx
>> -       and     $(VEC_SIZE - 1), %ecx
>> +       andl    $(VEC_SIZE - 1), %ecx
>>         shrl    $2, %ecx
>>  # endif
>> -       shlx    %SHIFT_REG, %VRDX, %VRDX
>> +       shrx    %SHIFT_REG, %VRSI, %VRSI
>>
>> -# ifdef USE_AS_WCSRCHR
>> -       kmovw   %edx, %k1
>> +       test    %VRSI, %VRSI
>> +       jz      L(page_cross_continue)
>> +
>> +       /* Found zero CHAR so need to test for search CHAR.  */
>> +       VPCMP   $0, %VMATCH, %VMM(1), %k1
>> +       KMOV    %k1, %VRAX
>> +       /* Shift out search CHAR matches that are before the beginning of
>> +          src (rdi).  */
>> +       shrx    %SHIFT_REG, %VRAX, %VRAX
>> +       /* For VEC_SIZE == 64 we are just at the end of a cacheline here
>> +          so to save code-size just re-use return logic for first VEC.
>> +          This is relatively cold code (page cross).  */
>> +# if VEC_SIZE == 64
>> +       jmp     L(page_cross_return)
>
>
> Can you please remove unconditional jump. It's simply confusing and doesn't help across the board.
done for wcsrchr-evex512
>
> $ tail -7 /tmp/wcsrchr-evex.out
>  301: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  306: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  30b: 21 f0                 and    %esi,%eax
>  30d: 74 07                 je     316 <__wcsrchr_evex+0x316>
>  30f: 0f bd c0             bsr    %eax,%eax
>  312: 48 8d 04 87           lea    (%rdi,%rax,4),%rax
>  316: c3                   ret
>
> $ tail -7 /tmp/wcsrchr-evex512.out
>  300: c4 e2 73 f7 f6       shrx   %ecx,%esi,%esi
>  305: 85 f6                 test   %esi,%esi
>  307: 0f 84 0b fd ff ff     je     18 <__wcsrchr_evex512+0x18>
>  30d: 62 b3 75 40 1f c8 00 vpcmpeqd %zmm16,%zmm17,%k1
>  314: c5 fb 93 c1           kmovd  %k1,%eax
>  318: c4 e2 73 f7 c0       shrx   %ecx,%eax,%eax
>  31d: e9 18 fd ff ff       jmp    3a <__wcsrchr_evex512+0x3a>
>
> $ tail -7 /tmp/strrchr-evex.out
>  2e8: c4 e2 43 f7 c0       shrx   %edi,%eax,%eax
>  2ed: c4 e2 48 f3 d6       blsmsk %esi,%esi
>  2f2: 21 f0                 and    %esi,%eax
>  2f4: 74 06                 je     2fc <__strrchr_evex+0x2fc>
>  2f6: 0f bd c0             bsr    %eax,%eax
>  2f9: 48 01 f8             add    %rdi,%rax
>  2fc: c3                   ret
>
>  $ tail -7 /tmp/strrchr-evex512.out
>  315: c4 e2 c3 f7 f6       shrx   %rdi,%rsi,%rsi
>  31a: 48 85 f6             test   %rsi,%rsi
>  31d: 0f 84 f5 fc ff ff     je     18 <__strrchr_evex512+0x18>
>  323: 62 b3 75 40 3f c8 00 vpcmpeqb %zmm16,%zmm17,%k1
>  32a: c4 e1 fb 93 c1       kmovq  %k1,%rax
>  32f: c4 e2 c3 f7 c0       shrx   %rdi,%rax,%rax
>  334: e9 04 fd ff ff       jmp    3d <__strrchr_evex512+0x3d>
>
>
>>
>> +    /* 6 bytes from cache-line.  */
>>  # else
>> -       KMOV    %VRDX, %k1
>> +       /* Check if any search CHAR match in range.  */
>> +       blsmsk  %VRSI, %VRSI
>> +       and     %VRSI, %VRAX
>> +       jz      L(ret2)
>> +       bsr     %VRAX, %VRAX
>> +#  ifdef USE_AS_WCSRCHR
>> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>> +#  else
>> +       addq    %rdi, %rax
>> +#  endif
>> +L(ret2):
>> +       ret
>> +       /* 3 bytes from cache-line.  */
>>  # endif
>> -
>> -       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
>
>
> Please remove the VCOMPRESS definition as it's not used.
done.
>
>>
>> -       /* We could technically just jmp back after the vpcompress but
>> -          it doesn't save any 16-byte blocks.  */
>> -       shrx    %SHIFT_REG, %VRSI, %VRSI
>> -       test    %VRSI, %VRSI
>> -       jnz     L(page_cross_return)
>> -       jmp     L(page_cross_continue)
>> -       /* 1-byte from cache line.  */
>>  END(STRRCHR)
>>  #endif
>> --
>> 2.34.1
>>
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a..b174d43208 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -71,7 +71,7 @@  ENTRY_P2ALIGN(STRRCHR, 6)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
-
+L(page_cross_continue):
 	VMOVU	(%rdi), %VMM(1)
 	/* k0 has a 1 for each zero CHAR in YMM1.  */
 	VPTESTN	%VMM(1), %VMM(1), %k0
@@ -79,10 +79,11 @@  ENTRY_P2ALIGN(STRRCHR, 6)
 	test	%VGPR(rsi), %VGPR(rsi)
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
+
 	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 	VPCMPEQ	%VMATCH, %VMM(1), %k1
 	KMOV	%k1, %VGPR(rax)
+L(page_cross_return):
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).  */
 	blsmsk	%VGPR(rsi), %VGPR(rsi)
@@ -167,7 +168,6 @@  L(first_vec_x1_return):
 
 	.p2align 4,, 12
 L(aligned_more):
-L(page_cross_continue):
 	/* Need to keep original pointer incase VEC(1) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
@@ -340,34 +340,56 @@  L(return_new_match_ret):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
 L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
 	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
 	KMOV	%k0, %VRSI
 
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
 	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
+	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
+	shrx	%SHIFT_REG, %VRSI, %VRSI
 
-# ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%SHIFT_REG, %VRAX, %VRAX
+	/* For VEC_SIZE == 64 we are just at the end of a cacheline here
+	   so to save code-size just re-use return logic for first VEC.
+	   This is relatively cold code (page cross).  */
+# if VEC_SIZE == 64
+	jmp	L(page_cross_return)
+    /* 6 bytes from cache-line.  */
 # else
-	KMOV	%VRDX, %k1
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret2)
+	bsr	%VRAX, %VRAX
+#  ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+L(ret2):
+	ret
+	/* 3 bytes from cache-line.  */
 # endif
-
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
 END(STRRCHR)
 #endif