Message ID | 20221019004409.3623395-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Optimizations are: > > 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch > in short string case. > 2. Restructure code so that small strings are given the hot path. > - This is a net-zero on the benchmark suite but in general makes > sense as smaller sizes are far more common. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 4. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > The optimizations (especially for point 2) make the memchr and > rawmemchr code essentially incompatible so split rawmemchr-evex > to a new file. > > Code Size Changes: > memchr-evex.S : -107 bytes > rawmemchr-evex.S : -53 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > memchr-evex.S : 0.928 > rawmemchr-evex.S : 0.986 (Less targets cross cache lines) > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memchr-evex.S | 939 ++++++++++-------- > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 9 +- > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 313 +++++- > 3 files changed, 851 insertions(+), 410 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > index 0dd4f1dcce..23a1c0018e 100644 > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > @@ -21,17 +21,27 @@ > > #if ISA_SHOULD_BUILD (4) > > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > # ifndef MEMCHR > # define MEMCHR __memchr_evex > # endif > > # ifdef USE_AS_WMEMCHR > +# define PC_SHIFT_GPR rcx > +# define VPTESTN vptestnmd > # define VPBROADCAST vpbroadcastd > # define VPMINU vpminud > # define VPCMP vpcmpd > # define VPCMPEQ vpcmpeqd > # define CHAR_SIZE 4 > + > +# define USE_WIDE_CHAR > # else > +# define PC_SHIFT_GPR rdi > +# define VPTESTN vptestnmb > # define VPBROADCAST vpbroadcastb > # define VPMINU vpminub > # define VPCMP vpcmpb > @@ -39,534 +49,661 @@ > # define CHAR_SIZE 1 > # endif > > - /* In the 4x loop the RTM and non-RTM versions have data pointer > - off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. > - This is represented by BASE_OFFSET. As well because the RTM > - version uses vpcmp which stores a bit per element compared where > - the non-RTM version uses vpcmpeq which stores a bit per byte > - compared RET_SCALE of CHAR_SIZE is only relevant for the RTM > - version. */ > -# ifdef USE_IN_RTM > +# include "reg-macros.h" > + > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > # define VZEROUPPER > -# define BASE_OFFSET (VEC_SIZE * 4) > -# define RET_SCALE CHAR_SIZE > + > +# define USE_TERN_IN_LOOP 0 > # else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > # define VZEROUPPER vzeroupper > -# define BASE_OFFSET 0 > -# define RET_SCALE 1 > # endif > > - /* In the return from 4x loop memchr and rawmemchr versions have > - data pointers off by VEC_SIZE * 4 with memchr version being > - VEC_SIZE * 4 greater. */ > -# ifdef USE_AS_RAWMEMCHR > -# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) > -# define RAW_PTR_REG rcx > -# define ALGN_PTR_REG rdi > +# if USE_TERN_IN_LOOP > + /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar > + so we don't want to multiply resulting index. */ > +# define TERN_CHAR_MULT 1 > + > +# ifdef USE_AS_WMEMCHR > +# define TEST_END() inc %VRCX > +# else > +# define TEST_END() add %rdx, %rcx > +# endif > # else > -# define RET_OFFSET BASE_OFFSET > -# define RAW_PTR_REG rdi > -# define ALGN_PTR_REG rcx > +# define TERN_CHAR_MULT CHAR_SIZE > +# define TEST_END() KORTEST %k2, %k3 > # endif > > -# define XMMZERO xmm23 > -# define YMMZERO ymm23 > -# define XMMMATCH xmm16 > -# define YMMMATCH ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +# ifndef USE_AS_WMEMCHR > +# define GPR_X0_IS_RET 1 > +# else > +# define GPR_X0_IS_RET 0 > +# endif > +# define GPR_X0 rax > +# else > +# define GPR_X0_IS_RET 0 > +# define GPR_X0 rdx > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# ifndef SECTION > -# define SECTION(p) p##.evex > +# if CHAR_PER_VEC == 64 > +# define LAST_VEC_OFFSET (VEC_SIZE * 3) > +# else > +# define LAST_VEC_OFFSET (VEC_SIZE * 2) > +# endif > +# if CHAR_PER_VEC >= 32 > +# define MASK_GPR(...) VGPR(__VA_ARGS__) > +# elif CHAR_PER_VEC == 16 > +# define MASK_GPR(reg) VGPR_SZ(reg, 16) > +# else > +# define MASK_GPR(reg) VGPR_SZ(reg, 8) > # endif > > -# define VEC_SIZE 32 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > -# define PAGE_SIZE 4096 > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > > - .section SECTION(.text),"ax",@progbits > +# define PAGE_SIZE 4096 > + > + > + .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN (MEMCHR, 6) > -# ifndef USE_AS_RAWMEMCHR > /* Check for zero length. */ > test %RDX_LP, %RDX_LP > - jz L(zero) > + jz L(zero_0) > > -# ifdef __ILP32__ > +# ifdef __ILP32__ > /* Clear the upper 32 bits. */ > movl %edx, %edx > -# endif > # endif > - /* Broadcast CHAR to YMMMATCH. */ > - VPBROADCAST %esi, %YMMMATCH > + VPBROADCAST %esi, %VMATCH > /* Check if we may cross page boundary with one vector load. */ > movl %edi, %eax > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page_boundary) > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > +# ifndef USE_AS_WMEMCHR > + /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a > + already a dependency between rcx and rsi so no worries about > + false-dep here. */ > + tzcnt %VRAX, %VRSI > + /* If rdx <= rsi then either 1) rcx was non-zero (there was a > + match) but it was out of bounds or 2) rcx was zero and rdx > + was <= VEC_SIZE so we are done scanning. */ > + cmpq %rsi, %rdx > + /* NB: Use branch to return zero/non-zero. Common usage will > + branch on result of function (if return is null/non-null). > + This branch can be used to predict the ensuing one so there > + is no reason to extend the data-dependency with cmovcc. */ > + jbe L(zero_0) > + > + /* If rcx is zero then len must be > RDX, otherwise since we > + already tested len vs lzcnt(rcx) (in rsi) we are good to > + return this match. */ > + test %VRAX, %VRAX > + jz L(more_1x_vec) > + leaq (%rdi, %rsi), %rax > +# else > > - /* Check the first VEC_SIZE bytes. */ > - VPCMP $0, (%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* If length < CHAR_PER_VEC handle special. */ > + /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE > + > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ > cmpq $CHAR_PER_VEC, %rdx > - jbe L(first_vec_x0) > -# endif > - testl %eax, %eax > - jz L(aligned_more) > - tzcntl %eax, %eax > -# ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > + ja L(more_1x_vec) > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(zero_0) > +L(first_vec_x0_ret): > leaq (%rdi, %rax, CHAR_SIZE), %rax > -# else > - addq %rdi, %rax > # endif > ret > > -# ifndef USE_AS_RAWMEMCHR > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(first_vec_x0): > - /* Check if first match was before length. NB: tzcnt has false data- > - dependency on destination. eax already had a data-dependency on esi > - so this should have no affect here. */ > - tzcntl %eax, %esi > -# ifdef USE_AS_WMEMCHR > - leaq (%rdi, %rsi, CHAR_SIZE), %rdi > -# else > - addq %rsi, %rdi > -# endif > + /* Only fits in first cache line for VEC_SIZE == 32. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 2 > +L(zero_0): > xorl %eax, %eax > - cmpl %esi, %edx > - cmovg %rdi, %rax > ret > # endif > > - .p2align 4 > -L(cross_page_boundary): > - /* Save pointer before aligning as its original value is > - necessary for computer return address if byte is found or > - adjusting length if it is not and this is memchr. */ > - movq %rdi, %rcx > - /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi > - for rawmemchr. */ > - andq $-VEC_SIZE, %ALGN_PTR_REG > - VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 > - kmovd %k0, %r8d > + .p2align 4,, 9 > +L(more_1x_vec): > # ifdef USE_AS_WMEMCHR > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - sarl $2, %eax > -# endif > -# ifndef USE_AS_RAWMEMCHR > - movl $(PAGE_SIZE / CHAR_SIZE), %esi > - subl %eax, %esi > + /* If wmemchr still need to test if there was a match in first > + VEC. Use bsf to test here so we can reuse > + L(first_vec_x0_ret). */ > + bsf %VRAX, %VRAX > + jnz L(first_vec_x0_ret) > # endif > + > +L(page_cross_continue): > # ifdef USE_AS_WMEMCHR > - andl $(CHAR_PER_VEC - 1), %eax > -# endif > - /* Remove the leading bytes. */ > - sarxl %eax, %r8d, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Check the end of data. */ > - cmpq %rsi, %rdx > - jbe L(first_vec_x0) > + /* We can't use end of the buffer to re-calculate length for > + wmemchr as len * CHAR_SIZE may overflow. */ > + leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + addq %rdx, %rax > +# else > + leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > # endif > - testl %eax, %eax > - jz L(cross_page_continue) > - tzcntl %eax, %eax > + > + /* rax contains remaining length - 1. -1 so we can get imm8 > + encoding in a few additional places saving code size. */ > + > + /* Needed regardless of remaining length. */ > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + > + /* We cannot fold the above `sub %rdi, %rax` with the `cmp > + $(CHAR_PER_VEC * 2), %rax` because its possible for a very > + large length to overflow and cause the subtract to carry > + despite length being above CHAR_PER_VEC * 2. */ > + cmpq $(CHAR_PER_VEC * 2 - 1), %rax > + ja L(more_2x_vec) > +L(last_2x_vec): > + > + test %VRDX, %VRDX > + jnz L(first_vec_x1_check) > + > + /* Check the end of data. NB: use 8-bit operations to save code > + size. We no longer need the full-width of eax and will > + perform a write-only operation over eax so there will be no > + partial-register stalls. */ > + subb $(CHAR_PER_VEC * 1 - 1), %al > + jle L(zero_0) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > # ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_0) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + > + /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give > + fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is > + not enough space before the next cache line to fit the `lea` > + for return. */ > +# if VEC_SIZE == 64 > + ja L(first_vec_x2_ret) > +L(zero_0): > + xorl %eax, %eax > + ret > # else > - addq %RAW_PTR_REG, %rax > + jbe L(zero_0) > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > # endif > + > + .p2align 4,, 5 > +L(first_vec_x1_check): > + bsf %VRDX, %VRDX > + cmpb %dl, %al > + jb L(zero_4) > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x1): > - tzcntl %eax, %eax > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 32. > + */ > +# if VEC_SIZE == 32 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > + > + .p2align 4,, 4 > L(first_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + bsf %VRCX, %VRCX > +L(first_vec_x2_ret): > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + */ > +# if VEC_SIZE == 64 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > -L(first_vec_x4): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 4 > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 5 > -L(aligned_more): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > > -# ifndef USE_AS_RAWMEMCHR > - /* Align data to VEC_SIZE. */ > -L(cross_page_continue): > - xorl %ecx, %ecx > - subl %edi, %ecx > - andq $-VEC_SIZE, %rdi > - /* esi is for adjusting length to see if near the end. */ > - leal (VEC_SIZE * 5)(%rdi, %rcx), %esi > -# ifdef USE_AS_WMEMCHR > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %esi > -# endif > -# else > - andq $-VEC_SIZE, %rdi > -L(cross_page_continue): > -# endif > - /* Load first VEC regardless. */ > - VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Adjust length. If near end handle specially. */ > - subq %rsi, %rdx > - jbe L(last_4x_vec_or_less) > -# endif > - testl %eax, %eax > + .p2align 4,, 5 > +L(more_2x_vec): > + /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking > + length. */ > + > + > + /* Already computed matches for first VEC in rdx. */ > + test %VRDX, %VRDX > jnz L(first_vec_x1) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x2) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + /* Needed regardless of next length check. */ > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + /* Check if we are near the end. */ > + cmpq $(CHAR_PER_VEC * 4 - 1), %rax > + ja L(more_4x_vec) > + > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + /* Use 8-bit instructions to save code size. We won't use full- > + width eax again and will perform a write-only operation to > + eax so no worries about partial-register stalls. */ > + subb $(CHAR_PER_VEC * 3), %al > + jb L(zero_2) > +L(last_vec_check): > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WMEMCHR > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_2) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + jae L(first_vec_x4_ret) > +L(zero_2): > + xorl %eax, %eax > + ret > + > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + For VEC_SIZE == 32 we put the return label at the end of > + L(first_vec_x4). */ > +# if VEC_SIZE == 64 > +L(first_vec_x4_ret): > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(first_vec_x4): > + bsf %VRCX, %VRCX > +# if VEC_SIZE == 32 > + /* Place L(first_vec_x4_ret) here as we can't fit it in the same > + cache line as where it is called from so we might as well > + save code size by reusing return of L(first_vec_x4). */ > +L(first_vec_x4_ret): > +# endif > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3_check): > + /* Need to adjust remaining length before checking. */ > + addb $-(CHAR_PER_VEC * 2), %al > + bsf %VRCX, %VRCX > + cmpb %cl, %al > + jb L(zero_2) > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3): > + bsf %VRCX, %VRCX > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 3 > +# if !USE_TERN_IN_LOOP > + .p2align 4,, 10 > +# endif > +L(more_4x_vec): > + test %VRCX, %VRCX > jnz L(first_vec_x3) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x4) > > + subq $-(VEC_SIZE * 5), %rdi > + subq $(CHAR_PER_VEC * 8), %rax > + jb L(last_4x_vec) > > -# ifndef USE_AS_RAWMEMCHR > - /* Check if at last CHAR_PER_VEC * 4 length. */ > - subq $(CHAR_PER_VEC * 4), %rdx > - jbe L(last_4x_vec_or_less_cmpeq) > - /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - > - /* Align data to VEC_SIZE * 4 for the loop and readjust length. > - */ > -# ifdef USE_AS_WMEMCHR > +# ifdef USE_AS_WMEMCHR > movl %edi, %ecx > - andq $-(4 * VEC_SIZE), %rdi > +# else > + addq %rdi, %rax > +# endif > + > + > +# if VEC_SIZE == 64 > + /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex > + processor has partial register stalls (all have merging > + uop). If that changes this can be removed. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# ifdef USE_AS_WMEMCHR > subl %edi, %ecx > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > sarl $2, %ecx > - addq %rcx, %rdx > -# else > - addq %rdi, %rdx > - andq $-(4 * VEC_SIZE), %rdi > - subq %rdi, %rdx > -# endif > + addq %rcx, %rax > # else > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - andq $-(4 * VEC_SIZE), %rdi > + subq %rdi, %rax > # endif > -# ifdef USE_IN_RTM > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > -# else > - /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not > - encodable with EVEX registers (ymm16-ymm31). */ > - vmovdqa64 %YMMMATCH, %ymm0 > + > + > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > # endif > > - /* Compare 4 * VEC at a time forward. */ > - .p2align 4 > + .p2align 4,, 11 > L(loop_4x_vec): > - /* Two versions of the loop. One that does not require > - vzeroupper by not using ymm0-ymm15 and another does that require > - vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 > - is used at all is because there is no EVEX encoding vpcmpeq and > - with vpcmpeq this loop can be performed more efficiently. The > - non-vzeroupper version is safe for RTM while the vzeroupper > - version should be prefered if RTM are not supported. */ > -# ifdef USE_IN_RTM > - /* It would be possible to save some instructions using 4x VPCMP > - but bottleneck on port 5 makes it not woth it. */ > - VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 > - /* xor will set bytes match esi to zero. */ > - vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 > - vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 > - VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 > - /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ > - VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} > - VPCMP $0, %YMM3, %YMMZERO, %k2 > -# else > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymmm0-15 and another does that > + require vzeroupper because it uses ymmm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > /* Since vptern can only take 3x vectors fastest to do 1 vec > seperately with EVEX vpcmp. */ > # ifdef USE_AS_WMEMCHR > /* vptern can only accept masks for epi32/epi64 so can only save > - instruction using not equals mask on vptern with wmemchr. */ > - VPCMP $4, (%rdi), %YMMMATCH, %k1 > + instruction using not equals mask on vptern with wmemchr. > + */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # else > - VPCMP $0, (%rdi), %YMMMATCH, %k1 > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # endif > /* Compare 3x with vpcmpeq and or them all together with vptern. > */ > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > # ifdef USE_AS_WMEMCHR > - /* This takes the not of or between ymm2, ymm3, ymm4 as well as > - combines result from VEC0 with zero mask. */ > - vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} > - vpmovmskb %ymm4, %ecx > + /* This takes the not of or between VEC_lo(2), VEC_lo(3), > + VEC_lo(4) as well as combines result from VEC(0) with zero > + mask. */ > + vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} > + vpmovmskb %VMM_lo(4), %VRCX > # else > - /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ > - vpternlogd $254, %ymm2, %ymm3, %ymm4 > - vpmovmskb %ymm4, %ecx > - kmovd %k1, %eax > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + KMOV %k1, %edx > # endif > -# endif > > -# ifdef USE_AS_RAWMEMCHR > - subq $-(VEC_SIZE * 4), %rdi > -# endif > -# ifdef USE_IN_RTM > - kortestd %k2, %k3 > # else > -# ifdef USE_AS_WMEMCHR > - /* ecx contains not of matches. All 1s means no matches. incl will > - overflow and set zeroflag if that is the case. */ > - incl %ecx > -# else > - /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding > - to ecx is not an issue because if eax is non-zero it will be > - used for returning the match. If it is zero the add does > - nothing. */ > - addq %rax, %rcx > -# endif > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > # endif > -# ifdef USE_AS_RAWMEMCHR > - jz L(loop_4x_vec) > -# else > - jnz L(loop_4x_vec_end) > + > + > + TEST_END () > + jnz L(loop_vec_ret) > > subq $-(VEC_SIZE * 4), %rdi > > - subq $(CHAR_PER_VEC * 4), %rdx > - ja L(loop_4x_vec) > + subq $(CHAR_PER_VEC * 4), %rax > + jae L(loop_4x_vec) > > - /* Fall through into less than 4 remaining vectors of length case. > + /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. > */ > - VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 > - addq $(BASE_OFFSET - VEC_SIZE), %rdi > - kmovd %k0, %eax > - VZEROUPPER > - > -L(last_4x_vec_or_less): > - /* Check if first VEC contained match. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + COND_VZEROUPPER > > - /* If remaining length > CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jg L(last_4x_vec) > - > -L(last_2x_vec): > - /* If remaining length < CHAR_PER_VEC. */ > - addl $CHAR_PER_VEC, %edx > - jle L(zero_end) > - > - /* Check VEC2 and compare any match with remaining length. */ > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - cmpl %eax, %edx > - jbe L(set_zero_end) > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end): > - ret > + .p2align 4,, 10 > +L(last_4x_vec): > + /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit > + instructions on eax from here on out. */ > +# if CHAR_PER_VEC != 64 > + andl $(CHAR_PER_VEC * 4 - 1), %eax > +# endif > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 > + subq $(VEC_SIZE * 1), %rdi > + KMOV %k0, %VRDX > + cmpb $(CHAR_PER_VEC * 2 - 1), %al > + jbe L(last_2x_vec) > + test %VRDX, %VRDX > + jnz L(last_vec_x1_novzero) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x2_novzero) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + subb $(CHAR_PER_VEC * 3), %al > + jae L(last_vec_check) > > -L(set_zero_end): > xorl %eax, %eax > ret > > - .p2align 4 > -L(first_vec_x1_check): > - /* eax must be non-zero. Use bsfl to save code size. */ > - bsfl %eax, %eax > - /* Adjust length. */ > - subl $-(CHAR_PER_VEC * 4), %edx > - /* Check if match within remaining length. */ > - cmpl %eax, %edx > - jbe L(set_zero_end) > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > + addq $VEC_SIZE, %rdi > +L(last_vec_x1_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > +# endif > > - .p2align 4 > -L(loop_4x_vec_end): > +# if CHAR_PER_VEC == 64 > + /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == > + 64 it needs a seperate return label. */ > + .p2align 4,, 4 > +L(last_vec_x2): > +L(last_vec_x2_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax > + ret > # endif > - /* rawmemchr will fall through into this if match was found in > - loop. */ > > -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR > - /* k1 has not of matches with VEC1. */ > - kmovd %k1, %eax > -# ifdef USE_AS_WMEMCHR > - subl $((1 << CHAR_PER_VEC) - 1), %eax > -# else > - incl %eax > -# endif > + .p2align 4,, 4 > +L(loop_vec_ret): > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > + KMOV %k1, %VRAX > + inc %MASK_GPR(rax) > # else > - /* eax already has matches for VEC1. */ > - testl %eax, %eax > + test %VRDX, %VRDX > # endif > - jnz L(last_vec_x1_return) > + jnz L(last_vec_x0) > > -# ifdef USE_IN_RTM > - VPCMP $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %eax > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRDX > # else > - vpmovmskb %ymm2, %eax > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRDX > # endif > - testl %eax, %eax > - jnz L(last_vec_x2_return) > + test %VRDX, %VRDX > + jnz L(last_vec_x1) > > -# ifdef USE_IN_RTM > - kmovd %k2, %eax > - testl %eax, %eax > - jnz L(last_vec_x3_return) > > - kmovd %k3, %eax > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRDX > # else > - vpmovmskb %ymm3, %eax > - /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ > - salq $VEC_SIZE, %rcx > - orq %rcx, %rax > - tzcntq %rax, %rax > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax > - VZEROUPPER > + KMOV %k2, %VRDX > # endif > - ret > > - .p2align 4,, 10 > -L(last_vec_x1_return): > - tzcntl %eax, %eax > -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For > + CHAR_PER_VEC we test the last 2x VEC seperately, for > + CHAR_PER_VEC <= 32 we can combine the results from the 2x > + VEC in a single GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRDX, %VRDX > + jnz L(last_vec_x2) > + KMOV %k3, %VRDX > # else > - addq %rdi, %rax > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > + > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx > + addq %rcx, %rdx > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > +# endif > # endif > - VZEROUPPER > + bsf %rdx, %rdx > + leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > - .p2align 4 > -L(last_vec_x2_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by RET_SCALE to get the wchar_t count > - if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and > - USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ > - leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax > - VZEROUPPER > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x1_novzero): > +# endif > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > -# ifdef USE_IN_RTM > - .p2align 4 > -L(last_vec_x3_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > + > + .p2align 4,, 4 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VGPR(GPR_X0), %VGPR(GPR_X0) > +# if GPR_X0_IS_RET > + addq %rdi, %rax > +# else > + leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax > +# endif > ret > + > + .p2align 4,, 6 > +L(page_cross): > + /* Need to preserve eax to compute inbound bytes we are > + checking. */ > +# ifdef USE_AS_WMEMCHR > + movl %eax, %ecx > +# else > + xorl %ecx, %ecx > + subl %eax, %ecx > # endif > > -# ifndef USE_AS_RAWMEMCHR > - .p2align 4,, 5 > -L(last_4x_vec_or_less_cmpeq): > - VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - subq $-(VEC_SIZE * 4), %rdi > - /* Check first VEC regardless. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > > - /* If remaining length <= CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jle L(last_2x_vec) > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > +# endif > > - .p2align 4 > -L(last_4x_vec): > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > > + shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Create mask for possible matches within remaining length. */ > -# ifdef USE_AS_WMEMCHR > - movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx > - bzhil %edx, %ecx, %ecx > -# else > - movq $-1, %rcx > - bzhiq %rdx, %rcx, %rcx > -# endif > - /* Test matches in data against length match. */ > - andl %ecx, %eax > - jnz L(last_vec_x3) > +# ifdef USE_AS_WMEMCHR > + negl %ecx > +# endif > > - /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after > - remaining length was found to be > CHAR_PER_VEC * 2. */ > - subl $CHAR_PER_VEC, %edx > - jbe L(zero_end2) > + /* mask lower bits from ecx (negative eax) to get bytes till > + next VEC. */ > + andl $(CHAR_PER_VEC - 1), %ecx > > + /* Check if VEC is entirely contained in the remainder of the > + page. */ > + cmpq %rcx, %rdx > + jbe L(page_cross_ret) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Shift remaining length mask for last VEC. */ > -# ifdef USE_AS_WMEMCHR > - shrl $CHAR_PER_VEC, %ecx > -# else > - shrq $CHAR_PER_VEC, %rcx > -# endif > - andl %ecx, %eax > - jz L(zero_end2) > - bsfl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end2): > - ret > + /* Length crosses the page so if rax is zero (no matches) > + continue. */ > + test %VRAX, %VRAX > + jz L(page_cross_continue) > > -L(last_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + /* if rdx > rcx then any match here must be in [buf:buf + len]. > + */ > + tzcnt %VRAX, %VRAX > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > ret > > - .p2align 4 > -L(last_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 2 > +L(page_cross_zero): > + xorl %eax, %eax > ret > + > + .p2align 4,, 4 > +L(page_cross_ret): > + /* Search is entirely contained in page cross case. */ > +# ifdef USE_AS_WMEMCHR > + test %VRAX, %VRAX > + jz L(page_cross_zero) > +# endif > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(page_cross_zero) > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > # endif > - /* 7 bytes from next cache line. */ > + ret > END (MEMCHR) > #endif > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > index deda1ca395..2073eaa620 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > @@ -1,3 +1,6 @@ > -#define MEMCHR __rawmemchr_evex_rtm > -#define USE_AS_RAWMEMCHR 1 > -#include "memchr-evex-rtm.S" > +#define RAWMEMCHR __rawmemchr_evex_rtm > + > +#define USE_IN_RTM 1 > +#define SECTION(p) p##.evex.rtm > + > +#include "rawmemchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > index dc1c450699..dad54def2b 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > @@ -1,7 +1,308 @@ > -#ifndef RAWMEMCHR > -# define RAWMEMCHR __rawmemchr_evex > -#endif > -#define USE_AS_RAWMEMCHR 1 > -#define MEMCHR RAWMEMCHR > +/* rawmemchr optimized with 256-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > +#include <sysdep.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > +# ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex > +# endif > + > + > +# define PC_SHIFT_GPR rdi > +# define REG_WIDTH VEC_SIZE > +# define VPTESTN vptestnmb > +# define VPBROADCAST vpbroadcastb > +# define VPMINU vpminub > +# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > +# define CHAR_SIZE 1 > + > +# include "reg-macros.h" > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > +# define VZEROUPPER > + > +# define USE_TERN_IN_LOOP 0 > +# else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > +# define VZEROUPPER vzeroupper > +# endif > + > +# define CHAR_PER_VEC VEC_SIZE > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else /* !(CHAR_PER_VEC == 64) */ > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > +# endif /* !(CHAR_PER_VEC == 64) */ > + > + > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > + > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (RAWMEMCHR, 6) > + VPBROADCAST %esi, %VMATCH > + /* Check if we may cross page boundary with one vector load. */ > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + test %VRAX, %VRAX > + jz L(aligned_more) > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > + > + .p2align 4,, 4 > +L(first_vec_x4): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 4)(%rdi, %rax), %rax > + ret > > -#include "memchr-evex.S" > + /* For VEC_SIZE == 32 we can fit this in aligning bytes so might > + as well place it more locally. For VEC_SIZE == 64 we reuse > + return code at the end of loop's return. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 4 > +L(FALLTHROUGH_RETURN_LBL): > + bsf %VRAX, %VRAX > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(page_cross): > + /* eax has lower page-offset bits of rdi so xor will zero them > + out. */ > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + /* Shift out out-of-bounds matches. */ > + shrx %VRDI, %VRAX, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > + > + .p2align 4,, 10 > +L(aligned_more): > +L(page_cross_continue): > + /* Align pointer. */ > + andq $(VEC_SIZE * -1), %rdi > + > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x4) > + > + subq $-(VEC_SIZE * 1), %rdi > +# if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > +# endif > + > + .p2align 4 > +L(loop_4x_vec): > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymm0-15 and another does that > + require vzeroupper because it uses ymm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > + /* Since vptern can only take 3x vectors fastest to do 1 vec > + seperately with EVEX vpcmp. */ > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + /* Compare 3x with vpcmpeq and or them all together with vptern. > + */ > + > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2) > + subq $(VEC_SIZE * -4), %rdi > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > + > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + > + KMOV %k1, %eax > + > + /* NB: rax has match from first VEC and rcx has matches from > + VEC 2-4. If rax is non-zero we will return that match. If > + rax is zero adding won't disturb the bits in rcx. */ > + add %rax, %rcx > +# else > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > + subq $(VEC_SIZE * -4), %rdi > + KORTEST %k2, %k3 > +# endif > + jz L(loop_4x_vec) > + > +# if USE_TERN_IN_LOOP > + test %VRAX, %VRAX > +# else > + KMOV %k1, %VRAX > + inc %VRAX > +# endif > + jnz L(last_vec_x0) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRAX > +# else > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRAX > +# endif > + test %VRAX, %VRAX > + jnz L(last_vec_x1) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRAX > +# else > + KMOV %k2, %VRAX > +# endif > + > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k3, %VRAX > +L(FALLTHROUGH_RETURN_LBL): > +# else > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $CHAR_PER_VEC, %rcx > + addq %rcx, %rax > +# endif > + bsf %rax, %rax > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(TAIL_RETURN_LBL): > + bsf %rax, %rax > + leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > +END (RAWMEMCHR) > +#endif > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 0dd4f1dcce..23a1c0018e 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -21,17 +21,27 @@ #if ISA_SHOULD_BUILD (4) +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + # ifndef MEMCHR # define MEMCHR __memchr_evex # endif # ifdef USE_AS_WMEMCHR +# define PC_SHIFT_GPR rcx +# define VPTESTN vptestnmd # define VPBROADCAST vpbroadcastd # define VPMINU vpminud # define VPCMP vpcmpd # define VPCMPEQ vpcmpeqd # define CHAR_SIZE 4 + +# define USE_WIDE_CHAR # else +# define PC_SHIFT_GPR rdi +# define VPTESTN vptestnmb # define VPBROADCAST vpbroadcastb # define VPMINU vpminub # define VPCMP vpcmpb @@ -39,534 +49,661 @@ # define CHAR_SIZE 1 # endif - /* In the 4x loop the RTM and non-RTM versions have data pointer - off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. - This is represented by BASE_OFFSET. As well because the RTM - version uses vpcmp which stores a bit per element compared where - the non-RTM version uses vpcmpeq which stores a bit per byte - compared RET_SCALE of CHAR_SIZE is only relevant for the RTM - version. */ -# ifdef USE_IN_RTM +# include "reg-macros.h" + + +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 + doesn't have VEX encoding), use VEX encoding in loop so we + can use vpcmpeqb + vptern which is more efficient than the + EVEX alternative. */ +# if defined USE_IN_RTM || VEC_SIZE == 64 +# undef COND_VZEROUPPER +# undef VZEROUPPER_RETURN +# undef VZEROUPPER + +# define COND_VZEROUPPER +# define VZEROUPPER_RETURN ret # define VZEROUPPER -# define BASE_OFFSET (VEC_SIZE * 4) -# define RET_SCALE CHAR_SIZE + +# define USE_TERN_IN_LOOP 0 # else +# define USE_TERN_IN_LOOP 1 +# undef VZEROUPPER # define VZEROUPPER vzeroupper -# define BASE_OFFSET 0 -# define RET_SCALE 1 # endif - /* In the return from 4x loop memchr and rawmemchr versions have - data pointers off by VEC_SIZE * 4 with memchr version being - VEC_SIZE * 4 greater. */ -# ifdef USE_AS_RAWMEMCHR -# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) -# define RAW_PTR_REG rcx -# define ALGN_PTR_REG rdi +# if USE_TERN_IN_LOOP + /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar + so we don't want to multiply resulting index. */ +# define TERN_CHAR_MULT 1 + +# ifdef USE_AS_WMEMCHR +# define TEST_END() inc %VRCX +# else +# define TEST_END() add %rdx, %rcx +# endif # else -# define RET_OFFSET BASE_OFFSET -# define RAW_PTR_REG rdi -# define ALGN_PTR_REG rcx +# define TERN_CHAR_MULT CHAR_SIZE +# define TEST_END() KORTEST %k2, %k3 # endif -# define XMMZERO xmm23 -# define YMMZERO ymm23 -# define XMMMATCH xmm16 -# define YMMMATCH ymm16 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +# ifndef USE_AS_WMEMCHR +# define GPR_X0_IS_RET 1 +# else +# define GPR_X0_IS_RET 0 +# endif +# define GPR_X0 rax +# else +# define GPR_X0_IS_RET 0 +# define GPR_X0 rdx +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# ifndef SECTION -# define SECTION(p) p##.evex +# if CHAR_PER_VEC == 64 +# define LAST_VEC_OFFSET (VEC_SIZE * 3) +# else +# define LAST_VEC_OFFSET (VEC_SIZE * 2) +# endif +# if CHAR_PER_VEC >= 32 +# define MASK_GPR(...) VGPR(__VA_ARGS__) +# elif CHAR_PER_VEC == 16 +# define MASK_GPR(reg) VGPR_SZ(reg, 16) +# else +# define MASK_GPR(reg) VGPR_SZ(reg, 8) # endif -# define VEC_SIZE 32 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# define PAGE_SIZE 4096 +# define VMATCH VMM(0) +# define VMATCH_LO VMM_lo(0) - .section SECTION(.text),"ax",@progbits +# define PAGE_SIZE 4096 + + + .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN (MEMCHR, 6) -# ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ test %RDX_LP, %RDX_LP - jz L(zero) + jz L(zero_0) -# ifdef __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx -# endif # endif - /* Broadcast CHAR to YMMMATCH. */ - VPBROADCAST %esi, %YMMMATCH + VPBROADCAST %esi, %VMATCH /* Check if we may cross page boundary with one vector load. */ movl %edi, %eax andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page_boundary) + ja L(page_cross) + + VPCMPEQ (%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX +# ifndef USE_AS_WMEMCHR + /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a + already a dependency between rcx and rsi so no worries about + false-dep here. */ + tzcnt %VRAX, %VRSI + /* If rdx <= rsi then either 1) rcx was non-zero (there was a + match) but it was out of bounds or 2) rcx was zero and rdx + was <= VEC_SIZE so we are done scanning. */ + cmpq %rsi, %rdx + /* NB: Use branch to return zero/non-zero. Common usage will + branch on result of function (if return is null/non-null). + This branch can be used to predict the ensuing one so there + is no reason to extend the data-dependency with cmovcc. */ + jbe L(zero_0) + + /* If rcx is zero then len must be > RDX, otherwise since we + already tested len vs lzcnt(rcx) (in rsi) we are good to + return this match. */ + test %VRAX, %VRAX + jz L(more_1x_vec) + leaq (%rdi, %rsi), %rax +# else - /* Check the first VEC_SIZE bytes. */ - VPCMP $0, (%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax -# ifndef USE_AS_RAWMEMCHR - /* If length < CHAR_PER_VEC handle special. */ + /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE + > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ cmpq $CHAR_PER_VEC, %rdx - jbe L(first_vec_x0) -# endif - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + ja L(more_1x_vec) + tzcnt %VRAX, %VRAX + cmpl %eax, %edx + jbe L(zero_0) +L(first_vec_x0_ret): leaq (%rdi, %rax, CHAR_SIZE), %rax -# else - addq %rdi, %rax # endif ret -# ifndef USE_AS_RAWMEMCHR -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(first_vec_x0): - /* Check if first match was before length. NB: tzcnt has false data- - dependency on destination. eax already had a data-dependency on esi - so this should have no affect here. */ - tzcntl %eax, %esi -# ifdef USE_AS_WMEMCHR - leaq (%rdi, %rsi, CHAR_SIZE), %rdi -# else - addq %rsi, %rdi -# endif + /* Only fits in first cache line for VEC_SIZE == 32. */ +# if VEC_SIZE == 32 + .p2align 4,, 2 +L(zero_0): xorl %eax, %eax - cmpl %esi, %edx - cmovg %rdi, %rax ret # endif - .p2align 4 -L(cross_page_boundary): - /* Save pointer before aligning as its original value is - necessary for computer return address if byte is found or - adjusting length if it is not and this is memchr. */ - movq %rdi, %rcx - /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi - for rawmemchr. */ - andq $-VEC_SIZE, %ALGN_PTR_REG - VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 - kmovd %k0, %r8d + .p2align 4,, 9 +L(more_1x_vec): # ifdef USE_AS_WMEMCHR - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - sarl $2, %eax -# endif -# ifndef USE_AS_RAWMEMCHR - movl $(PAGE_SIZE / CHAR_SIZE), %esi - subl %eax, %esi + /* If wmemchr still need to test if there was a match in first + VEC. Use bsf to test here so we can reuse + L(first_vec_x0_ret). */ + bsf %VRAX, %VRAX + jnz L(first_vec_x0_ret) # endif + +L(page_cross_continue): # ifdef USE_AS_WMEMCHR - andl $(CHAR_PER_VEC - 1), %eax -# endif - /* Remove the leading bytes. */ - sarxl %eax, %r8d, %eax -# ifndef USE_AS_RAWMEMCHR - /* Check the end of data. */ - cmpq %rsi, %rdx - jbe L(first_vec_x0) + /* We can't use end of the buffer to re-calculate length for + wmemchr as len * CHAR_SIZE may overflow. */ + leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + addq %rdx, %rax +# else + leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax # endif - testl %eax, %eax - jz L(cross_page_continue) - tzcntl %eax, %eax + + /* rax contains remaining length - 1. -1 so we can get imm8 + encoding in a few additional places saving code size. */ + + /* Needed regardless of remaining length. */ + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 + KMOV %k0, %VRDX + + /* We cannot fold the above `sub %rdi, %rax` with the `cmp + $(CHAR_PER_VEC * 2), %rax` because its possible for a very + large length to overflow and cause the subtract to carry + despite length being above CHAR_PER_VEC * 2. */ + cmpq $(CHAR_PER_VEC * 2 - 1), %rax + ja L(more_2x_vec) +L(last_2x_vec): + + test %VRDX, %VRDX + jnz L(first_vec_x1_check) + + /* Check the end of data. NB: use 8-bit operations to save code + size. We no longer need the full-width of eax and will + perform a write-only operation over eax so there will be no + partial-register stalls. */ + subb $(CHAR_PER_VEC * 1 - 1), %al + jle L(zero_0) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX # ifdef USE_AS_WMEMCHR - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax + /* For wmemchr against we can't take advantage of tzcnt(0) == + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ + test %VRCX, %VRCX + jz L(zero_0) +# endif + tzcnt %VRCX, %VRCX + cmp %cl, %al + + /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give + fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is + not enough space before the next cache line to fit the `lea` + for return. */ +# if VEC_SIZE == 64 + ja L(first_vec_x2_ret) +L(zero_0): + xorl %eax, %eax + ret # else - addq %RAW_PTR_REG, %rax + jbe L(zero_0) + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax + ret # endif + + .p2align 4,, 5 +L(first_vec_x1_check): + bsf %VRDX, %VRDX + cmpb %dl, %al + jb L(zero_4) + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + /* Fits at the end of the cache line here for VEC_SIZE == 32. + */ +# if VEC_SIZE == 32 +L(zero_4): + xorl %eax, %eax ret +# endif - .p2align 4 + + .p2align 4,, 4 L(first_vec_x2): - tzcntl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + bsf %VRCX, %VRCX +L(first_vec_x2_ret): + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax ret - .p2align 4 -L(first_vec_x3): - tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + /* Fits at the end of the cache line here for VEC_SIZE == 64. + */ +# if VEC_SIZE == 64 +L(zero_4): + xorl %eax, %eax ret +# endif - .p2align 4 -L(first_vec_x4): - tzcntl %eax, %eax - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + .p2align 4,, 4 +L(first_vec_x1): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret - .p2align 5 -L(aligned_more): - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ -# ifndef USE_AS_RAWMEMCHR - /* Align data to VEC_SIZE. */ -L(cross_page_continue): - xorl %ecx, %ecx - subl %edi, %ecx - andq $-VEC_SIZE, %rdi - /* esi is for adjusting length to see if near the end. */ - leal (VEC_SIZE * 5)(%rdi, %rcx), %esi -# ifdef USE_AS_WMEMCHR - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %esi -# endif -# else - andq $-VEC_SIZE, %rdi -L(cross_page_continue): -# endif - /* Load first VEC regardless. */ - VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax -# ifndef USE_AS_RAWMEMCHR - /* Adjust length. If near end handle specially. */ - subq %rsi, %rdx - jbe L(last_4x_vec_or_less) -# endif - testl %eax, %eax + .p2align 4,, 5 +L(more_2x_vec): + /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking + length. */ + + + /* Already computed matches for first VEC in rdx. */ + test %VRDX, %VRDX jnz L(first_vec_x1) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x2) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + /* Needed regardless of next length check. */ + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + + /* Check if we are near the end. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rax + ja L(more_4x_vec) + + test %VRCX, %VRCX + jnz L(first_vec_x3_check) + + /* Use 8-bit instructions to save code size. We won't use full- + width eax again and will perform a write-only operation to + eax so no worries about partial-register stalls. */ + subb $(CHAR_PER_VEC * 3), %al + jb L(zero_2) +L(last_vec_check): + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WMEMCHR + /* For wmemchr against we can't take advantage of tzcnt(0) == + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ + test %VRCX, %VRCX + jz L(zero_2) +# endif + tzcnt %VRCX, %VRCX + cmp %cl, %al + jae L(first_vec_x4_ret) +L(zero_2): + xorl %eax, %eax + ret + + /* Fits at the end of the cache line here for VEC_SIZE == 64. + For VEC_SIZE == 32 we put the return label at the end of + L(first_vec_x4). */ +# if VEC_SIZE == 64 +L(first_vec_x4_ret): + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax + ret +# endif + + .p2align 4,, 6 +L(first_vec_x4): + bsf %VRCX, %VRCX +# if VEC_SIZE == 32 + /* Place L(first_vec_x4_ret) here as we can't fit it in the same + cache line as where it is called from so we might as well + save code size by reusing return of L(first_vec_x4). */ +L(first_vec_x4_ret): +# endif + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 6 +L(first_vec_x3_check): + /* Need to adjust remaining length before checking. */ + addb $-(CHAR_PER_VEC * 2), %al + bsf %VRCX, %VRCX + cmpb %cl, %al + jb L(zero_2) + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 6 +L(first_vec_x3): + bsf %VRCX, %VRCX + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 3 +# if !USE_TERN_IN_LOOP + .p2align 4,, 10 +# endif +L(more_4x_vec): + test %VRCX, %VRCX jnz L(first_vec_x3) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x4) + subq $-(VEC_SIZE * 5), %rdi + subq $(CHAR_PER_VEC * 8), %rax + jb L(last_4x_vec) -# ifndef USE_AS_RAWMEMCHR - /* Check if at last CHAR_PER_VEC * 4 length. */ - subq $(CHAR_PER_VEC * 4), %rdx - jbe L(last_4x_vec_or_less_cmpeq) - /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi - - /* Align data to VEC_SIZE * 4 for the loop and readjust length. - */ -# ifdef USE_AS_WMEMCHR +# ifdef USE_AS_WMEMCHR movl %edi, %ecx - andq $-(4 * VEC_SIZE), %rdi +# else + addq %rdi, %rax +# endif + + +# if VEC_SIZE == 64 + /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex + processor has partial register stalls (all have merging + uop). If that changes this can be removed. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# ifdef USE_AS_WMEMCHR subl %edi, %ecx - /* NB: Divide bytes by 4 to get the wchar_t count. */ sarl $2, %ecx - addq %rcx, %rdx -# else - addq %rdi, %rdx - andq $-(4 * VEC_SIZE), %rdi - subq %rdi, %rdx -# endif + addq %rcx, %rax # else - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi - andq $-(4 * VEC_SIZE), %rdi + subq %rdi, %rax # endif -# ifdef USE_IN_RTM - vpxorq %XMMZERO, %XMMZERO, %XMMZERO -# else - /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not - encodable with EVEX registers (ymm16-ymm31). */ - vmovdqa64 %YMMMATCH, %ymm0 + + + +# if USE_TERN_IN_LOOP + /* copy VMATCH to low ymm so we can use vpcmpeq which is not + encodable with EVEX registers. NB: this is VEC_SIZE == 32 + only as there is no way to encode vpcmpeq with zmm0-15. */ + vmovdqa64 %VMATCH, %VMATCH_LO # endif - /* Compare 4 * VEC at a time forward. */ - .p2align 4 + .p2align 4,, 11 L(loop_4x_vec): - /* Two versions of the loop. One that does not require - vzeroupper by not using ymm0-ymm15 and another does that require - vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 - is used at all is because there is no EVEX encoding vpcmpeq and - with vpcmpeq this loop can be performed more efficiently. The - non-vzeroupper version is safe for RTM while the vzeroupper - version should be prefered if RTM are not supported. */ -# ifdef USE_IN_RTM - /* It would be possible to save some instructions using 4x VPCMP - but bottleneck on port 5 makes it not woth it. */ - VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 - /* xor will set bytes match esi to zero. */ - vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 - vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 - VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 - /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ - VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} - VPCMP $0, %YMM3, %YMMZERO, %k2 -# else + /* Two versions of the loop. One that does not require + vzeroupper by not using ymmm0-15 and another does that + require vzeroupper because it uses ymmm0-15. The reason why + ymm0-15 is used at all is because there is no EVEX encoding + vpcmpeq and with vpcmpeq this loop can be performed more + efficiently. The non-vzeroupper version is safe for RTM + while the vzeroupper version should be prefered if RTM are + not supported. Which loop version we use is determined by + USE_TERN_IN_LOOP. */ + +# if USE_TERN_IN_LOOP /* Since vptern can only take 3x vectors fastest to do 1 vec seperately with EVEX vpcmp. */ # ifdef USE_AS_WMEMCHR /* vptern can only accept masks for epi32/epi64 so can only save - instruction using not equals mask on vptern with wmemchr. */ - VPCMP $4, (%rdi), %YMMMATCH, %k1 + instruction using not equals mask on vptern with wmemchr. + */ + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 # else - VPCMP $0, (%rdi), %YMMMATCH, %k1 + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 # endif /* Compare 3x with vpcmpeq and or them all together with vptern. */ - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) # ifdef USE_AS_WMEMCHR - /* This takes the not of or between ymm2, ymm3, ymm4 as well as - combines result from VEC0 with zero mask. */ - vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} - vpmovmskb %ymm4, %ecx + /* This takes the not of or between VEC_lo(2), VEC_lo(3), + VEC_lo(4) as well as combines result from VEC(0) with zero + mask. */ + vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} + vpmovmskb %VMM_lo(4), %VRCX # else - /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ - vpternlogd $254, %ymm2, %ymm3, %ymm4 - vpmovmskb %ymm4, %ecx - kmovd %k1, %eax + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into + VEC_lo(4). */ + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) + vpmovmskb %VMM_lo(4), %VRCX + KMOV %k1, %edx # endif -# endif -# ifdef USE_AS_RAWMEMCHR - subq $-(VEC_SIZE * 4), %rdi -# endif -# ifdef USE_IN_RTM - kortestd %k2, %k3 # else -# ifdef USE_AS_WMEMCHR - /* ecx contains not of matches. All 1s means no matches. incl will - overflow and set zeroflag if that is the case. */ - incl %ecx -# else - /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding - to ecx is not an issue because if eax is non-zero it will be - used for returning the match. If it is zero the add does - nothing. */ - addq %rax, %rcx -# endif + /* Loop version that uses EVEX encoding. */ + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 + vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) + vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTN %VMM(3), %VMM(3), %k2 # endif -# ifdef USE_AS_RAWMEMCHR - jz L(loop_4x_vec) -# else - jnz L(loop_4x_vec_end) + + + TEST_END () + jnz L(loop_vec_ret) subq $-(VEC_SIZE * 4), %rdi - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop_4x_vec) + subq $(CHAR_PER_VEC * 4), %rax + jae L(loop_4x_vec) - /* Fall through into less than 4 remaining vectors of length case. + /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. */ - VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 - addq $(BASE_OFFSET - VEC_SIZE), %rdi - kmovd %k0, %eax - VZEROUPPER - -L(last_4x_vec_or_less): - /* Check if first VEC contained match. */ - testl %eax, %eax - jnz L(first_vec_x1_check) + COND_VZEROUPPER - /* If remaining length > CHAR_PER_VEC * 2. */ - addl $(CHAR_PER_VEC * 2), %edx - jg L(last_4x_vec) - -L(last_2x_vec): - /* If remaining length < CHAR_PER_VEC. */ - addl $CHAR_PER_VEC, %edx - jle L(zero_end) - - /* Check VEC2 and compare any match with remaining length. */ - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - cmpl %eax, %edx - jbe L(set_zero_end) - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -L(zero_end): - ret + .p2align 4,, 10 +L(last_4x_vec): + /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit + instructions on eax from here on out. */ +# if CHAR_PER_VEC != 64 + andl $(CHAR_PER_VEC * 4 - 1), %eax +# endif + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 + subq $(VEC_SIZE * 1), %rdi + KMOV %k0, %VRDX + cmpb $(CHAR_PER_VEC * 2 - 1), %al + jbe L(last_2x_vec) + test %VRDX, %VRDX + jnz L(last_vec_x1_novzero) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x2_novzero) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(first_vec_x3_check) + + subb $(CHAR_PER_VEC * 3), %al + jae L(last_vec_check) -L(set_zero_end): xorl %eax, %eax ret - .p2align 4 -L(first_vec_x1_check): - /* eax must be non-zero. Use bsfl to save code size. */ - bsfl %eax, %eax - /* Adjust length. */ - subl $-(CHAR_PER_VEC * 4), %edx - /* Check if match within remaining length. */ - cmpl %eax, %edx - jbe L(set_zero_end) - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP +L(last_vec_x2_novzero): + addq $VEC_SIZE, %rdi +L(last_vec_x1_novzero): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret +# endif - .p2align 4 -L(loop_4x_vec_end): +# if CHAR_PER_VEC == 64 + /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == + 64 it needs a seperate return label. */ + .p2align 4,, 4 +L(last_vec_x2): +L(last_vec_x2_novzero): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax + ret # endif - /* rawmemchr will fall through into this if match was found in - loop. */ -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR - /* k1 has not of matches with VEC1. */ - kmovd %k1, %eax -# ifdef USE_AS_WMEMCHR - subl $((1 << CHAR_PER_VEC) - 1), %eax -# else - incl %eax -# endif + .p2align 4,, 4 +L(loop_vec_ret): +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP + KMOV %k1, %VRAX + inc %MASK_GPR(rax) # else - /* eax already has matches for VEC1. */ - testl %eax, %eax + test %VRDX, %VRDX # endif - jnz L(last_vec_x1_return) + jnz L(last_vec_x0) -# ifdef USE_IN_RTM - VPCMP $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %eax + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(2), %VRDX # else - vpmovmskb %ymm2, %eax + VPTESTN %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRDX # endif - testl %eax, %eax - jnz L(last_vec_x2_return) + test %VRDX, %VRDX + jnz L(last_vec_x1) -# ifdef USE_IN_RTM - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x3_return) - kmovd %k3, %eax - tzcntl %eax, %eax - leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(3), %VRDX # else - vpmovmskb %ymm3, %eax - /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ - salq $VEC_SIZE, %rcx - orq %rcx, %rax - tzcntq %rax, %rax - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax - VZEROUPPER + KMOV %k2, %VRDX # endif - ret - .p2align 4,, 10 -L(last_vec_x1_return): - tzcntl %eax, %eax -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper + (only if used VEX encoded loop). */ + COND_VZEROUPPER + + /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For + CHAR_PER_VEC we test the last 2x VEC seperately, for + CHAR_PER_VEC <= 32 we can combine the results from the 2x + VEC in a single GPR. */ +# if CHAR_PER_VEC == 64 +# if USE_TERN_IN_LOOP +# error "Unsupported" +# endif + + + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ + test %VRDX, %VRDX + jnz L(last_vec_x2) + KMOV %k3, %VRDX # else - addq %rdi, %rax + /* CHAR_PER_VEC <= 32 so we can combine the results from the + last 2x VEC. */ + +# if !USE_TERN_IN_LOOP + KMOV %k3, %VRCX +# endif + salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx + addq %rcx, %rdx +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +L(last_vec_x2_novzero): +# endif # endif - VZEROUPPER + bsf %rdx, %rdx + leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax ret - .p2align 4 -L(last_vec_x2_return): - tzcntl %eax, %eax - /* NB: Multiply bytes by RET_SCALE to get the wchar_t count - if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and - USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ - leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax - VZEROUPPER + .p2align 4,, 8 +L(last_vec_x1): + COND_VZEROUPPER +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +L(last_vec_x1_novzero): +# endif + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax ret -# ifdef USE_IN_RTM - .p2align 4 -L(last_vec_x3_return): - tzcntl %eax, %eax - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + + .p2align 4,, 4 +L(last_vec_x0): + COND_VZEROUPPER + bsf %VGPR(GPR_X0), %VGPR(GPR_X0) +# if GPR_X0_IS_RET + addq %rdi, %rax +# else + leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax +# endif ret + + .p2align 4,, 6 +L(page_cross): + /* Need to preserve eax to compute inbound bytes we are + checking. */ +# ifdef USE_AS_WMEMCHR + movl %eax, %ecx +# else + xorl %ecx, %ecx + subl %eax, %ecx # endif -# ifndef USE_AS_RAWMEMCHR - .p2align 4,, 5 -L(last_4x_vec_or_less_cmpeq): - VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - subq $-(VEC_SIZE * 4), %rdi - /* Check first VEC regardless. */ - testl %eax, %eax - jnz L(first_vec_x1_check) + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 + KMOV %k0, %VRAX - /* If remaining length <= CHAR_PER_VEC * 2. */ - addl $(CHAR_PER_VEC * 2), %edx - jle L(last_2x_vec) +# ifdef USE_AS_WMEMCHR + /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif - .p2align 4 -L(last_4x_vec): - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x2) + shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - /* Create mask for possible matches within remaining length. */ -# ifdef USE_AS_WMEMCHR - movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx - bzhil %edx, %ecx, %ecx -# else - movq $-1, %rcx - bzhiq %rdx, %rcx, %rcx -# endif - /* Test matches in data against length match. */ - andl %ecx, %eax - jnz L(last_vec_x3) +# ifdef USE_AS_WMEMCHR + negl %ecx +# endif - /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after - remaining length was found to be > CHAR_PER_VEC * 2. */ - subl $CHAR_PER_VEC, %edx - jbe L(zero_end2) + /* mask lower bits from ecx (negative eax) to get bytes till + next VEC. */ + andl $(CHAR_PER_VEC - 1), %ecx + /* Check if VEC is entirely contained in the remainder of the + page. */ + cmpq %rcx, %rdx + jbe L(page_cross_ret) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - /* Shift remaining length mask for last VEC. */ -# ifdef USE_AS_WMEMCHR - shrl $CHAR_PER_VEC, %ecx -# else - shrq $CHAR_PER_VEC, %rcx -# endif - andl %ecx, %eax - jz L(zero_end2) - bsfl %eax, %eax - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax -L(zero_end2): - ret + /* Length crosses the page so if rax is zero (no matches) + continue. */ + test %VRAX, %VRAX + jz L(page_cross_continue) -L(last_vec_x2): - tzcntl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + /* if rdx > rcx then any match here must be in [buf:buf + len]. + */ + tzcnt %VRAX, %VRAX +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif ret - .p2align 4 -L(last_vec_x3): - tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + .p2align 4,, 2 +L(page_cross_zero): + xorl %eax, %eax ret + + .p2align 4,, 4 +L(page_cross_ret): + /* Search is entirely contained in page cross case. */ +# ifdef USE_AS_WMEMCHR + test %VRAX, %VRAX + jz L(page_cross_zero) +# endif + tzcnt %VRAX, %VRAX + cmpl %eax, %edx + jbe L(page_cross_zero) +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif - /* 7 bytes from next cache line. */ + ret END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S index deda1ca395..2073eaa620 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S @@ -1,3 +1,6 @@ -#define MEMCHR __rawmemchr_evex_rtm -#define USE_AS_RAWMEMCHR 1 -#include "memchr-evex-rtm.S" +#define RAWMEMCHR __rawmemchr_evex_rtm + +#define USE_IN_RTM 1 +#define SECTION(p) p##.evex.rtm + +#include "rawmemchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S index dc1c450699..dad54def2b 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -1,7 +1,308 @@ -#ifndef RAWMEMCHR -# define RAWMEMCHR __rawmemchr_evex -#endif -#define USE_AS_RAWMEMCHR 1 -#define MEMCHR RAWMEMCHR +/* rawmemchr optimized with 256-bit EVEX instructions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> +#include <sysdep.h> + +#if ISA_SHOULD_BUILD (4) + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + +# ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex +# endif + + +# define PC_SHIFT_GPR rdi +# define REG_WIDTH VEC_SIZE +# define VPTESTN vptestnmb +# define VPBROADCAST vpbroadcastb +# define VPMINU vpminub +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 + +# include "reg-macros.h" + +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 + doesn't have VEX encoding), use VEX encoding in loop so we + can use vpcmpeqb + vptern which is more efficient than the + EVEX alternative. */ +# if defined USE_IN_RTM || VEC_SIZE == 64 +# undef COND_VZEROUPPER +# undef VZEROUPPER_RETURN +# undef VZEROUPPER + + +# define COND_VZEROUPPER +# define VZEROUPPER_RETURN ret +# define VZEROUPPER + +# define USE_TERN_IN_LOOP 0 +# else +# define USE_TERN_IN_LOOP 1 +# undef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define CHAR_PER_VEC VEC_SIZE + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else /* !(CHAR_PER_VEC == 64) */ + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) +# endif /* !(CHAR_PER_VEC == 64) */ + + +# define VMATCH VMM(0) +# define VMATCH_LO VMM_lo(0) + +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (RAWMEMCHR, 6) + VPBROADCAST %esi, %VMATCH + /* Check if we may cross page boundary with one vector load. */ + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + VPCMPEQ (%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + + test %VRAX, %VRAX + jz L(aligned_more) +L(first_vec_x0): + bsf %VRAX, %VRAX + addq %rdi, %rax + ret + + .p2align 4,, 4 +L(first_vec_x4): + bsf %VRAX, %VRAX + leaq (VEC_SIZE * 4)(%rdi, %rax), %rax + ret -#include "memchr-evex.S" + /* For VEC_SIZE == 32 we can fit this in aligning bytes so might + as well place it more locally. For VEC_SIZE == 64 we reuse + return code at the end of loop's return. */ +# if VEC_SIZE == 32 + .p2align 4,, 4 +L(FALLTHROUGH_RETURN_LBL): + bsf %VRAX, %VRAX + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax + ret +# endif + + .p2align 4,, 6 +L(page_cross): + /* eax has lower page-offset bits of rdi so xor will zero them + out. */ + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 + KMOV %k0, %VRAX + + /* Shift out out-of-bounds matches. */ + shrx %VRDI, %VRAX, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) + + .p2align 4,, 10 +L(aligned_more): +L(page_cross_continue): + /* Align pointer. */ + andq $(VEC_SIZE * -1), %rdi + + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x4) + + subq $-(VEC_SIZE * 1), %rdi +# if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# if USE_TERN_IN_LOOP + /* copy VMATCH to low ymm so we can use vpcmpeq which is not + encodable with EVEX registers. NB: this is VEC_SIZE == 32 + only as there is no way to encode vpcmpeq with zmm0-15. */ + vmovdqa64 %VMATCH, %VMATCH_LO +# endif + + .p2align 4 +L(loop_4x_vec): + /* Two versions of the loop. One that does not require + vzeroupper by not using ymm0-15 and another does that + require vzeroupper because it uses ymm0-15. The reason why + ymm0-15 is used at all is because there is no EVEX encoding + vpcmpeq and with vpcmpeq this loop can be performed more + efficiently. The non-vzeroupper version is safe for RTM + while the vzeroupper version should be prefered if RTM are + not supported. Which loop version we use is determined by + USE_TERN_IN_LOOP. */ + +# if USE_TERN_IN_LOOP + /* Since vptern can only take 3x vectors fastest to do 1 vec + seperately with EVEX vpcmp. */ + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 + /* Compare 3x with vpcmpeq and or them all together with vptern. + */ + + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2) + subq $(VEC_SIZE * -4), %rdi + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) + + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into + VEC_lo(4). */ + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) + vpmovmskb %VMM_lo(4), %VRCX + + KMOV %k1, %eax + + /* NB: rax has match from first VEC and rcx has matches from + VEC 2-4. If rax is non-zero we will return that match. If + rax is zero adding won't disturb the bits in rcx. */ + add %rax, %rcx +# else + /* Loop version that uses EVEX encoding. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTN %VMM(3), %VMM(3), %k2 + subq $(VEC_SIZE * -4), %rdi + KORTEST %k2, %k3 +# endif + jz L(loop_4x_vec) + +# if USE_TERN_IN_LOOP + test %VRAX, %VRAX +# else + KMOV %k1, %VRAX + inc %VRAX +# endif + jnz L(last_vec_x0) + + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(2), %VRAX +# else + VPTESTN %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRAX +# endif + test %VRAX, %VRAX + jnz L(last_vec_x1) + + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(3), %VRAX +# else + KMOV %k2, %VRAX +# endif + + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper + (only if used VEX encoded loop). */ + COND_VZEROUPPER + + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC + individually, for VEC_SIZE == 32 we combine them in a single + 64-bit GPR. */ +# if CHAR_PER_VEC == 64 +# if USE_TERN_IN_LOOP +# error "Unsupported" +# endif + + + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k3, %VRAX +L(FALLTHROUGH_RETURN_LBL): +# else + /* CHAR_PER_VEC <= 32 so we can combine the results from the + last 2x VEC. */ +# if !USE_TERN_IN_LOOP + KMOV %k3, %VRCX +# endif + salq $CHAR_PER_VEC, %rcx + addq %rcx, %rax +# endif + bsf %rax, %rax + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(TAIL_RETURN_LBL): + bsf %rax, %rax + leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(last_vec_x1): + COND_VZEROUPPER +L(first_vec_x1): + bsf %VRAX, %VRAX + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(last_vec_x0): + COND_VZEROUPPER + bsf %VRAX, %VRAX + addq %rdi, %rax + ret +END (RAWMEMCHR) +#endif