Message ID | 20221019004409.3623395-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Optimizations are: > 1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch > in short string case. > 2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > > Code Size Changes: > memrchr-evex.S : -29 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > memrchr-evex.S : 0.949 (Mostly from improvements in small strings) > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++---------- > 1 file changed, 324 insertions(+), 214 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S > index 550b328c5a..dbcf52808f 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S > @@ -21,17 +21,19 @@ > #if ISA_SHOULD_BUILD (4) > > # include <sysdep.h> > -# include "x86-evex256-vecs.h" > -# if VEC_SIZE != 32 > -# error "VEC_SIZE != 32 unimplemented" > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > # endif > > +# include "reg-macros.h" > + > # ifndef MEMRCHR > -# define MEMRCHR __memrchr_evex > +# define MEMRCHR __memrchr_evex > # endif > > -# define PAGE_SIZE 4096 > -# define VMMMATCH VMM(0) > +# define PAGE_SIZE 4096 > +# define VMATCH VMM(0) > > .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN(MEMRCHR, 6) > @@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6) > # endif > jz L(zero_0) > > - /* Get end pointer. Minus one for two reasons. 1) It is necessary for a > - correct page cross check and 2) it correctly sets up end ptr to be > - subtract by lzcnt aligned. */ > + /* Get end pointer. Minus one for three reasons. 1) It is > + necessary for a correct page cross check and 2) it correctly > + sets up end ptr to be subtract by lzcnt aligned. 3) it is a > + necessary step in aligning ptr. */ > leaq -1(%rdi, %rdx), %rax > - vpbroadcastb %esi, %VMMMATCH > + vpbroadcastb %esi, %VMATCH > > /* Check if we can load 1x VEC without cross a page. */ > testl $(PAGE_SIZE - VEC_SIZE), %eax > jz L(page_cross) > > - /* Don't use rax for pointer here because EVEX has better encoding with > - offset % VEC_SIZE == 0. */ > - vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0 > - kmovd %k0, %ecx > - > - /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ > - cmpq $VEC_SIZE, %rdx > - ja L(more_1x_vec) > -L(ret_vec_x0_test): > - > - /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which > - will guarantee edx (len) is less than it. */ > - lzcntl %ecx, %ecx > - cmpl %ecx, %edx > - jle L(zero_0) > - subq %rcx, %rax > + /* Don't use rax for pointer here because EVEX has better > + encoding with offset % VEC_SIZE == 0. */ > + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + /* If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a > + already a dependency between rcx and rsi so no worries about > + false-dep here. */ > + lzcnt %VRCX, %VRSI > + /* If rdx <= rsi then either 1) rcx was non-zero (there was a > + match) but it was out of bounds or 2) rcx was zero and rdx > + was <= VEC_SIZE so we are done scanning. */ > + cmpq %rsi, %rdx > + /* NB: Use branch to return zero/non-zero. Common usage will > + branch on result of function (if return is null/non-null). > + This branch can be used to predict the ensuing one so there > + is no reason to extend the data-dependency with cmovcc. */ > + jbe L(zero_0) > + > + /* If rcx is zero then len must be > RDX, otherwise since we > + already tested len vs lzcnt(rcx) (in rsi) we are good to > + return this match. */ > + test %VRCX, %VRCX > + jz L(more_1x_vec) > + subq %rsi, %rax > ret > > - /* Fits in aligning bytes of first cache line. */ > + /* Fits in aligning bytes of first cache line for VEC_SIZE == > + 32. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 2 > L(zero_0): > xorl %eax, %eax > ret > - > - .p2align 4,, 9 > -L(ret_vec_x0_dec): > - decq %rax > -L(ret_vec_x0): > - lzcntl %ecx, %ecx > - subq %rcx, %rax > - ret > +# endif > > .p2align 4,, 10 > L(more_1x_vec): > - testl %ecx, %ecx > - jnz L(ret_vec_x0) > - > /* Align rax (pointer to string). */ > andq $-VEC_SIZE, %rax > - > +L(page_cross_continue): > /* Recompute length after aligning. */ > - movq %rax, %rdx > + subq %rdi, %rax > > - /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > - > - subq %rdi, %rdx > - > - cmpq $(VEC_SIZE * 2), %rdx > + cmpq $(VEC_SIZE * 2), %rax > ja L(more_2x_vec) > + > L(last_2x_vec): > + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > > - /* Must dec rax because L(ret_vec_x0_test) expects it. */ > - decq %rax > - cmpl $VEC_SIZE, %edx > - jbe L(ret_vec_x0_test) > + test %VRCX, %VRCX > + jnz L(ret_vec_x0_test) > > - testl %ecx, %ecx > - jnz L(ret_vec_x0) > + /* If VEC_SIZE == 64 need to subtract because lzcntq won't > + implicitly add VEC_SIZE to match position. */ > +# if VEC_SIZE == 64 > + subl $VEC_SIZE, %eax > +# else > + cmpb $VEC_SIZE, %al > +# endif > + jle L(zero_2) > > - /* Don't use rax for pointer here because EVEX has better encoding with > - offset % VEC_SIZE == 0. */ > - vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0 > - kmovd %k0, %ecx > - /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ > + /* We adjusted rax (length) for VEC_SIZE == 64 so need seperate > + offsets. */ > +# if VEC_SIZE == 64 > + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 > +# else > + vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 > +# endif > + KMOV %k0, %VRCX > + /* NB: 64-bit lzcnt. This will naturally add 32 to position for > + VEC_SIZE == 32. */ > lzcntq %rcx, %rcx > - cmpl %ecx, %edx > - jle L(zero_0) > - subq %rcx, %rax > - ret > - > - /* Inexpensive place to put this regarding code size / target alignments > - / ICache NLP. Necessary for 2-byte encoding of jump to page cross > - case which in turn is necessary for hot path (len <= VEC_SIZE) to fit > - in first cache line. */ > -L(page_cross): > - movq %rax, %rsi > - andq $-VEC_SIZE, %rsi > - vpcmpb $0, (%rsi), %VMMMATCH, %k0 > - kmovd %k0, %r8d > - /* Shift out negative alignment (because we are starting from endptr and > - working backwards). */ > - movl %eax, %ecx > - /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ > - notl %ecx > - shlxl %ecx, %r8d, %ecx > - cmpq %rdi, %rsi > - ja L(more_1x_vec) > - lzcntl %ecx, %ecx > - cmpl %ecx, %edx > - jle L(zero_1) > - subq %rcx, %rax > + subl %ecx, %eax > + ja L(first_vec_x1_ret) > + /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the > + first cache line (this is the second cache line). */ > +# if VEC_SIZE == 64 > +L(zero_0): > +# endif > +L(zero_2): > + xorl %eax, %eax > ret > > - /* Continue creating zero labels that fit in aligning bytes and get > - 2-byte encoding / are in the same cache line as condition. */ > -L(zero_1): > - xorl %eax, %eax > + /* NB: Fits in aligning bytes before next cache line for > + VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to > + L(first_vec_x0_test). */ > +# if VEC_SIZE == 32 > +L(first_vec_x1_ret): > + leaq -1(%rdi, %rax), %rax > ret > +# endif > > - .p2align 4,, 8 > -L(ret_vec_x1): > - /* This will naturally add 32 to position. */ > - bsrl %ecx, %ecx > - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax > + .p2align 4,, 6 > +L(ret_vec_x0_test): > + lzcnt %VRCX, %VRCX > + subl %ecx, %eax > + jle L(zero_2) > +# if VEC_SIZE == 64 > + /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit > + L(first_vec_x1_ret) in the same cache line as its jmp base > + so we might as well save code size. */ > +L(first_vec_x1_ret): > +# endif > + leaq -1(%rdi, %rax), %rax > ret > > - .p2align 4,, 8 > + .p2align 4,, 6 > +L(loop_last_4x_vec): > + /* Compute remaining length. */ > + subl %edi, %eax > +L(last_4x_vec): > + cmpl $(VEC_SIZE * 2), %eax > + jle L(last_2x_vec) > +# if VEC_SIZE == 32 > + /* Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need > + the spare bytes to align the loop properly. */ > + .p2align 4,, 10 > +# endif > L(more_2x_vec): > - testl %ecx, %ecx > - jnz L(ret_vec_x0_dec) > > - vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > - testl %ecx, %ecx > - jnz L(ret_vec_x1) > + /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and > + return if either hit. */ > + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + test %VRCX, %VRCX > + jnz L(first_vec_x0) > + > + vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(first_vec_x1) > > /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > + vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > > - subq $(VEC_SIZE * 4), %rdx > + /* Check if we are near the end. */ > + subq $(VEC_SIZE * 4), %rax > ja L(more_4x_vec) > > - cmpl $(VEC_SIZE * -1), %edx > - jle L(ret_vec_x2_test) > -L(last_vec): > - testl %ecx, %ecx > - jnz L(ret_vec_x2) > + test %VRCX, %VRCX > + jnz L(first_vec_x2_test) > > + /* Adjust length for final check and check if we are at the end. > + */ > + addl $(VEC_SIZE * 1), %eax > + jle L(zero_1) > > - /* Need no matter what. */ > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > - lzcntl %ecx, %ecx > - subq $(VEC_SIZE * 3 + 1), %rax > - subq %rcx, %rax > - cmpq %rax, %rdi > - ja L(zero_1) > + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + lzcnt %VRCX, %VRCX > + subl %ecx, %eax > + ja L(first_vec_x3_ret) > +L(zero_1): > + xorl %eax, %eax > + ret > +L(first_vec_x3_ret): > + leaq -1(%rdi, %rax), %rax > ret > > - .p2align 4,, 8 > -L(ret_vec_x2_test): > - lzcntl %ecx, %ecx > - subq $(VEC_SIZE * 2 + 1), %rax > - subq %rcx, %rax > - cmpq %rax, %rdi > - ja L(zero_1) > + .p2align 4,, 6 > +L(first_vec_x2_test): > + /* Must adjust length before check. */ > + subl $-(VEC_SIZE * 2 - 1), %eax > + lzcnt %VRCX, %VRCX > + subl %ecx, %eax > + jl L(zero_4) > + addq %rdi, %rax > ret > > - .p2align 4,, 8 > -L(ret_vec_x2): > - bsrl %ecx, %ecx > - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax > + > + .p2align 4,, 10 > +L(first_vec_x0): > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * -1)(%rdi, %rax), %rax > + addq %rcx, %rax > ret > > - .p2align 4,, 8 > -L(ret_vec_x3): > - bsrl %ecx, %ecx > - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > + /* Fits unobtrusively here. */ > +L(zero_4): > + xorl %eax, %eax > + ret > + > + .p2align 4,, 10 > +L(first_vec_x1): > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * -2)(%rdi, %rax), %rax > + addq %rcx, %rax > ret > > .p2align 4,, 8 > +L(first_vec_x3): > + bsr %VRCX, %VRCX > + addq %rdi, %rax > + addq %rcx, %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x2): > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax > + addq %rcx, %rax > + ret > + > + .p2align 4,, 2 > L(more_4x_vec): > - testl %ecx, %ecx > - jnz L(ret_vec_x2) > + test %VRCX, %VRCX > + jnz L(first_vec_x2) > > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > + vpcmpeqb (%rdi, %rax), %VMATCH, %k0 > + KMOV %k0, %VRCX > > - testl %ecx, %ecx > - jnz L(ret_vec_x3) > + test %VRCX, %VRCX > + jnz L(first_vec_x3) > > /* Check if near end before re-aligning (otherwise might do an > unnecessary loop iteration). */ > - addq $-(VEC_SIZE * 4), %rax > - cmpq $(VEC_SIZE * 4), %rdx > + cmpq $(VEC_SIZE * 4), %rax > jbe L(last_4x_vec) > > - decq %rax > - andq $-(VEC_SIZE * 4), %rax > - movq %rdi, %rdx > - /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because > - lengths that overflow can be valid and break the comparison. */ > - andq $-(VEC_SIZE * 4), %rdx > + > + /* NB: We setup the loop to NOT use index-address-mode for the > + buffer. This costs some instructions & code size but avoids > + stalls due to unlaminated micro-fused instructions (as used > + in the loop) from being forced to issue in the same group > + (essentially narrowing the backend width). */ > + > + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi > + because lengths that overflow can be valid and break the > + comparison. */ > +# if VEC_SIZE == 64 > + /* Use rdx as intermediate to compute rax, this gets us imm8 > + encoding which just allows the L(more_4x_vec) block to fit > + in 1 cache-line. */ > + leaq (VEC_SIZE * 4)(%rdi), %rdx > + leaq (VEC_SIZE * -1)(%rdx, %rax), %rax > + > + /* No evex machine has partial register stalls. This can be > + replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that > + changes. */ > + xorb %al, %al > + xorb %dl, %dl > +# else > + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > + andq $(VEC_SIZE * -4), %rax > + leaq (VEC_SIZE * 4)(%rdi), %rdx > + andq $(VEC_SIZE * -4), %rdx > +# endif > + > > .p2align 4 > L(loop_4x_vec): > - /* Store 1 were not-equals and 0 where equals in k1 (used to mask later > - on). */ > - vpcmpb $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1 > + /* NB: We could do the same optimization here as we do for > + memchr/rawmemchr by using VEX encoding in the loop for access > + to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as > + memchr it may not be worth the extra code size, but if the > + need arises it an easy ~15% perf improvement to the loop. */ > + > + cmpq %rdx, %rax > + je L(loop_last_4x_vec) > + /* Store 1 were not-equals and 0 where equals in k1 (used to > + mask later on). */ > + vpcmpb $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1 > > /* VEC(2/3) will have zero-byte where we found a CHAR. */ > - vpxorq (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2) > - vpxorq (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3) > - vpcmpb $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4 > + vpxorq (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3) > + vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4 > > - /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where > - CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ > + /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit > + where CHAR is found and VEC(2/3) have zero-byte where CHAR > + is found. */ > vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z} > vptestnmb %VMM(3), %VMM(3), %k2 > > - /* Any 1s and we found CHAR. */ > - kortestd %k2, %k4 > - jnz L(loop_end) > - > addq $-(VEC_SIZE * 4), %rax > - cmpq %rdx, %rax > - jne L(loop_4x_vec) > > - /* Need to re-adjust rdx / rax for L(last_4x_vec). */ > - subq $-(VEC_SIZE * 4), %rdx > - movq %rdx, %rax > - subl %edi, %edx > -L(last_4x_vec): > + /* Any 1s and we found CHAR. */ > + KORTEST %k2, %k4 > + jz L(loop_4x_vec) > + > > - /* Used no matter what. */ > - vpcmpb $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > + /* K1 has non-matches for first VEC. inc; jz will overflow rcx > + iff all bytes where non-matches. */ > + KMOV %k1, %VRCX > + inc %VRCX > + jnz L(first_vec_x0_end) > > - cmpl $(VEC_SIZE * 2), %edx > - jbe L(last_2x_vec) > + vptestnmb %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(first_vec_x1_end) > + KMOV %k2, %VRCX > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if VEC_SIZE == 64 > + test %VRCX, %VRCX > + jnz L(first_vec_x2_end) > + KMOV %k4, %VRCX > +# else > + /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from > + VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the > + result in rsi (from VEC(4)). If rcx is non-zero then CHAR in > + VEC(3) and bsrq will use that position. */ > + KMOV %k4, %VRSI > + salq $32, %rcx > + orq %rsi, %rcx > +# endif > + bsrq %rcx, %rcx > + addq %rcx, %rax > + ret > > - testl %ecx, %ecx > - jnz L(ret_vec_x0_dec) > + .p2align 4,, 4 > +L(first_vec_x0_end): > + /* rcx has 1s at non-matches so we need to `not` it. We used > + `inc` to test if zero so use `neg` to complete the `not` so > + the last 1 bit represent a match. NB: (-x + 1 == ~x). */ > + neg %VRCX > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax > + ret > > + .p2align 4,, 10 > +L(first_vec_x1_end): > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax > + ret > > - vpcmpb $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > +# if VEC_SIZE == 64 > + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 > + need return label for it. */ > + .p2align 4,, 4 > +L(first_vec_x2_end): > + bsr %VRCX, %VRCX > + leaq (VEC_SIZE * 1)(%rcx, %rax), %rax > + ret > +# endif > > - testl %ecx, %ecx > - jnz L(ret_vec_x1) > > - /* Used no matter what. */ > - vpcmpb $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0 > - kmovd %k0, %ecx > + .p2align 4,, 4 > +L(page_cross): > + /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can > + use movzbl to get the amount of bytes we are checking here. > + */ > + movzbl %al, %ecx > + andq $-VEC_SIZE, %rax > + vpcmpeqb (%rax), %VMATCH, %k0 > + KMOV %k0, %VRSI > > - cmpl $(VEC_SIZE * 3), %edx > - ja L(last_vec) > + /* eax was comptued as %rdi + %rdx - 1 so need to add back 1 > + here. */ > + leal 1(%rcx), %r8d > > - lzcntl %ecx, %ecx > - subq $(VEC_SIZE * 2 + 1), %rax > - subq %rcx, %rax > - cmpq %rax, %rdi > - jbe L(ret_1) > + /* Invert ecx to get shift count for byte matches out of range. > + */ > + notl %ecx > + shlx %VRCX, %VRSI, %VRSI > + > + /* if r8 < rdx then the entire [buf, buf + len] is handled in > + the page cross case. NB: we can't use the trick here we use > + in the non page-cross case because we aren't checking full > + VEC_SIZE. */ > + cmpq %r8, %rdx > + ja L(page_cross_check) > + lzcnt %VRSI, %VRSI > + subl %esi, %edx > + ja L(page_cross_ret) > xorl %eax, %eax > -L(ret_1): > ret > > - .p2align 4,, 6 > -L(loop_end): > - kmovd %k1, %ecx > - notl %ecx > - testl %ecx, %ecx > - jnz L(ret_vec_x0_end) > +L(page_cross_check): > + test %VRSI, %VRSI > + jz L(page_cross_continue) > > - vptestnmb %VMM(2), %VMM(2), %k0 > - kmovd %k0, %ecx > - testl %ecx, %ecx > - jnz L(ret_vec_x1_end) > - > - kmovd %k2, %ecx > - kmovd %k4, %esi > - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > - then it won't affect the result in esi (VEC4). If ecx is non-zero > - then CHAR in VEC3 and bsrq will use that position. */ > - salq $32, %rcx > - orq %rsi, %rcx > - bsrq %rcx, %rcx > - addq %rcx, %rax > - ret > - .p2align 4,, 4 > -L(ret_vec_x0_end): > - addq $(VEC_SIZE), %rax > -L(ret_vec_x1_end): > - bsrl %ecx, %ecx > - leaq (VEC_SIZE * 2)(%rax, %rcx), %rax > + lzcnt %VRSI, %VRSI > + subl %esi, %edx > +L(page_cross_ret): > + leaq -1(%rdi, %rdx), %rax > ret > - > END(MEMRCHR) > #endif > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S index 550b328c5a..dbcf52808f 100644 --- a/sysdeps/x86_64/multiarch/memrchr-evex.S +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -21,17 +21,19 @@ #if ISA_SHOULD_BUILD (4) # include <sysdep.h> -# include "x86-evex256-vecs.h" -# if VEC_SIZE != 32 -# error "VEC_SIZE != 32 unimplemented" + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" # endif +# include "reg-macros.h" + # ifndef MEMRCHR -# define MEMRCHR __memrchr_evex +# define MEMRCHR __memrchr_evex # endif -# define PAGE_SIZE 4096 -# define VMMMATCH VMM(0) +# define PAGE_SIZE 4096 +# define VMATCH VMM(0) .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN(MEMRCHR, 6) @@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6) # endif jz L(zero_0) - /* Get end pointer. Minus one for two reasons. 1) It is necessary for a - correct page cross check and 2) it correctly sets up end ptr to be - subtract by lzcnt aligned. */ + /* Get end pointer. Minus one for three reasons. 1) It is + necessary for a correct page cross check and 2) it correctly + sets up end ptr to be subtract by lzcnt aligned. 3) it is a + necessary step in aligning ptr. */ leaq -1(%rdi, %rdx), %rax - vpbroadcastb %esi, %VMMMATCH + vpbroadcastb %esi, %VMATCH /* Check if we can load 1x VEC without cross a page. */ testl $(PAGE_SIZE - VEC_SIZE), %eax jz L(page_cross) - /* Don't use rax for pointer here because EVEX has better encoding with - offset % VEC_SIZE == 0. */ - vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0 - kmovd %k0, %ecx - - /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ - cmpq $VEC_SIZE, %rdx - ja L(more_1x_vec) -L(ret_vec_x0_test): - - /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which - will guarantee edx (len) is less than it. */ - lzcntl %ecx, %ecx - cmpl %ecx, %edx - jle L(zero_0) - subq %rcx, %rax + /* Don't use rax for pointer here because EVEX has better + encoding with offset % VEC_SIZE == 0. */ + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0 + KMOV %k0, %VRCX + + /* If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a + already a dependency between rcx and rsi so no worries about + false-dep here. */ + lzcnt %VRCX, %VRSI + /* If rdx <= rsi then either 1) rcx was non-zero (there was a + match) but it was out of bounds or 2) rcx was zero and rdx + was <= VEC_SIZE so we are done scanning. */ + cmpq %rsi, %rdx + /* NB: Use branch to return zero/non-zero. Common usage will + branch on result of function (if return is null/non-null). + This branch can be used to predict the ensuing one so there + is no reason to extend the data-dependency with cmovcc. */ + jbe L(zero_0) + + /* If rcx is zero then len must be > RDX, otherwise since we + already tested len vs lzcnt(rcx) (in rsi) we are good to + return this match. */ + test %VRCX, %VRCX + jz L(more_1x_vec) + subq %rsi, %rax ret - /* Fits in aligning bytes of first cache line. */ + /* Fits in aligning bytes of first cache line for VEC_SIZE == + 32. */ +# if VEC_SIZE == 32 + .p2align 4,, 2 L(zero_0): xorl %eax, %eax ret - - .p2align 4,, 9 -L(ret_vec_x0_dec): - decq %rax -L(ret_vec_x0): - lzcntl %ecx, %ecx - subq %rcx, %rax - ret +# endif .p2align 4,, 10 L(more_1x_vec): - testl %ecx, %ecx - jnz L(ret_vec_x0) - /* Align rax (pointer to string). */ andq $-VEC_SIZE, %rax - +L(page_cross_continue): /* Recompute length after aligning. */ - movq %rax, %rdx + subq %rdi, %rax - /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx - - subq %rdi, %rdx - - cmpq $(VEC_SIZE * 2), %rdx + cmpq $(VEC_SIZE * 2), %rax ja L(more_2x_vec) + L(last_2x_vec): + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX - /* Must dec rax because L(ret_vec_x0_test) expects it. */ - decq %rax - cmpl $VEC_SIZE, %edx - jbe L(ret_vec_x0_test) + test %VRCX, %VRCX + jnz L(ret_vec_x0_test) - testl %ecx, %ecx - jnz L(ret_vec_x0) + /* If VEC_SIZE == 64 need to subtract because lzcntq won't + implicitly add VEC_SIZE to match position. */ +# if VEC_SIZE == 64 + subl $VEC_SIZE, %eax +# else + cmpb $VEC_SIZE, %al +# endif + jle L(zero_2) - /* Don't use rax for pointer here because EVEX has better encoding with - offset % VEC_SIZE == 0. */ - vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0 - kmovd %k0, %ecx - /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ + /* We adjusted rax (length) for VEC_SIZE == 64 so need seperate + offsets. */ +# if VEC_SIZE == 64 + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 +# else + vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 +# endif + KMOV %k0, %VRCX + /* NB: 64-bit lzcnt. This will naturally add 32 to position for + VEC_SIZE == 32. */ lzcntq %rcx, %rcx - cmpl %ecx, %edx - jle L(zero_0) - subq %rcx, %rax - ret - - /* Inexpensive place to put this regarding code size / target alignments - / ICache NLP. Necessary for 2-byte encoding of jump to page cross - case which in turn is necessary for hot path (len <= VEC_SIZE) to fit - in first cache line. */ -L(page_cross): - movq %rax, %rsi - andq $-VEC_SIZE, %rsi - vpcmpb $0, (%rsi), %VMMMATCH, %k0 - kmovd %k0, %r8d - /* Shift out negative alignment (because we are starting from endptr and - working backwards). */ - movl %eax, %ecx - /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ - notl %ecx - shlxl %ecx, %r8d, %ecx - cmpq %rdi, %rsi - ja L(more_1x_vec) - lzcntl %ecx, %ecx - cmpl %ecx, %edx - jle L(zero_1) - subq %rcx, %rax + subl %ecx, %eax + ja L(first_vec_x1_ret) + /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the + first cache line (this is the second cache line). */ +# if VEC_SIZE == 64 +L(zero_0): +# endif +L(zero_2): + xorl %eax, %eax ret - /* Continue creating zero labels that fit in aligning bytes and get - 2-byte encoding / are in the same cache line as condition. */ -L(zero_1): - xorl %eax, %eax + /* NB: Fits in aligning bytes before next cache line for + VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to + L(first_vec_x0_test). */ +# if VEC_SIZE == 32 +L(first_vec_x1_ret): + leaq -1(%rdi, %rax), %rax ret +# endif - .p2align 4,, 8 -L(ret_vec_x1): - /* This will naturally add 32 to position. */ - bsrl %ecx, %ecx - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax + .p2align 4,, 6 +L(ret_vec_x0_test): + lzcnt %VRCX, %VRCX + subl %ecx, %eax + jle L(zero_2) +# if VEC_SIZE == 64 + /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit + L(first_vec_x1_ret) in the same cache line as its jmp base + so we might as well save code size. */ +L(first_vec_x1_ret): +# endif + leaq -1(%rdi, %rax), %rax ret - .p2align 4,, 8 + .p2align 4,, 6 +L(loop_last_4x_vec): + /* Compute remaining length. */ + subl %edi, %eax +L(last_4x_vec): + cmpl $(VEC_SIZE * 2), %eax + jle L(last_2x_vec) +# if VEC_SIZE == 32 + /* Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need + the spare bytes to align the loop properly. */ + .p2align 4,, 10 +# endif L(more_2x_vec): - testl %ecx, %ecx - jnz L(ret_vec_x0_dec) - vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx - testl %ecx, %ecx - jnz L(ret_vec_x1) + /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and + return if either hit. */ + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX + + test %VRCX, %VRCX + jnz L(first_vec_x0) + + vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(first_vec_x1) /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx + vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX - subq $(VEC_SIZE * 4), %rdx + /* Check if we are near the end. */ + subq $(VEC_SIZE * 4), %rax ja L(more_4x_vec) - cmpl $(VEC_SIZE * -1), %edx - jle L(ret_vec_x2_test) -L(last_vec): - testl %ecx, %ecx - jnz L(ret_vec_x2) + test %VRCX, %VRCX + jnz L(first_vec_x2_test) + /* Adjust length for final check and check if we are at the end. + */ + addl $(VEC_SIZE * 1), %eax + jle L(zero_1) - /* Need no matter what. */ - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx - lzcntl %ecx, %ecx - subq $(VEC_SIZE * 3 + 1), %rax - subq %rcx, %rax - cmpq %rax, %rdi - ja L(zero_1) + vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX + + lzcnt %VRCX, %VRCX + subl %ecx, %eax + ja L(first_vec_x3_ret) +L(zero_1): + xorl %eax, %eax + ret +L(first_vec_x3_ret): + leaq -1(%rdi, %rax), %rax ret - .p2align 4,, 8 -L(ret_vec_x2_test): - lzcntl %ecx, %ecx - subq $(VEC_SIZE * 2 + 1), %rax - subq %rcx, %rax - cmpq %rax, %rdi - ja L(zero_1) + .p2align 4,, 6 +L(first_vec_x2_test): + /* Must adjust length before check. */ + subl $-(VEC_SIZE * 2 - 1), %eax + lzcnt %VRCX, %VRCX + subl %ecx, %eax + jl L(zero_4) + addq %rdi, %rax ret - .p2align 4,, 8 -L(ret_vec_x2): - bsrl %ecx, %ecx - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + + .p2align 4,, 10 +L(first_vec_x0): + bsr %VRCX, %VRCX + leaq (VEC_SIZE * -1)(%rdi, %rax), %rax + addq %rcx, %rax ret - .p2align 4,, 8 -L(ret_vec_x3): - bsrl %ecx, %ecx - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + /* Fits unobtrusively here. */ +L(zero_4): + xorl %eax, %eax + ret + + .p2align 4,, 10 +L(first_vec_x1): + bsr %VRCX, %VRCX + leaq (VEC_SIZE * -2)(%rdi, %rax), %rax + addq %rcx, %rax ret .p2align 4,, 8 +L(first_vec_x3): + bsr %VRCX, %VRCX + addq %rdi, %rax + addq %rcx, %rax + ret + + .p2align 4,, 6 +L(first_vec_x2): + bsr %VRCX, %VRCX + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax + addq %rcx, %rax + ret + + .p2align 4,, 2 L(more_4x_vec): - testl %ecx, %ecx - jnz L(ret_vec_x2) + test %VRCX, %VRCX + jnz L(first_vec_x2) - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx + vpcmpeqb (%rdi, %rax), %VMATCH, %k0 + KMOV %k0, %VRCX - testl %ecx, %ecx - jnz L(ret_vec_x3) + test %VRCX, %VRCX + jnz L(first_vec_x3) /* Check if near end before re-aligning (otherwise might do an unnecessary loop iteration). */ - addq $-(VEC_SIZE * 4), %rax - cmpq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rax jbe L(last_4x_vec) - decq %rax - andq $-(VEC_SIZE * 4), %rax - movq %rdi, %rdx - /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because - lengths that overflow can be valid and break the comparison. */ - andq $-(VEC_SIZE * 4), %rdx + + /* NB: We setup the loop to NOT use index-address-mode for the + buffer. This costs some instructions & code size but avoids + stalls due to unlaminated micro-fused instructions (as used + in the loop) from being forced to issue in the same group + (essentially narrowing the backend width). */ + + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi + because lengths that overflow can be valid and break the + comparison. */ +# if VEC_SIZE == 64 + /* Use rdx as intermediate to compute rax, this gets us imm8 + encoding which just allows the L(more_4x_vec) block to fit + in 1 cache-line. */ + leaq (VEC_SIZE * 4)(%rdi), %rdx + leaq (VEC_SIZE * -1)(%rdx, %rax), %rax + + /* No evex machine has partial register stalls. This can be + replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that + changes. */ + xorb %al, %al + xorb %dl, %dl +# else + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + andq $(VEC_SIZE * -4), %rax + leaq (VEC_SIZE * 4)(%rdi), %rdx + andq $(VEC_SIZE * -4), %rdx +# endif + .p2align 4 L(loop_4x_vec): - /* Store 1 were not-equals and 0 where equals in k1 (used to mask later - on). */ - vpcmpb $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1 + /* NB: We could do the same optimization here as we do for + memchr/rawmemchr by using VEX encoding in the loop for access + to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as + memchr it may not be worth the extra code size, but if the + need arises it an easy ~15% perf improvement to the loop. */ + + cmpq %rdx, %rax + je L(loop_last_4x_vec) + /* Store 1 were not-equals and 0 where equals in k1 (used to + mask later on). */ + vpcmpb $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1 /* VEC(2/3) will have zero-byte where we found a CHAR. */ - vpxorq (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2) - vpxorq (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3) - vpcmpb $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4 + vpxorq (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2) + vpxorq (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3) + vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4 - /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where - CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ + /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit + where CHAR is found and VEC(2/3) have zero-byte where CHAR + is found. */ vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z} vptestnmb %VMM(3), %VMM(3), %k2 - /* Any 1s and we found CHAR. */ - kortestd %k2, %k4 - jnz L(loop_end) - addq $-(VEC_SIZE * 4), %rax - cmpq %rdx, %rax - jne L(loop_4x_vec) - /* Need to re-adjust rdx / rax for L(last_4x_vec). */ - subq $-(VEC_SIZE * 4), %rdx - movq %rdx, %rax - subl %edi, %edx -L(last_4x_vec): + /* Any 1s and we found CHAR. */ + KORTEST %k2, %k4 + jz L(loop_4x_vec) + - /* Used no matter what. */ - vpcmpb $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx + /* K1 has non-matches for first VEC. inc; jz will overflow rcx + iff all bytes where non-matches. */ + KMOV %k1, %VRCX + inc %VRCX + jnz L(first_vec_x0_end) - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) + vptestnmb %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(first_vec_x1_end) + KMOV %k2, %VRCX + + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC + individually, for VEC_SIZE == 32 we combine them in a single + 64-bit GPR. */ +# if VEC_SIZE == 64 + test %VRCX, %VRCX + jnz L(first_vec_x2_end) + KMOV %k4, %VRCX +# else + /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from + VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the + result in rsi (from VEC(4)). If rcx is non-zero then CHAR in + VEC(3) and bsrq will use that position. */ + KMOV %k4, %VRSI + salq $32, %rcx + orq %rsi, %rcx +# endif + bsrq %rcx, %rcx + addq %rcx, %rax + ret - testl %ecx, %ecx - jnz L(ret_vec_x0_dec) + .p2align 4,, 4 +L(first_vec_x0_end): + /* rcx has 1s at non-matches so we need to `not` it. We used + `inc` to test if zero so use `neg` to complete the `not` so + the last 1 bit represent a match. NB: (-x + 1 == ~x). */ + neg %VRCX + bsr %VRCX, %VRCX + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax + ret + .p2align 4,, 10 +L(first_vec_x1_end): + bsr %VRCX, %VRCX + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax + ret - vpcmpb $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx +# if VEC_SIZE == 64 + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 + need return label for it. */ + .p2align 4,, 4 +L(first_vec_x2_end): + bsr %VRCX, %VRCX + leaq (VEC_SIZE * 1)(%rcx, %rax), %rax + ret +# endif - testl %ecx, %ecx - jnz L(ret_vec_x1) - /* Used no matter what. */ - vpcmpb $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0 - kmovd %k0, %ecx + .p2align 4,, 4 +L(page_cross): + /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can + use movzbl to get the amount of bytes we are checking here. + */ + movzbl %al, %ecx + andq $-VEC_SIZE, %rax + vpcmpeqb (%rax), %VMATCH, %k0 + KMOV %k0, %VRSI - cmpl $(VEC_SIZE * 3), %edx - ja L(last_vec) + /* eax was comptued as %rdi + %rdx - 1 so need to add back 1 + here. */ + leal 1(%rcx), %r8d - lzcntl %ecx, %ecx - subq $(VEC_SIZE * 2 + 1), %rax - subq %rcx, %rax - cmpq %rax, %rdi - jbe L(ret_1) + /* Invert ecx to get shift count for byte matches out of range. + */ + notl %ecx + shlx %VRCX, %VRSI, %VRSI + + /* if r8 < rdx then the entire [buf, buf + len] is handled in + the page cross case. NB: we can't use the trick here we use + in the non page-cross case because we aren't checking full + VEC_SIZE. */ + cmpq %r8, %rdx + ja L(page_cross_check) + lzcnt %VRSI, %VRSI + subl %esi, %edx + ja L(page_cross_ret) xorl %eax, %eax -L(ret_1): ret - .p2align 4,, 6 -L(loop_end): - kmovd %k1, %ecx - notl %ecx - testl %ecx, %ecx - jnz L(ret_vec_x0_end) +L(page_cross_check): + test %VRSI, %VRSI + jz L(page_cross_continue) - vptestnmb %VMM(2), %VMM(2), %k0 - kmovd %k0, %ecx - testl %ecx, %ecx - jnz L(ret_vec_x1_end) - - kmovd %k2, %ecx - kmovd %k4, %esi - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) - then it won't affect the result in esi (VEC4). If ecx is non-zero - then CHAR in VEC3 and bsrq will use that position. */ - salq $32, %rcx - orq %rsi, %rcx - bsrq %rcx, %rcx - addq %rcx, %rax - ret - .p2align 4,, 4 -L(ret_vec_x0_end): - addq $(VEC_SIZE), %rax -L(ret_vec_x1_end): - bsrl %ecx, %ecx - leaq (VEC_SIZE * 2)(%rax, %rcx), %rax + lzcnt %VRSI, %VRSI + subl %esi, %edx +L(page_cross_ret): + leaq -1(%rdi, %rdx), %rax ret - END(MEMRCHR) #endif