Message ID | 20221018024901.3381469-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Optimizations are: > > 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch > in short string case. > 2. Restructure code so that small strings are given the hot path. > - This is a net-zero on the benchmark suite but in general makes > sense as smaller sizes are far more common. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 4. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > The optimizations (especially for point 2) make the memchr and > rawmemchr code essentially incompatible so split rawmemchr-evex > to a new file. > > Code Size Changes: > memchr-evex.S : -107 bytes > rawmemchr-evex.S : -53 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > memchr-evex.S : 0.928 > rawmemchr-evex.S : 0.986 (Less targets cross cache lines) > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memchr-evex.S | 939 ++++++++++-------- > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 9 +- > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 313 +++++- > 3 files changed, 851 insertions(+), 410 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > index 0dd4f1dcce..23a1c0018e 100644 > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > @@ -21,17 +21,27 @@ > > #if ISA_SHOULD_BUILD (4) > > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > # ifndef MEMCHR > # define MEMCHR __memchr_evex > # endif > > # ifdef USE_AS_WMEMCHR > +# define PC_SHIFT_GPR rcx > +# define VPTESTN vptestnmd > # define VPBROADCAST vpbroadcastd > # define VPMINU vpminud > # define VPCMP vpcmpd > # define VPCMPEQ vpcmpeqd > # define CHAR_SIZE 4 > + > +# define USE_WIDE_CHAR > # else > +# define PC_SHIFT_GPR rdi > +# define VPTESTN vptestnmb > # define VPBROADCAST vpbroadcastb > # define VPMINU vpminub > # define VPCMP vpcmpb > @@ -39,534 +49,661 @@ > # define CHAR_SIZE 1 > # endif > > - /* In the 4x loop the RTM and non-RTM versions have data pointer > - off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. > - This is represented by BASE_OFFSET. As well because the RTM > - version uses vpcmp which stores a bit per element compared where > - the non-RTM version uses vpcmpeq which stores a bit per byte > - compared RET_SCALE of CHAR_SIZE is only relevant for the RTM > - version. */ > -# ifdef USE_IN_RTM > +# include "reg-macros.h" > + > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > # define VZEROUPPER > -# define BASE_OFFSET (VEC_SIZE * 4) > -# define RET_SCALE CHAR_SIZE > + > +# define USE_TERN_IN_LOOP 0 > # else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > # define VZEROUPPER vzeroupper > -# define BASE_OFFSET 0 > -# define RET_SCALE 1 > # endif > > - /* In the return from 4x loop memchr and rawmemchr versions have > - data pointers off by VEC_SIZE * 4 with memchr version being > - VEC_SIZE * 4 greater. */ > -# ifdef USE_AS_RAWMEMCHR > -# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) > -# define RAW_PTR_REG rcx > -# define ALGN_PTR_REG rdi > +# if USE_TERN_IN_LOOP > + /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar > + so we don't want to multiply resulting index. */ > +# define TERN_CHAR_MULT 1 > + > +# ifdef USE_AS_WMEMCHR > +# define TEST_END() inc %VRCX > +# else > +# define TEST_END() add %rdx, %rcx > +# endif > # else > -# define RET_OFFSET BASE_OFFSET > -# define RAW_PTR_REG rdi > -# define ALGN_PTR_REG rcx > +# define TERN_CHAR_MULT CHAR_SIZE > +# define TEST_END() KORTEST %k2, %k3 > # endif > > -# define XMMZERO xmm23 > -# define YMMZERO ymm23 > -# define XMMMATCH xmm16 > -# define YMMMATCH ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +# ifndef USE_AS_WMEMCHR > +# define GPR_X0_IS_RET 1 > +# else > +# define GPR_X0_IS_RET 0 > +# endif > +# define GPR_X0 rax > +# else > +# define GPR_X0_IS_RET 0 > +# define GPR_X0 rdx > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# ifndef SECTION > -# define SECTION(p) p##.evex > +# if CHAR_PER_VEC == 64 > +# define LAST_VEC_OFFSET (VEC_SIZE * 3) > +# else > +# define LAST_VEC_OFFSET (VEC_SIZE * 2) > +# endif > +# if CHAR_PER_VEC >= 32 > +# define MASK_GPR(...) VGPR(__VA_ARGS__) > +# elif CHAR_PER_VEC == 16 > +# define MASK_GPR(reg) VGPR_SZ(reg, 16) > +# else > +# define MASK_GPR(reg) VGPR_SZ(reg, 8) > # endif > > -# define VEC_SIZE 32 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > -# define PAGE_SIZE 4096 > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > > - .section SECTION(.text),"ax",@progbits > +# define PAGE_SIZE 4096 > + > + > + .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN (MEMCHR, 6) > -# ifndef USE_AS_RAWMEMCHR > /* Check for zero length. */ > test %RDX_LP, %RDX_LP > - jz L(zero) > + jz L(zero_0) > > -# ifdef __ILP32__ > +# ifdef __ILP32__ > /* Clear the upper 32 bits. */ > movl %edx, %edx > -# endif > # endif > - /* Broadcast CHAR to YMMMATCH. */ > - VPBROADCAST %esi, %YMMMATCH > + VPBROADCAST %esi, %VMATCH > /* Check if we may cross page boundary with one vector load. */ > movl %edi, %eax > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page_boundary) > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > +# ifndef USE_AS_WMEMCHR > + /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a > + already a dependency between rcx and rsi so no worries about > + false-dep here. */ > + tzcnt %VRAX, %VRSI > + /* If rdx <= rsi then either 1) rcx was non-zero (there was a > + match) but it was out of bounds or 2) rcx was zero and rdx > + was <= VEC_SIZE so we are done scanning. */ > + cmpq %rsi, %rdx > + /* NB: Use branch to return zero/non-zero. Common usage will > + branch on result of function (if return is null/non-null). > + This branch can be used to predict the ensuing one so there > + is no reason to extend the data-dependency with cmovcc. */ > + jbe L(zero_0) > + > + /* If rcx is zero then len must be > RDX, otherwise since we > + already tested len vs lzcnt(rcx) (in rsi) we are good to > + return this match. */ > + test %VRAX, %VRAX > + jz L(more_1x_vec) > + leaq (%rdi, %rsi), %rax > +# else > > - /* Check the first VEC_SIZE bytes. */ > - VPCMP $0, (%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* If length < CHAR_PER_VEC handle special. */ > + /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE > + > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ > cmpq $CHAR_PER_VEC, %rdx > - jbe L(first_vec_x0) > -# endif > - testl %eax, %eax > - jz L(aligned_more) > - tzcntl %eax, %eax > -# ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > + ja L(more_1x_vec) > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(zero_0) > +L(first_vec_x0_ret): > leaq (%rdi, %rax, CHAR_SIZE), %rax > -# else > - addq %rdi, %rax > # endif > ret > > -# ifndef USE_AS_RAWMEMCHR > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(first_vec_x0): > - /* Check if first match was before length. NB: tzcnt has false data- > - dependency on destination. eax already had a data-dependency on esi > - so this should have no affect here. */ > - tzcntl %eax, %esi > -# ifdef USE_AS_WMEMCHR > - leaq (%rdi, %rsi, CHAR_SIZE), %rdi > -# else > - addq %rsi, %rdi > -# endif > + /* Only fits in first cache line for VEC_SIZE == 32. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 2 > +L(zero_0): > xorl %eax, %eax > - cmpl %esi, %edx > - cmovg %rdi, %rax > ret > # endif > > - .p2align 4 > -L(cross_page_boundary): > - /* Save pointer before aligning as its original value is > - necessary for computer return address if byte is found or > - adjusting length if it is not and this is memchr. */ > - movq %rdi, %rcx > - /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi > - for rawmemchr. */ > - andq $-VEC_SIZE, %ALGN_PTR_REG > - VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 > - kmovd %k0, %r8d > + .p2align 4,, 9 > +L(more_1x_vec): > # ifdef USE_AS_WMEMCHR > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - sarl $2, %eax > -# endif > -# ifndef USE_AS_RAWMEMCHR > - movl $(PAGE_SIZE / CHAR_SIZE), %esi > - subl %eax, %esi > + /* If wmemchr still need to test if there was a match in first > + VEC. Use bsf to test here so we can reuse > + L(first_vec_x0_ret). */ > + bsf %VRAX, %VRAX > + jnz L(first_vec_x0_ret) > # endif > + > +L(page_cross_continue): > # ifdef USE_AS_WMEMCHR > - andl $(CHAR_PER_VEC - 1), %eax > -# endif > - /* Remove the leading bytes. */ > - sarxl %eax, %r8d, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Check the end of data. */ > - cmpq %rsi, %rdx > - jbe L(first_vec_x0) > + /* We can't use end of the buffer to re-calculate length for > + wmemchr as len * CHAR_SIZE may overflow. */ > + leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + addq %rdx, %rax > +# else > + leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > # endif > - testl %eax, %eax > - jz L(cross_page_continue) > - tzcntl %eax, %eax > + > + /* rax contains remaining length - 1. -1 so we can get imm8 > + encoding in a few additional places saving code size. */ > + > + /* Needed regardless of remaining length. */ > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + > + /* We cannot fold the above `sub %rdi, %rax` with the `cmp > + $(CHAR_PER_VEC * 2), %rax` because its possible for a very > + large length to overflow and cause the subtract to carry > + despite length being above CHAR_PER_VEC * 2. */ > + cmpq $(CHAR_PER_VEC * 2 - 1), %rax > + ja L(more_2x_vec) > +L(last_2x_vec): > + > + test %VRDX, %VRDX > + jnz L(first_vec_x1_check) > + > + /* Check the end of data. NB: use 8-bit operations to save code > + size. We no longer need the full-width of eax and will > + perform a write-only operation over eax so there will be no > + partial-register stalls. */ > + subb $(CHAR_PER_VEC * 1 - 1), %al > + jle L(zero_0) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > # ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_0) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + > + /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give > + fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is > + not enough space before the next cache line to fit the `lea` > + for return. */ > +# if VEC_SIZE == 64 > + ja L(first_vec_x2_ret) > +L(zero_0): > + xorl %eax, %eax > + ret > # else > - addq %RAW_PTR_REG, %rax > + jbe L(zero_0) > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > # endif > + > + .p2align 4,, 5 > +L(first_vec_x1_check): > + bsf %VRDX, %VRDX > + cmpb %dl, %al > + jb L(zero_4) > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x1): > - tzcntl %eax, %eax > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 32. > + */ > +# if VEC_SIZE == 32 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > + > + .p2align 4,, 4 > L(first_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + bsf %VRCX, %VRCX > +L(first_vec_x2_ret): > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + */ > +# if VEC_SIZE == 64 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > -L(first_vec_x4): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 4 > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 5 > -L(aligned_more): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > > -# ifndef USE_AS_RAWMEMCHR > - /* Align data to VEC_SIZE. */ > -L(cross_page_continue): > - xorl %ecx, %ecx > - subl %edi, %ecx > - andq $-VEC_SIZE, %rdi > - /* esi is for adjusting length to see if near the end. */ > - leal (VEC_SIZE * 5)(%rdi, %rcx), %esi > -# ifdef USE_AS_WMEMCHR > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %esi > -# endif > -# else > - andq $-VEC_SIZE, %rdi > -L(cross_page_continue): > -# endif > - /* Load first VEC regardless. */ > - VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Adjust length. If near end handle specially. */ > - subq %rsi, %rdx > - jbe L(last_4x_vec_or_less) > -# endif > - testl %eax, %eax > + .p2align 4,, 5 > +L(more_2x_vec): > + /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking > + length. */ > + > + > + /* Already computed matches for first VEC in rdx. */ > + test %VRDX, %VRDX > jnz L(first_vec_x1) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x2) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + /* Needed regardless of next length check. */ > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + /* Check if we are near the end. */ > + cmpq $(CHAR_PER_VEC * 4 - 1), %rax > + ja L(more_4x_vec) > + > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + /* Use 8-bit instructions to save code size. We won't use full- > + width eax again and will perform a write-only operation to > + eax so no worries about partial-register stalls. */ > + subb $(CHAR_PER_VEC * 3), %al > + jb L(zero_2) > +L(last_vec_check): > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WMEMCHR > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_2) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + jae L(first_vec_x4_ret) > +L(zero_2): > + xorl %eax, %eax > + ret > + > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + For VEC_SIZE == 32 we put the return label at the end of > + L(first_vec_x4). */ > +# if VEC_SIZE == 64 > +L(first_vec_x4_ret): > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(first_vec_x4): > + bsf %VRCX, %VRCX > +# if VEC_SIZE == 32 > + /* Place L(first_vec_x4_ret) here as we can't fit it in the same > + cache line as where it is called from so we might as well > + save code size by reusing return of L(first_vec_x4). */ > +L(first_vec_x4_ret): > +# endif > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3_check): > + /* Need to adjust remaining length before checking. */ > + addb $-(CHAR_PER_VEC * 2), %al > + bsf %VRCX, %VRCX > + cmpb %cl, %al > + jb L(zero_2) > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3): > + bsf %VRCX, %VRCX > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 3 > +# if !USE_TERN_IN_LOOP > + .p2align 4,, 10 > +# endif > +L(more_4x_vec): > + test %VRCX, %VRCX > jnz L(first_vec_x3) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x4) > > + subq $-(VEC_SIZE * 5), %rdi > + subq $(CHAR_PER_VEC * 8), %rax > + jb L(last_4x_vec) > > -# ifndef USE_AS_RAWMEMCHR > - /* Check if at last CHAR_PER_VEC * 4 length. */ > - subq $(CHAR_PER_VEC * 4), %rdx > - jbe L(last_4x_vec_or_less_cmpeq) > - /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - > - /* Align data to VEC_SIZE * 4 for the loop and readjust length. > - */ > -# ifdef USE_AS_WMEMCHR > +# ifdef USE_AS_WMEMCHR > movl %edi, %ecx > - andq $-(4 * VEC_SIZE), %rdi > +# else > + addq %rdi, %rax > +# endif > + > + > +# if VEC_SIZE == 64 > + /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex > + processor has partial register stalls (all have merging > + uop). If that changes this can be removed. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# ifdef USE_AS_WMEMCHR > subl %edi, %ecx > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > sarl $2, %ecx > - addq %rcx, %rdx > -# else > - addq %rdi, %rdx > - andq $-(4 * VEC_SIZE), %rdi > - subq %rdi, %rdx > -# endif > + addq %rcx, %rax > # else > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - andq $-(4 * VEC_SIZE), %rdi > + subq %rdi, %rax > # endif > -# ifdef USE_IN_RTM > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > -# else > - /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not > - encodable with EVEX registers (ymm16-ymm31). */ > - vmovdqa64 %YMMMATCH, %ymm0 > + > + > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > # endif > > - /* Compare 4 * VEC at a time forward. */ > - .p2align 4 > + .p2align 4,, 11 > L(loop_4x_vec): > - /* Two versions of the loop. One that does not require > - vzeroupper by not using ymm0-ymm15 and another does that require > - vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 > - is used at all is because there is no EVEX encoding vpcmpeq and > - with vpcmpeq this loop can be performed more efficiently. The > - non-vzeroupper version is safe for RTM while the vzeroupper > - version should be prefered if RTM are not supported. */ > -# ifdef USE_IN_RTM > - /* It would be possible to save some instructions using 4x VPCMP > - but bottleneck on port 5 makes it not woth it. */ > - VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 > - /* xor will set bytes match esi to zero. */ > - vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 > - vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 > - VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 > - /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ > - VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} > - VPCMP $0, %YMM3, %YMMZERO, %k2 > -# else > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymmm0-15 and another does that > + require vzeroupper because it uses ymmm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > /* Since vptern can only take 3x vectors fastest to do 1 vec > seperately with EVEX vpcmp. */ > # ifdef USE_AS_WMEMCHR > /* vptern can only accept masks for epi32/epi64 so can only save > - instruction using not equals mask on vptern with wmemchr. */ > - VPCMP $4, (%rdi), %YMMMATCH, %k1 > + instruction using not equals mask on vptern with wmemchr. > + */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # else > - VPCMP $0, (%rdi), %YMMMATCH, %k1 > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # endif > /* Compare 3x with vpcmpeq and or them all together with vptern. > */ > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > # ifdef USE_AS_WMEMCHR > - /* This takes the not of or between ymm2, ymm3, ymm4 as well as > - combines result from VEC0 with zero mask. */ > - vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} > - vpmovmskb %ymm4, %ecx > + /* This takes the not of or between VEC_lo(2), VEC_lo(3), > + VEC_lo(4) as well as combines result from VEC(0) with zero > + mask. */ > + vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} > + vpmovmskb %VMM_lo(4), %VRCX > # else > - /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ > - vpternlogd $254, %ymm2, %ymm3, %ymm4 > - vpmovmskb %ymm4, %ecx > - kmovd %k1, %eax > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + KMOV %k1, %edx > # endif > -# endif > > -# ifdef USE_AS_RAWMEMCHR > - subq $-(VEC_SIZE * 4), %rdi > -# endif > -# ifdef USE_IN_RTM > - kortestd %k2, %k3 > # else > -# ifdef USE_AS_WMEMCHR > - /* ecx contains not of matches. All 1s means no matches. incl will > - overflow and set zeroflag if that is the case. */ > - incl %ecx > -# else > - /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding > - to ecx is not an issue because if eax is non-zero it will be > - used for returning the match. If it is zero the add does > - nothing. */ > - addq %rax, %rcx > -# endif > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > # endif > -# ifdef USE_AS_RAWMEMCHR > - jz L(loop_4x_vec) > -# else > - jnz L(loop_4x_vec_end) > + > + > + TEST_END () > + jnz L(loop_vec_ret) > > subq $-(VEC_SIZE * 4), %rdi > > - subq $(CHAR_PER_VEC * 4), %rdx > - ja L(loop_4x_vec) > + subq $(CHAR_PER_VEC * 4), %rax > + jae L(loop_4x_vec) > > - /* Fall through into less than 4 remaining vectors of length case. > + /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. > */ > - VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 > - addq $(BASE_OFFSET - VEC_SIZE), %rdi > - kmovd %k0, %eax > - VZEROUPPER > - > -L(last_4x_vec_or_less): > - /* Check if first VEC contained match. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + COND_VZEROUPPER > > - /* If remaining length > CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jg L(last_4x_vec) > - > -L(last_2x_vec): > - /* If remaining length < CHAR_PER_VEC. */ > - addl $CHAR_PER_VEC, %edx > - jle L(zero_end) > - > - /* Check VEC2 and compare any match with remaining length. */ > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - cmpl %eax, %edx > - jbe L(set_zero_end) > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end): > - ret > + .p2align 4,, 10 > +L(last_4x_vec): > + /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit > + instructions on eax from here on out. */ > +# if CHAR_PER_VEC != 64 > + andl $(CHAR_PER_VEC * 4 - 1), %eax > +# endif > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 > + subq $(VEC_SIZE * 1), %rdi > + KMOV %k0, %VRDX > + cmpb $(CHAR_PER_VEC * 2 - 1), %al > + jbe L(last_2x_vec) > + test %VRDX, %VRDX > + jnz L(last_vec_x1_novzero) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x2_novzero) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + subb $(CHAR_PER_VEC * 3), %al > + jae L(last_vec_check) > > -L(set_zero_end): > xorl %eax, %eax > ret > > - .p2align 4 > -L(first_vec_x1_check): > - /* eax must be non-zero. Use bsfl to save code size. */ > - bsfl %eax, %eax > - /* Adjust length. */ > - subl $-(CHAR_PER_VEC * 4), %edx > - /* Check if match within remaining length. */ > - cmpl %eax, %edx > - jbe L(set_zero_end) > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > + addq $VEC_SIZE, %rdi > +L(last_vec_x1_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > +# endif > > - .p2align 4 > -L(loop_4x_vec_end): > +# if CHAR_PER_VEC == 64 > + /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == > + 64 it needs a seperate return label. */ > + .p2align 4,, 4 > +L(last_vec_x2): > +L(last_vec_x2_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax > + ret > # endif > - /* rawmemchr will fall through into this if match was found in > - loop. */ > > -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR > - /* k1 has not of matches with VEC1. */ > - kmovd %k1, %eax > -# ifdef USE_AS_WMEMCHR > - subl $((1 << CHAR_PER_VEC) - 1), %eax > -# else > - incl %eax > -# endif > + .p2align 4,, 4 > +L(loop_vec_ret): > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > + KMOV %k1, %VRAX > + inc %MASK_GPR(rax) > # else > - /* eax already has matches for VEC1. */ > - testl %eax, %eax > + test %VRDX, %VRDX > # endif > - jnz L(last_vec_x1_return) > + jnz L(last_vec_x0) > > -# ifdef USE_IN_RTM > - VPCMP $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %eax > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRDX > # else > - vpmovmskb %ymm2, %eax > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRDX > # endif > - testl %eax, %eax > - jnz L(last_vec_x2_return) > + test %VRDX, %VRDX > + jnz L(last_vec_x1) > > -# ifdef USE_IN_RTM > - kmovd %k2, %eax > - testl %eax, %eax > - jnz L(last_vec_x3_return) > > - kmovd %k3, %eax > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRDX > # else > - vpmovmskb %ymm3, %eax > - /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ > - salq $VEC_SIZE, %rcx > - orq %rcx, %rax > - tzcntq %rax, %rax > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax > - VZEROUPPER > + KMOV %k2, %VRDX > # endif > - ret > > - .p2align 4,, 10 > -L(last_vec_x1_return): > - tzcntl %eax, %eax > -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For > + CHAR_PER_VEC we test the last 2x VEC seperately, for > + CHAR_PER_VEC <= 32 we can combine the results from the 2x > + VEC in a single GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRDX, %VRDX > + jnz L(last_vec_x2) > + KMOV %k3, %VRDX > # else > - addq %rdi, %rax > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > + > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx > + addq %rcx, %rdx > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > +# endif > # endif > - VZEROUPPER > + bsf %rdx, %rdx > + leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > - .p2align 4 > -L(last_vec_x2_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by RET_SCALE to get the wchar_t count > - if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and > - USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ > - leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax > - VZEROUPPER > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x1_novzero): > +# endif > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > -# ifdef USE_IN_RTM > - .p2align 4 > -L(last_vec_x3_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > + > + .p2align 4,, 4 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VGPR(GPR_X0), %VGPR(GPR_X0) > +# if GPR_X0_IS_RET > + addq %rdi, %rax > +# else > + leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax > +# endif > ret > + > + .p2align 4,, 6 > +L(page_cross): > + /* Need to preserve eax to compute inbound bytes we are > + checking. */ > +# ifdef USE_AS_WMEMCHR > + movl %eax, %ecx > +# else > + xorl %ecx, %ecx > + subl %eax, %ecx > # endif > > -# ifndef USE_AS_RAWMEMCHR > - .p2align 4,, 5 > -L(last_4x_vec_or_less_cmpeq): > - VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - subq $-(VEC_SIZE * 4), %rdi > - /* Check first VEC regardless. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > > - /* If remaining length <= CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jle L(last_2x_vec) > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > +# endif > > - .p2align 4 > -L(last_4x_vec): > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > > + shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Create mask for possible matches within remaining length. */ > -# ifdef USE_AS_WMEMCHR > - movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx > - bzhil %edx, %ecx, %ecx > -# else > - movq $-1, %rcx > - bzhiq %rdx, %rcx, %rcx > -# endif > - /* Test matches in data against length match. */ > - andl %ecx, %eax > - jnz L(last_vec_x3) > +# ifdef USE_AS_WMEMCHR > + negl %ecx > +# endif > > - /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after > - remaining length was found to be > CHAR_PER_VEC * 2. */ > - subl $CHAR_PER_VEC, %edx > - jbe L(zero_end2) > + /* mask lower bits from ecx (negative eax) to get bytes till > + next VEC. */ > + andl $(CHAR_PER_VEC - 1), %ecx > > + /* Check if VEC is entirely contained in the remainder of the > + page. */ > + cmpq %rcx, %rdx > + jbe L(page_cross_ret) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Shift remaining length mask for last VEC. */ > -# ifdef USE_AS_WMEMCHR > - shrl $CHAR_PER_VEC, %ecx > -# else > - shrq $CHAR_PER_VEC, %rcx > -# endif > - andl %ecx, %eax > - jz L(zero_end2) > - bsfl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end2): > - ret > + /* Length crosses the page so if rax is zero (no matches) > + continue. */ > + test %VRAX, %VRAX > + jz L(page_cross_continue) > > -L(last_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + /* if rdx > rcx then any match here must be in [buf:buf + len]. > + */ > + tzcnt %VRAX, %VRAX > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > ret > > - .p2align 4 > -L(last_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 2 > +L(page_cross_zero): > + xorl %eax, %eax > ret > + > + .p2align 4,, 4 > +L(page_cross_ret): > + /* Search is entirely contained in page cross case. */ > +# ifdef USE_AS_WMEMCHR > + test %VRAX, %VRAX > + jz L(page_cross_zero) > +# endif > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(page_cross_zero) > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > # endif > - /* 7 bytes from next cache line. */ > + ret > END (MEMCHR) > #endif > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > index deda1ca395..2073eaa620 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > @@ -1,3 +1,6 @@ > -#define MEMCHR __rawmemchr_evex_rtm > -#define USE_AS_RAWMEMCHR 1 > -#include "memchr-evex-rtm.S" > +#define RAWMEMCHR __rawmemchr_evex_rtm > + > +#define USE_IN_RTM 1 > +#define SECTION(p) p##.evex.rtm > + > +#include "rawmemchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > index dc1c450699..dad54def2b 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > @@ -1,7 +1,308 @@ > -#ifndef RAWMEMCHR > -# define RAWMEMCHR __rawmemchr_evex > -#endif > -#define USE_AS_RAWMEMCHR 1 > -#define MEMCHR RAWMEMCHR > +/* rawmemchr optimized with 256-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > +#include <sysdep.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > +# ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex > +# endif > + > + > +# define PC_SHIFT_GPR rdi > +# define REG_WIDTH VEC_SIZE > +# define VPTESTN vptestnmb > +# define VPBROADCAST vpbroadcastb > +# define VPMINU vpminub > +# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > +# define CHAR_SIZE 1 > + > +# include "reg-macros.h" > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > +# define VZEROUPPER > + > +# define USE_TERN_IN_LOOP 0 > +# else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > +# define VZEROUPPER vzeroupper > +# endif > + > +# define CHAR_PER_VEC VEC_SIZE > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else /* !(CHAR_PER_VEC == 64) */ > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > +# endif /* !(CHAR_PER_VEC == 64) */ > + > + > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > + > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (RAWMEMCHR, 6) > + VPBROADCAST %esi, %VMATCH > + /* Check if we may cross page boundary with one vector load. */ > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + test %VRAX, %VRAX > + jz L(aligned_more) > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > + > + .p2align 4,, 4 > +L(first_vec_x4): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 4)(%rdi, %rax), %rax > + ret > > -#include "memchr-evex.S" > + /* For VEC_SIZE == 32 we can fit this in aligning bytes so might > + as well place it more locally. For VEC_SIZE == 64 we reuse > + return code at the end of loop's return. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 4 > +L(FALLTHROUGH_RETURN_LBL): > + bsf %VRAX, %VRAX > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(page_cross): > + /* eax has lower page-offset bits of rdi so xor will zero them > + out. */ > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + /* Shift out out-of-bounds matches. */ > + shrx %VRDI, %VRAX, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > + > + .p2align 4,, 10 > +L(aligned_more): > +L(page_cross_continue): > + /* Align pointer. */ > + andq $(VEC_SIZE * -1), %rdi > + > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x4) > + > + subq $-(VEC_SIZE * 1), %rdi > +# if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > +# endif > + > + .p2align 4 > +L(loop_4x_vec): > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymm0-15 and another does that > + require vzeroupper because it uses ymm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > + /* Since vptern can only take 3x vectors fastest to do 1 vec > + seperately with EVEX vpcmp. */ > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + /* Compare 3x with vpcmpeq and or them all together with vptern. > + */ > + > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2) > + subq $(VEC_SIZE * -4), %rdi > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > + > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + > + KMOV %k1, %eax > + > + /* NB: rax has match from first VEC and rcx has matches from > + VEC 2-4. If rax is non-zero we will return that match. If > + rax is zero adding won't disturb the bits in rcx. */ > + add %rax, %rcx > +# else > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > + subq $(VEC_SIZE * -4), %rdi > + KORTEST %k2, %k3 > +# endif > + jz L(loop_4x_vec) > + > +# if USE_TERN_IN_LOOP > + test %VRAX, %VRAX > +# else > + KMOV %k1, %VRAX > + inc %VRAX > +# endif > + jnz L(last_vec_x0) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRAX > +# else > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRAX > +# endif > + test %VRAX, %VRAX > + jnz L(last_vec_x1) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRAX > +# else > + KMOV %k2, %VRAX > +# endif > + > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k3, %VRAX > +L(FALLTHROUGH_RETURN_LBL): > +# else > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $CHAR_PER_VEC, %rcx > + addq %rcx, %rax > +# endif > + bsf %rax, %rax > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(TAIL_RETURN_LBL): > + bsf %rax, %rax > + leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > +END (RAWMEMCHR) > +#endif > -- > 2.34.1 > Results For: rawmemchr alignment,char ,length ,__rawmemchr_evex ,__rawmemchr_evex_orig 0 ,0 ,1 ,2.878 ,2.891 ,0.996 0 ,0 ,10 ,2.876 ,2.886 ,0.997 0 ,0 ,1024 ,22.832 ,23.58 ,0.968 0 ,0 ,11 ,2.886 ,2.887 ,0.999 0 ,0 ,12 ,2.864 ,2.871 ,0.998 0 ,0 ,128 ,5.816 ,6.014 ,0.967 0 ,0 ,13 ,2.854 ,2.863 ,0.997 0 ,0 ,14 ,2.886 ,2.865 ,1.007 0 ,0 ,15 ,2.863 ,2.886 ,0.992 0 ,0 ,16 ,2.859 ,2.857 ,1.001 0 ,0 ,17 ,2.848 ,2.881 ,0.988 0 ,0 ,18 ,2.854 ,2.865 ,0.996 0 ,0 ,19 ,2.878 ,2.872 ,1.002 0 ,0 ,2 ,2.887 ,2.9 ,0.995 0 ,0 ,20 ,2.857 ,2.862 ,0.998 0 ,0 ,21 ,2.861 ,2.86 ,1.0 0 ,0 ,22 ,2.854 ,2.873 ,0.993 0 ,0 ,23 ,2.872 ,2.861 ,1.004 0 ,0 ,24 ,2.853 ,2.855 ,0.999 0 ,0 ,25 ,2.85 ,2.853 ,0.999 0 ,0 ,256 ,10.355 ,10.703 ,0.968 0 ,0 ,26 ,2.86 ,2.853 ,1.002 0 ,0 ,27 ,2.846 ,2.861 ,0.995 0 ,0 ,28 ,2.849 ,2.861 ,0.996 0 ,0 ,29 ,2.867 ,2.868 ,1.0 0 ,0 ,3 ,2.863 ,2.892 ,0.99 0 ,0 ,30 ,2.855 ,2.869 ,0.995 0 ,0 ,31 ,2.842 ,2.867 ,0.991 0 ,0 ,32 ,4.245 ,4.28 ,0.992 0 ,0 ,4 ,2.875 ,2.894 ,0.994 0 ,0 ,5 ,2.887 ,2.893 ,0.998 0 ,0 ,512 ,14.736 ,15.229 ,0.968 0 ,0 ,6 ,2.876 ,2.868 ,1.003 0 ,0 ,64 ,4.957 ,4.968 ,0.998 0 ,0 ,7 ,2.893 ,2.88 ,1.004 0 ,0 ,8 ,2.856 ,2.867 ,0.996 0 ,0 ,9 ,2.872 ,2.885 ,0.996 0 ,23 ,1 ,2.826 ,2.859 ,0.988 0 ,23 ,10 ,2.861 ,2.876 ,0.995 0 ,23 ,1023 ,21.322 ,22.016 ,0.968 0 ,23 ,1024 ,22.76 ,23.532 ,0.967 0 ,23 ,11 ,2.872 ,2.875 ,0.999 0 ,23 ,12 ,2.872 ,2.881 ,0.997 0 ,23 ,127 ,5.293 ,5.38 ,0.984 0 ,23 ,1279 ,24.974 ,25.923 ,0.963 0 ,23 ,128 ,5.904 ,5.683 ,1.039 0 ,23 ,1280 ,26.229 ,27.041 ,0.97 0 ,23 ,13 ,2.878 ,2.87 ,1.003 0 ,23 ,14 ,2.843 ,2.87 ,0.991 0 ,23 ,15 ,2.864 ,2.873 ,0.997 0 ,23 ,1535 ,28.787 ,29.899 ,0.963 0 ,23 ,1536 ,30.286 ,31.62 ,0.958 0 ,23 ,159 ,6.12 ,6.081 ,1.006 0 ,23 ,16 ,2.879 ,2.868 ,1.004 0 ,23 ,160 ,8.965 ,9.035 ,0.992 0 ,23 ,17 ,2.861 ,2.884 ,0.992 0 ,23 ,1791 ,32.274 ,33.92 ,0.951 0 ,23 ,1792 ,33.701 ,35.386 ,0.952 0 ,23 ,18 ,2.861 ,2.873 ,0.996 0 ,23 ,19 ,2.848 ,2.865 ,0.994 0 ,23 ,191 ,8.858 ,9.03 ,0.981 0 ,23 ,192 ,9.255 ,9.801 ,0.944 0 ,23 ,2 ,2.889 ,2.897 ,0.997 0 ,23 ,20 ,2.843 ,2.846 ,0.999 0 ,23 ,2047 ,36.33 ,37.384 ,0.972 0 ,23 ,2048 ,37.147 ,38.863 ,0.956 0 ,23 ,21 ,2.855 ,2.86 ,0.998 0 ,23 ,22 ,2.843 ,2.846 ,0.999 0 ,23 ,223 ,8.993 ,9.551 ,0.942 0 ,23 ,224 ,9.1 ,9.656 ,0.942 0 ,23 ,23 ,2.847 ,2.852 ,0.998 0 ,23 ,24 ,2.854 ,2.854 ,1.0 0 ,23 ,25 ,2.863 ,2.873 ,0.996 0 ,23 ,255 ,9.087 ,9.693 ,0.938 0 ,23 ,2559 ,50.009 ,57.564 ,0.869 0 ,23 ,256 ,10.385 ,10.78 ,0.963 0 ,23 ,2560 ,44.992 ,49.487 ,0.909 0 ,23 ,26 ,2.859 ,2.86 ,0.999 0 ,23 ,27 ,2.856 ,2.861 ,0.998 0 ,23 ,28 ,2.862 ,2.853 ,1.003 0 ,23 ,29 ,2.853 ,2.851 ,1.001 0 ,23 ,3 ,2.89 ,2.917 ,0.991 0 ,23 ,30 ,2.871 ,2.888 ,0.994 0 ,23 ,3071 ,70.078 ,66.366 ,1.056 0 ,23 ,3072 ,51.136 ,54.752 ,0.934 0 ,23 ,31 ,2.848 ,2.857 ,0.997 0 ,23 ,319 ,10.808 ,11.072 ,0.976 0 ,23 ,32 ,4.202 ,4.195 ,1.002 0 ,23 ,320 ,11.071 ,11.839 ,0.935 0 ,23 ,3583 ,82.389 ,81.245 ,1.014 0 ,23 ,3584 ,58.072 ,62.416 ,0.93 0 ,23 ,383 ,11.152 ,11.866 ,0.94 0 ,23 ,384 ,12.533 ,12.761 ,0.982 0 ,23 ,4 ,2.868 ,2.892 ,0.992 0 ,23 ,447 ,12.916 ,13.313 ,0.97 0 ,23 ,448 ,13.303 ,13.954 ,0.953 0 ,23 ,5 ,2.885 ,2.875 ,1.004 0 ,23 ,511 ,13.28 ,13.871 ,0.957 0 ,23 ,512 ,14.792 ,15.284 ,0.968 0 ,23 ,6 ,2.857 ,2.87 ,0.995 0 ,23 ,63 ,4.277 ,4.283 ,0.999 0 ,23 ,639 ,15.31 ,16.14 ,0.949 0 ,23 ,64 ,4.961 ,4.961 ,1.0 0 ,23 ,640 ,16.757 ,17.581 ,0.953 0 ,23 ,7 ,2.881 ,2.875 ,1.002 0 ,23 ,767 ,17.31 ,18.654 ,0.928 0 ,23 ,768 ,19.421 ,19.879 ,0.977 0 ,23 ,8 ,2.871 ,2.878 ,0.998 0 ,23 ,895 ,19.345 ,20.32 ,0.952 0 ,23 ,896 ,21.683 ,21.331 ,1.017 0 ,23 ,9 ,2.904 ,2.868 ,1.013 0 ,23 ,95 ,4.989 ,4.945 ,1.009 0 ,23 ,96 ,5.382 ,5.098 ,1.056 1 ,0 ,64 ,4.945 ,4.953 ,0.998 1 ,23 ,64 ,4.998 ,4.95 ,1.01 2 ,0 ,64 ,4.92 ,4.939 ,0.996 2 ,23 ,64 ,4.95 ,4.957 ,0.999 3 ,0 ,64 ,4.964 ,4.954 ,1.002 3 ,23 ,64 ,4.943 ,4.978 ,0.993 4 ,0 ,64 ,4.981 ,4.968 ,1.003 4 ,23 ,64 ,4.949 ,4.969 ,0.996 5 ,0 ,64 ,4.923 ,4.932 ,0.998 5 ,23 ,64 ,4.931 ,4.931 ,1.0 6 ,0 ,64 ,4.794 ,4.799 ,0.999 6 ,23 ,64 ,4.803 ,4.8 ,1.001 0.9859952989629946 Results For: memchr align,invert_pos ,len ,pos ,seek_char ,__memchr_evex ,__memchr_evex_orig 0 ,0 ,0 ,1 ,0 ,3.473 ,4.166 ,0.834 0 ,0 ,0 ,1 ,23 ,3.505 ,4.181 ,0.838 0 ,0 ,1 ,2 ,0 ,3.488 ,3.485 ,1.001 0 ,0 ,1 ,2 ,23 ,3.472 ,3.469 ,1.001 0 ,0 ,10 ,11 ,0 ,3.665 ,4.443 ,0.825 0 ,0 ,10 ,11 ,23 ,3.485 ,3.856 ,0.904 0 ,0 ,10 ,9 ,0 ,3.646 ,3.872 ,0.942 0 ,0 ,10 ,9 ,23 ,3.661 ,3.771 ,0.971 0 ,0 ,1024 ,1024 ,23 ,21.347 ,20.117 ,1.061 0 ,0 ,1024 ,1056 ,23 ,21.66 ,20.361 ,1.064 0 ,0 ,1024 ,1088 ,23 ,22.226 ,20.41 ,1.089 0 ,0 ,1024 ,1120 ,23 ,21.754 ,20.29 ,1.072 0 ,0 ,1024 ,1152 ,23 ,21.777 ,20.303 ,1.073 0 ,0 ,1024 ,1184 ,23 ,21.532 ,20.325 ,1.059 0 ,0 ,1024 ,1216 ,23 ,21.862 ,20.278 ,1.078 0 ,0 ,1024 ,1248 ,23 ,21.539 ,20.218 ,1.065 0 ,0 ,1024 ,1280 ,23 ,21.725 ,20.265 ,1.072 0 ,0 ,1024 ,1312 ,23 ,21.756 ,20.352 ,1.069 0 ,0 ,1024 ,1344 ,23 ,21.772 ,20.247 ,1.075 0 ,0 ,1024 ,1376 ,23 ,21.542 ,20.363 ,1.058 0 ,0 ,1024 ,1408 ,23 ,21.573 ,20.319 ,1.062 0 ,0 ,1024 ,640 ,23 ,16.42 ,16.53 ,0.993 0 ,0 ,1024 ,672 ,23 ,16.664 ,16.655 ,1.001 0 ,0 ,1024 ,704 ,23 ,17.763 ,17.228 ,1.031 0 ,0 ,1024 ,736 ,23 ,18.094 ,17.306 ,1.046 0 ,0 ,1024 ,768 ,23 ,18.683 ,18.971 ,0.985 0 ,0 ,1024 ,800 ,23 ,18.738 ,18.792 ,0.997 0 ,0 ,1024 ,832 ,23 ,19.831 ,19.277 ,1.029 0 ,0 ,1024 ,864 ,23 ,19.749 ,19.052 ,1.037 0 ,0 ,1024 ,896 ,23 ,20.025 ,19.218 ,1.042 0 ,0 ,1024 ,928 ,23 ,21.18 ,19.66 ,1.077 0 ,0 ,1024 ,960 ,23 ,20.96 ,21.487 ,0.975 0 ,0 ,1024 ,992 ,23 ,22.066 ,20.802 ,1.061 0 ,0 ,1056 ,1024 ,23 ,21.801 ,20.757 ,1.05 0 ,0 ,1088 ,1024 ,23 ,21.457 ,20.95 ,1.024 0 ,0 ,11 ,10 ,0 ,3.617 ,3.812 ,0.949 0 ,0 ,11 ,10 ,23 ,3.701 ,3.848 ,0.962 0 ,0 ,11 ,12 ,0 ,3.482 ,3.759 ,0.926 0 ,0 ,11 ,12 ,23 ,3.513 ,3.78 ,0.929 0 ,0 ,112 ,16 ,23 ,3.56 ,3.807 ,0.935 0 ,0 ,1120 ,1024 ,23 ,21.753 ,20.777 ,1.047 0 ,0 ,1152 ,1024 ,23 ,21.724 ,20.948 ,1.037 0 ,0 ,1184 ,1024 ,23 ,22.588 ,22.291 ,1.013 0 ,0 ,12 ,11 ,0 ,3.588 ,3.76 ,0.954 0 ,0 ,12 ,11 ,23 ,3.737 ,3.853 ,0.97 0 ,0 ,12 ,13 ,0 ,3.504 ,3.843 ,0.912 0 ,0 ,12 ,13 ,23 ,3.498 ,3.807 ,0.919 0 ,0 ,1216 ,1024 ,23 ,22.525 ,22.172 ,1.016 0 ,0 ,1248 ,1024 ,23 ,22.882 ,22.391 ,1.022 0 ,0 ,128 ,128 ,23 ,5.46 ,6.528 ,0.836 0 ,0 ,128 ,160 ,23 ,5.622 ,6.848 ,0.821 0 ,0 ,128 ,192 ,23 ,5.653 ,6.872 ,0.823 0 ,0 ,128 ,224 ,23 ,6.018 ,7.722 ,0.779 0 ,0 ,128 ,256 ,23 ,5.693 ,6.915 ,0.823 0 ,0 ,128 ,288 ,23 ,5.669 ,7.024 ,0.807 0 ,0 ,128 ,32 ,23 ,4.641 ,5.73 ,0.81 0 ,0 ,128 ,320 ,23 ,5.588 ,6.872 ,0.813 0 ,0 ,128 ,352 ,23 ,5.571 ,6.87 ,0.811 0 ,0 ,128 ,384 ,23 ,5.61 ,6.913 ,0.811 0 ,0 ,128 ,416 ,23 ,5.545 ,6.835 ,0.811 0 ,0 ,128 ,448 ,23 ,5.586 ,6.908 ,0.809 0 ,0 ,128 ,480 ,23 ,5.59 ,6.674 ,0.837 0 ,0 ,128 ,512 ,23 ,5.58 ,6.76 ,0.825 0 ,0 ,128 ,64 ,23 ,5.036 ,6.123 ,0.823 0 ,0 ,128 ,96 ,23 ,6.141 ,6.397 ,0.96 0 ,0 ,1280 ,1024 ,23 ,22.328 ,22.221 ,1.005 0 ,0 ,13 ,12 ,0 ,3.551 ,3.81 ,0.932 0 ,0 ,13 ,12 ,23 ,3.644 ,3.956 ,0.921 0 ,0 ,13 ,14 ,0 ,3.498 ,3.775 ,0.926 0 ,0 ,13 ,14 ,23 ,3.489 ,3.785 ,0.922 0 ,0 ,1312 ,1024 ,23 ,22.724 ,22.229 ,1.022 0 ,0 ,1344 ,1024 ,23 ,22.405 ,22.205 ,1.009 0 ,0 ,1376 ,1024 ,23 ,22.286 ,22.346 ,0.997 0 ,0 ,14 ,13 ,0 ,3.548 ,3.805 ,0.932 0 ,0 ,14 ,13 ,23 ,3.612 ,3.9 ,0.926 0 ,0 ,14 ,15 ,0 ,3.491 ,3.771 ,0.926 0 ,0 ,14 ,15 ,23 ,3.507 ,3.819 ,0.918 0 ,0 ,1408 ,1024 ,23 ,22.468 ,22.266 ,1.009 0 ,0 ,144 ,16 ,23 ,3.633 ,3.828 ,0.949 0 ,0 ,15 ,14 ,0 ,3.642 ,3.863 ,0.943 0 ,0 ,15 ,14 ,23 ,3.69 ,3.832 ,0.963 0 ,0 ,15 ,16 ,0 ,3.501 ,3.894 ,0.899 0 ,0 ,15 ,16 ,23 ,3.611 ,3.839 ,0.941 0 ,0 ,16 ,112 ,23 ,3.497 ,3.909 ,0.895 0 ,0 ,16 ,144 ,23 ,3.501 ,3.925 ,0.892 0 ,0 ,16 ,15 ,0 ,3.658 ,3.857 ,0.948 0 ,0 ,16 ,15 ,23 ,3.87 ,3.787 ,1.022 0 ,0 ,16 ,16 ,23 ,3.425 ,3.711 ,0.923 0 ,0 ,16 ,17 ,0 ,3.5 ,3.848 ,0.909 0 ,0 ,16 ,17 ,23 ,3.494 ,3.82 ,0.914 0 ,0 ,16 ,176 ,23 ,3.476 ,3.88 ,0.896 0 ,0 ,16 ,208 ,23 ,3.464 ,3.799 ,0.912 0 ,0 ,16 ,240 ,23 ,3.468 ,3.85 ,0.901 0 ,0 ,16 ,272 ,23 ,3.516 ,3.848 ,0.914 0 ,0 ,16 ,304 ,23 ,3.497 ,3.869 ,0.904 0 ,0 ,16 ,336 ,23 ,3.491 ,3.822 ,0.913 0 ,0 ,16 ,368 ,23 ,3.484 ,3.798 ,0.917 0 ,0 ,16 ,400 ,23 ,3.493 ,3.877 ,0.901 0 ,0 ,16 ,48 ,23 ,3.48 ,3.823 ,0.91 0 ,0 ,16 ,80 ,23 ,3.497 ,3.868 ,0.904 0 ,0 ,160 ,128 ,23 ,6.651 ,7.158 ,0.929 0 ,0 ,160 ,256 ,23 ,6.136 ,7.605 ,0.807 0 ,0 ,160 ,32 ,23 ,4.882 ,5.71 ,0.855 0 ,0 ,160 ,512 ,23 ,6.102 ,6.676 ,0.914 0 ,0 ,160 ,64 ,23 ,5.311 ,6.122 ,0.867 0 ,0 ,1664 ,2048 ,23 ,31.73 ,29.774 ,1.066 0 ,0 ,1696 ,2048 ,23 ,31.282 ,29.567 ,1.058 0 ,0 ,17 ,16 ,0 ,3.66 ,3.868 ,0.946 0 ,0 ,17 ,16 ,23 ,3.803 ,3.855 ,0.986 0 ,0 ,17 ,18 ,0 ,3.477 ,3.893 ,0.893 0 ,0 ,17 ,18 ,23 ,3.475 ,3.809 ,0.912 0 ,0 ,1728 ,2048 ,23 ,32.093 ,30.336 ,1.058 0 ,0 ,176 ,16 ,23 ,3.665 ,3.884 ,0.944 0 ,0 ,1760 ,2048 ,23 ,32.968 ,30.894 ,1.067 0 ,0 ,1792 ,2048 ,23 ,33.445 ,31.817 ,1.051 0 ,0 ,18 ,17 ,0 ,3.701 ,3.785 ,0.978 0 ,0 ,18 ,17 ,23 ,3.743 ,3.833 ,0.977 0 ,0 ,18 ,19 ,0 ,3.478 ,3.837 ,0.907 0 ,0 ,18 ,19 ,23 ,3.463 ,3.868 ,0.895 0 ,0 ,1824 ,2048 ,23 ,33.291 ,31.768 ,1.048 0 ,0 ,1856 ,2048 ,23 ,33.922 ,32.431 ,1.046 0 ,0 ,1888 ,2048 ,23 ,35.392 ,33.135 ,1.068 0 ,0 ,19 ,18 ,0 ,3.616 ,3.791 ,0.954 0 ,0 ,19 ,18 ,23 ,3.813 ,3.807 ,1.002 0 ,0 ,19 ,20 ,0 ,3.465 ,3.795 ,0.913 0 ,0 ,19 ,20 ,23 ,3.458 ,3.811 ,0.907 0 ,0 ,192 ,128 ,23 ,6.158 ,6.144 ,1.002 0 ,0 ,192 ,256 ,23 ,7.663 ,7.608 ,1.007 0 ,0 ,192 ,32 ,23 ,4.818 ,5.133 ,0.939 0 ,0 ,192 ,512 ,23 ,7.465 ,7.249 ,1.03 0 ,0 ,192 ,64 ,23 ,5.125 ,5.188 ,0.988 0 ,0 ,1920 ,2048 ,23 ,35.59 ,33.388 ,1.066 0 ,0 ,1952 ,2048 ,23 ,35.15 ,33.167 ,1.06 0 ,0 ,1984 ,2048 ,23 ,35.715 ,33.95 ,1.052 0 ,0 ,2 ,1 ,0 ,3.496 ,3.642 ,0.96 0 ,0 ,2 ,1 ,23 ,3.466 ,3.444 ,1.007 0 ,0 ,2 ,3 ,0 ,3.501 ,3.677 ,0.952 0 ,0 ,2 ,3 ,23 ,3.553 ,3.604 ,0.986 0 ,0 ,20 ,19 ,0 ,3.573 ,3.804 ,0.939 0 ,0 ,20 ,19 ,23 ,3.815 ,3.834 ,0.995 0 ,0 ,20 ,21 ,0 ,3.481 ,3.778 ,0.921 0 ,0 ,20 ,21 ,23 ,3.481 ,3.833 ,0.908 0 ,0 ,2016 ,2048 ,23 ,36.429 ,34.281 ,1.063 0 ,0 ,2048 ,1024 ,0 ,23.047 ,22.507 ,1.024 0 ,0 ,2048 ,1024 ,23 ,22.719 ,22.414 ,1.014 0 ,0 ,2048 ,128 ,0 ,6.151 ,6.026 ,1.021 0 ,0 ,2048 ,128 ,23 ,6.186 ,6.083 ,1.017 0 ,0 ,2048 ,1664 ,23 ,32.613 ,31.399 ,1.039 0 ,0 ,2048 ,1696 ,23 ,32.519 ,31.396 ,1.036 0 ,0 ,2048 ,1728 ,23 ,34.272 ,32.097 ,1.068 0 ,0 ,2048 ,1760 ,23 ,33.56 ,32.092 ,1.046 0 ,0 ,2048 ,1792 ,23 ,34.325 ,35.3 ,0.972 0 ,0 ,2048 ,1824 ,23 ,34.551 ,33.401 ,1.034 0 ,0 ,2048 ,1856 ,23 ,35.717 ,34.195 ,1.044 0 ,0 ,2048 ,1888 ,23 ,35.653 ,34.074 ,1.046 0 ,0 ,2048 ,1920 ,23 ,35.127 ,33.787 ,1.04 0 ,0 ,2048 ,1952 ,23 ,37.31 ,33.955 ,1.099 0 ,0 ,2048 ,1984 ,23 ,36.119 ,36.15 ,0.999 0 ,0 ,2048 ,2016 ,23 ,37.774 ,35.764 ,1.056 0 ,0 ,2048 ,2048 ,0 ,37.794 ,35.197 ,1.074 0 ,0 ,2048 ,2048 ,23 ,37.135 ,34.502 ,1.076 0 ,0 ,2048 ,2080 ,23 ,37.593 ,34.836 ,1.079 0 ,0 ,2048 ,2112 ,23 ,37.494 ,34.934 ,1.073 0 ,0 ,2048 ,2144 ,23 ,37.47 ,35.042 ,1.069 0 ,0 ,2048 ,2176 ,23 ,37.51 ,34.77 ,1.079 0 ,0 ,2048 ,2208 ,23 ,37.512 ,34.873 ,1.076 0 ,0 ,2048 ,2240 ,23 ,37.81 ,35.223 ,1.073 0 ,0 ,2048 ,2272 ,23 ,37.648 ,34.795 ,1.082 0 ,0 ,2048 ,2304 ,23 ,37.628 ,34.938 ,1.077 0 ,0 ,2048 ,2336 ,23 ,37.607 ,34.815 ,1.08 0 ,0 ,2048 ,2368 ,23 ,37.661 ,34.828 ,1.081 0 ,0 ,2048 ,2400 ,23 ,37.711 ,34.934 ,1.08 0 ,0 ,2048 ,2432 ,23 ,37.428 ,34.937 ,1.071 0 ,0 ,2048 ,256 ,0 ,10.418 ,10.646 ,0.979 0 ,0 ,2048 ,256 ,23 ,10.448 ,10.688 ,0.978 0 ,0 ,2048 ,32 ,0 ,4.639 ,5.259 ,0.882 0 ,0 ,2048 ,32 ,23 ,4.822 ,5.232 ,0.922 0 ,0 ,2048 ,512 ,0 ,14.497 ,14.909 ,0.972 0 ,0 ,2048 ,512 ,23 ,14.652 ,14.994 ,0.977 0 ,0 ,2048 ,64 ,0 ,5.159 ,5.176 ,0.997 0 ,0 ,2048 ,64 ,23 ,5.135 ,5.157 ,0.996 0 ,0 ,208 ,16 ,23 ,3.6 ,3.935 ,0.915 0 ,0 ,2080 ,2048 ,23 ,37.366 ,35.59 ,1.05 0 ,0 ,21 ,20 ,0 ,3.618 ,3.93 ,0.921 0 ,0 ,21 ,20 ,23 ,3.826 ,3.756 ,1.019 0 ,0 ,21 ,22 ,0 ,3.456 ,3.754 ,0.92 0 ,0 ,21 ,22 ,23 ,3.421 ,3.825 ,0.895 0 ,0 ,2112 ,2048 ,23 ,37.713 ,35.722 ,1.056 0 ,0 ,2144 ,2048 ,23 ,37.058 ,35.878 ,1.033 0 ,0 ,2176 ,2048 ,23 ,37.001 ,35.798 ,1.034 0 ,0 ,22 ,21 ,0 ,3.53 ,3.708 ,0.952 0 ,0 ,22 ,21 ,23 ,3.705 ,3.821 ,0.97 0 ,0 ,22 ,23 ,0 ,3.385 ,3.744 ,0.904 0 ,0 ,22 ,23 ,23 ,3.6 ,4.397 ,0.819 0 ,0 ,2208 ,2048 ,23 ,37.641 ,37.406 ,1.006 0 ,0 ,224 ,128 ,23 ,6.174 ,6.209 ,0.994 0 ,0 ,224 ,256 ,23 ,8.043 ,8.168 ,0.985 0 ,0 ,224 ,32 ,23 ,5.2 ,5.013 ,1.037 0 ,0 ,224 ,512 ,23 ,7.923 ,7.845 ,1.01 0 ,0 ,224 ,64 ,23 ,5.059 ,5.266 ,0.961 0 ,0 ,2240 ,2048 ,23 ,38.457 ,37.305 ,1.031 0 ,0 ,2272 ,2048 ,23 ,38.433 ,37.216 ,1.033 0 ,0 ,23 ,22 ,0 ,3.593 ,3.725 ,0.964 0 ,0 ,23 ,22 ,23 ,3.689 ,3.827 ,0.964 0 ,0 ,23 ,24 ,0 ,3.422 ,3.765 ,0.909 0 ,0 ,23 ,24 ,23 ,3.445 ,3.745 ,0.92 0 ,0 ,2304 ,2048 ,23 ,37.974 ,37.383 ,1.016 0 ,0 ,2336 ,2048 ,23 ,38.69 ,37.569 ,1.03 0 ,0 ,2368 ,2048 ,23 ,38.716 ,37.644 ,1.028 0 ,0 ,24 ,23 ,0 ,3.549 ,3.806 ,0.932 0 ,0 ,24 ,23 ,23 ,3.738 ,3.762 ,0.994 0 ,0 ,24 ,25 ,0 ,3.342 ,3.681 ,0.908 0 ,0 ,24 ,25 ,23 ,3.341 ,3.823 ,0.874 0 ,0 ,240 ,16 ,23 ,3.642 ,3.859 ,0.944 0 ,0 ,2400 ,2048 ,23 ,38.162 ,37.283 ,1.024 0 ,0 ,2432 ,2048 ,23 ,38.212 ,37.582 ,1.017 0 ,0 ,25 ,24 ,0 ,3.61 ,3.795 ,0.951 0 ,0 ,25 ,24 ,23 ,3.695 ,3.769 ,0.98 0 ,0 ,25 ,26 ,0 ,3.351 ,3.7 ,0.906 0 ,0 ,25 ,26 ,23 ,3.322 ,3.734 ,0.89 0 ,0 ,256 ,128 ,23 ,6.204 ,6.079 ,1.02 0 ,0 ,256 ,160 ,23 ,7.927 ,7.624 ,1.04 0 ,0 ,256 ,192 ,23 ,7.865 ,7.782 ,1.011 0 ,0 ,256 ,224 ,23 ,8.83 ,8.766 ,1.007 0 ,0 ,256 ,256 ,23 ,8.367 ,8.437 ,0.992 0 ,0 ,256 ,288 ,23 ,8.523 ,8.537 ,0.998 0 ,0 ,256 ,32 ,23 ,5.07 ,5.007 ,1.013 0 ,0 ,256 ,320 ,23 ,8.523 ,8.604 ,0.991 0 ,0 ,256 ,352 ,23 ,8.611 ,8.629 ,0.998 0 ,0 ,256 ,384 ,23 ,8.541 ,8.495 ,1.005 0 ,0 ,256 ,416 ,23 ,8.723 ,8.63 ,1.011 0 ,0 ,256 ,448 ,23 ,8.598 ,8.623 ,0.997 0 ,0 ,256 ,480 ,23 ,8.498 ,8.622 ,0.986 0 ,0 ,256 ,512 ,23 ,8.532 ,8.632 ,0.988 0 ,0 ,256 ,544 ,23 ,9.267 ,8.599 ,1.078 0 ,0 ,256 ,576 ,23 ,9.163 ,8.699 ,1.053 0 ,0 ,256 ,608 ,23 ,9.201 ,8.691 ,1.059 0 ,0 ,256 ,64 ,23 ,5.013 ,5.26 ,0.953 0 ,0 ,256 ,640 ,23 ,8.489 ,8.643 ,0.982 0 ,0 ,256 ,96 ,23 ,6.429 ,5.756 ,1.117 0 ,0 ,26 ,25 ,0 ,3.485 ,3.71 ,0.939 0 ,0 ,26 ,25 ,23 ,3.535 ,3.742 ,0.945 0 ,0 ,26 ,27 ,0 ,3.351 ,3.728 ,0.899 0 ,0 ,26 ,27 ,23 ,3.344 ,3.826 ,0.874 0 ,0 ,27 ,26 ,0 ,3.462 ,3.683 ,0.94 0 ,0 ,27 ,26 ,23 ,3.602 ,3.81 ,0.945 0 ,0 ,27 ,28 ,0 ,3.326 ,3.716 ,0.895 0 ,0 ,27 ,28 ,23 ,3.313 ,3.698 ,0.896 0 ,0 ,272 ,16 ,23 ,3.603 ,3.867 ,0.932 0 ,0 ,28 ,27 ,0 ,3.445 ,3.714 ,0.927 0 ,0 ,28 ,27 ,23 ,3.553 ,3.789 ,0.938 0 ,0 ,28 ,29 ,0 ,3.287 ,3.739 ,0.879 0 ,0 ,28 ,29 ,23 ,3.286 ,3.753 ,0.875 0 ,0 ,288 ,128 ,23 ,6.189 ,6.001 ,1.031 0 ,0 ,288 ,256 ,23 ,9.392 ,9.63 ,0.975 0 ,0 ,288 ,32 ,23 ,5.028 ,5.029 ,1.0 0 ,0 ,288 ,512 ,23 ,9.082 ,9.382 ,0.968 0 ,0 ,288 ,64 ,23 ,5.107 ,5.276 ,0.968 0 ,0 ,29 ,28 ,0 ,3.467 ,3.703 ,0.936 0 ,0 ,29 ,28 ,23 ,3.643 ,3.785 ,0.962 0 ,0 ,29 ,30 ,0 ,3.279 ,3.69 ,0.889 0 ,0 ,29 ,30 ,23 ,3.263 ,3.705 ,0.881 0 ,0 ,3 ,2 ,0 ,3.483 ,3.75 ,0.929 0 ,0 ,3 ,2 ,23 ,3.549 ,3.791 ,0.936 0 ,0 ,3 ,4 ,0 ,3.499 ,3.615 ,0.968 0 ,0 ,3 ,4 ,23 ,3.492 ,3.616 ,0.966 0 ,0 ,30 ,29 ,0 ,3.455 ,3.746 ,0.922 0 ,0 ,30 ,29 ,23 ,3.643 ,3.797 ,0.959 0 ,0 ,30 ,31 ,0 ,3.309 ,3.704 ,0.893 0 ,0 ,30 ,31 ,23 ,3.302 ,3.801 ,0.869 0 ,0 ,304 ,16 ,23 ,3.571 ,3.965 ,0.901 0 ,0 ,31 ,30 ,0 ,3.428 ,3.748 ,0.915 0 ,0 ,31 ,30 ,23 ,3.511 ,3.755 ,0.935 0 ,0 ,32 ,128 ,23 ,3.28 ,3.702 ,0.886 0 ,0 ,32 ,160 ,23 ,3.308 ,3.702 ,0.894 0 ,0 ,32 ,192 ,23 ,3.296 ,3.756 ,0.878 0 ,0 ,32 ,224 ,23 ,3.31 ,3.707 ,0.893 0 ,0 ,32 ,256 ,23 ,3.314 ,3.715 ,0.892 0 ,0 ,32 ,288 ,23 ,3.324 ,3.737 ,0.889 0 ,0 ,32 ,31 ,0 ,3.458 ,3.752 ,0.922 0 ,0 ,32 ,31 ,23 ,3.456 ,3.7 ,0.934 0 ,0 ,32 ,32 ,23 ,3.23 ,3.643 ,0.887 0 ,0 ,32 ,320 ,23 ,3.334 ,3.673 ,0.908 0 ,0 ,32 ,352 ,23 ,3.324 ,3.728 ,0.892 0 ,0 ,32 ,384 ,23 ,3.311 ,3.713 ,0.892 0 ,0 ,32 ,416 ,23 ,3.34 ,3.676 ,0.908 0 ,0 ,32 ,64 ,23 ,3.285 ,3.673 ,0.895 0 ,0 ,32 ,96 ,23 ,3.3 ,3.67 ,0.899 0 ,0 ,320 ,128 ,23 ,6.128 ,5.986 ,1.024 0 ,0 ,320 ,256 ,23 ,10.255 ,9.859 ,1.04 0 ,0 ,320 ,32 ,23 ,5.226 ,5.063 ,1.032 0 ,0 ,320 ,512 ,23 ,10.38 ,10.25 ,1.013 0 ,0 ,320 ,64 ,23 ,5.062 ,5.193 ,0.975 0 ,0 ,336 ,16 ,23 ,3.592 ,3.963 ,0.906 0 ,0 ,352 ,128 ,23 ,6.197 ,6.048 ,1.025 0 ,0 ,352 ,256 ,23 ,10.583 ,10.571 ,1.001 0 ,0 ,352 ,32 ,23 ,5.248 ,5.028 ,1.044 0 ,0 ,352 ,512 ,23 ,10.823 ,10.873 ,0.995 0 ,0 ,352 ,64 ,23 ,5.071 ,5.202 ,0.975 0 ,0 ,368 ,16 ,23 ,3.556 ,3.857 ,0.922 0 ,0 ,3712 ,4096 ,23 ,63.78 ,69.22 ,0.921 0 ,0 ,3744 ,4096 ,23 ,63.149 ,70.832 ,0.892 0 ,0 ,3776 ,4096 ,23 ,63.619 ,70.826 ,0.898 0 ,0 ,3808 ,4096 ,23 ,64.318 ,71.604 ,0.898 0 ,0 ,384 ,128 ,23 ,6.161 ,6.105 ,1.009 0 ,0 ,384 ,256 ,23 ,9.792 ,9.752 ,1.004 0 ,0 ,384 ,32 ,23 ,5.498 ,5.014 ,1.097 0 ,0 ,384 ,512 ,23 ,11.584 ,11.573 ,1.001 0 ,0 ,384 ,64 ,23 ,4.951 ,5.261 ,0.941 0 ,0 ,3840 ,4096 ,23 ,65.775 ,70.85 ,0.928 0 ,0 ,3872 ,4096 ,23 ,66.258 ,72.207 ,0.918 0 ,0 ,3904 ,4096 ,23 ,66.891 ,72.083 ,0.928 0 ,0 ,3936 ,4096 ,23 ,66.326 ,73.547 ,0.902 0 ,0 ,3968 ,4096 ,23 ,67.857 ,73.444 ,0.924 0 ,0 ,4 ,3 ,0 ,3.591 ,3.785 ,0.949 0 ,0 ,4 ,3 ,23 ,3.589 ,3.813 ,0.941 0 ,0 ,4 ,5 ,0 ,3.486 ,3.514 ,0.992 0 ,0 ,4 ,5 ,23 ,3.483 ,3.58 ,0.973 0 ,0 ,400 ,16 ,23 ,3.575 ,3.88 ,0.921 0 ,0 ,4000 ,4096 ,23 ,67.682 ,74.733 ,0.906 0 ,0 ,4032 ,4096 ,23 ,67.609 ,76.891 ,0.879 0 ,0 ,4064 ,4096 ,23 ,68.659 ,76.556 ,0.897 0 ,0 ,4096 ,3712 ,23 ,64.615 ,88.387 ,0.731 0 ,0 ,4096 ,3744 ,23 ,64.921 ,87.941 ,0.738 0 ,0 ,4096 ,3776 ,23 ,65.276 ,87.668 ,0.745 0 ,0 ,4096 ,3808 ,23 ,66.016 ,88.603 ,0.745 0 ,0 ,4096 ,3840 ,23 ,70.403 ,91.997 ,0.765 0 ,0 ,4096 ,3872 ,23 ,67.055 ,87.431 ,0.767 0 ,0 ,4096 ,3904 ,23 ,68.023 ,89.039 ,0.764 0 ,0 ,4096 ,3936 ,23 ,67.631 ,89.265 ,0.758 0 ,0 ,4096 ,3968 ,23 ,68.641 ,74.007 ,0.927 0 ,0 ,4096 ,4000 ,23 ,72.133 ,78.95 ,0.914 0 ,0 ,4096 ,4032 ,23 ,69.08 ,77.393 ,0.893 0 ,0 ,4096 ,4064 ,23 ,70.372 ,77.075 ,0.913 0 ,0 ,4096 ,4096 ,23 ,69.437 ,75.123 ,0.924 0 ,0 ,4096 ,4128 ,23 ,70.462 ,75.608 ,0.932 0 ,0 ,4096 ,4160 ,23 ,69.956 ,75.867 ,0.922 0 ,0 ,4096 ,4192 ,23 ,69.843 ,75.901 ,0.92 0 ,0 ,4096 ,4224 ,23 ,70.844 ,76.334 ,0.928 0 ,0 ,4096 ,4256 ,23 ,69.573 ,75.887 ,0.917 0 ,0 ,4096 ,4288 ,23 ,70.359 ,76.0 ,0.926 0 ,0 ,4096 ,4320 ,23 ,71.167 ,75.91 ,0.938 0 ,0 ,4096 ,4352 ,23 ,69.839 ,75.444 ,0.926 0 ,0 ,4096 ,4384 ,23 ,69.719 ,75.942 ,0.918 0 ,0 ,4096 ,4416 ,23 ,69.554 ,75.796 ,0.918 0 ,0 ,4096 ,4448 ,23 ,69.115 ,75.496 ,0.915 0 ,0 ,4096 ,4480 ,23 ,70.861 ,75.695 ,0.936 0 ,0 ,4128 ,4096 ,23 ,69.667 ,77.45 ,0.9 0 ,0 ,416 ,128 ,23 ,6.163 ,6.065 ,1.016 0 ,0 ,416 ,256 ,23 ,11.565 ,10.811 ,1.07 0 ,0 ,416 ,32 ,23 ,5.391 ,5.133 ,1.05 0 ,0 ,416 ,512 ,23 ,11.685 ,10.918 ,1.07 0 ,0 ,416 ,64 ,23 ,4.987 ,5.125 ,0.973 0 ,0 ,4160 ,4096 ,23 ,69.348 ,76.459 ,0.907 0 ,0 ,4192 ,4096 ,23 ,70.619 ,76.057 ,0.929 0 ,0 ,4224 ,4096 ,23 ,68.959 ,76.303 ,0.904 0 ,0 ,4256 ,4096 ,23 ,75.085 ,96.41 ,0.779 0 ,0 ,4288 ,4096 ,23 ,69.921 ,92.693 ,0.754 0 ,0 ,4320 ,4096 ,23 ,72.347 ,96.461 ,0.75 0 ,0 ,4352 ,4096 ,23 ,72.83 ,98.647 ,0.738 0 ,0 ,4384 ,4096 ,23 ,70.59 ,95.961 ,0.736 0 ,0 ,4416 ,4096 ,23 ,71.088 ,95.826 ,0.742 0 ,0 ,4448 ,4096 ,23 ,71.876 ,96.575 ,0.744 0 ,0 ,448 ,128 ,23 ,6.128 ,6.058 ,1.012 0 ,0 ,448 ,256 ,23 ,10.492 ,10.524 ,0.997 0 ,0 ,448 ,512 ,23 ,12.444 ,11.774 ,1.057 0 ,0 ,448 ,64 ,23 ,4.977 ,5.204 ,0.956 0 ,0 ,4480 ,4096 ,23 ,70.467 ,95.694 ,0.736 0 ,0 ,48 ,16 ,23 ,3.472 ,3.889 ,0.893 0 ,0 ,480 ,128 ,23 ,6.185 ,6.002 ,1.031 0 ,0 ,480 ,256 ,23 ,10.382 ,10.477 ,0.991 0 ,0 ,480 ,512 ,23 ,12.402 ,12.486 ,0.993 0 ,0 ,5 ,4 ,0 ,3.578 ,3.777 ,0.947 0 ,0 ,5 ,4 ,23 ,3.521 ,3.788 ,0.929 0 ,0 ,5 ,6 ,0 ,3.489 ,3.712 ,0.94 0 ,0 ,5 ,6 ,23 ,3.476 ,3.727 ,0.933 0 ,0 ,512 ,128 ,23 ,6.127 ,6.091 ,1.006 0 ,0 ,512 ,160 ,23 ,9.055 ,9.019 ,1.004 0 ,0 ,512 ,192 ,23 ,9.408 ,9.58 ,0.982 0 ,0 ,512 ,224 ,23 ,9.337 ,9.378 ,0.996 0 ,0 ,512 ,256 ,23 ,10.419 ,10.511 ,0.991 0 ,0 ,512 ,288 ,23 ,10.862 ,10.885 ,0.998 0 ,0 ,512 ,320 ,23 ,11.236 ,11.349 ,0.99 0 ,0 ,512 ,352 ,23 ,12.097 ,11.381 ,1.063 0 ,0 ,512 ,384 ,23 ,11.787 ,11.561 ,1.02 0 ,0 ,512 ,416 ,23 ,12.889 ,12.124 ,1.063 0 ,0 ,512 ,448 ,23 ,13.497 ,13.479 ,1.001 0 ,0 ,512 ,480 ,23 ,13.987 ,13.836 ,1.011 0 ,0 ,512 ,512 ,23 ,13.425 ,13.128 ,1.023 0 ,0 ,512 ,544 ,23 ,13.628 ,13.322 ,1.023 0 ,0 ,512 ,576 ,23 ,13.629 ,13.332 ,1.022 0 ,0 ,512 ,608 ,23 ,13.592 ,13.286 ,1.023 0 ,0 ,512 ,640 ,23 ,13.504 ,13.303 ,1.015 0 ,0 ,512 ,672 ,23 ,13.641 ,13.31 ,1.025 0 ,0 ,512 ,704 ,23 ,13.602 ,14.037 ,0.969 0 ,0 ,512 ,736 ,23 ,13.599 ,13.259 ,1.026 0 ,0 ,512 ,768 ,23 ,13.556 ,13.218 ,1.026 0 ,0 ,512 ,800 ,23 ,13.479 ,13.274 ,1.016 0 ,0 ,512 ,832 ,23 ,13.588 ,13.265 ,1.024 0 ,0 ,512 ,864 ,23 ,13.552 ,13.265 ,1.022 0 ,0 ,512 ,896 ,23 ,13.688 ,13.369 ,1.024 0 ,0 ,544 ,256 ,23 ,10.269 ,10.421 ,0.985 0 ,0 ,544 ,512 ,23 ,14.301 ,13.686 ,1.045 0 ,0 ,576 ,256 ,23 ,10.335 ,10.421 ,0.992 0 ,0 ,576 ,512 ,23 ,14.129 ,13.776 ,1.026 0 ,0 ,6 ,5 ,0 ,3.6 ,3.781 ,0.952 0 ,0 ,6 ,5 ,23 ,3.522 ,3.783 ,0.931 0 ,0 ,6 ,7 ,0 ,3.506 ,3.787 ,0.926 0 ,0 ,6 ,7 ,23 ,3.505 ,3.811 ,0.92 0 ,0 ,608 ,256 ,23 ,10.422 ,10.401 ,1.002 0 ,0 ,608 ,512 ,23 ,14.041 ,13.801 ,1.017 0 ,0 ,64 ,128 ,23 ,4.606 ,5.534 ,0.832 0 ,0 ,64 ,160 ,23 ,4.482 ,5.649 ,0.793 0 ,0 ,64 ,192 ,23 ,4.629 ,5.528 ,0.837 0 ,0 ,64 ,224 ,23 ,4.516 ,5.489 ,0.823 0 ,0 ,64 ,256 ,23 ,4.448 ,5.588 ,0.796 0 ,0 ,64 ,288 ,23 ,4.581 ,5.517 ,0.83 0 ,0 ,64 ,32 ,23 ,4.755 ,5.667 ,0.839 0 ,0 ,64 ,320 ,23 ,4.421 ,5.481 ,0.807 0 ,0 ,64 ,352 ,23 ,4.562 ,5.522 ,0.826 0 ,0 ,64 ,384 ,23 ,4.467 ,5.49 ,0.814 0 ,0 ,64 ,416 ,23 ,4.384 ,5.449 ,0.804 0 ,0 ,64 ,448 ,23 ,4.492 ,5.542 ,0.811 0 ,0 ,64 ,64 ,23 ,4.373 ,5.382 ,0.812 0 ,0 ,64 ,96 ,23 ,4.473 ,5.568 ,0.803 0 ,0 ,640 ,1024 ,23 ,15.477 ,15.286 ,1.012 0 ,0 ,640 ,256 ,23 ,10.386 ,10.54 ,0.985 0 ,0 ,640 ,512 ,23 ,13.804 ,13.711 ,1.007 0 ,0 ,672 ,1024 ,23 ,15.551 ,15.098 ,1.03 0 ,0 ,672 ,512 ,23 ,14.409 ,14.727 ,0.978 0 ,0 ,7 ,6 ,0 ,3.658 ,3.773 ,0.969 0 ,0 ,7 ,6 ,23 ,3.684 ,3.864 ,0.953 0 ,0 ,7 ,8 ,0 ,3.506 ,3.831 ,0.915 0 ,0 ,7 ,8 ,23 ,3.498 ,3.796 ,0.921 0 ,0 ,704 ,1024 ,23 ,16.131 ,15.806 ,1.021 0 ,0 ,704 ,512 ,23 ,14.531 ,14.761 ,0.984 0 ,0 ,736 ,1024 ,23 ,16.909 ,16.371 ,1.033 0 ,0 ,736 ,512 ,23 ,14.332 ,14.728 ,0.973 0 ,0 ,768 ,1024 ,23 ,17.52 ,17.314 ,1.012 0 ,0 ,768 ,512 ,23 ,14.487 ,14.744 ,0.983 0 ,0 ,7808 ,8192 ,23 ,142.838 ,140.594 ,1.016 0 ,0 ,7840 ,8192 ,23 ,146.234 ,141.352 ,1.035 0 ,0 ,7872 ,8192 ,23 ,145.796 ,142.548 ,1.023 0 ,0 ,7904 ,8192 ,23 ,144.219 ,143.683 ,1.004 0 ,0 ,7936 ,8192 ,23 ,147.803 ,143.665 ,1.029 0 ,0 ,7968 ,8192 ,23 ,147.458 ,144.457 ,1.021 0 ,0 ,8 ,7 ,0 ,3.556 ,3.801 ,0.935 0 ,0 ,8 ,7 ,23 ,3.613 ,3.782 ,0.955 0 ,0 ,8 ,9 ,0 ,3.5 ,3.811 ,0.918 0 ,0 ,8 ,9 ,23 ,3.506 ,3.825 ,0.917 0 ,0 ,80 ,16 ,23 ,3.541 ,3.965 ,0.893 0 ,0 ,800 ,1024 ,23 ,17.385 ,17.114 ,1.016 0 ,0 ,800 ,512 ,23 ,14.447 ,14.829 ,0.974 0 ,0 ,8000 ,8192 ,23 ,147.199 ,144.857 ,1.016 0 ,0 ,8032 ,8192 ,23 ,148.789 ,145.683 ,1.021 0 ,0 ,8064 ,8192 ,23 ,149.846 ,145.922 ,1.027 0 ,0 ,8096 ,8192 ,23 ,150.151 ,145.632 ,1.031 0 ,0 ,8128 ,8192 ,23 ,149.362 ,146.551 ,1.019 0 ,0 ,8160 ,8192 ,23 ,149.914 ,149.245 ,1.004 0 ,0 ,832 ,1024 ,23 ,17.734 ,17.688 ,1.003 0 ,0 ,832 ,512 ,23 ,14.485 ,14.736 ,0.983 0 ,0 ,864 ,1024 ,23 ,18.89 ,17.95 ,1.052 0 ,0 ,864 ,512 ,23 ,15.036 ,15.126 ,0.994 0 ,0 ,896 ,1024 ,23 ,19.813 ,18.7 ,1.06 0 ,0 ,896 ,512 ,23 ,14.523 ,14.808 ,0.981 0 ,0 ,9 ,10 ,0 ,3.498 ,3.818 ,0.916 0 ,0 ,9 ,10 ,23 ,3.519 ,3.792 ,0.928 0 ,0 ,9 ,8 ,0 ,3.637 ,3.787 ,0.96 0 ,0 ,9 ,8 ,23 ,3.571 ,3.784 ,0.944 0 ,0 ,928 ,1024 ,23 ,19.587 ,18.73 ,1.046 0 ,0 ,96 ,128 ,23 ,5.024 ,6.657 ,0.755 0 ,0 ,96 ,256 ,23 ,5.063 ,6.472 ,0.782 0 ,0 ,96 ,32 ,23 ,4.998 ,5.735 ,0.871 0 ,0 ,96 ,64 ,23 ,5.6 ,5.634 ,0.994 0 ,0 ,960 ,1024 ,23 ,19.758 ,19.474 ,1.015 0 ,0 ,992 ,1024 ,23 ,21.526 ,19.571 ,1.1 1 ,0 ,0 ,1 ,0 ,3.321 ,3.989 ,0.832 1 ,0 ,0 ,1 ,23 ,3.381 ,4.061 ,0.833 1 ,0 ,192 ,32 ,0 ,4.672 ,5.119 ,0.913 1 ,0 ,192 ,32 ,23 ,4.516 ,4.979 ,0.907 1 ,0 ,2 ,1 ,0 ,3.525 ,3.521 ,1.001 1 ,0 ,2 ,1 ,23 ,3.608 ,3.668 ,0.984 1 ,0 ,256 ,32 ,0 ,4.58 ,5.029 ,0.911 1 ,0 ,256 ,32 ,23 ,4.569 ,5.008 ,0.912 1 ,0 ,256 ,64 ,0 ,5.933 ,5.39 ,1.101 1 ,0 ,256 ,64 ,23 ,5.057 ,5.365 ,0.943 1 ,0 ,512 ,32 ,0 ,4.63 ,4.965 ,0.933 1 ,0 ,512 ,32 ,23 ,4.581 ,5.087 ,0.901 10 ,0 ,11 ,10 ,0 ,3.57 ,3.81 ,0.937 10 ,0 ,11 ,10 ,23 ,3.59 ,3.816 ,0.941 10 ,0 ,9 ,10 ,0 ,3.51 ,3.84 ,0.914 10 ,0 ,9 ,10 ,23 ,3.506 ,3.818 ,0.918 11 ,0 ,10 ,11 ,0 ,3.508 ,3.829 ,0.916 11 ,0 ,10 ,11 ,23 ,3.5 ,3.952 ,0.886 11 ,0 ,12 ,11 ,0 ,3.62 ,3.813 ,0.949 11 ,0 ,12 ,11 ,23 ,3.595 ,3.816 ,0.942 12 ,0 ,11 ,12 ,0 ,3.508 ,3.828 ,0.916 12 ,0 ,11 ,12 ,23 ,3.509 ,3.823 ,0.918 12 ,0 ,13 ,12 ,0 ,3.622 ,3.798 ,0.954 12 ,0 ,13 ,12 ,23 ,3.567 ,3.835 ,0.93 13 ,0 ,12 ,13 ,0 ,3.51 ,3.797 ,0.924 13 ,0 ,12 ,13 ,23 ,3.485 ,3.778 ,0.922 13 ,0 ,14 ,13 ,0 ,3.625 ,3.84 ,0.944 13 ,0 ,14 ,13 ,23 ,3.594 ,3.842 ,0.935 14 ,0 ,13 ,14 ,0 ,3.473 ,3.829 ,0.907 14 ,0 ,13 ,14 ,23 ,3.5 ,3.846 ,0.91 14 ,0 ,15 ,14 ,0 ,3.691 ,3.795 ,0.973 14 ,0 ,15 ,14 ,23 ,3.537 ,3.828 ,0.924 15 ,0 ,14 ,15 ,0 ,3.489 ,3.83 ,0.911 15 ,0 ,14 ,15 ,23 ,3.495 ,3.793 ,0.921 15 ,0 ,16 ,15 ,0 ,3.607 ,3.775 ,0.956 15 ,0 ,16 ,15 ,23 ,3.619 ,3.883 ,0.932 16 ,0 ,15 ,16 ,0 ,3.518 ,3.852 ,0.913 16 ,0 ,15 ,16 ,23 ,3.492 ,3.772 ,0.926 16 ,0 ,17 ,16 ,0 ,3.624 ,3.859 ,0.939 16 ,0 ,17 ,16 ,23 ,3.634 ,3.817 ,0.952 17 ,0 ,16 ,17 ,0 ,3.485 ,3.89 ,0.896 17 ,0 ,16 ,17 ,23 ,3.498 ,3.836 ,0.912 17 ,0 ,18 ,17 ,0 ,3.583 ,3.816 ,0.939 17 ,0 ,18 ,17 ,23 ,3.595 ,3.818 ,0.942 18 ,0 ,17 ,18 ,0 ,3.468 ,3.839 ,0.903 18 ,0 ,17 ,18 ,23 ,3.493 ,3.805 ,0.918 18 ,0 ,19 ,18 ,0 ,3.593 ,3.805 ,0.944 18 ,0 ,19 ,18 ,23 ,3.585 ,3.776 ,0.949 19 ,0 ,18 ,19 ,0 ,3.474 ,3.818 ,0.91 19 ,0 ,18 ,19 ,23 ,3.474 ,3.832 ,0.907 19 ,0 ,20 ,19 ,0 ,3.576 ,3.849 ,0.929 19 ,0 ,20 ,19 ,23 ,3.502 ,3.873 ,0.904 2 ,0 ,1 ,2 ,0 ,3.515 ,3.515 ,1.0 2 ,0 ,1 ,2 ,23 ,3.506 ,3.504 ,1.0 2 ,0 ,192 ,64 ,0 ,5.019 ,5.348 ,0.938 2 ,0 ,192 ,64 ,23 ,5.265 ,5.433 ,0.969 2 ,0 ,256 ,64 ,0 ,5.028 ,5.155 ,0.975 2 ,0 ,256 ,64 ,23 ,4.967 ,5.161 ,0.962 2 ,0 ,3 ,2 ,0 ,3.603 ,3.78 ,0.953 2 ,0 ,3 ,2 ,23 ,3.568 ,3.829 ,0.932 2 ,0 ,512 ,64 ,0 ,4.982 ,5.124 ,0.972 2 ,0 ,512 ,64 ,23 ,4.963 ,5.239 ,0.947 20 ,0 ,19 ,20 ,0 ,3.446 ,3.791 ,0.909 20 ,0 ,19 ,20 ,23 ,3.475 ,3.819 ,0.91 20 ,0 ,21 ,20 ,0 ,3.601 ,3.776 ,0.954 20 ,0 ,21 ,20 ,23 ,3.599 ,3.798 ,0.948 2048 ,0 ,0 ,1 ,0 ,3.429 ,4.112 ,0.834 2048 ,0 ,0 ,1 ,23 ,3.455 ,4.144 ,0.834 2048 ,0 ,1 ,2 ,0 ,3.525 ,3.505 ,1.006 2048 ,0 ,1 ,2 ,23 ,3.498 ,3.496 ,1.001 2048 ,0 ,10 ,11 ,0 ,3.5 ,3.931 ,0.89 2048 ,0 ,10 ,11 ,23 ,3.542 ,3.848 ,0.92 2048 ,0 ,10 ,9 ,0 ,3.588 ,3.819 ,0.94 2048 ,0 ,10 ,9 ,23 ,3.595 ,3.836 ,0.937 2048 ,0 ,11 ,10 ,0 ,3.626 ,3.785 ,0.958 2048 ,0 ,11 ,10 ,23 ,3.622 ,3.816 ,0.949 2048 ,0 ,11 ,12 ,0 ,3.491 ,3.826 ,0.912 2048 ,0 ,11 ,12 ,23 ,3.49 ,3.804 ,0.917 2048 ,0 ,12 ,11 ,0 ,3.556 ,3.774 ,0.942 2048 ,0 ,12 ,11 ,23 ,3.678 ,3.986 ,0.923 2048 ,0 ,12 ,13 ,0 ,3.494 ,3.835 ,0.911 2048 ,0 ,12 ,13 ,23 ,3.481 ,3.829 ,0.909 2048 ,0 ,13 ,12 ,0 ,3.632 ,3.888 ,0.934 2048 ,0 ,13 ,12 ,23 ,3.614 ,3.824 ,0.945 2048 ,0 ,13 ,14 ,0 ,3.497 ,3.888 ,0.9 2048 ,0 ,13 ,14 ,23 ,3.506 ,3.833 ,0.915 2048 ,0 ,14 ,13 ,0 ,3.568 ,3.792 ,0.941 2048 ,0 ,14 ,13 ,23 ,3.563 ,3.829 ,0.931 2048 ,0 ,14 ,15 ,0 ,3.482 ,3.809 ,0.914 2048 ,0 ,14 ,15 ,23 ,3.471 ,3.792 ,0.915 2048 ,0 ,15 ,14 ,0 ,3.598 ,3.813 ,0.944 2048 ,0 ,15 ,14 ,23 ,3.576 ,3.868 ,0.925 2048 ,0 ,15 ,16 ,0 ,3.506 ,3.915 ,0.896 2048 ,0 ,15 ,16 ,23 ,3.494 ,3.827 ,0.913 2048 ,0 ,16 ,15 ,0 ,3.564 ,3.857 ,0.924 2048 ,0 ,16 ,15 ,23 ,3.578 ,3.789 ,0.944 2048 ,0 ,16 ,17 ,0 ,3.487 ,3.826 ,0.911 2048 ,0 ,16 ,17 ,23 ,3.472 ,3.789 ,0.916 2048 ,0 ,17 ,16 ,0 ,3.572 ,3.859 ,0.925 2048 ,0 ,17 ,16 ,23 ,3.64 ,3.797 ,0.959 2048 ,0 ,17 ,18 ,0 ,3.485 ,3.808 ,0.915 2048 ,0 ,17 ,18 ,23 ,3.471 ,3.896 ,0.891 2048 ,0 ,18 ,17 ,0 ,3.585 ,3.802 ,0.943 2048 ,0 ,18 ,17 ,23 ,3.578 ,3.834 ,0.933 2048 ,0 ,18 ,19 ,0 ,3.5 ,3.797 ,0.922 2048 ,0 ,18 ,19 ,23 ,3.468 ,3.798 ,0.913 2048 ,0 ,19 ,18 ,0 ,3.595 ,3.893 ,0.923 2048 ,0 ,19 ,18 ,23 ,3.588 ,3.862 ,0.929 2048 ,0 ,19 ,20 ,0 ,3.455 ,3.908 ,0.884 2048 ,0 ,19 ,20 ,23 ,3.465 ,3.801 ,0.911 2048 ,0 ,2 ,1 ,0 ,3.461 ,3.542 ,0.977 2048 ,0 ,2 ,1 ,23 ,3.27 ,3.298 ,0.992 2048 ,0 ,2 ,3 ,0 ,3.686 ,3.71 ,0.994 2048 ,0 ,2 ,3 ,23 ,3.681 ,3.836 ,0.959 2048 ,0 ,20 ,19 ,0 ,3.601 ,3.756 ,0.959 2048 ,0 ,20 ,19 ,23 ,3.586 ,3.85 ,0.932 2048 ,0 ,20 ,21 ,0 ,3.448 ,3.753 ,0.919 2048 ,0 ,20 ,21 ,23 ,3.496 ,3.85 ,0.908 2048 ,0 ,21 ,20 ,0 ,3.632 ,3.848 ,0.944 2048 ,0 ,21 ,20 ,23 ,3.599 ,3.813 ,0.944 2048 ,0 ,21 ,22 ,0 ,3.45 ,3.763 ,0.917 2048 ,0 ,21 ,22 ,23 ,3.436 ,3.82 ,0.899 2048 ,0 ,22 ,21 ,0 ,3.575 ,3.914 ,0.914 2048 ,0 ,22 ,21 ,23 ,3.574 ,3.793 ,0.942 2048 ,0 ,22 ,23 ,0 ,3.442 ,3.759 ,0.916 2048 ,0 ,22 ,23 ,23 ,3.437 ,3.802 ,0.904 2048 ,0 ,23 ,22 ,0 ,3.553 ,3.789 ,0.938 2048 ,0 ,23 ,22 ,23 ,3.571 ,3.739 ,0.955 2048 ,0 ,23 ,24 ,0 ,3.429 ,3.78 ,0.907 2048 ,0 ,23 ,24 ,23 ,3.467 ,3.739 ,0.927 2048 ,0 ,24 ,23 ,0 ,3.566 ,3.821 ,0.933 2048 ,0 ,24 ,23 ,23 ,3.536 ,3.759 ,0.941 2048 ,0 ,24 ,25 ,0 ,3.429 ,3.718 ,0.922 2048 ,0 ,24 ,25 ,23 ,3.431 ,3.794 ,0.904 2048 ,0 ,25 ,24 ,0 ,3.521 ,3.735 ,0.943 2048 ,0 ,25 ,24 ,23 ,3.557 ,3.713 ,0.958 2048 ,0 ,25 ,26 ,0 ,3.389 ,3.764 ,0.901 2048 ,0 ,25 ,26 ,23 ,3.369 ,3.712 ,0.908 2048 ,0 ,26 ,25 ,0 ,3.511 ,3.82 ,0.919 2048 ,0 ,26 ,25 ,23 ,3.524 ,3.81 ,0.925 2048 ,0 ,26 ,27 ,0 ,3.399 ,3.767 ,0.902 2048 ,0 ,26 ,27 ,23 ,3.411 ,3.733 ,0.914 2048 ,0 ,27 ,26 ,0 ,3.511 ,3.742 ,0.938 2048 ,0 ,27 ,26 ,23 ,3.526 ,3.733 ,0.945 2048 ,0 ,27 ,28 ,0 ,3.358 ,3.709 ,0.905 2048 ,0 ,27 ,28 ,23 ,3.408 ,3.735 ,0.912 2048 ,0 ,28 ,27 ,0 ,3.508 ,3.733 ,0.94 2048 ,0 ,28 ,27 ,23 ,3.467 ,3.686 ,0.941 2048 ,0 ,28 ,29 ,0 ,3.335 ,3.699 ,0.902 2048 ,0 ,28 ,29 ,23 ,3.363 ,3.675 ,0.915 2048 ,0 ,29 ,28 ,0 ,3.561 ,3.72 ,0.957 2048 ,0 ,29 ,28 ,23 ,3.501 ,3.707 ,0.944 2048 ,0 ,29 ,30 ,0 ,3.348 ,3.734 ,0.897 2048 ,0 ,29 ,30 ,23 ,3.336 ,3.767 ,0.886 2048 ,0 ,3 ,2 ,0 ,3.627 ,3.8 ,0.954 2048 ,0 ,3 ,2 ,23 ,3.632 ,3.831 ,0.948 2048 ,0 ,3 ,4 ,0 ,3.501 ,3.491 ,1.003 2048 ,0 ,3 ,4 ,23 ,3.498 ,3.652 ,0.958 2048 ,0 ,30 ,29 ,0 ,3.528 ,3.794 ,0.93 2048 ,0 ,30 ,29 ,23 ,3.47 ,3.666 ,0.947 2048 ,0 ,30 ,31 ,0 ,3.355 ,3.752 ,0.894 2048 ,0 ,30 ,31 ,23 ,3.316 ,3.671 ,0.903 2048 ,0 ,31 ,30 ,0 ,3.429 ,3.679 ,0.932 2048 ,0 ,31 ,30 ,23 ,3.441 ,3.724 ,0.924 2048 ,0 ,32 ,31 ,0 ,3.367 ,3.671 ,0.917 2048 ,0 ,32 ,31 ,23 ,3.416 ,3.708 ,0.921 2048 ,0 ,4 ,3 ,0 ,3.699 ,3.977 ,0.93 2048 ,0 ,4 ,3 ,23 ,3.832 ,3.977 ,0.964 2048 ,0 ,4 ,5 ,0 ,3.527 ,3.549 ,0.994 2048 ,0 ,4 ,5 ,23 ,3.489 ,3.567 ,0.978 2048 ,0 ,5 ,4 ,0 ,3.657 ,3.842 ,0.952 2048 ,0 ,5 ,4 ,23 ,3.655 ,3.789 ,0.965 2048 ,0 ,5 ,6 ,0 ,3.51 ,3.778 ,0.929 2048 ,0 ,5 ,6 ,23 ,3.498 ,3.794 ,0.922 2048 ,0 ,6 ,5 ,0 ,3.601 ,3.798 ,0.948 2048 ,0 ,6 ,5 ,23 ,3.637 ,3.846 ,0.946 2048 ,0 ,6 ,7 ,0 ,3.48 ,3.741 ,0.93 2048 ,0 ,6 ,7 ,23 ,3.489 ,3.804 ,0.917 2048 ,0 ,7 ,6 ,0 ,3.613 ,3.817 ,0.947 2048 ,0 ,7 ,6 ,23 ,3.6 ,3.783 ,0.952 2048 ,0 ,7 ,8 ,0 ,3.48 ,3.816 ,0.912 2048 ,0 ,7 ,8 ,23 ,3.498 ,3.743 ,0.934 2048 ,0 ,8 ,7 ,0 ,3.599 ,3.791 ,0.95 2048 ,0 ,8 ,7 ,23 ,3.616 ,3.859 ,0.937 2048 ,0 ,8 ,9 ,0 ,3.509 ,3.791 ,0.925 2048 ,0 ,8 ,9 ,23 ,3.501 ,3.801 ,0.921 2048 ,0 ,9 ,10 ,0 ,3.509 ,3.841 ,0.913 2048 ,0 ,9 ,10 ,23 ,3.507 ,3.804 ,0.922 2048 ,0 ,9 ,8 ,0 ,3.583 ,3.771 ,0.95 2048 ,0 ,9 ,8 ,23 ,3.551 ,3.844 ,0.924 2049 ,0 ,0 ,1 ,0 ,3.316 ,3.994 ,0.83 2049 ,0 ,0 ,1 ,23 ,3.378 ,4.055 ,0.833 2049 ,0 ,2 ,1 ,0 ,3.498 ,3.602 ,0.971 2049 ,0 ,2 ,1 ,23 ,3.502 ,3.565 ,0.982 2050 ,0 ,1 ,2 ,0 ,3.533 ,3.531 ,1.001 2050 ,0 ,1 ,2 ,23 ,3.513 ,3.504 ,1.002 2050 ,0 ,3 ,2 ,0 ,3.628 ,3.894 ,0.932 2050 ,0 ,3 ,2 ,23 ,3.579 ,3.836 ,0.933 2051 ,0 ,2 ,3 ,0 ,3.697 ,3.771 ,0.98 2051 ,0 ,2 ,3 ,23 ,3.696 ,3.738 ,0.989 2051 ,0 ,4 ,3 ,0 ,3.751 ,3.969 ,0.945 2051 ,0 ,4 ,3 ,23 ,3.713 ,3.979 ,0.933 2052 ,0 ,3 ,4 ,0 ,3.498 ,3.544 ,0.987 2052 ,0 ,3 ,4 ,23 ,3.521 ,3.513 ,1.002 2052 ,0 ,5 ,4 ,0 ,3.575 ,3.824 ,0.935 2052 ,0 ,5 ,4 ,23 ,3.598 ,3.877 ,0.928 2053 ,0 ,4 ,5 ,0 ,3.506 ,3.592 ,0.976 2053 ,0 ,4 ,5 ,23 ,3.509 ,3.525 ,0.996 2053 ,0 ,6 ,5 ,0 ,3.558 ,3.881 ,0.917 2053 ,0 ,6 ,5 ,23 ,3.597 ,3.853 ,0.933 2054 ,0 ,5 ,6 ,0 ,3.503 ,3.807 ,0.92 2054 ,0 ,5 ,6 ,23 ,3.515 ,3.827 ,0.919 2054 ,0 ,7 ,6 ,0 ,3.535 ,3.793 ,0.932 2054 ,0 ,7 ,6 ,23 ,3.572 ,3.796 ,0.941 2055 ,0 ,6 ,7 ,0 ,3.492 ,3.691 ,0.946 2055 ,0 ,6 ,7 ,23 ,3.489 ,3.717 ,0.939 2055 ,0 ,8 ,7 ,0 ,3.604 ,3.792 ,0.95 2055 ,0 ,8 ,7 ,23 ,3.542 ,3.784 ,0.936 2056 ,0 ,7 ,8 ,0 ,3.507 ,3.861 ,0.908 2056 ,0 ,7 ,8 ,23 ,3.501 ,3.825 ,0.915 2056 ,0 ,9 ,8 ,0 ,3.599 ,3.792 ,0.949 2056 ,0 ,9 ,8 ,23 ,3.585 ,3.818 ,0.939 2057 ,0 ,10 ,9 ,0 ,3.607 ,3.816 ,0.945 2057 ,0 ,10 ,9 ,23 ,3.652 ,3.814 ,0.958 2057 ,0 ,8 ,9 ,0 ,3.515 ,3.827 ,0.918 2057 ,0 ,8 ,9 ,23 ,3.506 ,3.808 ,0.921 2058 ,0 ,11 ,10 ,0 ,3.593 ,3.806 ,0.944 2058 ,0 ,11 ,10 ,23 ,3.623 ,3.845 ,0.942 2058 ,0 ,9 ,10 ,0 ,3.506 ,3.844 ,0.912 2058 ,0 ,9 ,10 ,23 ,3.498 ,3.819 ,0.916 2059 ,0 ,10 ,11 ,0 ,3.506 ,3.862 ,0.908 2059 ,0 ,10 ,11 ,23 ,3.509 ,3.794 ,0.925 2059 ,0 ,12 ,11 ,0 ,3.567 ,3.855 ,0.925 2059 ,0 ,12 ,11 ,23 ,3.595 ,3.8 ,0.946 2060 ,0 ,11 ,12 ,0 ,3.509 ,3.87 ,0.907 2060 ,0 ,11 ,12 ,23 ,3.494 ,3.773 ,0.926 2060 ,0 ,13 ,12 ,0 ,3.537 ,3.78 ,0.936 2060 ,0 ,13 ,12 ,23 ,3.631 ,3.839 ,0.946 2061 ,0 ,12 ,13 ,0 ,3.509 ,3.854 ,0.91 2061 ,0 ,12 ,13 ,23 ,3.491 ,3.815 ,0.915 2061 ,0 ,14 ,13 ,0 ,3.572 ,3.838 ,0.931 2061 ,0 ,14 ,13 ,23 ,3.588 ,3.796 ,0.945 2062 ,0 ,13 ,14 ,0 ,3.497 ,3.839 ,0.911 2062 ,0 ,13 ,14 ,23 ,3.481 ,3.809 ,0.914 2062 ,0 ,15 ,14 ,0 ,3.621 ,3.802 ,0.952 2062 ,0 ,15 ,14 ,23 ,3.549 ,3.869 ,0.917 2063 ,0 ,14 ,15 ,0 ,3.489 ,3.825 ,0.912 2063 ,0 ,14 ,15 ,23 ,3.478 ,3.78 ,0.92 2063 ,0 ,16 ,15 ,0 ,3.571 ,3.823 ,0.934 2063 ,0 ,16 ,15 ,23 ,3.58 ,3.827 ,0.935 2064 ,0 ,15 ,16 ,0 ,3.489 ,3.846 ,0.907 2064 ,0 ,15 ,16 ,23 ,3.486 ,3.827 ,0.911 2064 ,0 ,17 ,16 ,0 ,3.567 ,3.811 ,0.936 2064 ,0 ,17 ,16 ,23 ,3.638 ,3.83 ,0.95 2065 ,0 ,16 ,17 ,0 ,3.482 ,3.772 ,0.923 2065 ,0 ,16 ,17 ,23 ,3.498 ,3.841 ,0.911 2065 ,0 ,18 ,17 ,0 ,3.559 ,3.807 ,0.935 2065 ,0 ,18 ,17 ,23 ,3.62 ,3.731 ,0.97 2066 ,0 ,17 ,18 ,0 ,3.476 ,3.809 ,0.913 2066 ,0 ,17 ,18 ,23 ,3.467 ,3.843 ,0.902 2066 ,0 ,19 ,18 ,0 ,3.58 ,3.806 ,0.941 2066 ,0 ,19 ,18 ,23 ,3.577 ,3.915 ,0.914 2067 ,0 ,18 ,19 ,0 ,3.485 ,3.828 ,0.91 2067 ,0 ,18 ,19 ,23 ,3.471 ,3.831 ,0.906 2067 ,0 ,20 ,19 ,0 ,3.611 ,3.848 ,0.938 2067 ,0 ,20 ,19 ,23 ,3.582 ,3.855 ,0.929 2068 ,0 ,19 ,20 ,0 ,3.449 ,3.739 ,0.922 2068 ,0 ,19 ,20 ,23 ,3.463 ,3.827 ,0.905 2068 ,0 ,21 ,20 ,0 ,3.669 ,3.824 ,0.959 2068 ,0 ,21 ,20 ,23 ,3.6 ,3.845 ,0.936 2069 ,0 ,20 ,21 ,0 ,3.441 ,3.802 ,0.905 2069 ,0 ,20 ,21 ,23 ,3.463 ,3.735 ,0.927 2069 ,0 ,22 ,21 ,0 ,3.609 ,3.768 ,0.958 2069 ,0 ,22 ,21 ,23 ,3.605 ,3.769 ,0.956 2070 ,0 ,21 ,22 ,0 ,3.431 ,3.815 ,0.899 2070 ,0 ,21 ,22 ,23 ,3.452 ,3.81 ,0.906 2070 ,0 ,23 ,22 ,0 ,3.563 ,3.811 ,0.935 2070 ,0 ,23 ,22 ,23 ,3.53 ,3.85 ,0.917 2071 ,0 ,22 ,23 ,0 ,3.439 ,3.837 ,0.896 2071 ,0 ,22 ,23 ,23 ,3.421 ,3.778 ,0.905 2071 ,0 ,24 ,23 ,0 ,3.552 ,3.746 ,0.948 2071 ,0 ,24 ,23 ,23 ,3.545 ,3.805 ,0.932 2072 ,0 ,23 ,24 ,0 ,3.431 ,3.788 ,0.906 2072 ,0 ,23 ,24 ,23 ,3.444 ,3.789 ,0.909 2072 ,0 ,25 ,24 ,0 ,3.553 ,3.781 ,0.94 2072 ,0 ,25 ,24 ,23 ,3.563 ,3.74 ,0.953 2073 ,0 ,24 ,25 ,0 ,3.421 ,3.688 ,0.928 2073 ,0 ,24 ,25 ,23 ,3.425 ,3.833 ,0.893 2073 ,0 ,26 ,25 ,0 ,3.56 ,3.765 ,0.945 2073 ,0 ,26 ,25 ,23 ,3.549 ,3.758 ,0.945 2074 ,0 ,25 ,26 ,0 ,3.4 ,3.743 ,0.908 2074 ,0 ,25 ,26 ,23 ,3.39 ,3.725 ,0.91 2074 ,0 ,27 ,26 ,0 ,3.509 ,3.807 ,0.922 2074 ,0 ,27 ,26 ,23 ,3.514 ,3.791 ,0.927 2075 ,0 ,26 ,27 ,0 ,3.395 ,3.765 ,0.902 2075 ,0 ,26 ,27 ,23 ,3.391 ,3.75 ,0.904 2075 ,0 ,28 ,27 ,0 ,3.538 ,3.772 ,0.938 2075 ,0 ,28 ,27 ,23 ,3.504 ,3.705 ,0.946 2076 ,0 ,27 ,28 ,0 ,3.368 ,3.689 ,0.913 2076 ,0 ,27 ,28 ,23 ,3.358 ,3.732 ,0.9 2076 ,0 ,29 ,28 ,0 ,3.523 ,3.723 ,0.946 2076 ,0 ,29 ,28 ,23 ,3.443 ,3.752 ,0.917 2077 ,0 ,28 ,29 ,0 ,3.356 ,3.711 ,0.904 2077 ,0 ,28 ,29 ,23 ,3.348 ,3.684 ,0.909 2077 ,0 ,30 ,29 ,0 ,3.5 ,3.68 ,0.951 2077 ,0 ,30 ,29 ,23 ,3.4 ,3.711 ,0.916 2078 ,0 ,29 ,30 ,0 ,3.368 ,3.697 ,0.911 2078 ,0 ,29 ,30 ,23 ,3.348 ,3.652 ,0.917 2078 ,0 ,31 ,30 ,0 ,3.455 ,3.781 ,0.914 2078 ,0 ,31 ,30 ,23 ,3.461 ,3.735 ,0.927 2079 ,0 ,30 ,31 ,0 ,3.372 ,3.816 ,0.884 2079 ,0 ,30 ,31 ,23 ,3.357 ,3.692 ,0.909 2079 ,0 ,32 ,31 ,0 ,3.358 ,3.741 ,0.898 2079 ,0 ,32 ,31 ,23 ,3.386 ,3.702 ,0.915 21 ,0 ,20 ,21 ,0 ,3.485 ,3.842 ,0.907 21 ,0 ,20 ,21 ,23 ,3.469 ,3.829 ,0.906 21 ,0 ,22 ,21 ,0 ,3.541 ,3.756 ,0.943 21 ,0 ,22 ,21 ,23 ,3.586 ,3.787 ,0.947 22 ,0 ,21 ,22 ,0 ,3.438 ,3.813 ,0.902 22 ,0 ,21 ,22 ,23 ,3.44 ,3.788 ,0.908 22 ,0 ,23 ,22 ,0 ,3.602 ,3.905 ,0.922 22 ,0 ,23 ,22 ,23 ,3.604 ,3.83 ,0.941 23 ,0 ,22 ,23 ,0 ,3.396 ,3.736 ,0.909 23 ,0 ,22 ,23 ,23 ,3.386 ,3.856 ,0.878 23 ,0 ,24 ,23 ,0 ,3.589 ,3.853 ,0.932 23 ,0 ,24 ,23 ,23 ,3.528 ,3.816 ,0.925 24 ,0 ,23 ,24 ,0 ,3.414 ,3.688 ,0.926 24 ,0 ,23 ,24 ,23 ,3.402 ,3.768 ,0.903 24 ,0 ,25 ,24 ,0 ,3.524 ,3.701 ,0.952 24 ,0 ,25 ,24 ,23 ,3.486 ,3.738 ,0.933 25 ,0 ,24 ,25 ,0 ,3.383 ,3.755 ,0.901 25 ,0 ,24 ,25 ,23 ,3.382 ,3.766 ,0.898 25 ,0 ,26 ,25 ,0 ,3.51 ,3.789 ,0.926 25 ,0 ,26 ,25 ,23 ,3.475 ,3.735 ,0.93 26 ,0 ,25 ,26 ,0 ,3.367 ,3.8 ,0.886 26 ,0 ,25 ,26 ,23 ,3.364 ,3.732 ,0.901 26 ,0 ,27 ,26 ,0 ,3.544 ,3.664 ,0.967 26 ,0 ,27 ,26 ,23 ,3.487 ,3.706 ,0.941 27 ,0 ,26 ,27 ,0 ,3.358 ,3.683 ,0.912 27 ,0 ,26 ,27 ,23 ,3.33 ,3.736 ,0.891 27 ,0 ,28 ,27 ,0 ,3.488 ,3.666 ,0.951 27 ,0 ,28 ,27 ,23 ,3.479 ,3.707 ,0.938 28 ,0 ,27 ,28 ,0 ,3.367 ,3.826 ,0.88 28 ,0 ,27 ,28 ,23 ,3.323 ,3.709 ,0.896 28 ,0 ,29 ,28 ,0 ,3.468 ,3.704 ,0.936 28 ,0 ,29 ,28 ,23 ,3.537 ,3.804 ,0.93 29 ,0 ,28 ,29 ,0 ,3.322 ,3.699 ,0.898 29 ,0 ,28 ,29 ,23 ,3.291 ,3.701 ,0.889 29 ,0 ,30 ,29 ,0 ,3.451 ,3.715 ,0.929 29 ,0 ,30 ,29 ,23 ,3.412 ,3.674 ,0.929 3 ,0 ,192 ,96 ,0 ,5.844 ,5.713 ,1.023 3 ,0 ,192 ,96 ,23 ,5.792 ,5.688 ,1.018 3 ,0 ,2 ,3 ,0 ,3.699 ,3.756 ,0.985 3 ,0 ,2 ,3 ,23 ,3.686 ,3.753 ,0.982 3 ,0 ,256 ,64 ,0 ,4.998 ,5.242 ,0.953 3 ,0 ,256 ,64 ,23 ,4.987 ,5.224 ,0.955 3 ,0 ,256 ,96 ,0 ,5.846 ,5.735 ,1.019 3 ,0 ,256 ,96 ,23 ,5.809 ,5.795 ,1.003 3 ,0 ,4 ,3 ,0 ,3.619 ,3.823 ,0.947 3 ,0 ,4 ,3 ,23 ,3.644 ,3.798 ,0.96 3 ,0 ,512 ,96 ,0 ,5.684 ,5.685 ,1.0 3 ,0 ,512 ,96 ,23 ,5.781 ,5.718 ,1.011 30 ,0 ,29 ,30 ,0 ,3.332 ,3.682 ,0.905 30 ,0 ,29 ,30 ,23 ,3.327 ,3.688 ,0.902 30 ,0 ,31 ,30 ,0 ,3.403 ,3.732 ,0.912 30 ,0 ,31 ,30 ,23 ,3.406 ,3.778 ,0.902 31 ,0 ,30 ,31 ,0 ,3.358 ,3.665 ,0.916 31 ,0 ,30 ,31 ,23 ,3.334 ,3.663 ,0.91 31 ,0 ,32 ,31 ,0 ,3.381 ,3.712 ,0.911 31 ,0 ,32 ,31 ,23 ,3.506 ,3.837 ,0.914 4 ,0 ,192 ,128 ,0 ,6.737 ,6.179 ,1.09 4 ,0 ,192 ,128 ,23 ,6.341 ,6.195 ,1.024 4 ,0 ,256 ,128 ,0 ,6.751 ,6.094 ,1.108 4 ,0 ,256 ,128 ,23 ,6.153 ,6.145 ,1.001 4 ,0 ,256 ,64 ,0 ,5.052 ,5.33 ,0.948 4 ,0 ,256 ,64 ,23 ,5.043 ,5.31 ,0.95 4 ,0 ,3 ,4 ,0 ,3.515 ,3.542 ,0.992 4 ,0 ,3 ,4 ,23 ,3.508 ,3.531 ,0.993 4 ,0 ,5 ,4 ,0 ,3.548 ,3.767 ,0.942 4 ,0 ,5 ,4 ,23 ,3.543 ,3.752 ,0.944 4 ,0 ,512 ,128 ,0 ,6.143 ,6.093 ,1.008 4 ,0 ,512 ,128 ,23 ,6.715 ,6.042 ,1.111 4081 ,0 ,0 ,1 ,0 ,3.262 ,3.912 ,0.834 4081 ,0 ,0 ,1 ,23 ,3.27 ,3.921 ,0.834 4081 ,0 ,1 ,2 ,0 ,5.01 ,5.101 ,0.982 4081 ,0 ,1 ,2 ,23 ,5.01 ,5.061 ,0.99 4081 ,0 ,10 ,11 ,0 ,4.959 ,5.291 ,0.937 4081 ,0 ,10 ,11 ,23 ,4.966 ,5.312 ,0.935 4081 ,0 ,10 ,9 ,0 ,4.317 ,5.319 ,0.812 4081 ,0 ,10 ,9 ,23 ,4.32 ,5.257 ,0.822 4081 ,0 ,11 ,10 ,0 ,4.314 ,5.287 ,0.816 4081 ,0 ,11 ,10 ,23 ,4.325 ,5.268 ,0.821 4081 ,0 ,11 ,12 ,0 ,4.94 ,5.302 ,0.932 4081 ,0 ,11 ,12 ,23 ,4.96 ,5.291 ,0.937 4081 ,0 ,12 ,11 ,0 ,4.379 ,5.237 ,0.836 4081 ,0 ,12 ,11 ,23 ,4.304 ,5.285 ,0.814 4081 ,0 ,12 ,13 ,0 ,4.971 ,5.321 ,0.934 4081 ,0 ,12 ,13 ,23 ,4.944 ,5.26 ,0.94 4081 ,0 ,13 ,12 ,0 ,4.302 ,5.298 ,0.812 4081 ,0 ,13 ,12 ,23 ,4.296 ,5.238 ,0.82 4081 ,0 ,13 ,14 ,0 ,4.933 ,5.278 ,0.935 4081 ,0 ,13 ,14 ,23 ,4.963 ,5.356 ,0.926 4081 ,0 ,14 ,13 ,0 ,4.292 ,5.262 ,0.816 4081 ,0 ,14 ,13 ,23 ,4.337 ,5.342 ,0.812 4081 ,0 ,14 ,15 ,0 ,4.899 ,5.254 ,0.932 4081 ,0 ,14 ,15 ,23 ,4.955 ,5.272 ,0.94 4081 ,0 ,15 ,14 ,0 ,4.327 ,5.284 ,0.819 4081 ,0 ,15 ,14 ,23 ,4.327 ,5.382 ,0.804 4081 ,0 ,15 ,16 ,0 ,4.939 ,5.28 ,0.935 4081 ,0 ,15 ,16 ,23 ,4.986 ,5.275 ,0.945 4081 ,0 ,16 ,15 ,0 ,5.696 ,7.264 ,0.784 4081 ,0 ,16 ,15 ,23 ,5.642 ,7.302 ,0.773 4081 ,0 ,16 ,17 ,0 ,5.603 ,7.975 ,0.703 4081 ,0 ,16 ,17 ,23 ,5.635 ,7.971 ,0.707 4081 ,0 ,17 ,16 ,0 ,5.659 ,7.294 ,0.776 4081 ,0 ,17 ,16 ,23 ,5.716 ,7.371 ,0.775 4081 ,0 ,17 ,18 ,0 ,5.602 ,7.928 ,0.707 4081 ,0 ,17 ,18 ,23 ,5.65 ,7.964 ,0.709 4081 ,0 ,18 ,17 ,0 ,5.697 ,7.34 ,0.776 4081 ,0 ,18 ,17 ,23 ,5.647 ,7.265 ,0.777 4081 ,0 ,18 ,19 ,0 ,5.587 ,7.918 ,0.706 4081 ,0 ,18 ,19 ,23 ,5.625 ,8.091 ,0.695 4081 ,0 ,19 ,18 ,0 ,5.645 ,7.312 ,0.772 4081 ,0 ,19 ,18 ,23 ,5.711 ,7.357 ,0.776 4081 ,0 ,19 ,20 ,0 ,5.572 ,7.979 ,0.698 4081 ,0 ,19 ,20 ,23 ,5.649 ,7.944 ,0.711 4081 ,0 ,2 ,1 ,0 ,4.2 ,5.012 ,0.838 4081 ,0 ,2 ,1 ,23 ,3.979 ,4.597 ,0.865 4081 ,0 ,2 ,3 ,0 ,5.245 ,5.274 ,0.994 4081 ,0 ,2 ,3 ,23 ,5.27 ,5.303 ,0.994 4081 ,0 ,20 ,19 ,0 ,5.646 ,7.264 ,0.777 4081 ,0 ,20 ,19 ,23 ,5.649 ,7.373 ,0.766 4081 ,0 ,20 ,21 ,0 ,5.583 ,7.914 ,0.705 4081 ,0 ,20 ,21 ,23 ,5.614 ,7.952 ,0.706 4081 ,0 ,21 ,20 ,0 ,5.64 ,7.308 ,0.772 4081 ,0 ,21 ,20 ,23 ,5.657 ,7.283 ,0.777 4081 ,0 ,21 ,22 ,0 ,5.592 ,7.854 ,0.712 4081 ,0 ,21 ,22 ,23 ,5.592 ,7.881 ,0.71 4081 ,0 ,22 ,21 ,0 ,5.653 ,7.219 ,0.783 4081 ,0 ,22 ,21 ,23 ,5.628 ,7.21 ,0.781 4081 ,0 ,22 ,23 ,0 ,5.633 ,7.904 ,0.713 4081 ,0 ,22 ,23 ,23 ,5.634 ,7.902 ,0.713 4081 ,0 ,23 ,22 ,0 ,5.658 ,7.27 ,0.778 4081 ,0 ,23 ,22 ,23 ,5.653 ,7.243 ,0.78 4081 ,0 ,23 ,24 ,0 ,5.546 ,7.838 ,0.708 4081 ,0 ,23 ,24 ,23 ,5.574 ,7.876 ,0.708 4081 ,0 ,24 ,23 ,0 ,5.641 ,7.303 ,0.772 4081 ,0 ,24 ,23 ,23 ,5.645 ,7.225 ,0.781 4081 ,0 ,24 ,25 ,0 ,5.566 ,7.864 ,0.708 4081 ,0 ,24 ,25 ,23 ,5.555 ,7.879 ,0.705 4081 ,0 ,25 ,24 ,0 ,5.603 ,7.182 ,0.78 4081 ,0 ,25 ,24 ,23 ,5.604 ,7.186 ,0.78 4081 ,0 ,25 ,26 ,0 ,5.498 ,7.79 ,0.706 4081 ,0 ,25 ,26 ,23 ,5.503 ,7.781 ,0.707 4081 ,0 ,256 ,128 ,23 ,6.564 ,7.033 ,0.933 4081 ,0 ,256 ,160 ,23 ,8.062 ,8.228 ,0.98 4081 ,0 ,256 ,192 ,23 ,8.183 ,8.162 ,1.003 4081 ,0 ,256 ,224 ,23 ,9.406 ,9.034 ,1.041 4081 ,0 ,256 ,32 ,23 ,5.45 ,6.315 ,0.863 4081 ,0 ,256 ,64 ,0 ,5.398 ,5.967 ,0.905 4081 ,0 ,256 ,64 ,23 ,5.557 ,6.259 ,0.888 4081 ,0 ,256 ,96 ,23 ,6.277 ,6.661 ,0.942 4081 ,0 ,26 ,25 ,0 ,5.616 ,7.212 ,0.779 4081 ,0 ,26 ,25 ,23 ,5.586 ,7.134 ,0.783 4081 ,0 ,26 ,27 ,0 ,5.467 ,7.724 ,0.708 4081 ,0 ,26 ,27 ,23 ,5.453 ,7.743 ,0.704 4081 ,0 ,27 ,26 ,0 ,5.56 ,7.131 ,0.78 4081 ,0 ,27 ,26 ,23 ,5.559 ,7.112 ,0.782 4081 ,0 ,27 ,28 ,0 ,5.459 ,7.804 ,0.699 4081 ,0 ,27 ,28 ,23 ,5.454 ,7.837 ,0.696 4081 ,0 ,28 ,27 ,0 ,5.599 ,7.209 ,0.777 4081 ,0 ,28 ,27 ,23 ,5.531 ,7.126 ,0.776 4081 ,0 ,28 ,29 ,0 ,5.458 ,7.795 ,0.7 4081 ,0 ,28 ,29 ,23 ,5.467 ,7.69 ,0.711 4081 ,0 ,29 ,28 ,0 ,5.563 ,7.19 ,0.774 4081 ,0 ,29 ,28 ,23 ,5.536 ,7.119 ,0.778 4081 ,0 ,29 ,30 ,0 ,5.464 ,7.727 ,0.707 4081 ,0 ,29 ,30 ,23 ,5.507 ,7.707 ,0.715 4081 ,0 ,3 ,2 ,0 ,4.347 ,5.331 ,0.815 4081 ,0 ,3 ,2 ,23 ,4.366 ,5.319 ,0.821 4081 ,0 ,3 ,4 ,0 ,4.968 ,5.147 ,0.965 4081 ,0 ,3 ,4 ,23 ,4.972 ,5.04 ,0.987 4081 ,0 ,30 ,29 ,0 ,5.589 ,7.146 ,0.782 4081 ,0 ,30 ,29 ,23 ,5.561 ,7.145 ,0.778 4081 ,0 ,30 ,31 ,0 ,5.453 ,7.709 ,0.707 4081 ,0 ,30 ,31 ,23 ,5.441 ,7.687 ,0.708 4081 ,0 ,31 ,30 ,0 ,5.498 ,7.059 ,0.779 4081 ,0 ,31 ,30 ,23 ,5.52 ,7.076 ,0.78 4081 ,0 ,32 ,31 ,0 ,5.496 ,7.072 ,0.777 4081 ,0 ,32 ,31 ,23 ,5.506 ,7.113 ,0.774 4081 ,0 ,4 ,3 ,0 ,4.341 ,5.298 ,0.819 4081 ,0 ,4 ,3 ,23 ,4.333 ,5.34 ,0.811 4081 ,0 ,4 ,5 ,0 ,4.968 ,5.179 ,0.959 4081 ,0 ,4 ,5 ,23 ,4.984 ,5.108 ,0.976 4081 ,0 ,5 ,4 ,0 ,4.327 ,5.31 ,0.815 4081 ,0 ,5 ,4 ,23 ,4.345 ,5.274 ,0.824 4081 ,0 ,5 ,6 ,0 ,4.907 ,5.312 ,0.924 4081 ,0 ,5 ,6 ,23 ,4.935 ,5.239 ,0.942 4081 ,0 ,6 ,5 ,0 ,4.335 ,5.322 ,0.815 4081 ,0 ,6 ,5 ,23 ,4.337 ,5.272 ,0.823 4081 ,0 ,6 ,7 ,0 ,4.929 ,5.278 ,0.934 4081 ,0 ,6 ,7 ,23 ,4.956 ,5.192 ,0.954 4081 ,0 ,7 ,6 ,0 ,4.307 ,5.273 ,0.817 4081 ,0 ,7 ,6 ,23 ,4.263 ,5.198 ,0.82 4081 ,0 ,7 ,8 ,0 ,4.941 ,5.263 ,0.939 4081 ,0 ,7 ,8 ,23 ,4.975 ,5.301 ,0.939 4081 ,0 ,8 ,7 ,0 ,4.315 ,5.236 ,0.824 4081 ,0 ,8 ,7 ,23 ,4.312 ,5.331 ,0.809 4081 ,0 ,8 ,9 ,0 ,4.97 ,5.327 ,0.933 4081 ,0 ,8 ,9 ,23 ,4.953 ,5.266 ,0.941 4081 ,0 ,9 ,10 ,0 ,4.941 ,5.297 ,0.933 4081 ,0 ,9 ,10 ,23 ,4.959 ,5.303 ,0.935 4081 ,0 ,9 ,8 ,0 ,4.314 ,5.283 ,0.817 4081 ,0 ,9 ,8 ,23 ,4.331 ,5.283 ,0.82 5 ,0 ,192 ,160 ,0 ,7.739 ,7.265 ,1.065 5 ,0 ,192 ,160 ,23 ,7.878 ,7.41 ,1.063 5 ,0 ,256 ,160 ,0 ,7.5 ,7.28 ,1.03 5 ,0 ,256 ,160 ,23 ,7.693 ,7.228 ,1.064 5 ,0 ,256 ,64 ,0 ,5.195 ,5.353 ,0.97 5 ,0 ,256 ,64 ,23 ,5.142 ,5.359 ,0.96 5 ,0 ,4 ,5 ,0 ,3.508 ,3.534 ,0.993 5 ,0 ,4 ,5 ,23 ,3.506 ,3.532 ,0.993 5 ,0 ,512 ,160 ,0 ,9.026 ,9.23 ,0.978 5 ,0 ,512 ,160 ,23 ,9.133 ,9.441 ,0.967 5 ,0 ,6 ,5 ,0 ,3.575 ,3.729 ,0.959 5 ,0 ,6 ,5 ,23 ,3.556 ,3.791 ,0.938 6 ,0 ,192 ,192 ,0 ,7.969 ,7.958 ,1.001 6 ,0 ,192 ,192 ,23 ,8.081 ,7.991 ,1.011 6 ,0 ,256 ,192 ,0 ,7.801 ,7.655 ,1.019 6 ,0 ,256 ,192 ,23 ,7.927 ,7.813 ,1.015 6 ,0 ,256 ,64 ,0 ,5.218 ,5.435 ,0.96 6 ,0 ,256 ,64 ,23 ,5.112 ,5.372 ,0.952 6 ,0 ,5 ,6 ,0 ,3.491 ,3.684 ,0.948 6 ,0 ,5 ,6 ,23 ,3.483 ,3.718 ,0.937 6 ,0 ,512 ,192 ,0 ,9.568 ,9.86 ,0.97 6 ,0 ,512 ,192 ,23 ,9.556 ,9.693 ,0.986 6 ,0 ,7 ,6 ,0 ,3.631 ,3.739 ,0.971 6 ,0 ,7 ,6 ,23 ,3.614 ,3.865 ,0.935 7 ,0 ,192 ,224 ,0 ,7.997 ,7.814 ,1.023 7 ,0 ,192 ,224 ,23 ,7.919 ,7.82 ,1.013 7 ,0 ,256 ,224 ,0 ,8.76 ,8.428 ,1.039 7 ,0 ,256 ,224 ,23 ,8.73 ,8.474 ,1.03 7 ,0 ,256 ,64 ,0 ,5.074 ,5.389 ,0.942 7 ,0 ,256 ,64 ,23 ,5.123 ,5.229 ,0.98 7 ,0 ,512 ,224 ,0 ,9.416 ,9.45 ,0.996 7 ,0 ,512 ,224 ,23 ,9.405 ,9.482 ,0.992 7 ,0 ,6 ,7 ,0 ,3.498 ,3.75 ,0.933 7 ,0 ,6 ,7 ,23 ,3.49 ,3.738 ,0.934 7 ,0 ,8 ,7 ,0 ,3.631 ,3.773 ,0.962 7 ,0 ,8 ,7 ,23 ,3.622 ,3.79 ,0.956 8 ,0 ,7 ,8 ,0 ,3.498 ,3.761 ,0.93 8 ,0 ,7 ,8 ,23 ,3.489 ,3.785 ,0.922 8 ,0 ,9 ,8 ,0 ,3.606 ,3.782 ,0.953 8 ,0 ,9 ,8 ,23 ,3.604 ,3.85 ,0.936 9 ,0 ,10 ,9 ,0 ,3.589 ,3.84 ,0.935 9 ,0 ,10 ,9 ,23 ,3.624 ,3.814 ,0.95 9 ,0 ,8 ,9 ,0 ,3.508 ,3.822 ,0.918 9 ,0 ,8 ,9 ,23 ,3.5 ,3.793 ,0.923 0.9281712548418259
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 0dd4f1dcce..23a1c0018e 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -21,17 +21,27 @@ #if ISA_SHOULD_BUILD (4) +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + # ifndef MEMCHR # define MEMCHR __memchr_evex # endif # ifdef USE_AS_WMEMCHR +# define PC_SHIFT_GPR rcx +# define VPTESTN vptestnmd # define VPBROADCAST vpbroadcastd # define VPMINU vpminud # define VPCMP vpcmpd # define VPCMPEQ vpcmpeqd # define CHAR_SIZE 4 + +# define USE_WIDE_CHAR # else +# define PC_SHIFT_GPR rdi +# define VPTESTN vptestnmb # define VPBROADCAST vpbroadcastb # define VPMINU vpminub # define VPCMP vpcmpb @@ -39,534 +49,661 @@ # define CHAR_SIZE 1 # endif - /* In the 4x loop the RTM and non-RTM versions have data pointer - off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. - This is represented by BASE_OFFSET. As well because the RTM - version uses vpcmp which stores a bit per element compared where - the non-RTM version uses vpcmpeq which stores a bit per byte - compared RET_SCALE of CHAR_SIZE is only relevant for the RTM - version. */ -# ifdef USE_IN_RTM +# include "reg-macros.h" + + +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 + doesn't have VEX encoding), use VEX encoding in loop so we + can use vpcmpeqb + vptern which is more efficient than the + EVEX alternative. */ +# if defined USE_IN_RTM || VEC_SIZE == 64 +# undef COND_VZEROUPPER +# undef VZEROUPPER_RETURN +# undef VZEROUPPER + +# define COND_VZEROUPPER +# define VZEROUPPER_RETURN ret # define VZEROUPPER -# define BASE_OFFSET (VEC_SIZE * 4) -# define RET_SCALE CHAR_SIZE + +# define USE_TERN_IN_LOOP 0 # else +# define USE_TERN_IN_LOOP 1 +# undef VZEROUPPER # define VZEROUPPER vzeroupper -# define BASE_OFFSET 0 -# define RET_SCALE 1 # endif - /* In the return from 4x loop memchr and rawmemchr versions have - data pointers off by VEC_SIZE * 4 with memchr version being - VEC_SIZE * 4 greater. */ -# ifdef USE_AS_RAWMEMCHR -# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) -# define RAW_PTR_REG rcx -# define ALGN_PTR_REG rdi +# if USE_TERN_IN_LOOP + /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar + so we don't want to multiply resulting index. */ +# define TERN_CHAR_MULT 1 + +# ifdef USE_AS_WMEMCHR +# define TEST_END() inc %VRCX +# else +# define TEST_END() add %rdx, %rcx +# endif # else -# define RET_OFFSET BASE_OFFSET -# define RAW_PTR_REG rdi -# define ALGN_PTR_REG rcx +# define TERN_CHAR_MULT CHAR_SIZE +# define TEST_END() KORTEST %k2, %k3 # endif -# define XMMZERO xmm23 -# define YMMZERO ymm23 -# define XMMMATCH xmm16 -# define YMMMATCH ymm16 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +# ifndef USE_AS_WMEMCHR +# define GPR_X0_IS_RET 1 +# else +# define GPR_X0_IS_RET 0 +# endif +# define GPR_X0 rax +# else +# define GPR_X0_IS_RET 0 +# define GPR_X0 rdx +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# ifndef SECTION -# define SECTION(p) p##.evex +# if CHAR_PER_VEC == 64 +# define LAST_VEC_OFFSET (VEC_SIZE * 3) +# else +# define LAST_VEC_OFFSET (VEC_SIZE * 2) +# endif +# if CHAR_PER_VEC >= 32 +# define MASK_GPR(...) VGPR(__VA_ARGS__) +# elif CHAR_PER_VEC == 16 +# define MASK_GPR(reg) VGPR_SZ(reg, 16) +# else +# define MASK_GPR(reg) VGPR_SZ(reg, 8) # endif -# define VEC_SIZE 32 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# define PAGE_SIZE 4096 +# define VMATCH VMM(0) +# define VMATCH_LO VMM_lo(0) - .section SECTION(.text),"ax",@progbits +# define PAGE_SIZE 4096 + + + .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN (MEMCHR, 6) -# ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ test %RDX_LP, %RDX_LP - jz L(zero) + jz L(zero_0) -# ifdef __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx -# endif # endif - /* Broadcast CHAR to YMMMATCH. */ - VPBROADCAST %esi, %YMMMATCH + VPBROADCAST %esi, %VMATCH /* Check if we may cross page boundary with one vector load. */ movl %edi, %eax andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page_boundary) + ja L(page_cross) + + VPCMPEQ (%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX +# ifndef USE_AS_WMEMCHR + /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a + already a dependency between rcx and rsi so no worries about + false-dep here. */ + tzcnt %VRAX, %VRSI + /* If rdx <= rsi then either 1) rcx was non-zero (there was a + match) but it was out of bounds or 2) rcx was zero and rdx + was <= VEC_SIZE so we are done scanning. */ + cmpq %rsi, %rdx + /* NB: Use branch to return zero/non-zero. Common usage will + branch on result of function (if return is null/non-null). + This branch can be used to predict the ensuing one so there + is no reason to extend the data-dependency with cmovcc. */ + jbe L(zero_0) + + /* If rcx is zero then len must be > RDX, otherwise since we + already tested len vs lzcnt(rcx) (in rsi) we are good to + return this match. */ + test %VRAX, %VRAX + jz L(more_1x_vec) + leaq (%rdi, %rsi), %rax +# else - /* Check the first VEC_SIZE bytes. */ - VPCMP $0, (%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax -# ifndef USE_AS_RAWMEMCHR - /* If length < CHAR_PER_VEC handle special. */ + /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE + > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ cmpq $CHAR_PER_VEC, %rdx - jbe L(first_vec_x0) -# endif - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + ja L(more_1x_vec) + tzcnt %VRAX, %VRAX + cmpl %eax, %edx + jbe L(zero_0) +L(first_vec_x0_ret): leaq (%rdi, %rax, CHAR_SIZE), %rax -# else - addq %rdi, %rax # endif ret -# ifndef USE_AS_RAWMEMCHR -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(first_vec_x0): - /* Check if first match was before length. NB: tzcnt has false data- - dependency on destination. eax already had a data-dependency on esi - so this should have no affect here. */ - tzcntl %eax, %esi -# ifdef USE_AS_WMEMCHR - leaq (%rdi, %rsi, CHAR_SIZE), %rdi -# else - addq %rsi, %rdi -# endif + /* Only fits in first cache line for VEC_SIZE == 32. */ +# if VEC_SIZE == 32 + .p2align 4,, 2 +L(zero_0): xorl %eax, %eax - cmpl %esi, %edx - cmovg %rdi, %rax ret # endif - .p2align 4 -L(cross_page_boundary): - /* Save pointer before aligning as its original value is - necessary for computer return address if byte is found or - adjusting length if it is not and this is memchr. */ - movq %rdi, %rcx - /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi - for rawmemchr. */ - andq $-VEC_SIZE, %ALGN_PTR_REG - VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 - kmovd %k0, %r8d + .p2align 4,, 9 +L(more_1x_vec): # ifdef USE_AS_WMEMCHR - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - sarl $2, %eax -# endif -# ifndef USE_AS_RAWMEMCHR - movl $(PAGE_SIZE / CHAR_SIZE), %esi - subl %eax, %esi + /* If wmemchr still need to test if there was a match in first + VEC. Use bsf to test here so we can reuse + L(first_vec_x0_ret). */ + bsf %VRAX, %VRAX + jnz L(first_vec_x0_ret) # endif + +L(page_cross_continue): # ifdef USE_AS_WMEMCHR - andl $(CHAR_PER_VEC - 1), %eax -# endif - /* Remove the leading bytes. */ - sarxl %eax, %r8d, %eax -# ifndef USE_AS_RAWMEMCHR - /* Check the end of data. */ - cmpq %rsi, %rdx - jbe L(first_vec_x0) + /* We can't use end of the buffer to re-calculate length for + wmemchr as len * CHAR_SIZE may overflow. */ + leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + addq %rdx, %rax +# else + leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax # endif - testl %eax, %eax - jz L(cross_page_continue) - tzcntl %eax, %eax + + /* rax contains remaining length - 1. -1 so we can get imm8 + encoding in a few additional places saving code size. */ + + /* Needed regardless of remaining length. */ + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 + KMOV %k0, %VRDX + + /* We cannot fold the above `sub %rdi, %rax` with the `cmp + $(CHAR_PER_VEC * 2), %rax` because its possible for a very + large length to overflow and cause the subtract to carry + despite length being above CHAR_PER_VEC * 2. */ + cmpq $(CHAR_PER_VEC * 2 - 1), %rax + ja L(more_2x_vec) +L(last_2x_vec): + + test %VRDX, %VRDX + jnz L(first_vec_x1_check) + + /* Check the end of data. NB: use 8-bit operations to save code + size. We no longer need the full-width of eax and will + perform a write-only operation over eax so there will be no + partial-register stalls. */ + subb $(CHAR_PER_VEC * 1 - 1), %al + jle L(zero_0) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX # ifdef USE_AS_WMEMCHR - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax + /* For wmemchr against we can't take advantage of tzcnt(0) == + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ + test %VRCX, %VRCX + jz L(zero_0) +# endif + tzcnt %VRCX, %VRCX + cmp %cl, %al + + /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give + fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is + not enough space before the next cache line to fit the `lea` + for return. */ +# if VEC_SIZE == 64 + ja L(first_vec_x2_ret) +L(zero_0): + xorl %eax, %eax + ret # else - addq %RAW_PTR_REG, %rax + jbe L(zero_0) + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax + ret # endif + + .p2align 4,, 5 +L(first_vec_x1_check): + bsf %VRDX, %VRDX + cmpb %dl, %al + jb L(zero_4) + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + /* Fits at the end of the cache line here for VEC_SIZE == 32. + */ +# if VEC_SIZE == 32 +L(zero_4): + xorl %eax, %eax ret +# endif - .p2align 4 + + .p2align 4,, 4 L(first_vec_x2): - tzcntl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + bsf %VRCX, %VRCX +L(first_vec_x2_ret): + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax ret - .p2align 4 -L(first_vec_x3): - tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + /* Fits at the end of the cache line here for VEC_SIZE == 64. + */ +# if VEC_SIZE == 64 +L(zero_4): + xorl %eax, %eax ret +# endif - .p2align 4 -L(first_vec_x4): - tzcntl %eax, %eax - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + .p2align 4,, 4 +L(first_vec_x1): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret - .p2align 5 -L(aligned_more): - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ -# ifndef USE_AS_RAWMEMCHR - /* Align data to VEC_SIZE. */ -L(cross_page_continue): - xorl %ecx, %ecx - subl %edi, %ecx - andq $-VEC_SIZE, %rdi - /* esi is for adjusting length to see if near the end. */ - leal (VEC_SIZE * 5)(%rdi, %rcx), %esi -# ifdef USE_AS_WMEMCHR - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %esi -# endif -# else - andq $-VEC_SIZE, %rdi -L(cross_page_continue): -# endif - /* Load first VEC regardless. */ - VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax -# ifndef USE_AS_RAWMEMCHR - /* Adjust length. If near end handle specially. */ - subq %rsi, %rdx - jbe L(last_4x_vec_or_less) -# endif - testl %eax, %eax + .p2align 4,, 5 +L(more_2x_vec): + /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking + length. */ + + + /* Already computed matches for first VEC in rdx. */ + test %VRDX, %VRDX jnz L(first_vec_x1) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x2) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + /* Needed regardless of next length check. */ + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + + /* Check if we are near the end. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rax + ja L(more_4x_vec) + + test %VRCX, %VRCX + jnz L(first_vec_x3_check) + + /* Use 8-bit instructions to save code size. We won't use full- + width eax again and will perform a write-only operation to + eax so no worries about partial-register stalls. */ + subb $(CHAR_PER_VEC * 3), %al + jb L(zero_2) +L(last_vec_check): + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WMEMCHR + /* For wmemchr against we can't take advantage of tzcnt(0) == + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ + test %VRCX, %VRCX + jz L(zero_2) +# endif + tzcnt %VRCX, %VRCX + cmp %cl, %al + jae L(first_vec_x4_ret) +L(zero_2): + xorl %eax, %eax + ret + + /* Fits at the end of the cache line here for VEC_SIZE == 64. + For VEC_SIZE == 32 we put the return label at the end of + L(first_vec_x4). */ +# if VEC_SIZE == 64 +L(first_vec_x4_ret): + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax + ret +# endif + + .p2align 4,, 6 +L(first_vec_x4): + bsf %VRCX, %VRCX +# if VEC_SIZE == 32 + /* Place L(first_vec_x4_ret) here as we can't fit it in the same + cache line as where it is called from so we might as well + save code size by reusing return of L(first_vec_x4). */ +L(first_vec_x4_ret): +# endif + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 6 +L(first_vec_x3_check): + /* Need to adjust remaining length before checking. */ + addb $-(CHAR_PER_VEC * 2), %al + bsf %VRCX, %VRCX + cmpb %cl, %al + jb L(zero_2) + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 6 +L(first_vec_x3): + bsf %VRCX, %VRCX + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 3 +# if !USE_TERN_IN_LOOP + .p2align 4,, 10 +# endif +L(more_4x_vec): + test %VRCX, %VRCX jnz L(first_vec_x3) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x4) + subq $-(VEC_SIZE * 5), %rdi + subq $(CHAR_PER_VEC * 8), %rax + jb L(last_4x_vec) -# ifndef USE_AS_RAWMEMCHR - /* Check if at last CHAR_PER_VEC * 4 length. */ - subq $(CHAR_PER_VEC * 4), %rdx - jbe L(last_4x_vec_or_less_cmpeq) - /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi - - /* Align data to VEC_SIZE * 4 for the loop and readjust length. - */ -# ifdef USE_AS_WMEMCHR +# ifdef USE_AS_WMEMCHR movl %edi, %ecx - andq $-(4 * VEC_SIZE), %rdi +# else + addq %rdi, %rax +# endif + + +# if VEC_SIZE == 64 + /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex + processor has partial register stalls (all have merging + uop). If that changes this can be removed. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# ifdef USE_AS_WMEMCHR subl %edi, %ecx - /* NB: Divide bytes by 4 to get the wchar_t count. */ sarl $2, %ecx - addq %rcx, %rdx -# else - addq %rdi, %rdx - andq $-(4 * VEC_SIZE), %rdi - subq %rdi, %rdx -# endif + addq %rcx, %rax # else - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi - andq $-(4 * VEC_SIZE), %rdi + subq %rdi, %rax # endif -# ifdef USE_IN_RTM - vpxorq %XMMZERO, %XMMZERO, %XMMZERO -# else - /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not - encodable with EVEX registers (ymm16-ymm31). */ - vmovdqa64 %YMMMATCH, %ymm0 + + + +# if USE_TERN_IN_LOOP + /* copy VMATCH to low ymm so we can use vpcmpeq which is not + encodable with EVEX registers. NB: this is VEC_SIZE == 32 + only as there is no way to encode vpcmpeq with zmm0-15. */ + vmovdqa64 %VMATCH, %VMATCH_LO # endif - /* Compare 4 * VEC at a time forward. */ - .p2align 4 + .p2align 4,, 11 L(loop_4x_vec): - /* Two versions of the loop. One that does not require - vzeroupper by not using ymm0-ymm15 and another does that require - vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 - is used at all is because there is no EVEX encoding vpcmpeq and - with vpcmpeq this loop can be performed more efficiently. The - non-vzeroupper version is safe for RTM while the vzeroupper - version should be prefered if RTM are not supported. */ -# ifdef USE_IN_RTM - /* It would be possible to save some instructions using 4x VPCMP - but bottleneck on port 5 makes it not woth it. */ - VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 - /* xor will set bytes match esi to zero. */ - vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 - vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 - VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 - /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ - VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} - VPCMP $0, %YMM3, %YMMZERO, %k2 -# else + /* Two versions of the loop. One that does not require + vzeroupper by not using ymmm0-15 and another does that + require vzeroupper because it uses ymmm0-15. The reason why + ymm0-15 is used at all is because there is no EVEX encoding + vpcmpeq and with vpcmpeq this loop can be performed more + efficiently. The non-vzeroupper version is safe for RTM + while the vzeroupper version should be prefered if RTM are + not supported. Which loop version we use is determined by + USE_TERN_IN_LOOP. */ + +# if USE_TERN_IN_LOOP /* Since vptern can only take 3x vectors fastest to do 1 vec seperately with EVEX vpcmp. */ # ifdef USE_AS_WMEMCHR /* vptern can only accept masks for epi32/epi64 so can only save - instruction using not equals mask on vptern with wmemchr. */ - VPCMP $4, (%rdi), %YMMMATCH, %k1 + instruction using not equals mask on vptern with wmemchr. + */ + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 # else - VPCMP $0, (%rdi), %YMMMATCH, %k1 + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 # endif /* Compare 3x with vpcmpeq and or them all together with vptern. */ - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) # ifdef USE_AS_WMEMCHR - /* This takes the not of or between ymm2, ymm3, ymm4 as well as - combines result from VEC0 with zero mask. */ - vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} - vpmovmskb %ymm4, %ecx + /* This takes the not of or between VEC_lo(2), VEC_lo(3), + VEC_lo(4) as well as combines result from VEC(0) with zero + mask. */ + vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} + vpmovmskb %VMM_lo(4), %VRCX # else - /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ - vpternlogd $254, %ymm2, %ymm3, %ymm4 - vpmovmskb %ymm4, %ecx - kmovd %k1, %eax + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into + VEC_lo(4). */ + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) + vpmovmskb %VMM_lo(4), %VRCX + KMOV %k1, %edx # endif -# endif -# ifdef USE_AS_RAWMEMCHR - subq $-(VEC_SIZE * 4), %rdi -# endif -# ifdef USE_IN_RTM - kortestd %k2, %k3 # else -# ifdef USE_AS_WMEMCHR - /* ecx contains not of matches. All 1s means no matches. incl will - overflow and set zeroflag if that is the case. */ - incl %ecx -# else - /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding - to ecx is not an issue because if eax is non-zero it will be - used for returning the match. If it is zero the add does - nothing. */ - addq %rax, %rcx -# endif + /* Loop version that uses EVEX encoding. */ + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 + vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) + vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTN %VMM(3), %VMM(3), %k2 # endif -# ifdef USE_AS_RAWMEMCHR - jz L(loop_4x_vec) -# else - jnz L(loop_4x_vec_end) + + + TEST_END () + jnz L(loop_vec_ret) subq $-(VEC_SIZE * 4), %rdi - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop_4x_vec) + subq $(CHAR_PER_VEC * 4), %rax + jae L(loop_4x_vec) - /* Fall through into less than 4 remaining vectors of length case. + /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. */ - VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 - addq $(BASE_OFFSET - VEC_SIZE), %rdi - kmovd %k0, %eax - VZEROUPPER - -L(last_4x_vec_or_less): - /* Check if first VEC contained match. */ - testl %eax, %eax - jnz L(first_vec_x1_check) + COND_VZEROUPPER - /* If remaining length > CHAR_PER_VEC * 2. */ - addl $(CHAR_PER_VEC * 2), %edx - jg L(last_4x_vec) - -L(last_2x_vec): - /* If remaining length < CHAR_PER_VEC. */ - addl $CHAR_PER_VEC, %edx - jle L(zero_end) - - /* Check VEC2 and compare any match with remaining length. */ - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - cmpl %eax, %edx - jbe L(set_zero_end) - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -L(zero_end): - ret + .p2align 4,, 10 +L(last_4x_vec): + /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit + instructions on eax from here on out. */ +# if CHAR_PER_VEC != 64 + andl $(CHAR_PER_VEC * 4 - 1), %eax +# endif + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 + subq $(VEC_SIZE * 1), %rdi + KMOV %k0, %VRDX + cmpb $(CHAR_PER_VEC * 2 - 1), %al + jbe L(last_2x_vec) + test %VRDX, %VRDX + jnz L(last_vec_x1_novzero) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x2_novzero) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(first_vec_x3_check) + + subb $(CHAR_PER_VEC * 3), %al + jae L(last_vec_check) -L(set_zero_end): xorl %eax, %eax ret - .p2align 4 -L(first_vec_x1_check): - /* eax must be non-zero. Use bsfl to save code size. */ - bsfl %eax, %eax - /* Adjust length. */ - subl $-(CHAR_PER_VEC * 4), %edx - /* Check if match within remaining length. */ - cmpl %eax, %edx - jbe L(set_zero_end) - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP +L(last_vec_x2_novzero): + addq $VEC_SIZE, %rdi +L(last_vec_x1_novzero): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax ret +# endif - .p2align 4 -L(loop_4x_vec_end): +# if CHAR_PER_VEC == 64 + /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == + 64 it needs a seperate return label. */ + .p2align 4,, 4 +L(last_vec_x2): +L(last_vec_x2_novzero): + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax + ret # endif - /* rawmemchr will fall through into this if match was found in - loop. */ -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR - /* k1 has not of matches with VEC1. */ - kmovd %k1, %eax -# ifdef USE_AS_WMEMCHR - subl $((1 << CHAR_PER_VEC) - 1), %eax -# else - incl %eax -# endif + .p2align 4,, 4 +L(loop_vec_ret): +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP + KMOV %k1, %VRAX + inc %MASK_GPR(rax) # else - /* eax already has matches for VEC1. */ - testl %eax, %eax + test %VRDX, %VRDX # endif - jnz L(last_vec_x1_return) + jnz L(last_vec_x0) -# ifdef USE_IN_RTM - VPCMP $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %eax + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(2), %VRDX # else - vpmovmskb %ymm2, %eax + VPTESTN %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRDX # endif - testl %eax, %eax - jnz L(last_vec_x2_return) + test %VRDX, %VRDX + jnz L(last_vec_x1) -# ifdef USE_IN_RTM - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x3_return) - kmovd %k3, %eax - tzcntl %eax, %eax - leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(3), %VRDX # else - vpmovmskb %ymm3, %eax - /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ - salq $VEC_SIZE, %rcx - orq %rcx, %rax - tzcntq %rax, %rax - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax - VZEROUPPER + KMOV %k2, %VRDX # endif - ret - .p2align 4,, 10 -L(last_vec_x1_return): - tzcntl %eax, %eax -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper + (only if used VEX encoded loop). */ + COND_VZEROUPPER + + /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For + CHAR_PER_VEC we test the last 2x VEC seperately, for + CHAR_PER_VEC <= 32 we can combine the results from the 2x + VEC in a single GPR. */ +# if CHAR_PER_VEC == 64 +# if USE_TERN_IN_LOOP +# error "Unsupported" +# endif + + + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ + test %VRDX, %VRDX + jnz L(last_vec_x2) + KMOV %k3, %VRDX # else - addq %rdi, %rax + /* CHAR_PER_VEC <= 32 so we can combine the results from the + last 2x VEC. */ + +# if !USE_TERN_IN_LOOP + KMOV %k3, %VRCX +# endif + salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx + addq %rcx, %rdx +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +L(last_vec_x2_novzero): +# endif # endif - VZEROUPPER + bsf %rdx, %rdx + leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax ret - .p2align 4 -L(last_vec_x2_return): - tzcntl %eax, %eax - /* NB: Multiply bytes by RET_SCALE to get the wchar_t count - if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and - USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ - leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax - VZEROUPPER + .p2align 4,, 8 +L(last_vec_x1): + COND_VZEROUPPER +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP +L(last_vec_x1_novzero): +# endif + bsf %VRDX, %VRDX + leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax ret -# ifdef USE_IN_RTM - .p2align 4 -L(last_vec_x3_return): - tzcntl %eax, %eax - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + + .p2align 4,, 4 +L(last_vec_x0): + COND_VZEROUPPER + bsf %VGPR(GPR_X0), %VGPR(GPR_X0) +# if GPR_X0_IS_RET + addq %rdi, %rax +# else + leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax +# endif ret + + .p2align 4,, 6 +L(page_cross): + /* Need to preserve eax to compute inbound bytes we are + checking. */ +# ifdef USE_AS_WMEMCHR + movl %eax, %ecx +# else + xorl %ecx, %ecx + subl %eax, %ecx # endif -# ifndef USE_AS_RAWMEMCHR - .p2align 4,, 5 -L(last_4x_vec_or_less_cmpeq): - VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - subq $-(VEC_SIZE * 4), %rdi - /* Check first VEC regardless. */ - testl %eax, %eax - jnz L(first_vec_x1_check) + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 + KMOV %k0, %VRAX - /* If remaining length <= CHAR_PER_VEC * 2. */ - addl $(CHAR_PER_VEC * 2), %edx - jle L(last_2x_vec) +# ifdef USE_AS_WMEMCHR + /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif - .p2align 4 -L(last_4x_vec): - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x2) + shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - /* Create mask for possible matches within remaining length. */ -# ifdef USE_AS_WMEMCHR - movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx - bzhil %edx, %ecx, %ecx -# else - movq $-1, %rcx - bzhiq %rdx, %rcx, %rcx -# endif - /* Test matches in data against length match. */ - andl %ecx, %eax - jnz L(last_vec_x3) +# ifdef USE_AS_WMEMCHR + negl %ecx +# endif - /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after - remaining length was found to be > CHAR_PER_VEC * 2. */ - subl $CHAR_PER_VEC, %edx - jbe L(zero_end2) + /* mask lower bits from ecx (negative eax) to get bytes till + next VEC. */ + andl $(CHAR_PER_VEC - 1), %ecx + /* Check if VEC is entirely contained in the remainder of the + page. */ + cmpq %rcx, %rdx + jbe L(page_cross_ret) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 - kmovd %k0, %eax - /* Shift remaining length mask for last VEC. */ -# ifdef USE_AS_WMEMCHR - shrl $CHAR_PER_VEC, %ecx -# else - shrq $CHAR_PER_VEC, %rcx -# endif - andl %ecx, %eax - jz L(zero_end2) - bsfl %eax, %eax - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax -L(zero_end2): - ret + /* Length crosses the page so if rax is zero (no matches) + continue. */ + test %VRAX, %VRAX + jz L(page_cross_continue) -L(last_vec_x2): - tzcntl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + /* if rdx > rcx then any match here must be in [buf:buf + len]. + */ + tzcnt %VRAX, %VRAX +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif ret - .p2align 4 -L(last_vec_x3): - tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + .p2align 4,, 2 +L(page_cross_zero): + xorl %eax, %eax ret + + .p2align 4,, 4 +L(page_cross_ret): + /* Search is entirely contained in page cross case. */ +# ifdef USE_AS_WMEMCHR + test %VRAX, %VRAX + jz L(page_cross_zero) +# endif + tzcnt %VRAX, %VRAX + cmpl %eax, %edx + jbe L(page_cross_zero) +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif - /* 7 bytes from next cache line. */ + ret END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S index deda1ca395..2073eaa620 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S @@ -1,3 +1,6 @@ -#define MEMCHR __rawmemchr_evex_rtm -#define USE_AS_RAWMEMCHR 1 -#include "memchr-evex-rtm.S" +#define RAWMEMCHR __rawmemchr_evex_rtm + +#define USE_IN_RTM 1 +#define SECTION(p) p##.evex.rtm + +#include "rawmemchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S index dc1c450699..dad54def2b 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -1,7 +1,308 @@ -#ifndef RAWMEMCHR -# define RAWMEMCHR __rawmemchr_evex -#endif -#define USE_AS_RAWMEMCHR 1 -#define MEMCHR RAWMEMCHR +/* rawmemchr optimized with 256-bit EVEX instructions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> +#include <sysdep.h> + +#if ISA_SHOULD_BUILD (4) + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + +# ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex +# endif + + +# define PC_SHIFT_GPR rdi +# define REG_WIDTH VEC_SIZE +# define VPTESTN vptestnmb +# define VPBROADCAST vpbroadcastb +# define VPMINU vpminub +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 + +# include "reg-macros.h" + +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 + doesn't have VEX encoding), use VEX encoding in loop so we + can use vpcmpeqb + vptern which is more efficient than the + EVEX alternative. */ +# if defined USE_IN_RTM || VEC_SIZE == 64 +# undef COND_VZEROUPPER +# undef VZEROUPPER_RETURN +# undef VZEROUPPER + + +# define COND_VZEROUPPER +# define VZEROUPPER_RETURN ret +# define VZEROUPPER + +# define USE_TERN_IN_LOOP 0 +# else +# define USE_TERN_IN_LOOP 1 +# undef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define CHAR_PER_VEC VEC_SIZE + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else /* !(CHAR_PER_VEC == 64) */ + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) +# endif /* !(CHAR_PER_VEC == 64) */ + + +# define VMATCH VMM(0) +# define VMATCH_LO VMM_lo(0) + +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (RAWMEMCHR, 6) + VPBROADCAST %esi, %VMATCH + /* Check if we may cross page boundary with one vector load. */ + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + VPCMPEQ (%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + + test %VRAX, %VRAX + jz L(aligned_more) +L(first_vec_x0): + bsf %VRAX, %VRAX + addq %rdi, %rax + ret + + .p2align 4,, 4 +L(first_vec_x4): + bsf %VRAX, %VRAX + leaq (VEC_SIZE * 4)(%rdi, %rax), %rax + ret -#include "memchr-evex.S" + /* For VEC_SIZE == 32 we can fit this in aligning bytes so might + as well place it more locally. For VEC_SIZE == 64 we reuse + return code at the end of loop's return. */ +# if VEC_SIZE == 32 + .p2align 4,, 4 +L(FALLTHROUGH_RETURN_LBL): + bsf %VRAX, %VRAX + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax + ret +# endif + + .p2align 4,, 6 +L(page_cross): + /* eax has lower page-offset bits of rdi so xor will zero them + out. */ + xorq %rdi, %rax + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 + KMOV %k0, %VRAX + + /* Shift out out-of-bounds matches. */ + shrx %VRDI, %VRAX, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) + + .p2align 4,, 10 +L(aligned_more): +L(page_cross_continue): + /* Align pointer. */ + andq $(VEC_SIZE * -1), %rdi + + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x4) + + subq $-(VEC_SIZE * 1), %rdi +# if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# if USE_TERN_IN_LOOP + /* copy VMATCH to low ymm so we can use vpcmpeq which is not + encodable with EVEX registers. NB: this is VEC_SIZE == 32 + only as there is no way to encode vpcmpeq with zmm0-15. */ + vmovdqa64 %VMATCH, %VMATCH_LO +# endif + + .p2align 4 +L(loop_4x_vec): + /* Two versions of the loop. One that does not require + vzeroupper by not using ymm0-15 and another does that + require vzeroupper because it uses ymm0-15. The reason why + ymm0-15 is used at all is because there is no EVEX encoding + vpcmpeq and with vpcmpeq this loop can be performed more + efficiently. The non-vzeroupper version is safe for RTM + while the vzeroupper version should be prefered if RTM are + not supported. Which loop version we use is determined by + USE_TERN_IN_LOOP. */ + +# if USE_TERN_IN_LOOP + /* Since vptern can only take 3x vectors fastest to do 1 vec + seperately with EVEX vpcmp. */ + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 + /* Compare 3x with vpcmpeq and or them all together with vptern. + */ + + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2) + subq $(VEC_SIZE * -4), %rdi + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) + + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into + VEC_lo(4). */ + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) + vpmovmskb %VMM_lo(4), %VRCX + + KMOV %k1, %eax + + /* NB: rax has match from first VEC and rcx has matches from + VEC 2-4. If rax is non-zero we will return that match. If + rax is zero adding won't disturb the bits in rcx. */ + add %rax, %rcx +# else + /* Loop version that uses EVEX encoding. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 + vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2) + vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3) + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3 + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} + VPTESTN %VMM(3), %VMM(3), %k2 + subq $(VEC_SIZE * -4), %rdi + KORTEST %k2, %k3 +# endif + jz L(loop_4x_vec) + +# if USE_TERN_IN_LOOP + test %VRAX, %VRAX +# else + KMOV %k1, %VRAX + inc %VRAX +# endif + jnz L(last_vec_x0) + + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(2), %VRAX +# else + VPTESTN %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRAX +# endif + test %VRAX, %VRAX + jnz L(last_vec_x1) + + +# if USE_TERN_IN_LOOP + vpmovmskb %VMM_lo(3), %VRAX +# else + KMOV %k2, %VRAX +# endif + + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper + (only if used VEX encoded loop). */ + COND_VZEROUPPER + + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC + individually, for VEC_SIZE == 32 we combine them in a single + 64-bit GPR. */ +# if CHAR_PER_VEC == 64 +# if USE_TERN_IN_LOOP +# error "Unsupported" +# endif + + + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k3, %VRAX +L(FALLTHROUGH_RETURN_LBL): +# else + /* CHAR_PER_VEC <= 32 so we can combine the results from the + last 2x VEC. */ +# if !USE_TERN_IN_LOOP + KMOV %k3, %VRCX +# endif + salq $CHAR_PER_VEC, %rcx + addq %rcx, %rax +# endif + bsf %rax, %rax + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(TAIL_RETURN_LBL): + bsf %rax, %rax + leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(last_vec_x1): + COND_VZEROUPPER +L(first_vec_x1): + bsf %VRAX, %VRAX + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax + ret + + .p2align 4,, 8 +L(last_vec_x0): + COND_VZEROUPPER + bsf %VRAX, %VRAX + addq %rdi, %rax + ret +END (RAWMEMCHR) +#endif