Message ID | 20221019004409.3623395-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and > strncasecmp{l} functions can be added by including strcmp-evex.S with > "x86-evex512-vecs.h" defined. > > In addition save code size a bit in a few places. > > 1. tzcnt ... -> bsf ... > 2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d} > > This saves a touch of code size but has minimal net affect. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/strcmp-evex.S | 676 ++++++++++++++++--------- > 1 file changed, 430 insertions(+), 246 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S > index e482d0167f..756a3bb8d6 100644 > --- a/sysdeps/x86_64/multiarch/strcmp-evex.S > +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S > @@ -20,6 +20,10 @@ > > #if ISA_SHOULD_BUILD (4) > > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > # define STRCMP_ISA _evex > # include "strcmp-naming.h" > > @@ -35,41 +39,57 @@ > # define PAGE_SIZE 4096 > > /* VEC_SIZE = Number of bytes in a ymm register. */ > -# define VEC_SIZE 32 > # define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) > > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > # ifdef USE_AS_WCSCMP > -# define TESTEQ subl $0xff, > /* Compare packed dwords. */ > # define VPCMP vpcmpd > +# define VPCMPEQ vpcmpeqd > # define VPMINU vpminud > # define VPTESTM vptestmd > # define VPTESTNM vptestnmd > /* 1 dword char == 4 bytes. */ > # define SIZE_OF_CHAR 4 > + > +# define TESTEQ sub $((1 << CHAR_PER_VEC) - 1), > + > +# define USE_WIDE_CHAR > # else > -# define TESTEQ incl > /* Compare packed bytes. */ > # define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > # define VPMINU vpminub > # define VPTESTM vptestmb > # define VPTESTNM vptestnmb > /* 1 byte char == 1 byte. */ > # define SIZE_OF_CHAR 1 > + > +# define TESTEQ inc > +# endif > + > +# include "reg-macros.h" > + > +# if VEC_SIZE == 64 > +# define RODATA_SECTION rodata.cst64 > +# else > +# define RODATA_SECTION rodata.cst32 > +# endif > + > +# if CHAR_PER_VEC == 64 > +# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3) > +# else > +# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2) > # endif > > # ifdef USE_AS_STRNCMP > -# define LOOP_REG r9d > +# define LOOP_REG VR9 > # define LOOP_REG64 r9 > > # define OFFSET_REG8 r9b > # define OFFSET_REG r9d > # define OFFSET_REG64 r9 > # else > -# define LOOP_REG edx > +# define LOOP_REG VRDX > # define LOOP_REG64 rdx > > # define OFFSET_REG8 dl > @@ -83,32 +103,6 @@ > # define VEC_OFFSET (-VEC_SIZE) > # endif > > -# define XMM0 xmm17 > -# define XMM1 xmm18 > - > -# define XMM10 xmm27 > -# define XMM11 xmm28 > -# define XMM12 xmm29 > -# define XMM13 xmm30 > -# define XMM14 xmm31 > - > - > -# define YMM0 ymm17 > -# define YMM1 ymm18 > -# define YMM2 ymm19 > -# define YMM3 ymm20 > -# define YMM4 ymm21 > -# define YMM5 ymm22 > -# define YMM6 ymm23 > -# define YMM7 ymm24 > -# define YMM8 ymm25 > -# define YMM9 ymm26 > -# define YMM10 ymm27 > -# define YMM11 ymm28 > -# define YMM12 ymm29 > -# define YMM13 ymm30 > -# define YMM14 ymm31 > - > # ifdef USE_AS_STRCASECMP_L > # define BYTE_LOOP_REG OFFSET_REG > # else > @@ -125,61 +119,72 @@ > # endif > # endif > > -# define LCASE_MIN_YMM %YMM12 > -# define LCASE_MAX_YMM %YMM13 > -# define CASE_ADD_YMM %YMM14 > +# define LCASE_MIN_V VMM(12) > +# define LCASE_MAX_V VMM(13) > +# define CASE_ADD_V VMM(14) > > -# define LCASE_MIN_XMM %XMM12 > -# define LCASE_MAX_XMM %XMM13 > -# define CASE_ADD_XMM %XMM14 > +# if VEC_SIZE == 64 > +# define LCASE_MIN_YMM VMM_256(12) > +# define LCASE_MAX_YMM VMM_256(13) > +# define CASE_ADD_YMM VMM_256(14) > +# endif > + > +# define LCASE_MIN_XMM VMM_128(12) > +# define LCASE_MAX_XMM VMM_128(13) > +# define CASE_ADD_XMM VMM_128(14) > > /* NB: wcsncmp uses r11 but strcasecmp is never used in > conjunction with wcscmp. */ > # define TOLOWER_BASE %r11 > > # ifdef USE_AS_STRCASECMP_L > -# define _REG(x, y) x ## y > -# define REG(x, y) _REG(x, y) > -# define TOLOWER(reg1, reg2, ext) \ > - vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ > - vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ > - vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ > - vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ > - vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ > - vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} > - > -# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst > -# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) > -# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) > - > -# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ > - TOLOWER (s1_reg, s2_reg, ext); \ > - VPCMP $0, s1_reg, s2_reg, reg_out > - > -# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ > - VMOVU s2_mem, s2_reg; \ > - CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) > - > -# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) > -# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) > - > -# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) > -# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) > +# define _REG(x, y) x ## y > +# define REG(x, y) _REG(x, y) > +# define TOLOWER(reg1, reg2, ext, vec_macro) \ > + vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \ > + vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \ > + vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \ > + vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \ > + vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \ > + vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6} > + > +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst > +# define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM) > +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256) > +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128) > + > +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \ > + TOLOWER (s1_reg, s2_reg, ext, vec_macro); \ > + VPCMPEQ s1_reg, s2_reg, reg_out > + > +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \ > + VMOVU s2_mem, s2_reg; \ > + CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro) > + > +# define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM) > +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256) > +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128) > + > +# define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM) > +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256) > +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128) > > # else > # define TOLOWER_gpr(...) > +# define TOLOWER_VMM(...) > # define TOLOWER_YMM(...) > # define TOLOWER_XMM(...) > > -# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ > - VPCMP $0, s2_reg, s1_reg, reg_out > +# define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \ > + VPCMPEQ s2_reg, s1_reg, reg_out > > -# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) > +# define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__) > +# define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__) > > -# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ > - VPCMP $0, s2_mem, s1_reg, reg_out > - > -# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) > +# define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \ > + VPCMPEQ s2_mem, s1_reg, reg_out > +# define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__) > +# define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__) > # endif > > /* Warning! > @@ -203,7 +208,7 @@ > the maximum offset is reached before a difference is found, zero is > returned. */ > > - .section .text.evex, "ax", @progbits > + .section SECTION(.text), "ax", @progbits > .align 16 > .type STRCMP, @function > .globl STRCMP > @@ -232,7 +237,7 @@ STRCMP: > # else > mov (%LOCALE_REG), %RAX_LP > # endif > - testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) > + testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) > jne STRCASECMP_L_NONASCII > leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE > # endif > @@ -254,28 +259,46 @@ STRCMP: > # endif > > # if defined USE_AS_STRCASECMP_L > - .section .rodata.cst32, "aM", @progbits, 32 > - .align 32 > + .section RODATA_SECTION, "aM", @progbits, VEC_SIZE > + .align VEC_SIZE > L(lcase_min): > .quad 0x4141414141414141 > .quad 0x4141414141414141 > .quad 0x4141414141414141 > .quad 0x4141414141414141 > +# if VEC_SIZE == 64 > + .quad 0x4141414141414141 > + .quad 0x4141414141414141 > + .quad 0x4141414141414141 > + .quad 0x4141414141414141 > +# endif > L(lcase_max): > .quad 0x1a1a1a1a1a1a1a1a > .quad 0x1a1a1a1a1a1a1a1a > .quad 0x1a1a1a1a1a1a1a1a > .quad 0x1a1a1a1a1a1a1a1a > +# if VEC_SIZE == 64 > + .quad 0x1a1a1a1a1a1a1a1a > + .quad 0x1a1a1a1a1a1a1a1a > + .quad 0x1a1a1a1a1a1a1a1a > + .quad 0x1a1a1a1a1a1a1a1a > +# endif > L(case_add): > .quad 0x2020202020202020 > .quad 0x2020202020202020 > .quad 0x2020202020202020 > .quad 0x2020202020202020 > +# if VEC_SIZE == 64 > + .quad 0x2020202020202020 > + .quad 0x2020202020202020 > + .quad 0x2020202020202020 > + .quad 0x2020202020202020 > +# endif > .previous > > - vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM > - vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM > - vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM > + VMOVA L(lcase_min)(%rip), %LCASE_MIN_V > + VMOVA L(lcase_max)(%rip), %LCASE_MAX_V > + VMOVA L(case_add)(%rip), %CASE_ADD_V > # endif > > movl %edi, %eax > @@ -288,12 +311,12 @@ L(case_add): > > L(no_page_cross): > /* Safe to compare 4x vectors. */ > - VMOVU (%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > + VMOVU (%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > /* Each bit cleared in K1 represents a mismatch or a null CHAR > in YMM0 and 32 bytes at (%rsi). */ > - CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > + CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > # ifdef USE_AS_STRNCMP > cmpq $CHAR_PER_VEC, %rdx > jbe L(vec_0_test_len) > @@ -303,14 +326,14 @@ L(no_page_cross): > wcscmp/wcsncmp. */ > > /* All 1s represents all equals. TESTEQ will overflow to zero in > - all equals case. Otherwise 1s will carry until position of first > - mismatch. */ > - TESTEQ %ecx > + all equals case. Otherwise 1s will carry until position of > + first mismatch. */ > + TESTEQ %VRCX > jz L(more_3x_vec) > > .p2align 4,, 4 > L(return_vec_0): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # ifdef USE_AS_WCSCMP > movl (%rdi, %rcx, SIZE_OF_CHAR), %edx > xorl %eax, %eax > @@ -321,7 +344,16 @@ L(return_vec_0): > orl $1, %eax > # else > movzbl (%rdi, %rcx), %eax > + /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte > + and keep logic for len <= VEC_SIZE (common) in just the > + first cache line. NB: No evex512 processor has partial- > + register stalls. If that changes this ifdef can be disabled > + without affecting correctness. */ > +# if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64 > + movb (%rsi, %rcx), %cl > +# else > movzbl (%rsi, %rcx), %ecx > +# endif > TOLOWER_gpr (%rax, %eax) > TOLOWER_gpr (%rcx, %ecx) > subl %ecx, %eax > @@ -332,8 +364,8 @@ L(ret0): > # ifdef USE_AS_STRNCMP > .p2align 4,, 4 > L(vec_0_test_len): > - notl %ecx > - bzhil %edx, %ecx, %eax > + not %VRCX > + bzhi %VRDX, %VRCX, %VRAX > jnz L(return_vec_0) > /* Align if will cross fetch block. */ > .p2align 4,, 2 > @@ -372,7 +404,7 @@ L(ret1): > > .p2align 4,, 10 > L(return_vec_1): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # ifdef USE_AS_STRNCMP > /* rdx must be > CHAR_PER_VEC so its safe to subtract without > worrying about underflow. */ > @@ -401,24 +433,41 @@ L(ret2): > .p2align 4,, 10 > # ifdef USE_AS_STRNCMP > L(return_vec_3): > -# if CHAR_PER_VEC <= 16 > +# if CHAR_PER_VEC <= 32 > + /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without > + additional branches by adjusting the bit positions from > + VEC3. We can't do this for CHAR_PER_VEC == 64. */ > +# if CHAR_PER_VEC <= 16 > sall $CHAR_PER_VEC, %ecx > -# else > +# else > salq $CHAR_PER_VEC, %rcx > +# endif > +# else > + /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just > + check it. */ > + bsf %VRCX, %VRCX > + addl $(CHAR_PER_VEC), %ecx > + cmpq %rcx, %rdx > + ja L(ret_vec_3_finish) > + xorl %eax, %eax > + ret > # endif > # endif > + > + /* If CHAR_PER_VEC == 64 we can't combine matches from the last > + 2x VEC so need seperate return label. */ > L(return_vec_2): > # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # else > - tzcntq %rcx, %rcx > + bsfq %rcx, %rcx > # endif > - > # ifdef USE_AS_STRNCMP > cmpq %rcx, %rdx > jbe L(ret_zero) > # endif > > +L(ret_vec_3_finish): > # ifdef USE_AS_WCSCMP > movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx > xorl %eax, %eax > @@ -440,7 +489,7 @@ L(ret3): > # ifndef USE_AS_STRNCMP > .p2align 4,, 10 > L(return_vec_3): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # ifdef USE_AS_WCSCMP > movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx > xorl %eax, %eax > @@ -465,11 +514,11 @@ L(ret4): > .p2align 5 > L(more_3x_vec): > /* Safe to compare 4x vectors. */ > - VMOVU (VEC_SIZE)(%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU (VEC_SIZE)(%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_1) > > # ifdef USE_AS_STRNCMP > @@ -477,18 +526,18 @@ L(more_3x_vec): > jbe L(ret_zero) > # endif > > - VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU (VEC_SIZE * 2)(%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_2) > > - VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU (VEC_SIZE * 3)(%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_3) > > # ifdef USE_AS_STRNCMP > @@ -565,110 +614,123 @@ L(loop): > > /* Loop entry after handling page cross during loop. */ > L(loop_skip_page_cross_check): > - VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 > - VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 > + VMOVA (VEC_SIZE * 0)(%rdi), %VMM(0) > + VMOVA (VEC_SIZE * 1)(%rdi), %VMM(2) > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4) > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6) > > - VPMINU %YMM0, %YMM2, %YMM8 > - VPMINU %YMM4, %YMM6, %YMM9 > + VPMINU %VMM(0), %VMM(2), %VMM(8) > + VPMINU %VMM(4), %VMM(6), %VMM(9) > > /* A zero CHAR in YMM9 means that there is a null CHAR. */ > - VPMINU %YMM8, %YMM9, %YMM9 > + VPMINU %VMM(8), %VMM(9), %VMM(9) > > /* Each bit set in K1 represents a non-null CHAR in YMM9. */ > - VPTESTM %YMM9, %YMM9, %k1 > + VPTESTM %VMM(9), %VMM(9), %k1 > # ifndef USE_AS_STRCASECMP_L > - vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 > - vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 > - vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 > + vpxorq (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1) > + vpxorq (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3) > + vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5) > /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while > oring with YMM1. Result is stored in YMM6. */ > - vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 > + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6) > # else > - VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 > - TOLOWER_YMM (%YMM0, %YMM1) > - VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 > - TOLOWER_YMM (%YMM2, %YMM3) > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 > - TOLOWER_YMM (%YMM4, %YMM5) > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 > - TOLOWER_YMM (%YMM6, %YMM7) > - vpxorq %YMM0, %YMM1, %YMM1 > - vpxorq %YMM2, %YMM3, %YMM3 > - vpxorq %YMM4, %YMM5, %YMM5 > - vpternlogd $0xde, %YMM7, %YMM1, %YMM6 > + VMOVU (VEC_SIZE * 0)(%rsi), %VMM(1) > + TOLOWER_VMM (%VMM(0), %VMM(1)) > + VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3) > + TOLOWER_VMM (%VMM(2), %VMM(3)) > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5) > + TOLOWER_VMM (%VMM(4), %VMM(5)) > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) > + TOLOWER_VMM (%VMM(6), %VMM(7)) > + vpxorq %VMM(0), %VMM(1), %VMM(1) > + vpxorq %VMM(2), %VMM(3), %VMM(3) > + vpxorq %VMM(4), %VMM(5), %VMM(5) > + vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6) > # endif > /* Or together YMM3, YMM5, and YMM6. */ > - vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 > + vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6) > > > /* A non-zero CHAR in YMM6 represents a mismatch. */ > - VPTESTNM %YMM6, %YMM6, %k0{%k1} > - kmovd %k0, %LOOP_REG > + VPTESTNM %VMM(6), %VMM(6), %k0{%k1} > + KMOV %k0, %LOOP_REG > > TESTEQ %LOOP_REG > jz L(loop) > > > /* Find which VEC has the mismatch of end of string. */ > - VPTESTM %YMM0, %YMM0, %k1 > - VPTESTNM %YMM1, %YMM1, %k0{%k1} > - kmovd %k0, %ecx > - TESTEQ %ecx > + VPTESTM %VMM(0), %VMM(0), %k1 > + VPTESTNM %VMM(1), %VMM(1), %k0{%k1} > + KMOV %k0, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_0_end) > > - VPTESTM %YMM2, %YMM2, %k1 > - VPTESTNM %YMM3, %YMM3, %k0{%k1} > - kmovd %k0, %ecx > - TESTEQ %ecx > + VPTESTM %VMM(2), %VMM(2), %k1 > + VPTESTNM %VMM(3), %VMM(3), %k0{%k1} > + KMOV %k0, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_1_end) > > > - /* Handle VEC 2 and 3 without branches. */ > + /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32. > + */ > L(return_vec_2_3_end): > # ifdef USE_AS_STRNCMP > subq $(CHAR_PER_VEC * 2), %rdx > jbe L(ret_zero_end) > # endif > > - VPTESTM %YMM4, %YMM4, %k1 > - VPTESTNM %YMM5, %YMM5, %k0{%k1} > - kmovd %k0, %ecx > - TESTEQ %ecx > + VPTESTM %VMM(4), %VMM(4), %k1 > + VPTESTNM %VMM(5), %VMM(5), %k0{%k1} > + KMOV %k0, %VRCX > + TESTEQ %VRCX > # if CHAR_PER_VEC <= 16 > sall $CHAR_PER_VEC, %LOOP_REG > orl %ecx, %LOOP_REG > -# else > +# elif CHAR_PER_VEC <= 32 > salq $CHAR_PER_VEC, %LOOP_REG64 > orq %rcx, %LOOP_REG64 > +# else > + /* We aren't combining last 2x VEC so branch on second the last. > + */ > + jnz L(return_vec_2_end) > # endif > -L(return_vec_3_end): > + > /* LOOP_REG contains matches for null/mismatch from the loop. If > - VEC 0,1,and 2 all have no null and no mismatches then mismatch > - must entirely be from VEC 3 which is fully represented by > - LOOP_REG. */ > + VEC 0,1,and 2 all have no null and no mismatches then > + mismatch must entirely be from VEC 3 which is fully > + represented by LOOP_REG. */ > # if CHAR_PER_VEC <= 16 > - tzcntl %LOOP_REG, %LOOP_REG > + bsf %LOOP_REG, %LOOP_REG > # else > - tzcntq %LOOP_REG64, %LOOP_REG64 > + bsfq %LOOP_REG64, %LOOP_REG64 > # endif > # ifdef USE_AS_STRNCMP > + > + /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to > + adj length before last comparison. */ > +# if CHAR_PER_VEC == 64 > + subq $CHAR_PER_VEC, %rdx > + jbe L(ret_zero_end) > +# endif > + > cmpq %LOOP_REG64, %rdx > jbe L(ret_zero_end) > # endif > > # ifdef USE_AS_WCSCMP > - movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx > + movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx > xorl %eax, %eax > - cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx > + cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx > je L(ret5) > setl %al > negl %eax > xorl %r8d, %eax > # else > - movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax > - movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx > + movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax > + movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx > TOLOWER_gpr (%rax, %eax) > TOLOWER_gpr (%rcx, %ecx) > subl %ecx, %eax > @@ -686,23 +748,39 @@ L(ret_zero_end): > # endif > > > + > /* The L(return_vec_N_end) differ from L(return_vec_N) in that > - they use the value of `r8` to negate the return value. This is > - because the page cross logic can swap `rdi` and `rsi`. */ > + they use the value of `r8` to negate the return value. This > + is because the page cross logic can swap `rdi` and `rsi`. > + */ > .p2align 4,, 10 > # ifdef USE_AS_STRNCMP > L(return_vec_1_end): > -# if CHAR_PER_VEC <= 16 > +# if CHAR_PER_VEC <= 32 > + /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end) > + without additional branches by adjusting the bit positions > + from VEC1. We can't do this for CHAR_PER_VEC == 64. */ > +# if CHAR_PER_VEC <= 16 > sall $CHAR_PER_VEC, %ecx > -# else > +# else > salq $CHAR_PER_VEC, %rcx > +# endif > +# else > + /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just > + check it. */ > + bsf %VRCX, %VRCX > + addl $(CHAR_PER_VEC), %ecx > + cmpq %rcx, %rdx > + ja L(ret_vec_0_end_finish) > + xorl %eax, %eax > + ret > # endif > # endif > L(return_vec_0_end): > # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # else > - tzcntq %rcx, %rcx > + bsfq %rcx, %rcx > # endif > > # ifdef USE_AS_STRNCMP > @@ -710,6 +788,7 @@ L(return_vec_0_end): > jbe L(ret_zero_end) > # endif > > +L(ret_vec_0_end_finish): > # ifdef USE_AS_WCSCMP > movl (%rdi, %rcx, SIZE_OF_CHAR), %edx > xorl %eax, %eax > @@ -737,7 +816,7 @@ L(ret6): > # ifndef USE_AS_STRNCMP > .p2align 4,, 10 > L(return_vec_1_end): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # ifdef USE_AS_WCSCMP > movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx > xorl %eax, %eax > @@ -760,6 +839,41 @@ L(ret7): > # endif > > > + /* If CHAR_PER_VEC == 64 we can't combine matches from the last > + 2x VEC so need seperate return label. */ > +# if CHAR_PER_VEC == 64 > +L(return_vec_2_end): > + bsf %VRCX, %VRCX > +# ifdef USE_AS_STRNCMP > + cmpq %rcx, %rdx > + jbe L(ret_zero_end) > +# endif > +# ifdef USE_AS_WCSCMP > + movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx > + xorl %eax, %eax > + cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx > + je L(ret31) > + setl %al > + negl %eax > + /* This is the non-zero case for `eax` so just xorl with `r8d` > + flip is `rdi` and `rsi` where swapped. */ > + xorl %r8d, %eax > +# else > + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax > + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx > + TOLOWER_gpr (%rax, %eax) > + TOLOWER_gpr (%rcx, %ecx) > + subl %ecx, %eax > + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross > + logic. Subtract `r8d` after xor for zero case. */ > + xorl %r8d, %eax > + subl %r8d, %eax > +# endif > +L(ret13): > + ret > +# endif > + > + > /* Page cross in rsi in next 4x VEC. */ > > /* TODO: Improve logic here. */ > @@ -778,11 +892,11 @@ L(page_cross_during_loop): > cmpl $-(VEC_SIZE * 3), %eax > jle L(less_1x_vec_till_page_cross) > > - VMOVA (%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVA (%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_0_end) > > /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ > @@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross): > to read back -VEC_SIZE. If rdi is truly at the start of a page > here, it means the previous page (rdi - VEC_SIZE) has already > been loaded earlier so must be valid. */ > - VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} > + VMOVU -VEC_SIZE(%rdi, %rax), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2} > /* Mask of potentially valid bits. The lower bits can be out of > range comparisons (but safe regarding page crosses). */ > > @@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross): > shlxl %ecx, %r10d, %ecx > movzbl %cl, %r10d > # else > - movl $-1, %ecx > - shlxl %esi, %ecx, %r10d > + mov $-1, %VRCX > + shlx %VRSI, %VRCX, %VR10 > # endif > > - kmovd %k1, %ecx > - notl %ecx > + KMOV %k1, %VRCX > + not %VRCX > > > # ifdef USE_AS_STRNCMP > @@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross): > /* Readjust eax before potentially returning to the loop. */ > addl $(PAGE_SIZE - VEC_SIZE * 4), %eax > > - andl %r10d, %ecx > + and %VR10, %VRCX > jz L(loop_skip_page_cross_check) > > - .p2align 4,, 3 > -L(return_page_cross_end): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > > # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) > leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx > @@ -874,8 +986,12 @@ L(ret8): > # ifdef USE_AS_STRNCMP > .p2align 4,, 10 > L(return_page_cross_end_check): > - andl %r10d, %ecx > - tzcntl %ecx, %ecx > + and %VR10, %VRCX > + /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero > + tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is > + guranteed to be <= CHAR_PER_VEC so we will only use the return > + idx if VRCX was non-zero. */ > + tzcnt %VRCX, %VRCX > leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx > # ifdef USE_AS_WCSCMP > sall $2, %edx > @@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross): > /* If more 2x vec till cross we will complete a full loop > iteration here. */ > > - VMOVA VEC_SIZE(%rdi), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVA VEC_SIZE(%rdi), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_1_end) > > # ifdef USE_AS_STRNCMP > @@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross): > subl $-(VEC_SIZE * 4), %eax > > /* Safe to include comparisons from lower bytes. */ > - VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_page_cross_0) > > - VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(return_vec_page_cross_1) > > # ifdef USE_AS_STRNCMP > @@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross): > # endif > > /* Finish the loop. */ > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 > - VPMINU %YMM4, %YMM6, %YMM9 > - VPTESTM %YMM9, %YMM9, %k1 > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4) > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6) > + VPMINU %VMM(4), %VMM(6), %VMM(9) > + VPTESTM %VMM(9), %VMM(9), %k1 > # ifndef USE_AS_STRCASECMP_L > - vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 > + vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5) > /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ > - vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 > + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6) > # else > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 > - TOLOWER_YMM (%YMM4, %YMM5) > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 > - TOLOWER_YMM (%YMM6, %YMM7) > - vpxorq %YMM4, %YMM5, %YMM5 > - vpternlogd $0xde, %YMM7, %YMM5, %YMM6 > -# endif > - VPTESTNM %YMM6, %YMM6, %k0{%k1} > - kmovd %k0, %LOOP_REG > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5) > + TOLOWER_VMM (%VMM(4), %VMM(5)) > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) > + TOLOWER_VMM (%VMM(6), %VMM(7)) > + vpxorq %VMM(4), %VMM(5), %VMM(5) > + vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6) > +# endif > + VPTESTNM %VMM(6), %VMM(6), %k0{%k1} > + KMOV %k0, %LOOP_REG > TESTEQ %LOOP_REG > jnz L(return_vec_2_3_end) > > /* Best for code size to include ucond-jmp here. Would be faster > - if this case is hot to duplicate the L(return_vec_2_3_end) code > - as fall-through and have jump back to loop on mismatch > + if this case is hot to duplicate the L(return_vec_2_3_end) > + code as fall-through and have jump back to loop on mismatch > comparison. */ > subq $-(VEC_SIZE * 4), %rdi > subq $-(VEC_SIZE * 4), %rsi > @@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross): > L(return_vec_page_cross_0): > addl $-VEC_SIZE, %eax > L(return_vec_page_cross_1): > - tzcntl %ecx, %ecx > + bsf %VRCX, %VRCX > # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP > leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx > # ifdef USE_AS_STRNCMP > @@ -1023,8 +1139,8 @@ L(ret9): > L(page_cross): > # ifndef USE_AS_STRNCMP > /* If both are VEC aligned we don't need any special logic here. > - Only valid for strcmp where stop condition is guranteed to be > - reachable by just reading memory. */ > + Only valid for strcmp where stop condition is guranteed to > + be reachable by just reading memory. */ > testl $((VEC_SIZE - 1) << 20), %eax > jz L(no_page_cross) > # endif > @@ -1065,11 +1181,11 @@ L(page_cross): > loadable memory until within 1x VEC of page cross. */ > .p2align 4,, 8 > L(page_cross_loop): > - VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} > - kmovd %k1, %ecx > - TESTEQ %ecx > + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2} > + KMOV %k1, %VRCX > + TESTEQ %VRCX > jnz L(check_ret_vec_page_cross) > addl $CHAR_PER_VEC, %OFFSET_REG > # ifdef USE_AS_STRNCMP > @@ -1087,13 +1203,13 @@ L(page_cross_loop): > subl %eax, %OFFSET_REG > /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed > to not cross page so is safe to load. Since we have already > - loaded at least 1 VEC from rsi it is also guranteed to be safe. > - */ > - VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 > - VPTESTM %YMM0, %YMM0, %k2 > - CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} > + loaded at least 1 VEC from rsi it is also guranteed to be > + safe. */ > + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0) > + VPTESTM %VMM(0), %VMM(0), %k2 > + CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2} > > - kmovd %k1, %ecx > + KMOV %k1, %VRCX > # ifdef USE_AS_STRNCMP > leal CHAR_PER_VEC(%OFFSET_REG64), %eax > cmpq %rax, %rdx > @@ -1104,7 +1220,7 @@ L(page_cross_loop): > addq %rdi, %rdx > # endif > # endif > - TESTEQ %ecx > + TESTEQ %VRCX > jz L(prepare_loop_no_len) > > .p2align 4,, 4 > @@ -1112,7 +1228,7 @@ L(ret_vec_page_cross): > # ifndef USE_AS_STRNCMP > L(check_ret_vec_page_cross): > # endif > - tzcntl %ecx, %ecx > + tzcnt %VRCX, %VRCX > addl %OFFSET_REG, %ecx > L(ret_vec_page_cross_cont): > # ifdef USE_AS_WCSCMP > @@ -1139,9 +1255,9 @@ L(ret12): > # ifdef USE_AS_STRNCMP > .p2align 4,, 10 > L(check_ret_vec_page_cross2): > - TESTEQ %ecx > + TESTEQ %VRCX > L(check_ret_vec_page_cross): > - tzcntl %ecx, %ecx > + tzcnt %VRCX, %VRCX > addl %OFFSET_REG, %ecx > cmpq %rcx, %rdx > ja L(ret_vec_page_cross_cont) > @@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page): > # ifdef USE_AS_WCSCMP > shrl $2, %eax > # endif > + > + /* Find largest load size we can use. VEC_SIZE == 64 only check > + if we can do a full ymm load. */ > +# if VEC_SIZE == 64 > + > + cmpl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax > + ja L(less_32_till_page) > + > + > + /* Use 16 byte comparison. */ > + VMOVU (%rdi), %VMM_256(0) > + VPTESTM %VMM_256(0), %VMM_256(0), %k2 > + CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2} > + kmovd %k1, %ecx > +# ifdef USE_AS_WCSCMP > + subl $0xff, %ecx > +# else > + incl %ecx > +# endif > + jnz L(check_ret_vec_page_cross) > + movl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG > +# ifdef USE_AS_STRNCMP > + cmpq %OFFSET_REG64, %rdx > + jbe L(ret_zero_page_cross_slow_case64) > + subl %eax, %OFFSET_REG > +# else > + /* Explicit check for 32 byte alignment. */ > + subl %eax, %OFFSET_REG > + jz L(prepare_loop) > +# endif > + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0) > + VPTESTM %VMM_256(0), %VMM_256(0), %k2 > + CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2} > + kmovd %k1, %ecx > +# ifdef USE_AS_WCSCMP > + subl $0xff, %ecx > +# else > + incl %ecx > +# endif > + jnz L(check_ret_vec_page_cross) > +# ifdef USE_AS_STRNCMP > + addl $(32 / SIZE_OF_CHAR), %OFFSET_REG > + subq %OFFSET_REG64, %rdx > + jbe L(ret_zero_page_cross_slow_case64) > + subq $-(CHAR_PER_VEC * 4), %rdx > + > + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi > + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi > +# else > + leaq (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi > + leaq (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi > +# endif > + jmp L(prepare_loop_aligned) > + > +# ifdef USE_AS_STRNCMP > + .p2align 4,, 2 > +L(ret_zero_page_cross_slow_case64): > + xorl %eax, %eax > + ret > +# endif > +L(less_32_till_page): > +# endif > + > /* Find largest load size we can use. */ > - cmpl $(16 / SIZE_OF_CHAR), %eax > + cmpl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax > ja L(less_16_till_page) > > /* Use 16 byte comparison. */ > @@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page): > incw %cx > # endif > jnz L(check_ret_vec_page_cross) > - movl $(16 / SIZE_OF_CHAR), %OFFSET_REG > + > + movl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG > # ifdef USE_AS_STRNCMP > +# if VEC_SIZE == 32 > cmpq %OFFSET_REG64, %rdx > +# else > + cmpq $(16 / SIZE_OF_CHAR), %rdx > +# endif > jbe L(ret_zero_page_cross_slow_case0) > subl %eax, %OFFSET_REG > # else > @@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0): > > .p2align 4,, 10 > L(less_16_till_page): > - cmpl $(24 / SIZE_OF_CHAR), %eax > + cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax > ja L(less_8_till_page) > > /* Use 8 byte comparison. */ > @@ -1260,7 +1444,7 @@ L(less_16_till_page): > cmpq $(8 / SIZE_OF_CHAR), %rdx > jbe L(ret_zero_page_cross_slow_case0) > # endif > - movl $(24 / SIZE_OF_CHAR), %OFFSET_REG > + movl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG > subl %eax, %OFFSET_REG > > vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 > @@ -1320,7 +1504,7 @@ L(ret_less_8_wcs): > ret > > # else > - cmpl $28, %eax > + cmpl $(VEC_SIZE - 4), %eax > ja L(less_4_till_page) > > vmovd (%rdi), %xmm0 > @@ -1335,7 +1519,7 @@ L(ret_less_8_wcs): > cmpq $4, %rdx > jbe L(ret_zero_page_cross_slow_case1) > # endif > - movl $(28 / SIZE_OF_CHAR), %OFFSET_REG > + movl $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG > subl %eax, %OFFSET_REG > > vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 > @@ -1386,7 +1570,7 @@ L(less_4_loop): > # endif > incq %rdi > /* end condition is reach page boundary (rdi is aligned). */ > - testl $31, %edi > + testb $(VEC_SIZE - 1), %dil > jnz L(less_4_loop) > leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi > addq $-(VEC_SIZE * 4), %rdi > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S index e482d0167f..756a3bb8d6 100644 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S @@ -20,6 +20,10 @@ #if ISA_SHOULD_BUILD (4) +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + # define STRCMP_ISA _evex # include "strcmp-naming.h" @@ -35,41 +39,57 @@ # define PAGE_SIZE 4096 /* VEC_SIZE = Number of bytes in a ymm register. */ -# define VEC_SIZE 32 # define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - # ifdef USE_AS_WCSCMP -# define TESTEQ subl $0xff, /* Compare packed dwords. */ # define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPMINU vpminud # define VPTESTM vptestmd # define VPTESTNM vptestnmd /* 1 dword char == 4 bytes. */ # define SIZE_OF_CHAR 4 + +# define TESTEQ sub $((1 << CHAR_PER_VEC) - 1), + +# define USE_WIDE_CHAR # else -# define TESTEQ incl /* Compare packed bytes. */ # define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb # define VPMINU vpminub # define VPTESTM vptestmb # define VPTESTNM vptestnmb /* 1 byte char == 1 byte. */ # define SIZE_OF_CHAR 1 + +# define TESTEQ inc +# endif + +# include "reg-macros.h" + +# if VEC_SIZE == 64 +# define RODATA_SECTION rodata.cst64 +# else +# define RODATA_SECTION rodata.cst32 +# endif + +# if CHAR_PER_VEC == 64 +# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3) +# else +# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2) # endif # ifdef USE_AS_STRNCMP -# define LOOP_REG r9d +# define LOOP_REG VR9 # define LOOP_REG64 r9 # define OFFSET_REG8 r9b # define OFFSET_REG r9d # define OFFSET_REG64 r9 # else -# define LOOP_REG edx +# define LOOP_REG VRDX # define LOOP_REG64 rdx # define OFFSET_REG8 dl @@ -83,32 +103,6 @@ # define VEC_OFFSET (-VEC_SIZE) # endif -# define XMM0 xmm17 -# define XMM1 xmm18 - -# define XMM10 xmm27 -# define XMM11 xmm28 -# define XMM12 xmm29 -# define XMM13 xmm30 -# define XMM14 xmm31 - - -# define YMM0 ymm17 -# define YMM1 ymm18 -# define YMM2 ymm19 -# define YMM3 ymm20 -# define YMM4 ymm21 -# define YMM5 ymm22 -# define YMM6 ymm23 -# define YMM7 ymm24 -# define YMM8 ymm25 -# define YMM9 ymm26 -# define YMM10 ymm27 -# define YMM11 ymm28 -# define YMM12 ymm29 -# define YMM13 ymm30 -# define YMM14 ymm31 - # ifdef USE_AS_STRCASECMP_L # define BYTE_LOOP_REG OFFSET_REG # else @@ -125,61 +119,72 @@ # endif # endif -# define LCASE_MIN_YMM %YMM12 -# define LCASE_MAX_YMM %YMM13 -# define CASE_ADD_YMM %YMM14 +# define LCASE_MIN_V VMM(12) +# define LCASE_MAX_V VMM(13) +# define CASE_ADD_V VMM(14) -# define LCASE_MIN_XMM %XMM12 -# define LCASE_MAX_XMM %XMM13 -# define CASE_ADD_XMM %XMM14 +# if VEC_SIZE == 64 +# define LCASE_MIN_YMM VMM_256(12) +# define LCASE_MAX_YMM VMM_256(13) +# define CASE_ADD_YMM VMM_256(14) +# endif + +# define LCASE_MIN_XMM VMM_128(12) +# define LCASE_MAX_XMM VMM_128(13) +# define CASE_ADD_XMM VMM_128(14) /* NB: wcsncmp uses r11 but strcasecmp is never used in conjunction with wcscmp. */ # define TOLOWER_BASE %r11 # ifdef USE_AS_STRCASECMP_L -# define _REG(x, y) x ## y -# define REG(x, y) _REG(x, y) -# define TOLOWER(reg1, reg2, ext) \ - vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ - vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ - vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ - vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ - vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ - vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} - -# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst -# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) -# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) - -# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ - TOLOWER (s1_reg, s2_reg, ext); \ - VPCMP $0, s1_reg, s2_reg, reg_out - -# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ - VMOVU s2_mem, s2_reg; \ - CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) - -# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) -# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) - -# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) -# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) +# define _REG(x, y) x ## y +# define REG(x, y) _REG(x, y) +# define TOLOWER(reg1, reg2, ext, vec_macro) \ + vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \ + vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \ + vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \ + vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \ + vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \ + vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6} + +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst +# define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM) +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256) +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128) + +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \ + TOLOWER (s1_reg, s2_reg, ext, vec_macro); \ + VPCMPEQ s1_reg, s2_reg, reg_out + +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \ + VMOVU s2_mem, s2_reg; \ + CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro) + +# define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM) +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256) +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128) + +# define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM) +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256) +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128) # else # define TOLOWER_gpr(...) +# define TOLOWER_VMM(...) # define TOLOWER_YMM(...) # define TOLOWER_XMM(...) -# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ - VPCMP $0, s2_reg, s1_reg, reg_out +# define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \ + VPCMPEQ s2_reg, s1_reg, reg_out -# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) +# define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__) +# define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__) -# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ - VPCMP $0, s2_mem, s1_reg, reg_out - -# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) +# define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \ + VPCMPEQ s2_mem, s1_reg, reg_out +# define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__) +# define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__) # endif /* Warning! @@ -203,7 +208,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.evex, "ax", @progbits + .section SECTION(.text), "ax", @progbits .align 16 .type STRCMP, @function .globl STRCMP @@ -232,7 +237,7 @@ STRCMP: # else mov (%LOCALE_REG), %RAX_LP # endif - testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) + testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) jne STRCASECMP_L_NONASCII leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE # endif @@ -254,28 +259,46 @@ STRCMP: # endif # if defined USE_AS_STRCASECMP_L - .section .rodata.cst32, "aM", @progbits, 32 - .align 32 + .section RODATA_SECTION, "aM", @progbits, VEC_SIZE + .align VEC_SIZE L(lcase_min): .quad 0x4141414141414141 .quad 0x4141414141414141 .quad 0x4141414141414141 .quad 0x4141414141414141 +# if VEC_SIZE == 64 + .quad 0x4141414141414141 + .quad 0x4141414141414141 + .quad 0x4141414141414141 + .quad 0x4141414141414141 +# endif L(lcase_max): .quad 0x1a1a1a1a1a1a1a1a .quad 0x1a1a1a1a1a1a1a1a .quad 0x1a1a1a1a1a1a1a1a .quad 0x1a1a1a1a1a1a1a1a +# if VEC_SIZE == 64 + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a +# endif L(case_add): .quad 0x2020202020202020 .quad 0x2020202020202020 .quad 0x2020202020202020 .quad 0x2020202020202020 +# if VEC_SIZE == 64 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 +# endif .previous - vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM - vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM - vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM + VMOVA L(lcase_min)(%rip), %LCASE_MIN_V + VMOVA L(lcase_max)(%rip), %LCASE_MAX_V + VMOVA L(case_add)(%rip), %CASE_ADD_V # endif movl %edi, %eax @@ -288,12 +311,12 @@ L(case_add): L(no_page_cross): /* Safe to compare 4x vectors. */ - VMOVU (%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 + VMOVU (%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 /* Each bit cleared in K1 represents a mismatch or a null CHAR in YMM0 and 32 bytes at (%rsi). */ - CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx + CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX # ifdef USE_AS_STRNCMP cmpq $CHAR_PER_VEC, %rdx jbe L(vec_0_test_len) @@ -303,14 +326,14 @@ L(no_page_cross): wcscmp/wcsncmp. */ /* All 1s represents all equals. TESTEQ will overflow to zero in - all equals case. Otherwise 1s will carry until position of first - mismatch. */ - TESTEQ %ecx + all equals case. Otherwise 1s will carry until position of + first mismatch. */ + TESTEQ %VRCX jz L(more_3x_vec) .p2align 4,, 4 L(return_vec_0): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # ifdef USE_AS_WCSCMP movl (%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax @@ -321,7 +344,16 @@ L(return_vec_0): orl $1, %eax # else movzbl (%rdi, %rcx), %eax + /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte + and keep logic for len <= VEC_SIZE (common) in just the + first cache line. NB: No evex512 processor has partial- + register stalls. If that changes this ifdef can be disabled + without affecting correctness. */ +# if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64 + movb (%rsi, %rcx), %cl +# else movzbl (%rsi, %rcx), %ecx +# endif TOLOWER_gpr (%rax, %eax) TOLOWER_gpr (%rcx, %ecx) subl %ecx, %eax @@ -332,8 +364,8 @@ L(ret0): # ifdef USE_AS_STRNCMP .p2align 4,, 4 L(vec_0_test_len): - notl %ecx - bzhil %edx, %ecx, %eax + not %VRCX + bzhi %VRDX, %VRCX, %VRAX jnz L(return_vec_0) /* Align if will cross fetch block. */ .p2align 4,, 2 @@ -372,7 +404,7 @@ L(ret1): .p2align 4,, 10 L(return_vec_1): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # ifdef USE_AS_STRNCMP /* rdx must be > CHAR_PER_VEC so its safe to subtract without worrying about underflow. */ @@ -401,24 +433,41 @@ L(ret2): .p2align 4,, 10 # ifdef USE_AS_STRNCMP L(return_vec_3): -# if CHAR_PER_VEC <= 16 +# if CHAR_PER_VEC <= 32 + /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without + additional branches by adjusting the bit positions from + VEC3. We can't do this for CHAR_PER_VEC == 64. */ +# if CHAR_PER_VEC <= 16 sall $CHAR_PER_VEC, %ecx -# else +# else salq $CHAR_PER_VEC, %rcx +# endif +# else + /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just + check it. */ + bsf %VRCX, %VRCX + addl $(CHAR_PER_VEC), %ecx + cmpq %rcx, %rdx + ja L(ret_vec_3_finish) + xorl %eax, %eax + ret # endif # endif + + /* If CHAR_PER_VEC == 64 we can't combine matches from the last + 2x VEC so need seperate return label. */ L(return_vec_2): # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # else - tzcntq %rcx, %rcx + bsfq %rcx, %rcx # endif - # ifdef USE_AS_STRNCMP cmpq %rcx, %rdx jbe L(ret_zero) # endif +L(ret_vec_3_finish): # ifdef USE_AS_WCSCMP movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax @@ -440,7 +489,7 @@ L(ret3): # ifndef USE_AS_STRNCMP .p2align 4,, 10 L(return_vec_3): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # ifdef USE_AS_WCSCMP movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax @@ -465,11 +514,11 @@ L(ret4): .p2align 5 L(more_3x_vec): /* Safe to compare 4x vectors. */ - VMOVU (VEC_SIZE)(%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU (VEC_SIZE)(%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_1) # ifdef USE_AS_STRNCMP @@ -477,18 +526,18 @@ L(more_3x_vec): jbe L(ret_zero) # endif - VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU (VEC_SIZE * 2)(%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_2) - VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU (VEC_SIZE * 3)(%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_3) # ifdef USE_AS_STRNCMP @@ -565,110 +614,123 @@ L(loop): /* Loop entry after handling page cross during loop. */ L(loop_skip_page_cross_check): - VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 - VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 - VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 - VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VMOVA (VEC_SIZE * 0)(%rdi), %VMM(0) + VMOVA (VEC_SIZE * 1)(%rdi), %VMM(2) + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4) + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6) - VPMINU %YMM0, %YMM2, %YMM8 - VPMINU %YMM4, %YMM6, %YMM9 + VPMINU %VMM(0), %VMM(2), %VMM(8) + VPMINU %VMM(4), %VMM(6), %VMM(9) /* A zero CHAR in YMM9 means that there is a null CHAR. */ - VPMINU %YMM8, %YMM9, %YMM9 + VPMINU %VMM(8), %VMM(9), %VMM(9) /* Each bit set in K1 represents a non-null CHAR in YMM9. */ - VPTESTM %YMM9, %YMM9, %k1 + VPTESTM %VMM(9), %VMM(9), %k1 # ifndef USE_AS_STRCASECMP_L - vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 - vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 - vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + vpxorq (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1) + vpxorq (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3) + vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5) /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while oring with YMM1. Result is stored in YMM6. */ - vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6) # else - VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 - TOLOWER_YMM (%YMM0, %YMM1) - VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 - TOLOWER_YMM (%YMM2, %YMM3) - VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 - TOLOWER_YMM (%YMM4, %YMM5) - VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 - TOLOWER_YMM (%YMM6, %YMM7) - vpxorq %YMM0, %YMM1, %YMM1 - vpxorq %YMM2, %YMM3, %YMM3 - vpxorq %YMM4, %YMM5, %YMM5 - vpternlogd $0xde, %YMM7, %YMM1, %YMM6 + VMOVU (VEC_SIZE * 0)(%rsi), %VMM(1) + TOLOWER_VMM (%VMM(0), %VMM(1)) + VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3) + TOLOWER_VMM (%VMM(2), %VMM(3)) + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5) + TOLOWER_VMM (%VMM(4), %VMM(5)) + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) + TOLOWER_VMM (%VMM(6), %VMM(7)) + vpxorq %VMM(0), %VMM(1), %VMM(1) + vpxorq %VMM(2), %VMM(3), %VMM(3) + vpxorq %VMM(4), %VMM(5), %VMM(5) + vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6) # endif /* Or together YMM3, YMM5, and YMM6. */ - vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6) /* A non-zero CHAR in YMM6 represents a mismatch. */ - VPTESTNM %YMM6, %YMM6, %k0{%k1} - kmovd %k0, %LOOP_REG + VPTESTNM %VMM(6), %VMM(6), %k0{%k1} + KMOV %k0, %LOOP_REG TESTEQ %LOOP_REG jz L(loop) /* Find which VEC has the mismatch of end of string. */ - VPTESTM %YMM0, %YMM0, %k1 - VPTESTNM %YMM1, %YMM1, %k0{%k1} - kmovd %k0, %ecx - TESTEQ %ecx + VPTESTM %VMM(0), %VMM(0), %k1 + VPTESTNM %VMM(1), %VMM(1), %k0{%k1} + KMOV %k0, %VRCX + TESTEQ %VRCX jnz L(return_vec_0_end) - VPTESTM %YMM2, %YMM2, %k1 - VPTESTNM %YMM3, %YMM3, %k0{%k1} - kmovd %k0, %ecx - TESTEQ %ecx + VPTESTM %VMM(2), %VMM(2), %k1 + VPTESTNM %VMM(3), %VMM(3), %k0{%k1} + KMOV %k0, %VRCX + TESTEQ %VRCX jnz L(return_vec_1_end) - /* Handle VEC 2 and 3 without branches. */ + /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32. + */ L(return_vec_2_3_end): # ifdef USE_AS_STRNCMP subq $(CHAR_PER_VEC * 2), %rdx jbe L(ret_zero_end) # endif - VPTESTM %YMM4, %YMM4, %k1 - VPTESTNM %YMM5, %YMM5, %k0{%k1} - kmovd %k0, %ecx - TESTEQ %ecx + VPTESTM %VMM(4), %VMM(4), %k1 + VPTESTNM %VMM(5), %VMM(5), %k0{%k1} + KMOV %k0, %VRCX + TESTEQ %VRCX # if CHAR_PER_VEC <= 16 sall $CHAR_PER_VEC, %LOOP_REG orl %ecx, %LOOP_REG -# else +# elif CHAR_PER_VEC <= 32 salq $CHAR_PER_VEC, %LOOP_REG64 orq %rcx, %LOOP_REG64 +# else + /* We aren't combining last 2x VEC so branch on second the last. + */ + jnz L(return_vec_2_end) # endif -L(return_vec_3_end): + /* LOOP_REG contains matches for null/mismatch from the loop. If - VEC 0,1,and 2 all have no null and no mismatches then mismatch - must entirely be from VEC 3 which is fully represented by - LOOP_REG. */ + VEC 0,1,and 2 all have no null and no mismatches then + mismatch must entirely be from VEC 3 which is fully + represented by LOOP_REG. */ # if CHAR_PER_VEC <= 16 - tzcntl %LOOP_REG, %LOOP_REG + bsf %LOOP_REG, %LOOP_REG # else - tzcntq %LOOP_REG64, %LOOP_REG64 + bsfq %LOOP_REG64, %LOOP_REG64 # endif # ifdef USE_AS_STRNCMP + + /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to + adj length before last comparison. */ +# if CHAR_PER_VEC == 64 + subq $CHAR_PER_VEC, %rdx + jbe L(ret_zero_end) +# endif + cmpq %LOOP_REG64, %rdx jbe L(ret_zero_end) # endif # ifdef USE_AS_WCSCMP - movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx + movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx xorl %eax, %eax - cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx + cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx je L(ret5) setl %al negl %eax xorl %r8d, %eax # else - movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax - movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx + movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx TOLOWER_gpr (%rax, %eax) TOLOWER_gpr (%rcx, %ecx) subl %ecx, %eax @@ -686,23 +748,39 @@ L(ret_zero_end): # endif + /* The L(return_vec_N_end) differ from L(return_vec_N) in that - they use the value of `r8` to negate the return value. This is - because the page cross logic can swap `rdi` and `rsi`. */ + they use the value of `r8` to negate the return value. This + is because the page cross logic can swap `rdi` and `rsi`. + */ .p2align 4,, 10 # ifdef USE_AS_STRNCMP L(return_vec_1_end): -# if CHAR_PER_VEC <= 16 +# if CHAR_PER_VEC <= 32 + /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end) + without additional branches by adjusting the bit positions + from VEC1. We can't do this for CHAR_PER_VEC == 64. */ +# if CHAR_PER_VEC <= 16 sall $CHAR_PER_VEC, %ecx -# else +# else salq $CHAR_PER_VEC, %rcx +# endif +# else + /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just + check it. */ + bsf %VRCX, %VRCX + addl $(CHAR_PER_VEC), %ecx + cmpq %rcx, %rdx + ja L(ret_vec_0_end_finish) + xorl %eax, %eax + ret # endif # endif L(return_vec_0_end): # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # else - tzcntq %rcx, %rcx + bsfq %rcx, %rcx # endif # ifdef USE_AS_STRNCMP @@ -710,6 +788,7 @@ L(return_vec_0_end): jbe L(ret_zero_end) # endif +L(ret_vec_0_end_finish): # ifdef USE_AS_WCSCMP movl (%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax @@ -737,7 +816,7 @@ L(ret6): # ifndef USE_AS_STRNCMP .p2align 4,, 10 L(return_vec_1_end): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # ifdef USE_AS_WCSCMP movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax @@ -760,6 +839,41 @@ L(ret7): # endif + /* If CHAR_PER_VEC == 64 we can't combine matches from the last + 2x VEC so need seperate return label. */ +# if CHAR_PER_VEC == 64 +L(return_vec_2_end): + bsf %VRCX, %VRCX +# ifdef USE_AS_STRNCMP + cmpq %rcx, %rdx + jbe L(ret_zero_end) +# endif +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax + cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret31) + setl %al + negl %eax + /* This is the non-zero case for `eax` so just xorl with `r8d` + flip is `rdi` and `rsi` where swapped. */ + xorl %r8d, %eax +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ + xorl %r8d, %eax + subl %r8d, %eax +# endif +L(ret13): + ret +# endif + + /* Page cross in rsi in next 4x VEC. */ /* TODO: Improve logic here. */ @@ -778,11 +892,11 @@ L(page_cross_during_loop): cmpl $-(VEC_SIZE * 3), %eax jle L(less_1x_vec_till_page_cross) - VMOVA (%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVA (%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_0_end) /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ @@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross): to read back -VEC_SIZE. If rdi is truly at the start of a page here, it means the previous page (rdi - VEC_SIZE) has already been loaded earlier so must be valid. */ - VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + VMOVU -VEC_SIZE(%rdi, %rax), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2} /* Mask of potentially valid bits. The lower bits can be out of range comparisons (but safe regarding page crosses). */ @@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross): shlxl %ecx, %r10d, %ecx movzbl %cl, %r10d # else - movl $-1, %ecx - shlxl %esi, %ecx, %r10d + mov $-1, %VRCX + shlx %VRSI, %VRCX, %VR10 # endif - kmovd %k1, %ecx - notl %ecx + KMOV %k1, %VRCX + not %VRCX # ifdef USE_AS_STRNCMP @@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross): /* Readjust eax before potentially returning to the loop. */ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax - andl %r10d, %ecx + and %VR10, %VRCX jz L(loop_skip_page_cross_check) - .p2align 4,, 3 -L(return_page_cross_end): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx @@ -874,8 +986,12 @@ L(ret8): # ifdef USE_AS_STRNCMP .p2align 4,, 10 L(return_page_cross_end_check): - andl %r10d, %ecx - tzcntl %ecx, %ecx + and %VR10, %VRCX + /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero + tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is + guranteed to be <= CHAR_PER_VEC so we will only use the return + idx if VRCX was non-zero. */ + tzcnt %VRCX, %VRCX leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx # ifdef USE_AS_WCSCMP sall $2, %edx @@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross): /* If more 2x vec till cross we will complete a full loop iteration here. */ - VMOVA VEC_SIZE(%rdi), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVA VEC_SIZE(%rdi), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_1_end) # ifdef USE_AS_STRNCMP @@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross): subl $-(VEC_SIZE * 4), %eax /* Safe to include comparisons from lower bytes. */ - VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_page_cross_0) - VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(return_vec_page_cross_1) # ifdef USE_AS_STRNCMP @@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross): # endif /* Finish the loop. */ - VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 - VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 - VPMINU %YMM4, %YMM6, %YMM9 - VPTESTM %YMM9, %YMM9, %k1 + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4) + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6) + VPMINU %VMM(4), %VMM(6), %VMM(9) + VPTESTM %VMM(9), %VMM(9), %k1 # ifndef USE_AS_STRCASECMP_L - vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5) /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ - vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6) # else - VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 - TOLOWER_YMM (%YMM4, %YMM5) - VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 - TOLOWER_YMM (%YMM6, %YMM7) - vpxorq %YMM4, %YMM5, %YMM5 - vpternlogd $0xde, %YMM7, %YMM5, %YMM6 -# endif - VPTESTNM %YMM6, %YMM6, %k0{%k1} - kmovd %k0, %LOOP_REG + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5) + TOLOWER_VMM (%VMM(4), %VMM(5)) + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) + TOLOWER_VMM (%VMM(6), %VMM(7)) + vpxorq %VMM(4), %VMM(5), %VMM(5) + vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6) +# endif + VPTESTNM %VMM(6), %VMM(6), %k0{%k1} + KMOV %k0, %LOOP_REG TESTEQ %LOOP_REG jnz L(return_vec_2_3_end) /* Best for code size to include ucond-jmp here. Would be faster - if this case is hot to duplicate the L(return_vec_2_3_end) code - as fall-through and have jump back to loop on mismatch + if this case is hot to duplicate the L(return_vec_2_3_end) + code as fall-through and have jump back to loop on mismatch comparison. */ subq $-(VEC_SIZE * 4), %rdi subq $-(VEC_SIZE * 4), %rsi @@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross): L(return_vec_page_cross_0): addl $-VEC_SIZE, %eax L(return_vec_page_cross_1): - tzcntl %ecx, %ecx + bsf %VRCX, %VRCX # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx # ifdef USE_AS_STRNCMP @@ -1023,8 +1139,8 @@ L(ret9): L(page_cross): # ifndef USE_AS_STRNCMP /* If both are VEC aligned we don't need any special logic here. - Only valid for strcmp where stop condition is guranteed to be - reachable by just reading memory. */ + Only valid for strcmp where stop condition is guranteed to + be reachable by just reading memory. */ testl $((VEC_SIZE - 1) << 20), %eax jz L(no_page_cross) # endif @@ -1065,11 +1181,11 @@ L(page_cross): loadable memory until within 1x VEC of page cross. */ .p2align 4,, 8 L(page_cross_loop): - VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} - kmovd %k1, %ecx - TESTEQ %ecx + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2} + KMOV %k1, %VRCX + TESTEQ %VRCX jnz L(check_ret_vec_page_cross) addl $CHAR_PER_VEC, %OFFSET_REG # ifdef USE_AS_STRNCMP @@ -1087,13 +1203,13 @@ L(page_cross_loop): subl %eax, %OFFSET_REG /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed to not cross page so is safe to load. Since we have already - loaded at least 1 VEC from rsi it is also guranteed to be safe. - */ - VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + loaded at least 1 VEC from rsi it is also guranteed to be + safe. */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0) + VPTESTM %VMM(0), %VMM(0), %k2 + CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2} - kmovd %k1, %ecx + KMOV %k1, %VRCX # ifdef USE_AS_STRNCMP leal CHAR_PER_VEC(%OFFSET_REG64), %eax cmpq %rax, %rdx @@ -1104,7 +1220,7 @@ L(page_cross_loop): addq %rdi, %rdx # endif # endif - TESTEQ %ecx + TESTEQ %VRCX jz L(prepare_loop_no_len) .p2align 4,, 4 @@ -1112,7 +1228,7 @@ L(ret_vec_page_cross): # ifndef USE_AS_STRNCMP L(check_ret_vec_page_cross): # endif - tzcntl %ecx, %ecx + tzcnt %VRCX, %VRCX addl %OFFSET_REG, %ecx L(ret_vec_page_cross_cont): # ifdef USE_AS_WCSCMP @@ -1139,9 +1255,9 @@ L(ret12): # ifdef USE_AS_STRNCMP .p2align 4,, 10 L(check_ret_vec_page_cross2): - TESTEQ %ecx + TESTEQ %VRCX L(check_ret_vec_page_cross): - tzcntl %ecx, %ecx + tzcnt %VRCX, %VRCX addl %OFFSET_REG, %ecx cmpq %rcx, %rdx ja L(ret_vec_page_cross_cont) @@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page): # ifdef USE_AS_WCSCMP shrl $2, %eax # endif + + /* Find largest load size we can use. VEC_SIZE == 64 only check + if we can do a full ymm load. */ +# if VEC_SIZE == 64 + + cmpl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax + ja L(less_32_till_page) + + + /* Use 16 byte comparison. */ + VMOVU (%rdi), %VMM_256(0) + VPTESTM %VMM_256(0), %VMM_256(0), %k2 + CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2} + kmovd %k1, %ecx +# ifdef USE_AS_WCSCMP + subl $0xff, %ecx +# else + incl %ecx +# endif + jnz L(check_ret_vec_page_cross) + movl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG +# ifdef USE_AS_STRNCMP + cmpq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case64) + subl %eax, %OFFSET_REG +# else + /* Explicit check for 32 byte alignment. */ + subl %eax, %OFFSET_REG + jz L(prepare_loop) +# endif + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0) + VPTESTM %VMM_256(0), %VMM_256(0), %k2 + CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2} + kmovd %k1, %ecx +# ifdef USE_AS_WCSCMP + subl $0xff, %ecx +# else + incl %ecx +# endif + jnz L(check_ret_vec_page_cross) +# ifdef USE_AS_STRNCMP + addl $(32 / SIZE_OF_CHAR), %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case64) + subq $-(CHAR_PER_VEC * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi +# else + leaq (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi +# endif + jmp L(prepare_loop_aligned) + +# ifdef USE_AS_STRNCMP + .p2align 4,, 2 +L(ret_zero_page_cross_slow_case64): + xorl %eax, %eax + ret +# endif +L(less_32_till_page): +# endif + /* Find largest load size we can use. */ - cmpl $(16 / SIZE_OF_CHAR), %eax + cmpl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax ja L(less_16_till_page) /* Use 16 byte comparison. */ @@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page): incw %cx # endif jnz L(check_ret_vec_page_cross) - movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + + movl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG # ifdef USE_AS_STRNCMP +# if VEC_SIZE == 32 cmpq %OFFSET_REG64, %rdx +# else + cmpq $(16 / SIZE_OF_CHAR), %rdx +# endif jbe L(ret_zero_page_cross_slow_case0) subl %eax, %OFFSET_REG # else @@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0): .p2align 4,, 10 L(less_16_till_page): - cmpl $(24 / SIZE_OF_CHAR), %eax + cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax ja L(less_8_till_page) /* Use 8 byte comparison. */ @@ -1260,7 +1444,7 @@ L(less_16_till_page): cmpq $(8 / SIZE_OF_CHAR), %rdx jbe L(ret_zero_page_cross_slow_case0) # endif - movl $(24 / SIZE_OF_CHAR), %OFFSET_REG + movl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG subl %eax, %OFFSET_REG vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 @@ -1320,7 +1504,7 @@ L(ret_less_8_wcs): ret # else - cmpl $28, %eax + cmpl $(VEC_SIZE - 4), %eax ja L(less_4_till_page) vmovd (%rdi), %xmm0 @@ -1335,7 +1519,7 @@ L(ret_less_8_wcs): cmpq $4, %rdx jbe L(ret_zero_page_cross_slow_case1) # endif - movl $(28 / SIZE_OF_CHAR), %OFFSET_REG + movl $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG subl %eax, %OFFSET_REG vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 @@ -1386,7 +1570,7 @@ L(less_4_loop): # endif incq %rdi /* end condition is reach page boundary (rdi is aligned). */ - testl $31, %edi + testb $(VEC_SIZE - 1), %dil jnz L(less_4_loop) leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi addq $-(VEC_SIZE * 4), %rdi