@@ -26,48 +26,75 @@
# define STRCHR __strchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
# define VPTESTN vptestnmd
+# define VPTEST vptestmd
# define VPMINU vpminud
# define CHAR_REG esi
-# define SHIFT_REG ecx
+# define SHIFT_REG rcx
# define CHAR_SIZE 4
+
+# define USE_WIDE_CHAR
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
# define VPTESTN vptestnmb
+# define VPTEST vptestmb
# define VPMINU vpminub
# define CHAR_REG sil
-# define SHIFT_REG edx
+# define SHIFT_REG rdi
# define CHAR_SIZE 1
# endif
-# define XMMZERO xmm16
-
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-# define YMM2 ymm19
-# define YMM3 ymm20
-# define YMM4 ymm21
-# define YMM5 ymm22
-# define YMM6 ymm23
-# define YMM7 ymm24
-# define YMM8 ymm25
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
- .section .text.evex,"ax",@progbits
-ENTRY_P2ALIGN (STRCHR, 5)
- /* Broadcast CHAR to YMM0. */
- VPBROADCAST %esi, %YMM0
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+# define MASK_GPR rcx
+# define LOOP_REG rax
+
+# define COND_MASK(k_reg) {%k_reg}
+# else
+# define MASK_GPR rax
+# define LOOP_REG rdi
+
+# define COND_MASK(k_reg)
+# endif
+
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+
+# if CHAR_PER_VEC == 64
+# define LAST_VEC_OFFSET (VEC_SIZE * 3)
+# define TESTZ(reg) incq %VGPR_SZ(reg, 64)
+# else
+
+# if CHAR_PER_VEC == 32
+# define TESTZ(reg) incl %VGPR_SZ(reg, 32)
+# elif CHAR_PER_VEC == 16
+# define TESTZ(reg) incw %VGPR_SZ(reg, 16)
+# else
+# define TESTZ(reg) incb %VGPR_SZ(reg, 8)
+# endif
+
+# define LAST_VEC_OFFSET (VEC_SIZE * 2)
+# endif
+
+# define VMATCH VMM(0)
+
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRCHR, 6)
+ /* Broadcast CHAR to VEC_0. */
+ VPBROADCAST %esi, %VMATCH
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
/* Check if we cross page boundary with one vector load.
@@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page_boundary)
+
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null bytes. */
- VMOVU (%rdi), %YMM1
-
+ VMOVU (%rdi), %VMM(1)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ vpxorq %VMM(1), %VMATCH, %VMM(2)
+ VPMINU %VMM(2), %VMM(1), %VMM(2)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRAX
+# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
+ /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
+ that all logic for match/null in first VEC first in 1x cache
+ lines. This has a slight cost to larger sizes. */
+ bsf %VRAX, %VRAX
+ jz L(aligned_more)
+# else
+ test %VRAX, %VRAX
jz L(aligned_more)
- tzcntl %eax, %eax
+ bsf %VRAX, %VRAX
+# endif
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
# endif
ret
-
-
- .p2align 4,, 10
-L(first_vec_x4):
-# ifndef USE_AS_STRCHRNUL
- /* Check to see if first match was CHAR (k0) or null (k1). */
- kmovd %k0, %eax
- tzcntl %eax, %eax
- kmovd %k1, %ecx
- /* bzhil will not be 0 if first match was null. */
- bzhil %eax, %ecx, %ecx
- jne L(zero)
-# else
- /* Combine CHAR and null matches. */
- kord %k0, %k1, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
-# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
- ret
-
# ifndef USE_AS_STRCHRNUL
L(zero):
xorl %eax, %eax
ret
# endif
-
- .p2align 4
+ .p2align 4,, 2
+L(first_vec_x3):
+ subq $-(VEC_SIZE * 2), %rdi
+# if VEC_SIZE == 32
+ /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
+ For VEC_SIZE == 64 the registers don't match. */
+L(last_vec_x2):
+# endif
L(first_vec_x1):
/* Use bsf here to save 1-byte keeping keeping the block in 1x
fetch block. eax guranteed non-zero. */
- bsfl %eax, %eax
+ bsf %VRCX, %VRCX
# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ /* Found CHAR or the null byte. */
+ cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
jne L(zero)
-
# endif
/* NB: Multiply sizeof char type (1 or 4) to get the number of
bytes. */
- leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ leaq (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
ret
- .p2align 4,, 10
+ .p2align 4,, 2
+L(first_vec_x4):
+ subq $-(VEC_SIZE * 2), %rdi
L(first_vec_x2):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
- kmovd %k0, %eax
- tzcntl %eax, %eax
- kmovd %k1, %ecx
+ KMOV %k0, %VRAX
+ tzcnt %VRAX, %VRAX
+ KMOV %k1, %VRCX
/* bzhil will not be 0 if first match was null. */
- bzhil %eax, %ecx, %ecx
+ bzhi %VRAX, %VRCX, %VRCX
jne L(zero)
# else
/* Combine CHAR and null matches. */
- kord %k0, %k1, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
+ KOR %k0, %k1, %k0
+ KMOV %k0, %VRAX
+ bsf %VRAX, %VRAX
# endif
/* NB: Multiply sizeof char type (1 or 4) to get the number of
bytes. */
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4,, 10
-L(first_vec_x3):
- /* Use bsf here to save 1-byte keeping keeping the block in 1x
- fetch block. eax guranteed non-zero. */
- bsfl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero)
+# ifdef USE_AS_STRCHRNUL
+ /* We use this as a hook to get imm8 encoding for the jmp to
+ L(page_cross_boundary). This allows the hot case of a
+ match/null-term in first VEC to fit entirely in 1 cache
+ line. */
+L(cross_page_boundary):
+ jmp L(cross_page_boundary_real)
# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
- ret
.p2align 4
L(aligned_more):
+L(cross_page_continue):
/* Align data to VEC_SIZE. */
andq $-VEC_SIZE, %rdi
-L(cross_page_continue):
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
- data is only aligned to VEC_SIZE. Use two alternating methods
- for checking VEC to balance latency and port contention. */
- /* This method has higher latency but has better port
- distribution. */
- VMOVA (VEC_SIZE)(%rdi), %YMM1
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. Use two alternating
+ methods for checking VEC to balance latency and port
+ contention. */
+
+ /* Method(1) with 8c latency:
+ For VEC_SIZE == 32:
+ p0 * 1.83, p1 * 0.83, p5 * 1.33
+ For VEC_SIZE == 64:
+ p0 * 2.50, p1 * 0.00, p5 * 1.50 */
+ VMOVA (VEC_SIZE)(%rdi), %VMM(1)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ vpxorq %VMM(1), %VMATCH, %VMM(2)
+ VPMINU %VMM(2), %VMM(1), %VMM(2)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
jnz L(first_vec_x1)
- /* This method has higher latency but has better port
- distribution. */
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
- /* Each bit in K0 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMM0, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPTESTN %YMM1, %YMM1, %k1
- kortestd %k0, %k1
+ /* Method(2) with 6c latency:
+ For VEC_SIZE == 32:
+ p0 * 1.00, p1 * 0.00, p5 * 2.00
+ For VEC_SIZE == 64:
+ p0 * 1.00, p1 * 0.00, p5 * 2.00 */
+ VMOVA (VEC_SIZE * 2)(%rdi), %VMM(1)
+ /* Each bit in K0 represents a CHAR in VEC_1. */
+ VPCMPEQ %VMM(1), %VMATCH, %k0
+ /* Each bit in K1 represents a CHAR in VEC_1. */
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KORTEST %k0, %k1
jnz L(first_vec_x2)
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
+ /* By swapping between Method 1/2 we get more fair port
+ distrubition and better throughput. */
+
+ VMOVA (VEC_SIZE * 3)(%rdi), %VMM(1)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ vpxorq %VMM(1), %VMATCH, %VMM(2)
+ VPMINU %VMM(2), %VMM(1), %VMM(2)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
jnz L(first_vec_x3)
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
- /* Each bit in K0 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMM0, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPTESTN %YMM1, %YMM1, %k1
- kortestd %k0, %k1
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ /* Each bit in K0 represents a CHAR in VEC_1. */
+ VPCMPEQ %VMM(1), %VMATCH, %k0
+ /* Each bit in K1 represents a CHAR in VEC_1. */
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KORTEST %k0, %k1
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 for the loop. */
+# if VEC_SIZE == 64
+ /* Use rax for the loop reg as it allows to the loop to fit in
+ exactly 2-cache-lines. (more efficient imm32 + gpr
+ encoding). */
+ leaq (VEC_SIZE)(%rdi), %rax
+ /* No partial register stalls on evex512 processors. */
+ xorb %al, %al
+# else
+ /* For VEC_SIZE == 32 continue using rdi for loop reg so we can
+ reuse more code and save space. */
addq $VEC_SIZE, %rdi
andq $-(VEC_SIZE * 4), %rdi
-
+# endif
.p2align 4
L(loop_4x_vec):
- /* Check 4x VEC at a time. No penalty to imm32 offset with evex
- encoding. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
- VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
- VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
- VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
-
- /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+ /* Check 4x VEC at a time. No penalty for imm32 offset with evex
+ encoding (if offset % VEC_SIZE == 0). */
+ VMOVA (VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
+ VMOVA (VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
+ VMOVA (VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
+
+ /* Collect bits where VEC_1 does NOT match esi. This is later
+ use to mask of results (getting not matches allows us to
+ save an instruction on combining). */
+ VPCMP $4, %VMATCH, %VMM(1), %k1
+
+ /* Two methods for loop depending on VEC_SIZE. This is because
+ with zmm registers VPMINU can only run on p0 (as opposed to
+ p0/p1 for ymm) so it is less prefered. */
+# if VEC_SIZE == 32
+ /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
zero. */
- vpxorq %YMM1, %YMM0, %YMM5
- /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
- k register. Its possible to save either 1 or 2 instructions
- using cmp no equals method for either YMM1 or YMM1 and YMM3
- respectively but bottleneck on p5 makes it not worth it. */
- VPCMP $4, %YMM0, %YMM2, %k2
- vpxorq %YMM3, %YMM0, %YMM7
- VPCMP $4, %YMM0, %YMM4, %k4
-
- /* Use min to select all zeros from either xor or end of string).
- */
- VPMINU %YMM1, %YMM5, %YMM1
- VPMINU %YMM3, %YMM7, %YMM3
+ vpxorq %VMM(2), %VMATCH, %VMM(6)
+ vpxorq %VMM(3), %VMATCH, %VMM(7)
- /* Use min + zeromask to select for zeros. Since k2 and k4 will
- have 0 as positions that matched with CHAR which will set
- zero in the corresponding destination bytes in YMM2 / YMM4.
- */
- VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
- VPMINU %YMM3, %YMM4, %YMM4
- VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
-
- VPTESTN %YMM4, %YMM4, %k1
- kmovd %k1, %ecx
- subq $-(VEC_SIZE * 4), %rdi
- testl %ecx, %ecx
+ /* Find non-matches in VEC_4 while combining with non-matches
+ from VEC_1. NB: Try and use masked predicate execution on
+ instructions that have mask result as it has no latency
+ penalty. */
+ VPCMP $4, %VMATCH, %VMM(4), %k4{%k1}
+
+ /* Combined zeros from VEC_1 / VEC_2 (search for null term). */
+ VPMINU %VMM(1), %VMM(2), %VMM(2)
+
+ /* Use min to select all zeros from either xor or end of
+ string). */
+ VPMINU %VMM(3), %VMM(7), %VMM(3)
+ VPMINU %VMM(2), %VMM(6), %VMM(2)
+
+ /* Combined zeros from VEC_2 / VEC_3 (search for null term). */
+ VPMINU %VMM(3), %VMM(4), %VMM(4)
+
+ /* Combined zeros from VEC_2 / VEC_4 (this has all null term and
+ esi matches for VEC_2 / VEC_3). */
+ VPMINU %VMM(2), %VMM(4), %VMM(4)
+# else
+ /* Collect non-matches for VEC_2. */
+ VPCMP $4, %VMM(2), %VMATCH, %k2
+
+ /* Combined zeros from VEC_1 / VEC_2 (search for null term). */
+ VPMINU %VMM(1), %VMM(2), %VMM(2)
+
+ /* Find non-matches in VEC_3/VEC_4 while combining with non-
+ matches from VEC_1/VEC_2 respectively. */
+ VPCMP $4, %VMM(3), %VMATCH, %k3{%k1}
+ VPCMP $4, %VMM(4), %VMATCH, %k4{%k2}
+
+ /* Finish combining zeros in all VECs. */
+ VPMINU %VMM(3), %VMM(4), %VMM(4)
+
+ /* Combine in esi matches for VEC_3 (if there was a match with
+ esi, the corresponding bit in %k3 is zero so the
+ VPMINU_MASKZ will have a zero in the result). NB: This make
+ the VPMINU 3c latency. The only way to avoid it is to
+ createa a 12c dependency chain on all the `VPCMP $4, ...`
+ which has higher total latency. */
+ VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z}
+# endif
+ VPTEST %VMM(4), %VMM(4), %k0{%k4}
+ KMOV %k0, %VRDX
+ subq $-(VEC_SIZE * 4), %LOOP_REG
+
+ /* TESTZ is inc using the proper register width depending on
+ CHAR_PER_VEC. An esi match or null-term match leaves a zero-
+ bit in rdx so inc won't overflow and won't be zero. */
+ TESTZ (rdx)
jz L(loop_4x_vec)
- VPTESTN %YMM1, %YMM1, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
+ VPTEST %VMM(1), %VMM(1), %k0{%k1}
+ KMOV %k0, %VGPR(MASK_GPR)
+ TESTZ (MASK_GPR)
+# if VEC_SIZE == 32
+ /* We can reuse the return code in page_cross logic for VEC_SIZE
+ == 32. */
+ jnz L(last_vec_x1_vec_size32)
+# else
+ jnz L(last_vec_x1_vec_size64)
+# endif
+
- VPTESTN %YMM2, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ /* COND_MASK integates the esi matches for VEC_SIZE == 64. For
+ VEC_SIZE == 32 they are already integrated. */
+ VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2)
+ KMOV %k0, %VRCX
+ TESTZ (rcx)
jnz L(last_vec_x2)
- VPTESTN %YMM3, %YMM3, %k0
- kmovd %k0, %eax
- /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
-# ifdef USE_AS_WCSCHR
- sall $8, %ecx
- orl %ecx, %eax
- bsfl %eax, %eax
+ VPTEST %VMM(3), %VMM(3), %k0 COND_MASK(k3)
+ KMOV %k0, %VRCX
+# if CHAR_PER_VEC == 64
+ TESTZ (rcx)
+ jnz L(last_vec_x3)
# else
- salq $32, %rcx
- orq %rcx, %rax
- bsfq %rax, %rax
+ salq $CHAR_PER_VEC, %rdx
+ TESTZ (rcx)
+ orq %rcx, %rdx
# endif
+
+ bsfq %rdx, %rdx
+
# ifndef USE_AS_STRCHRNUL
/* Check if match was CHAR or null. */
- cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ cmp (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
jne L(zero_end)
# endif
/* NB: Multiply sizeof char type (1 or 4) to get the number of
bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ leaq (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
ret
- .p2align 4,, 8
-L(last_vec_x1):
- bsfl %eax, %eax
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes.
- */
- leaq (%rdi, %rax, CHAR_SIZE), %rax
-# else
- addq %rdi, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ ret
# endif
-# ifndef USE_AS_STRCHRNUL
+
+ /* Seperate return label for last VEC1 because for VEC_SIZE ==
+ 32 we can reuse return code in L(page_cross) but VEC_SIZE ==
+ 64 has mismatched registers. */
+# if VEC_SIZE == 64
+ .p2align 4,, 8
+L(last_vec_x1_vec_size64):
+ bsf %VRCX, %VRCX
+# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
- cmp (%rax), %CHAR_REG
+ cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG
jne L(zero_end)
-# endif
-
+# endif
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rax, %rcx, CHAR_SIZE), %rax
+# else
+ addq %rcx, %rax
+# endif
ret
+ /* Since we can't combine the last 2x matches for CHAR_PER_VEC
+ == 64 we need return label for last VEC3. */
+# if CHAR_PER_VEC == 64
.p2align 4,, 8
+L(last_vec_x3):
+ addq $VEC_SIZE, %LOOP_REG
+# endif
+
+ /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
+ reuse L(first_vec_x3) due to register mismatch. */
L(last_vec_x2):
- bsfl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
+ bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR)
+# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
- cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ cmp (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
jne L(zero_end)
-# endif
+# endif
/* NB: Multiply sizeof char type (1 or 4) to get the number of
bytes. */
- leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ leaq (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
ret
+# endif
- /* Cold case for crossing page with first load. */
- .p2align 4,, 8
+ /* Cold case for crossing page with first load. */
+ .p2align 4,, 10
+# ifndef USE_AS_STRCHRNUL
L(cross_page_boundary):
- movq %rdi, %rdx
+# endif
+L(cross_page_boundary_real):
/* Align rdi. */
- andq $-VEC_SIZE, %rdi
- VMOVA (%rdi), %YMM1
- /* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
- kmovd %k0, %eax
+ xorq %rdi, %rax
+ VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+ /* Use high latency method of getting matches to save code size.
+ */
+
+ /* K1 has 1s where VEC(1) does NOT match esi. */
+ VPCMP $4, %VMM(1), %VMATCH, %k1
+ /* K0 has ones where K1 is 1 (non-match with esi), and non-zero
+ (null). */
+ VPTEST %VMM(1), %VMM(1), %k0{%k1}
+ KMOV %k0, %VRAX
/* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
- movl %edx, %SHIFT_REG
+ movl %edi, %VGPR_SZ(SHIFT_REG, 32)
/* NB: Divide shift count by 4 since each bit in K1 represent 4
bytes. */
- sarl $2, %SHIFT_REG
- andl $(CHAR_PER_VEC - 1), %SHIFT_REG
+ sarl $2, %VGPR_SZ(SHIFT_REG, 32)
+ andl $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
+
+ /* if wcsrchr we need to reverse matches as we can't rely on
+ signed shift to bring in ones. There is not sarx for
+ gpr8/16. Also not we can't use inc here as the lower bits
+ represent matches out of range so we can't rely on overflow.
+ */
+ xorl $((1 << CHAR_PER_VEC)- 1), %eax
+# endif
+ /* Use arithmatic shift so that leading 1s are filled in. */
+ sarx %VGPR(SHIFT_REG), %VRAX, %VRAX
+ /* If eax is all ones then no matches for esi or NULL. */
+
+# ifdef USE_AS_WCSCHR
+ test %VRAX, %VRAX
+# else
+ inc %VRAX
# endif
- sarxl %SHIFT_REG, %eax, %eax
- /* If eax is zero continue. */
- testl %eax, %eax
jz L(cross_page_continue)
- bsfl %eax, %eax
+ .p2align 4,, 10
+L(last_vec_x1_vec_size32):
+ bsf %VRAX, %VRAX
# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of
- bytes. */
- leaq (%rdx, %rax, CHAR_SIZE), %rax
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdx, %rax
+ addq %rdi, %rax
# endif
# ifndef USE_AS_STRCHRNUL
/* Check to see if match was CHAR or null. */
cmp (%rax), %CHAR_REG
- je L(cross_page_ret)
-L(zero_end):
- xorl %eax, %eax
-L(cross_page_ret):
+ jne L(zero_end_0)
# endif
ret
+# ifndef USE_AS_STRCHRNUL
+L(zero_end_0):
+ xorl %eax, %eax
+ ret
+# endif
END (STRCHR)
#endif