@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,7 +41,8 @@
# endif
# define VEC_SIZE 32
-
+# define PAGE_SIZE 4096
+
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
@@ -47,10 +50,10 @@ ENTRY (STRCHR)
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
@@ -63,45 +66,11 @@ ENTRY (STRCHR)
jnz L(first_vec_x0)
/* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
- tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
- addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ /* Check the next 3 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
vmovdqa (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -127,19 +96,9 @@ L(more_4x_vec):
testl %eax, %eax
jnz L(first_vec_x2)
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
+
/* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ addq $(VEC_SIZE * 3), %rdi
andq $-(4 * VEC_SIZE), %rdi
.p2align 4
@@ -150,34 +109,61 @@ L(loop_4x_vec):
vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
- vpor %ymm5, %ymm6, %ymm5
+ VPMINU %ymm5, %ymm6, %ymm5
+ VPCMPEQ %ymm5, %ymm9, %ymm5
vpmovmskb %ymm5, %eax
+ addq $(VEC_SIZE * 4), %rdi
+
testl %eax, %eax
- jnz L(4x_vec_end)
+ jz L(loop_4x_vec)
- addq $(VEC_SIZE * 4), %rdi
+ subq $(VEC_SIZE * 4), %rdi
+
+L(4x_vec_end):
+ VPCMPEQ %ymm1, %ymm9, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ VPCMPEQ %ymm2, %ymm9, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm4, %eax
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 3), %rdi
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
.p2align 4
L(first_vec_x0):
/* Found CHAR or the null byte. */
@@ -197,7 +183,7 @@ L(first_vec_x0):
L(first_vec_x1):
tzcntl %eax, %eax
# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
+ addq $VEC_SIZE, %rdi
addq %rdi, %rax
# else
xorl %edx, %edx
@@ -212,7 +198,7 @@ L(first_vec_x1):
L(first_vec_x2):
tzcntl %eax, %eax
# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 2), %rdi
addq %rdi, %rax
# else
xorl %edx, %edx
@@ -223,32 +209,47 @@ L(first_vec_x2):
VZEROUPPER
ret
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(4x_vec_end):
+L(cross_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
+ jnz L(cross_page_return)
+
+ /* Second block so that the 3 other blocks from L(aligned_more)
+ will get to next 4 * VEC_SIZE alignment. */
+ andq $-VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rdi
+ xorl %ecx, %ecx
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
+
+L(cross_page_return):
tzcntl %eax, %eax
+ addq %rcx, %rax
# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
# else
xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ leaq (%rdi, %rax), %rax
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
END (STRCHR)
-#endif
+# endif
@@ -37,6 +37,8 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI1)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);