@@ -49,132 +49,144 @@
.section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
- movl %edi, %ecx
-# ifndef USE_AS_STRCHRNUL
- xorl %edx, %edx
-# endif
-
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ VPBROADCAST %xmm0, %ymm0
vpxor %xmm9, %xmm9, %xmm9
- VPBROADCAST %xmm0, %ymm0
/* Check if we cross page boundary with one vector load. */
- andl $(PAGE_SIZE - 1), %ecx
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
- ja L(cross_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
- jz L(more_vecs)
+ jz L(aligned_more)
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
- addq %rdi, %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
-
- .p2align 4
-L(more_vecs):
- /* Align data for aligned loads in the loop. */
- andq $-VEC_SIZE, %rdi
-L(aligned_more):
-
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa VEC_SIZE(%rdi), %ymm8
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- vmovdqa VEC_SIZE(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
-
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
-
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jz L(prep_loop_4x)
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
+ alignment % 32 was either 16 or 0. As well this makes the
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
+ easier. */
+ .p2align 5
+L(first_vec_x4):
tzcntl %eax, %eax
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
- addq %rdi, %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
-# endif
+L(zero):
+ xorl %eax, %eax
VZEROUPPER_RETURN
+# endif
+
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
- leaq VEC_SIZE(%rdi, %rax), %rax
+ incq %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
-L(prep_loop_4x):
- /* Align data to 4 * VEC_SIZE. */
- andq $-(VEC_SIZE * 4), %rdi
+ .p2align 4
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq -VEC_SIZE but saves 4 bytes of code on
+ x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+ data is only aligned to VEC_SIZE. */
+ vmovdqa 1(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
+ addq $(VEC_SIZE * 4 + 1), %rdi
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+ vmovdqa (%rdi), %ymm5
+ vmovdqa (VEC_SIZE)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
/* Leaves only CHARS matching esi as 0. */
vpxor %ymm5, %ymm0, %ymm1
@@ -190,62 +202,102 @@ L(loop_4x_vec):
VPMINU %ymm1, %ymm2, %ymm5
VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
+ VPMINU %ymm5, %ymm6, %ymm6
- VPCMPEQ %ymm5, %ymm9, %ymm5
- vpmovmskb %ymm5, %eax
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ vpmovmskb %ymm6, %ecx
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- addq $(VEC_SIZE * 4), %rdi
- testl %eax, %eax
- jz L(loop_4x_vec)
- VPCMPEQ %ymm1, %ymm9, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm1, %ymm9, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(last_vec_x0)
+
- VPCMPEQ %ymm2, %ymm9, %ymm2
- vpmovmskb %ymm2, %eax
+ VPCMPEQ %ymm5, %ymm9, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(last_vec_x1)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used if
+ the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
- VPCMPEQ %ymm3, %ymm9, %ymm3
- VPCMPEQ %ymm4, %ymm9, %ymm4
- vpmovmskb %ymm3, %ecx
- vpmovmskb %ymm4, %eax
- salq $32, %rax
- orq %rcx, %rax
- tzcntq %rax, %rax
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ .p2align 4
+L(last_vec_x0):
+ tzcntl %eax, %eax
+ addq $-(VEC_SIZE * 4), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
+
+ .p2align 4
+L(last_vec_x1):
+ tzcntl %eax, %eax
+ subq $(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+
/* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
- andq $-VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
-
- vmovdqa (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
+ movq %rdi, %rdx
+ /* Align rdi to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bits. */
- sarxl %ecx, %eax, %eax
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod edx. */
+ sarxl %edx, %eax, %eax
testl %eax, %eax
- jz L(aligned_more)
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq %rcx, %rdi
- addq %rdi, %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ xorl %ecx, %ecx
+ /* Found CHAR or the null byte. */
+ cmp (%rdx, %rax), %CHAR_REG
+ leaq (%rdx, %rax), %rax
+ cmovne %rcx, %rax
+# else
+ addq %rdx, %rax
# endif
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
END (STRCHR)
# endif
No bug. This commit optimizes strlen-evex.S. The optimizations are all small things such as save an ALU in the alignment process, saving a few instructions in the loop return, saving some bytes in the main loop, and increasing the ILP in the return cases. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- Tests where run on the following CPUs: Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html All times are the geometric mean of N=20. The unit of time is seconds. "Cur" refers to the current implementation "New" refers to this patches implementation For strchr-evex the numbers are a near universal improvement. The only exception seems to be the [32, 64] case is marginally slower for Tigerlake and about even on Icelake (less than the gain in the [0, 31] case). Overall though I think the number show a sizable improvement, particularly once the 4x loop is hit. Results For Tigerlake strchr-evex size, algn, Cur T , New T , Win , Dif 32 , 0 , 4.89 , 5.23 , Cur , 0.34 32 , 1 , 4.67 , 5.09 , Cur , 0.42 64 , 0 , 5.59 , 5.46 , New , 0.13 64 , 2 , 5.52 , 5.43 , New , 0.09 128 , 0 , 8.04 , 7.44 , New , 0.6 128 , 3 , 8.0 , 7.45 , New , 0.55 256 , 0 , 14.7 , 12.94 , New , 1.76 256 , 4 , 14.78 , 13.03 , New , 1.75 512 , 0 , 20.37 , 19.05 , New , 1.32 512 , 5 , 20.34 , 18.98 , New , 1.36 1024, 0 , 31.62 , 28.24 , New , 3.38 1024, 6 , 31.55 , 28.2 , New , 3.35 2048, 0 , 53.22 , 47.12 , New , 6.1 2048, 7 , 53.15 , 47.0 , New , 6.15 64 , 1 , 5.45 , 5.41 , New , 0.04 64 , 3 , 5.46 , 5.39 , New , 0.07 64 , 4 , 5.48 , 5.39 , New , 0.09 64 , 5 , 5.54 , 5.39 , New , 0.15 64 , 6 , 5.47 , 5.41 , New , 0.06 64 , 7 , 5.46 , 5.39 , New , 0.07 256 , 16 , 14.58 , 12.92 , New , 1.66 256 , 32 , 15.36 , 13.54 , New , 1.82 256 , 48 , 15.49 , 13.71 , New , 1.78 256 , 64 , 16.53 , 14.78 , New , 1.75 256 , 80 , 16.57 , 14.82 , New , 1.75 256 , 96 , 13.26 , 11.99 , New , 1.27 256 , 112 , 13.36 , 12.07 , New , 1.29 0 , 0 , 3.75 , 3.09 , New , 0.66 1 , 0 , 3.75 , 3.09 , New , 0.66 2 , 0 , 3.74 , 3.09 , New , 0.65 3 , 0 , 3.74 , 3.09 , New , 0.65 4 , 0 , 3.74 , 3.09 , New , 0.65 5 , 0 , 3.74 , 3.1 , New , 0.64 6 , 0 , 3.74 , 3.1 , New , 0.64 7 , 0 , 3.74 , 3.09 , New , 0.65 8 , 0 , 3.74 , 3.09 , New , 0.65 9 , 0 , 3.74 , 3.1 , New , 0.64 10 , 0 , 3.75 , 3.09 , New , 0.66 11 , 0 , 3.75 , 3.1 , New , 0.65 12 , 0 , 3.74 , 3.1 , New , 0.64 13 , 0 , 3.77 , 3.1 , New , 0.67 14 , 0 , 3.78 , 3.1 , New , 0.68 15 , 0 , 3.82 , 3.1 , New , 0.72 16 , 0 , 3.76 , 3.1 , New , 0.66 17 , 0 , 3.8 , 3.1 , New , 0.7 18 , 0 , 3.77 , 3.1 , New , 0.67 19 , 0 , 3.81 , 3.1 , New , 0.71 20 , 0 , 3.77 , 3.13 , New , 0.64 21 , 0 , 3.8 , 3.11 , New , 0.69 22 , 0 , 3.82 , 3.11 , New , 0.71 23 , 0 , 3.77 , 3.11 , New , 0.66 24 , 0 , 3.77 , 3.11 , New , 0.66 25 , 0 , 3.76 , 3.11 , New , 0.65 26 , 0 , 3.76 , 3.11 , New , 0.65 27 , 0 , 3.76 , 3.11 , New , 0.65 28 , 0 , 3.77 , 3.11 , New , 0.66 29 , 0 , 3.76 , 3.11 , New , 0.65 30 , 0 , 3.76 , 3.11 , New , 0.65 31 , 0 , 3.76 , 3.11 , New , 0.65 Results For Icelake strchr-evex size, algn, Cur T , New T , Win , Dif 32 , 0 , 3.57 , 3.77 , Cur , 0.2 32 , 1 , 3.36 , 3.34 , New , 0.02 64 , 0 , 3.77 , 3.64 , New , 0.13 64 , 2 , 3.73 , 3.58 , New , 0.15 128 , 0 , 5.22 , 4.92 , New , 0.3 128 , 3 , 5.16 , 4.94 , New , 0.22 256 , 0 , 9.83 , 8.8 , New , 1.03 256 , 4 , 9.89 , 8.77 , New , 1.12 512 , 0 , 13.47 , 12.77 , New , 0.7 512 , 5 , 13.58 , 12.74 , New , 0.84 1024, 0 , 20.33 , 18.46 , New , 1.87 1024, 6 , 20.28 , 18.39 , New , 1.89 2048, 0 , 35.45 , 31.59 , New , 3.86 2048, 7 , 35.44 , 31.66 , New , 3.78 64 , 1 , 3.76 , 3.62 , New , 0.14 64 , 3 , 3.7 , 3.6 , New , 0.1 64 , 4 , 3.71 , 3.62 , New , 0.09 64 , 5 , 3.74 , 3.61 , New , 0.13 64 , 6 , 3.74 , 3.61 , New , 0.13 64 , 7 , 3.72 , 3.62 , New , 0.1 256 , 16 , 9.81 , 8.77 , New , 1.04 256 , 32 , 10.25 , 9.24 , New , 1.01 256 , 48 , 10.48 , 9.39 , New , 1.09 256 , 64 , 11.09 , 10.11 , New , 0.98 256 , 80 , 11.09 , 10.09 , New , 1.0 256 , 96 , 8.88 , 8.09 , New , 0.79 256 , 112 , 8.84 , 8.16 , New , 0.68 0 , 0 , 2.31 , 2.08 , New , 0.23 1 , 0 , 2.36 , 2.09 , New , 0.27 2 , 0 , 2.39 , 2.12 , New , 0.27 3 , 0 , 2.4 , 2.14 , New , 0.26 4 , 0 , 2.42 , 2.15 , New , 0.27 5 , 0 , 2.4 , 2.15 , New , 0.25 6 , 0 , 2.38 , 2.15 , New , 0.23 7 , 0 , 2.36 , 2.15 , New , 0.21 8 , 0 , 2.41 , 2.16 , New , 0.25 9 , 0 , 2.37 , 2.14 , New , 0.23 10 , 0 , 2.36 , 2.16 , New , 0.2 11 , 0 , 2.36 , 2.17 , New , 0.19 12 , 0 , 2.35 , 2.15 , New , 0.2 13 , 0 , 2.37 , 2.16 , New , 0.21 14 , 0 , 2.37 , 2.16 , New , 0.21 15 , 0 , 2.39 , 2.15 , New , 0.24 16 , 0 , 2.36 , 2.14 , New , 0.22 17 , 0 , 2.35 , 2.14 , New , 0.21 18 , 0 , 2.36 , 2.14 , New , 0.22 19 , 0 , 2.37 , 2.14 , New , 0.23 20 , 0 , 2.37 , 2.16 , New , 0.21 21 , 0 , 2.38 , 2.16 , New , 0.22 22 , 0 , 2.38 , 2.14 , New , 0.24 23 , 0 , 2.33 , 2.11 , New , 0.22 24 , 0 , 2.3 , 2.07 , New , 0.23 25 , 0 , 2.27 , 2.06 , New , 0.21 26 , 0 , 2.26 , 2.06 , New , 0.2 27 , 0 , 2.28 , 2.1 , New , 0.18 28 , 0 , 2.34 , 2.13 , New , 0.21 29 , 0 , 2.34 , 2.09 , New , 0.25 30 , 0 , 2.29 , 2.09 , New , 0.2 31 , 0 , 2.31 , 2.08 , New , 0.23 For strchr-avx the results are a lot closer as the optimizations where smaller but the trend is improvement. Especially on Skylake (which is the only one of the benchmark CPUs that this will actually be used on). Results For Skylake strchr-avx2 size, algn, Cur T , New T , Win , Dif 32 , 0 , 6.04 , 5.02 , New , 1.02 32 , 1 , 6.19 , 4.94 , New , 1.25 64 , 0 , 6.68 , 5.92 , New , 0.76 64 , 2 , 6.59 , 5.95 , New , 0.64 128 , 0 , 7.66 , 7.42 , New , 0.24 128 , 3 , 7.66 , 7.4 , New , 0.26 256 , 0 , 14.68 , 12.93 , New , 1.75 256 , 4 , 14.74 , 12.88 , New , 1.86 512 , 0 , 20.81 , 17.47 , New , 3.34 512 , 5 , 20.73 , 17.44 , New , 3.29 1024, 0 , 33.16 , 27.06 , New , 6.1 1024, 6 , 33.15 , 27.09 , New , 6.06 2048, 0 , 59.06 , 56.15 , New , 2.91 2048, 7 , 59.0 , 53.92 , New , 5.08 64 , 1 , 6.56 , 5.86 , New , 0.7 64 , 3 , 6.55 , 5.99 , New , 0.56 64 , 4 , 6.61 , 5.96 , New , 0.65 64 , 5 , 6.52 , 5.94 , New , 0.58 64 , 6 , 6.62 , 5.95 , New , 0.67 64 , 7 , 6.61 , 6.11 , New , 0.5 256 , 16 , 14.64 , 12.85 , New , 1.79 256 , 32 , 15.2 , 12.97 , New , 2.23 256 , 48 , 15.13 , 13.33 , New , 1.8 256 , 64 , 16.18 , 13.46 , New , 2.72 256 , 80 , 16.26 , 13.49 , New , 2.77 256 , 96 , 13.13 , 11.43 , New , 1.7 256 , 112 , 13.12 , 11.4 , New , 1.72 0 , 0 , 5.36 , 4.25 , New , 1.11 1 , 0 , 5.28 , 4.24 , New , 1.04 2 , 0 , 5.27 , 4.2 , New , 1.07 3 , 0 , 5.27 , 4.23 , New , 1.04 4 , 0 , 5.36 , 4.3 , New , 1.06 5 , 0 , 5.35 , 4.29 , New , 1.06 6 , 0 , 5.38 , 4.35 , New , 1.03 7 , 0 , 5.39 , 4.28 , New , 1.11 8 , 0 , 5.5 , 4.45 , New , 1.05 9 , 0 , 5.47 , 4.43 , New , 1.04 10 , 0 , 5.5 , 4.4 , New , 1.1 11 , 0 , 5.51 , 4.44 , New , 1.07 12 , 0 , 5.49 , 4.44 , New , 1.05 13 , 0 , 5.49 , 4.46 , New , 1.03 14 , 0 , 5.49 , 4.46 , New , 1.03 15 , 0 , 5.51 , 4.43 , New , 1.08 16 , 0 , 5.52 , 4.48 , New , 1.04 17 , 0 , 5.57 , 4.47 , New , 1.1 18 , 0 , 5.56 , 4.52 , New , 1.04 19 , 0 , 5.54 , 4.46 , New , 1.08 20 , 0 , 5.53 , 4.48 , New , 1.05 21 , 0 , 5.54 , 4.48 , New , 1.06 22 , 0 , 5.57 , 4.45 , New , 1.12 23 , 0 , 5.57 , 4.48 , New , 1.09 24 , 0 , 5.53 , 4.43 , New , 1.1 25 , 0 , 5.49 , 4.42 , New , 1.07 26 , 0 , 5.5 , 4.44 , New , 1.06 27 , 0 , 5.48 , 4.44 , New , 1.04 28 , 0 , 5.48 , 4.43 , New , 1.05 29 , 0 , 5.54 , 4.41 , New , 1.13 30 , 0 , 5.49 , 4.4 , New , 1.09 31 , 0 , 5.46 , 4.4 , New , 1.06 Results For Tigerlake strchr-avx2 size, algn, Cur T , New T , Win , Dif 32 , 0 , 5.88 , 5.47 , New , 0.41 32 , 1 , 5.73 , 5.46 , New , 0.27 64 , 0 , 6.32 , 6.1 , New , 0.22 64 , 2 , 6.17 , 6.11 , New , 0.06 128 , 0 , 7.93 , 7.68 , New , 0.25 128 , 3 , 7.93 , 7.73 , New , 0.2 256 , 0 , 14.87 , 14.5 , New , 0.37 256 , 4 , 14.96 , 14.59 , New , 0.37 512 , 0 , 21.25 , 20.18 , New , 1.07 512 , 5 , 21.25 , 20.11 , New , 1.14 1024, 0 , 33.17 , 31.26 , New , 1.91 1024, 6 , 33.14 , 31.13 , New , 2.01 2048, 0 , 53.39 , 52.51 , New , 0.88 2048, 7 , 53.3 , 52.34 , New , 0.96 64 , 1 , 6.11 , 6.09 , New , 0.02 64 , 3 , 6.04 , 6.01 , New , 0.03 64 , 4 , 6.04 , 6.03 , New , 0.01 64 , 5 , 6.13 , 6.05 , New , 0.08 64 , 6 , 6.09 , 6.06 , New , 0.03 64 , 7 , 6.04 , 6.03 , New , 0.01 256 , 16 , 14.77 , 14.39 , New , 0.38 256 , 32 , 15.58 , 15.27 , New , 0.31 256 , 48 , 15.88 , 15.32 , New , 0.56 256 , 64 , 16.85 , 16.01 , New , 0.84 256 , 80 , 16.83 , 16.03 , New , 0.8 256 , 96 , 13.5 , 13.14 , New , 0.36 256 , 112 , 13.71 , 13.24 , New , 0.47 0 , 0 , 3.78 , 3.76 , New , 0.02 1 , 0 , 3.79 , 3.76 , New , 0.03 2 , 0 , 3.82 , 3.77 , New , 0.05 3 , 0 , 3.78 , 3.76 , New , 0.02 4 , 0 , 3.75 , 3.75 , Eq , 0.0 5 , 0 , 3.77 , 3.74 , New , 0.03 6 , 0 , 3.78 , 3.76 , New , 0.02 7 , 0 , 3.91 , 3.85 , New , 0.06 8 , 0 , 3.76 , 3.77 , Cur , 0.01 9 , 0 , 3.75 , 3.75 , Eq , 0.0 10 , 0 , 3.76 , 3.76 , Eq , 0.0 11 , 0 , 3.77 , 3.75 , New , 0.02 12 , 0 , 3.79 , 3.77 , New , 0.02 13 , 0 , 3.86 , 3.86 , Eq , 0.0 14 , 0 , 4.2 , 4.2 , Eq , 0.0 15 , 0 , 4.17 , 4.07 , New , 0.1 16 , 0 , 4.1 , 4.1 , Eq , 0.0 17 , 0 , 4.12 , 4.09 , New , 0.03 18 , 0 , 4.12 , 4.12 , Eq , 0.0 19 , 0 , 4.18 , 4.09 , New , 0.09 20 , 0 , 4.14 , 4.09 , New , 0.05 21 , 0 , 4.15 , 4.11 , New , 0.04 22 , 0 , 4.23 , 4.13 , New , 0.1 23 , 0 , 4.18 , 4.16 , New , 0.02 24 , 0 , 4.13 , 4.21 , Cur , 0.08 25 , 0 , 4.17 , 4.15 , New , 0.02 26 , 0 , 4.17 , 4.16 , New , 0.01 27 , 0 , 4.18 , 4.16 , New , 0.02 28 , 0 , 4.17 , 4.15 , New , 0.02 29 , 0 , 4.2 , 4.13 , New , 0.07 30 , 0 , 4.16 , 4.12 , New , 0.04 31 , 0 , 4.15 , 4.15 , Eq , 0.0 Results For Icelake strchr-avx2 size, algn, Cur T , New T , Win , Dif 32 , 0 , 3.73 , 3.72 , New , 0.01 32 , 1 , 3.46 , 3.44 , New , 0.02 64 , 0 , 3.96 , 3.87 , New , 0.09 64 , 2 , 3.92 , 3.87 , New , 0.05 128 , 0 , 5.15 , 4.9 , New , 0.25 128 , 3 , 5.12 , 4.87 , New , 0.25 256 , 0 , 9.79 , 9.45 , New , 0.34 256 , 4 , 9.76 , 9.52 , New , 0.24 512 , 0 , 13.93 , 12.89 , New , 1.04 512 , 5 , 13.84 , 13.02 , New , 0.82 1024, 0 , 21.41 , 19.92 , New , 1.49 1024, 6 , 21.69 , 20.12 , New , 1.57 2048, 0 , 35.12 , 33.83 , New , 1.29 2048, 7 , 35.13 , 33.99 , New , 1.14 64 , 1 , 3.96 , 3.9 , New , 0.06 64 , 3 , 3.88 , 3.86 , New , 0.02 64 , 4 , 3.87 , 3.83 , New , 0.04 64 , 5 , 3.9 , 3.85 , New , 0.05 64 , 6 , 3.9 , 3.89 , New , 0.01 64 , 7 , 3.9 , 3.84 , New , 0.06 256 , 16 , 9.76 , 9.4 , New , 0.36 256 , 32 , 10.36 , 9.97 , New , 0.39 256 , 48 , 10.5 , 10.02 , New , 0.48 256 , 64 , 11.13 , 10.55 , New , 0.58 256 , 80 , 11.14 , 10.56 , New , 0.58 256 , 96 , 8.98 , 8.57 , New , 0.41 256 , 112 , 9.1 , 8.66 , New , 0.44 0 , 0 , 2.52 , 2.49 , New , 0.03 1 , 0 , 2.56 , 2.53 , New , 0.03 2 , 0 , 2.6 , 2.54 , New , 0.06 3 , 0 , 2.63 , 2.58 , New , 0.05 4 , 0 , 2.63 , 2.6 , New , 0.03 5 , 0 , 2.65 , 2.62 , New , 0.03 6 , 0 , 2.75 , 2.73 , New , 0.02 7 , 0 , 2.73 , 2.76 , Cur , 0.03 8 , 0 , 2.61 , 2.6 , New , 0.01 9 , 0 , 2.73 , 2.74 , Cur , 0.01 10 , 0 , 2.72 , 2.71 , New , 0.01 11 , 0 , 2.74 , 2.72 , New , 0.02 12 , 0 , 2.73 , 2.74 , Cur , 0.01 13 , 0 , 2.73 , 2.75 , Cur , 0.02 14 , 0 , 2.74 , 2.72 , New , 0.02 15 , 0 , 2.74 , 2.72 , New , 0.02 16 , 0 , 2.75 , 2.74 , New , 0.01 17 , 0 , 2.73 , 2.74 , Cur , 0.01 18 , 0 , 2.72 , 2.73 , Cur , 0.01 19 , 0 , 2.74 , 2.72 , New , 0.02 20 , 0 , 2.75 , 2.71 , New , 0.04 21 , 0 , 2.74 , 2.74 , Eq , 0.0 22 , 0 , 2.73 , 2.74 , Cur , 0.01 23 , 0 , 2.7 , 2.72 , Cur , 0.02 24 , 0 , 2.68 , 2.68 , Eq , 0.0 25 , 0 , 2.65 , 2.63 , New , 0.02 26 , 0 , 2.64 , 2.62 , New , 0.02 27 , 0 , 2.71 , 2.68 , New , 0.03 28 , 0 , 2.72 , 2.68 , New , 0.04 29 , 0 , 2.68 , 2.74 , Cur , 0.06 30 , 0 , 2.65 , 2.65 , Eq , 0.0 31 , 0 , 2.7 , 2.68 , New , 0.02 sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- 1 file changed, 173 insertions(+), 121 deletions(-)