diff mbox series

[v1,3/3] x86: Optimize memchr-evex.S

Message ID 20210503084435.160548-3-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/3] Bench: Expand bench-memchr.c | expand

Commit Message

Noah Goldstein May 3, 2021, 8:44 a.m. UTC
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Tests where run on the following CPUs:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

All times are the geometric mean of N=20. The unit of time is
seconds.

"Cur" refers to the current implementation
"New" refers to this patches implementation

Note: The numbers for size = [1, 32] are highly dependent on function
alignment. That being said the new implementation which uses cmovcc
instead of a branch (mostly for the reason of high variance with
different alignments) for the [1, 32] case is far more consistent and
performs about as well (and should only be a bigger improvement in
cases where the sizes / position are not 100% predictable).

For memchr-evex the numbers are a near universal improvement. The case
where the current implement as better is for size = 0 and for size =
[1, 32] with pos < size the two implementations are about the
same. For size = [1, 32] with pos > size, for medium range sizes, and
large size, however, the new implementation is faster.

Results For Tigerlake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  
256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  
2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  
256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  
2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  
256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  
2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  
256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   
2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  
256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  
2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  
256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  
2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  
256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  
192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  
256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  
512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  
192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   
512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   
192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  
256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  
512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  
192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  
256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   
512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   
192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  
256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  
512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   
192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  
256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  
512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  
192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  
256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  
512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  
2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  
0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  
3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   
1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   
4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   
2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  
5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  
5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  
3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  
6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  
6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  
4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  
7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  
5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  
8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  
8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  
6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   
9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  
10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  
8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  
11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  
9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  
9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  
12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  
10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  
13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  
13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  
11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  
11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  
14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  
14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  
12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   
12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  
15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  
13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  
13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  
16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  
14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  
14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  
17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  
15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  
18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  
16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  
19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  
19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   
17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  
20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  
18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  
21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  
19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  
22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  
20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  
23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  
23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  
21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  
24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  
24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  
22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  
22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  
25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  
23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  
26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  
26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  
24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  
24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  
27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  
27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  
25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  
25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  
28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  
26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  
29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  
27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  
30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  
28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  
31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  
29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  
32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  
32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  
30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  
30    , 7     , , 31    3.68  , 3.06  , New   , 0.62

Results For Icelake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  
256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  
2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  
256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  
2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  
256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  
2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  
256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  
2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  
256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  
2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  
256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  
2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  
256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  
192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  
256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  
512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  
192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  
512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  
192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  
256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  
512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  
192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  
256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  
512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  
192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  
256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  
512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  
192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  
256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  
512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  
192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   
256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  
512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  
2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  
2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  
0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  
0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  
3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  
3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  
1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  
1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  
4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  
4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  
2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  
2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  
5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  
5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  
3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  
3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  
6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  
6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  
4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  
4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  
7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  
7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  
5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  
5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  
8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  
8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  
6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  
6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  
9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  
7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  
10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  
10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  
8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  
8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  
11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  
11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  
9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  
9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  
12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  
12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  
10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  
10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  
13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  
13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  
11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  
11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  
14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  
14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  
12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  
12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  
15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  
15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  
13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  
13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  
16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  
16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  
14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  
14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  
17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  
15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  
18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  
18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  
16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  
16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  
19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  
19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  
17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  
17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  
20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  
20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  
18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  
18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  
21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  
21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  
19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  
19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  
22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  
22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  
20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   
20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  
23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  
23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  
21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  
21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  
24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  
24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  
22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  
22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  
25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  
23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   
26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  
26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  
24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  
24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   
27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  
27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  
25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  
25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  
28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  
28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   
26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  
29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  
29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  
27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  
27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  
30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  
30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  
28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  
28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  
31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  
31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  
29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  
29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  
32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  
32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  
30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  
30    , 7     , , 31    3.24  , 2.72  , New   , 0.52

For memchr-avx2 the improvements are more modest though again near
universal. The improvement is most significant for medium sizes and
small sizes with pos > size. For small sizes with pos < size and large
sizes the two implementations perform roughly the same for large
sizes.

Results For Tigerlake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  
256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  
2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  
256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  
2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   
256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  
2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  
2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  
256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  
2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   
256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   
2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  
256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  
192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  
256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  
512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  
192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  
512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  
192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  
256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  
512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  
192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  
256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  
512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  
192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  
256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  
512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  
192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  
256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  
512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  
192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  
256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  
512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  
2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  
2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  
0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  
1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  
1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  
4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  
2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   
2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   
5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  
5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  
3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  
3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   
6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  
6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  
4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   
4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  
7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  
7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  
5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  
5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  
8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  
8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  
6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  
9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  
10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  
8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  
11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  
11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  
9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  
9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  
12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  
12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  
10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  
10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  
13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  
13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   
11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  
14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  
14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  
12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  
15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  
15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  
13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  
16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  
16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  
14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  
14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  
17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  
15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   
18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  
18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  
16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  
16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  
19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  
19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  
17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  
17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   
20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  
20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  
18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  
18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  
21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  
21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   
19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  
22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  
22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  
20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  
20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  
23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  
23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  
21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  
21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  
24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  
24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  
22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  
22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  
25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  
23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  
26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  
26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  
24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  
24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  
27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  
27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  
25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  
25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  
28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  
28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  
26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  
26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  
29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  
29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  
27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  
27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  
30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  
30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  
28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  
28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  
31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  
31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  
29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  
29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  
32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  
32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  
30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  
30    , 7     , , 31    5.28  , 4.08  , New   , 1.2

Results For Icelake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  
256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  
2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  
256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  
2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  
256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  
2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   
256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   
2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  
256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  
2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  
256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  
2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  
256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  
192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  
256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  
512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  
192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  
512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  
192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  
256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  
512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  
192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   
256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  
512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  
192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  
256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  
512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  
192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  
256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  
512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  
192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  
256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  
512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  
2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  
2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  
0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  
0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  
3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  
3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  
1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  
1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  
4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  
4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  
2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  
2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  
5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  
5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  
3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  
3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  
6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  
6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  
4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  
4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   
7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  
7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  
5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  
5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  
8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  
8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  
6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  
6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  
9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  
7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  
10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  
10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  
8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  
8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   
11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  
11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  
9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  
9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  
12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  
12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  
10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   
10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  
13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  
11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  
11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  
14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  
14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  
12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  
15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  
15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  
13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  
13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  
16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  
16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  
14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  
14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  
17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  
15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  
18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  
18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  
16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  
16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  
19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  
19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  
17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  
17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  
20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  
20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  
18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   
21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  
21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  
19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  
19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  
22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  
22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  
20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  
20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  
23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  
21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  
21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  
24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  
24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  
22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   
22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  
25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  
23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  
26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  
26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  
24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  
24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   
27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  
27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  
25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   
25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  
28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  
28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  
26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  
26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  
29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  
29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  
27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  
30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  
30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  
28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  
28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  
31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  
29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  
29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  
32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  
32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  
30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  
30    , 7     , , 31    4.48  , 3.44  , New   , 1.04


Results For Skylake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  
256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  
2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  
256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  
2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  
256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  
2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  
256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  
2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   
256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  
2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   
256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  
2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  
256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  
192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  
256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  
512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  
192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  
512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  
192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  
256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  
512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  
192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  
256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  
512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  
192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  
256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  
512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  
192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  
256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  
512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  
192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  
256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  
512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  
2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  
2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  
0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  
0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  
3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  
3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  
1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  
1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  
4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  
4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  
2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  
2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  
5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  
5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  
3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  
3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  
6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  
6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  
4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  
4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  
7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  
7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  
5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  
5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  
8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  
8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  
6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  
6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  
9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  
7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  
10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  
10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  
8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  
8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  
11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  
11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  
9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  
9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  
12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  
12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  
10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  
10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  
13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  
13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  
11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  
11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  
14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  
14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  
12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   
12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   
15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  
15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  
13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  
13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  
16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  
16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  
14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  
14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  
17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   
15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  
18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  
18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   
16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  
16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  
19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  
19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  
17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  
17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  
20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  
20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  
18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   
18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  
21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   
21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  
19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   
19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  
22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  
22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  
20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  
20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  
23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  
23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  
21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  
21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  
24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  
24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  
22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   
22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  
25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  
23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  
26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   
26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  
24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  
24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  
27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  
27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  
25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   
25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  
28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   
28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  
26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   
26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  
29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  
29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  
27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  
27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  
30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   
30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  
28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  
28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   
31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  
31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  
29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  
29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  
32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  
32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  
30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  
30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  

 sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
 1 file changed, 349 insertions(+), 231 deletions(-)

Comments

H.J. Lu May 3, 2021, 6:58 p.m. UTC | #1
On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Tests where run on the following CPUs:
> 
> Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> 
> Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> 
> Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
> 
> All times are the geometric mean of N=20. The unit of time is
> seconds.
> 
> "Cur" refers to the current implementation
> "New" refers to this patches implementation
> 
> Note: The numbers for size = [1, 32] are highly dependent on function
> alignment. That being said the new implementation which uses cmovcc
> instead of a branch (mostly for the reason of high variance with
> different alignments) for the [1, 32] case is far more consistent and
> performs about as well (and should only be a bigger improvement in
> cases where the sizes / position are not 100% predictable).
> 
> For memchr-evex the numbers are a near universal improvement. The case
> where the current implement as better is for size = 0 and for size =
> [1, 32] with pos < size the two implementations are about the
> same. For size = [1, 32] with pos > size, for medium range sizes, and
> large size, however, the new implementation is faster.
> 
> Results For Tigerlake memchr-evex
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  
> 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  
> 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  
> 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  
> 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  
> 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  
> 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  
> 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   
> 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  
> 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  
> 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  
> 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  
> 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  
> 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  
> 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  
> 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  
> 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  
> 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   
> 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   
> 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  
> 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  
> 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  
> 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  
> 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   
> 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   
> 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  
> 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  
> 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   
> 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  
> 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  
> 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  
> 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  
> 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  
> 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  
> 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
> 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  
> 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  
> 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
> 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
> 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   
> 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   
> 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
> 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
> 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   
> 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  
> 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  
> 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
> 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  
> 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  
> 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  
> 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
> 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  
> 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  
> 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
> 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
> 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  
> 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  
> 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  
> 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
> 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  
> 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   
> 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
> 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  
> 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
> 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
> 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  
> 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  
> 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
> 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  
> 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  
> 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  
> 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
> 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
> 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  
> 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  
> 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  
> 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  
> 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  
> 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  
> 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  
> 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  
> 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   
> 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  
> 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
> 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  
> 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  
> 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  
> 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
> 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  
> 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  
> 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  
> 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  
> 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  
> 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
> 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
> 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  
> 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  
> 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  
> 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
> 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   
> 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  
> 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
> 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
> 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  
> 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  
> 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
> 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
> 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  
> 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  
> 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
> 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
> 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  
> 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  
> 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  
> 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
> 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  
> 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  
> 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  
> 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  
> 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  
> 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  
> 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  
> 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  
> 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  
> 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  
> 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  
> 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  
> 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  
> 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  
> 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  
> 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  
> 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
> 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
> 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  
> 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  
> 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
> 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
> 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  
> 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  
> 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
> 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
> 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  
> 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  
> 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
> 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
> 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  
> 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  
> 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  
> 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  
> 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  
> 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62
> 
> Results For Icelake memchr-evex
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  
> 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  
> 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  
> 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  
> 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  
> 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  
> 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  
> 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  
> 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  
> 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  
> 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  
> 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  
> 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  
> 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  
> 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  
> 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  
> 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  
> 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  
> 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  
> 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  
> 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  
> 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  
> 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  
> 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  
> 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  
> 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  
> 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  
> 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  
> 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  
> 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  
> 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  
> 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   
> 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  
> 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  
> 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  
> 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  
> 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  
> 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  
> 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  
> 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  
> 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  
> 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  
> 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  
> 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  
> 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  
> 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  
> 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  
> 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  
> 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  
> 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  
> 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  
> 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  
> 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  
> 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  
> 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  
> 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  
> 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  
> 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  
> 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  
> 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  
> 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  
> 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  
> 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  
> 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  
> 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  
> 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  
> 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  
> 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  
> 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  
> 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  
> 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  
> 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  
> 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  
> 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  
> 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  
> 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  
> 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  
> 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  
> 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  
> 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  
> 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  
> 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  
> 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  
> 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  
> 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  
> 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  
> 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  
> 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  
> 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  
> 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  
> 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  
> 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  
> 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  
> 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  
> 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  
> 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  
> 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  
> 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  
> 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  
> 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  
> 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  
> 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  
> 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  
> 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  
> 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  
> 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  
> 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  
> 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  
> 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  
> 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  
> 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  
> 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  
> 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   
> 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  
> 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  
> 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  
> 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  
> 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  
> 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  
> 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  
> 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  
> 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  
> 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  
> 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   
> 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  
> 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  
> 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  
> 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   
> 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  
> 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  
> 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  
> 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  
> 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  
> 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
> 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   
> 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  
> 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  
> 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  
> 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  
> 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  
> 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  
> 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  
> 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  
> 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  
> 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  
> 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  
> 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  
> 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  
> 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  
> 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  
> 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  
> 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52
> 
> For memchr-avx2 the improvements are more modest though again near
> universal. The improvement is most significant for medium sizes and
> small sizes with pos > size. For small sizes with pos < size and large
> sizes the two implementations perform roughly the same for large
> sizes.
> 
> Results For Tigerlake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  
> 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  
> 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  
> 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  
> 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   
> 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  
> 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
> 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  
> 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  
> 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  
> 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   
> 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   
> 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  
> 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  
> 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  
> 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  
> 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  
> 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  
> 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  
> 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  
> 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  
> 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  
> 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  
> 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  
> 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  
> 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  
> 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  
> 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  
> 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  
> 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  
> 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  
> 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  
> 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  
> 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  
> 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  
> 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
> 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  
> 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
> 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  
> 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  
> 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  
> 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
> 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  
> 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   
> 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   
> 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  
> 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  
> 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  
> 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   
> 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  
> 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  
> 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   
> 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  
> 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  
> 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  
> 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  
> 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  
> 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  
> 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
> 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  
> 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  
> 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
> 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  
> 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
> 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
> 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  
> 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  
> 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  
> 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  
> 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  
> 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  
> 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  
> 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  
> 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  
> 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  
> 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  
> 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
> 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   
> 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  
> 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  
> 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
> 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  
> 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  
> 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  
> 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
> 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  
> 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  
> 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  
> 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  
> 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  
> 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  
> 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  
> 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   
> 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  
> 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  
> 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  
> 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  
> 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  
> 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  
> 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  
> 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   
> 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  
> 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  
> 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  
> 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  
> 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  
> 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
> 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   
> 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  
> 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  
> 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  
> 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  
> 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  
> 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  
> 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  
> 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  
> 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  
> 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  
> 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  
> 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  
> 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  
> 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  
> 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  
> 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  
> 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  
> 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  
> 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  
> 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  
> 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  
> 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  
> 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  
> 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  
> 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  
> 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  
> 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  
> 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  
> 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  
> 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  
> 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  
> 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  
> 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  
> 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  
> 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  
> 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  
> 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  
> 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  
> 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  
> 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  
> 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  
> 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  
> 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2
> 
> Results For Icelake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  
> 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  
> 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  
> 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  
> 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  
> 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  
> 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   
> 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   
> 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  
> 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  
> 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  
> 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  
> 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  
> 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  
> 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  
> 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  
> 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  
> 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  
> 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  
> 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  
> 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  
> 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  
> 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   
> 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  
> 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  
> 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  
> 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  
> 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  
> 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  
> 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  
> 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  
> 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  
> 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  
> 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  
> 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  
> 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  
> 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  
> 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  
> 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  
> 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  
> 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  
> 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  
> 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  
> 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  
> 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  
> 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  
> 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  
> 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  
> 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  
> 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  
> 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  
> 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  
> 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  
> 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   
> 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  
> 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  
> 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  
> 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  
> 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  
> 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  
> 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  
> 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  
> 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  
> 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  
> 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  
> 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  
> 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  
> 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   
> 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  
> 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  
> 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  
> 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  
> 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  
> 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  
> 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   
> 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  
> 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
> 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  
> 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  
> 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  
> 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  
> 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
> 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  
> 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  
> 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  
> 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  
> 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  
> 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  
> 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  
> 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  
> 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  
> 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  
> 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  
> 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  
> 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  
> 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  
> 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  
> 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  
> 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  
> 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  
> 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  
> 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  
> 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  
> 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
> 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  
> 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   
> 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  
> 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  
> 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  
> 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  
> 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  
> 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  
> 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  
> 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  
> 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
> 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  
> 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  
> 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  
> 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  
> 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  
> 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   
> 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  
> 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  
> 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  
> 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  
> 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  
> 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  
> 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   
> 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  
> 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  
> 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   
> 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  
> 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  
> 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  
> 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  
> 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  
> 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  
> 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
> 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  
> 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  
> 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  
> 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  
> 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  
> 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  
> 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
> 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  
> 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  
> 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  
> 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  
> 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  
> 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  
> 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04
> 
> 
> Results For Skylake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  
> 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  
> 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  
> 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  
> 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  
> 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  
> 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  
> 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  
> 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   
> 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  
> 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   
> 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  
> 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  
> 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  
> 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  
> 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  
> 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  
> 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  
> 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  
> 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  
> 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  
> 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  
> 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  
> 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  
> 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  
> 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  
> 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  
> 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  
> 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  
> 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  
> 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  
> 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  
> 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  
> 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  
> 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  
> 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  
> 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  
> 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  
> 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  
> 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  
> 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  
> 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  
> 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  
> 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  
> 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  
> 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  
> 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  
> 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  
> 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  
> 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  
> 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  
> 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  
> 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  
> 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  
> 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  
> 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  
> 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  
> 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  
> 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  
> 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  
> 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  
> 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  
> 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  
> 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  
> 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  
> 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  
> 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  
> 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  
> 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  
> 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  
> 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  
> 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  
> 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  
> 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  
> 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  
> 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  
> 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  
> 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  
> 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  
> 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  
> 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  
> 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  
> 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   
> 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   
> 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  
> 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  
> 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  
> 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  
> 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  
> 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  
> 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  
> 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  
> 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   
> 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  
> 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  
> 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   
> 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  
> 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  
> 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  
> 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  
> 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  
> 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  
> 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  
> 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  
> 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   
> 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  
> 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   
> 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  
> 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   
> 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  
> 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  
> 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  
> 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  
> 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  
> 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  
> 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  
> 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  
> 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  
> 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  
> 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  
> 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   
> 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  
> 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  
> 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  
> 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   
> 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  
> 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  
> 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  
> 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  
> 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  
> 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   
> 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  
> 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   
> 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  
> 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   
> 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  
> 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  
> 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  
> 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  
> 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  
> 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   
> 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  
> 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  
> 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   
> 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  
> 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  
> 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  
> 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  
> 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  
> 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  
> 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  
> 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  
> 
>  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
>  1 file changed, 349 insertions(+), 231 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..65c16ef8a4 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>  
>  # ifdef USE_AS_WMEMCHR
>  #  define VPBROADCAST	vpbroadcastd
> -#  define VPCMP		vpcmpd
> -#  define SHIFT_REG	r8d
> +#  define VPMINU	vpminud
> +#  define VPCMP	vpcmpd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPBROADCAST	vpbroadcastb
> -#  define VPCMP		vpcmpb
> -#  define SHIFT_REG	ecx
> +#  define VPMINU	vpminub
> +#  define VPCMP	vpcmpb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  # endif
>  
> +# ifdef USE_AS_RAWMEMCHR
> +#  define RAW_PTR_REG	rcx
> +#  define ALGN_PTR_REG	rdi
> +# else
> +#  define RAW_PTR_REG	rdi
> +#  define ALGN_PTR_REG	rcx
> +# endif
> +
> +#define XZERO		xmm23

Add a space before define.  Rename XZERO to XMMZERO.

> +#define YZERO		ymm23

Add a space before define.  Rename YZERO to YMMZERO.

>  # define XMMMATCH	xmm16
>  # define YMMMATCH	ymm16
>  # define YMM1		ymm17
> @@ -44,18 +58,16 @@
>  # define YMM6		ymm22
>  
>  # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>  
>  	.section .text.evex,"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)

No need for this change.

>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check for zero length.  */
>  	test	%RDX_LP, %RDX_LP
>  	jz	L(zero)
> -# endif
> -	movl	%edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> -	shl	$2, %RDX_LP
> -# else
> +
>  #  ifdef __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%edx, %edx
> @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
>  # endif
>  	/* Broadcast CHAR to YMMMATCH.  */
>  	VPBROADCAST %esi, %YMMMATCH
> -	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	/* Check if we may cross page boundary with one
> +	   vector load.  */

Fit comments to 72 columns.

> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -
> +	VPCMP	$0, (%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  # ifndef USE_AS_RAWMEMCHR
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rdx
> -	jbe	L(zero)
> +	/* If length < CHAR_PER_VEC handle special.  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	jnz	L(first_vec_x0)
> +	addq	%rdi, %rax
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	ret
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> -	addq	%rcx, %rdx
> -
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -	jmp	L(more_4x_vec)
> +L(zero):
> +	xorl	%eax, %eax
> +	ret
>  
> +	.p2align 5
> +L(first_vec_x0):
> +	/* Check if first match was before length.  */
> +	tzcntl	%eax, %eax
> +	xorl	%ecx, %ecx
> +	cmpl	%eax, %edx
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +	cmovle	%rcx, %rax
> +	ret
> +# else
> +	/* NB: first_vec_x0 is 17 bytes which will leave
> +	   cross_page_boundary (which is relatively cold) close
> +	   enough to ideal alignment. So only realign
> +	   L(cross_page_boundary) if rawmemchr.  */

Fit comments to 72 columns.

>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> +	/* Save pointer before aligning as its original
> +	   value is necessary for computer return address if byte is
> +	   found or adjusting length if it is not and this is
> +	   memchr.  */

Fit comments to 72 columns.

> +	movq	%rdi, %rcx
> +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> +	   for memchr and rdi for rawmemchr.  */

Fit comments to 72 columns.

> +	andq	$-VEC_SIZE, %ALGN_PTR_REG
> +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> +	kmovd	%k0, %r8d
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
> -	   bytes.  */
> -	movl	%ecx, %SHIFT_REG
> -	sarl	$2, %SHIFT_REG
> +	/* NB: Divide shift count by 4 since each bit in
> +	   K0 represent 4 bytes.  */
> +	sarl	$2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
> +	subl	%eax, %esi
>  # endif
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	/* Remove the leading bytes.  */
> -	sarxl	%SHIFT_REG, %eax, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> -	tzcntl	%eax, %eax
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +	andl	$(CHAR_PER_VEC - 1), %eax
>  # endif
> +	/* Remove the leading bytes.  */
> +	sarxl	%eax, %r8d, %eax
>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	cmpq	%rsi, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(cross_page_continue)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> +	addq	%RAW_PTR_REG, %rax
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
>  	ret
>  
>  	.p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -	   overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> +L(first_vec_x1):
> +	tzcntl	%eax, %eax
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	/* Check the end of data.  */
> -	subq	%rcx, %rdx
> -	jbe	L(zero)
> -# endif
> +	.p2align 4
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +
> +	.p2align 5
> +L(aligned_more):
> +	/* Check the first 4 * VEC_SIZE.  Only one
> +	   VEC_SIZE at a time since data is only aligned to
> +	   VEC_SIZE.  */

Fit comments to 72 columns.

>  
>  # ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> +	/* Align data to VEC_SIZE.  */
> +L(cross_page_continue):
> +	xorl	%ecx, %ecx
> +	subl	%edi, %ecx
> +	andq	$-VEC_SIZE, %rdi
> +	/* esi is for adjusting length to see if near the
> +	   end.  */

Fit comments to 72 columns.

> +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Divide bytes by 4 to get the wchar_t
> +	   count.  */
> +	sarl	$2, %esi
> +#  endif
> +# else
> +	andq	$-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Adjust length. If near end handle specially.
> +	 */

Fit comments to 72 columns.

> +	subq	%rsi, %rdx
>  	jbe	L(last_4x_vec_or_less)
>  # endif
> -
> -L(more_4x_vec):
> -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -	   since data is only aligned to VEC_SIZE.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
> +
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> +	/* Check if at last CHAR_PER_VEC * 4 length.  */
> +	subq	$(CHAR_PER_VEC * 4), %rdx
> +	jbe	L(last_4x_vec_or_less_cmpeq)
> +	addq	$VEC_SIZE, %rdi
>  
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> +	/* Align data to VEC_SIZE * 4 for the loop and
> +	   readjust length.  */

Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR
> +	movl	%edi, %ecx
>  	andq	$-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
> +	/* NB: Divide bytes by 4 to get the wchar_t
> +	   count.  */

Fit comments to 72 columns.

> +	sarl	$2, %ecx
>  	addq	%rcx, %rdx
> +#  else
> +	addq	%rdi, %rdx
> +	andq	$-(4 * VEC_SIZE), %rdi
> +	subq	%rdi, %rdx
> +#  endif
> +# else
> +	addq	$VEC_SIZE, %rdi
> +	andq	$-(4 * VEC_SIZE), %rdi
>  # endif
>  
> +	vpxorq	%XZERO, %XZERO, %XZERO
> +
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -	kord	%k1, %k2, %k5
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -	kord	%k3, %k4, %k6
> -	kortestd %k5, %k6
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> +	/* It would be possible to save some instructions
> +	   using 4x VPCMP but bottleneck on port 5 makes it not woth
> +	   it.  */

Fit comments to 72 columns.

> +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> +	/* xor will set bytes match esi to zero.  */
> +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero
> +	   mask.  */

Fit comments to 72 columns.

> +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
> +	VPCMP	$0, %YMM3, %YZERO, %k2
>  # ifdef USE_AS_RAWMEMCHR
> -	jmp	L(loop_4x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	kortestd %k2, %k3
> +	jz	L(loop_4x_vec)
>  # else
> -	subq	$(VEC_SIZE * 4), %rdx
> -	ja	L(loop_4x_vec)
> +	kortestd %k2, %k3
> +	jnz	L(loop_4x_vec_end)
>  
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %edx
> -	jle	L(last_2x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
>  
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$(CHAR_PER_VEC * 4), %rdx
> +	ja	L(loop_4x_vec)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	/* Fall through into less than 4 remaining
> +	   vectors of length case.  */

Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	addq	$(VEC_SIZE * 3), %rdi
> +	.p2align 4
> +L(last_4x_vec_or_less):
> +	/* Check if first VEC contained match.  */
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	jnz	L(first_vec_x1_check)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +	/* If remaining length > CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jg	L(last_4x_vec)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +L(last_2x_vec):
> +	/* If remaining length < CHAR_PER_VEC.  */
> +	addl	$CHAR_PER_VEC, %edx
> +	jle	L(zero_end)
> +
> +	/* Check VEC2 and compare any match with
> +	   remaining length.  */

Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	tzcntl	%eax, %eax
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> +	ret
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
>  
> -	jnz	L(first_vec_x3_check)
> +	.p2align 4
> +L(first_vec_x1_check):
> +	tzcntl	%eax, %eax
> +	/* Adjust length.  */
> +	subl	$-(CHAR_PER_VEC * 4), %edx
> +	/* Check if match within remaining length.  */
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +L(set_zero_end):
>  	xorl	%eax, %eax
>  	ret
>  
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %edx
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> +	/* rawmemchr will fall through into this if match
> +	   was found in loop.  */

Fit comments to 72 columns.

> +
> +	/* k1 has not of matches with VEC1.  */
>  	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	subl	$((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> +	incl	%eax
> +# endif
> +	jnz	L(last_vec_x1_return)
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +	VPCMP	$0, %YMM2, %YZERO, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2_return)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	kmovd	%k2, %eax
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	xorl	%eax, %eax
> -	ret
> +	jnz	L(last_vec_x3_return)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> +	kmovd	%k3, %eax
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> -# endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
>  	addq	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(first_vec_x2_check):
> -	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +#  endif
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	addq	%rdi, %rax
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	/* Check first VEC regardless.  */
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1_check)
> +
> +	/* If remaining length <= CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jle	L(last_2x_vec)
> +
>  	.p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Create mask for possible matches within
> +	   remaining length.  */

Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR
> +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> +	bzhil	%edx, %ecx, %ecx
> +#  else
> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
> +#  endif
> +	/* Test matches in data against length match.  */
> +	andl	%ecx, %eax
> +	jnz	L(last_vec_x3)
> +
> +	/* if remaining length <= CHAR_PER_VEC * 3 (Note
> +	   this is after remaining length was found to be >
> +	   CHAR_PER_VEC * 2.  */

Fit comments to 72 columns.

> +	subl	$CHAR_PER_VEC, %edx
> +	jbe	L(zero_end2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Shift remaining length mask for last VEC.  */
> +#  ifdef USE_AS_WMEMCHR
> +	shrl	$CHAR_PER_VEC, %ecx
> +#  else
> +	shrq	$CHAR_PER_VEC, %rcx
> +#  endif
> +	andl	%ecx, %eax
> +	jz	L(zero_end2)
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -	addq	$VEC_SIZE, %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
>  	ret
>  
> -	.p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
>  
>  	.p2align 4
> -L(4x_vec_end):
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	kmovd	%k2, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -	kmovd	%k3, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -	kmovd	%k4, %eax
> -	testl	%eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
> +# endif
>  
> -END (MEMCHR)
> +END(MEMCHR)

No need for this change.

>  #endif
> -- 
> 2.29.2
> 

Thanks.

H.J.
Noah Goldstein May 3, 2021, 8:06 p.m. UTC | #2
On Mon, May 3, 2021 at 2:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-evex.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > saving some ALU in the alignment process, and most importantly
> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> > test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Tests where run on the following CPUs:
> >
> > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> >
> > Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
> >
> > All times are the geometric mean of N=20. The unit of time is
> > seconds.
> >
> > "Cur" refers to the current implementation
> > "New" refers to this patches implementation
> >
> > Note: The numbers for size = [1, 32] are highly dependent on function
> > alignment. That being said the new implementation which uses cmovcc
> > instead of a branch (mostly for the reason of high variance with
> > different alignments) for the [1, 32] case is far more consistent and
> > performs about as well (and should only be a bigger improvement in
> > cases where the sizes / position are not 100% predictable).
> >
> > For memchr-evex the numbers are a near universal improvement. The case
> > where the current implement as better is for size = 0 and for size =
> > [1, 32] with pos < size the two implementations are about the
> > same. For size = [1, 32] with pos > size, for medium range sizes, and
> > large size, however, the new implementation is faster.
> >
> > Results For Tigerlake memchr-evex
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36
> > 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29
> > 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33
> > 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33
> > 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63
> > 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32
> > 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15
> > 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3
> > 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85
> > 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31
> > 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68
> > 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23
> > 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63
> > 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41
> > 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68
> > 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67
> > 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65
> > 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3
> > 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4
> > 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42
> > 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37
> > 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35
> > 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57
> > 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6
> > 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6
> > 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71
> > 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15
> > 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6
> > 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47
> > 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45
> > 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04
> > 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59
> > 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49
> > 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73
> > 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59
> > 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59
> > 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
> > 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
> > 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6
> > 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6
> > 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
> > 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
> > 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6
> > 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59
> > 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01
> > 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
> > 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61
> > 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59
> > 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01
> > 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
> > 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61
> > 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61
> > 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
> > 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
> > 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63
> > 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61
> > 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01
> > 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
> > 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63
> > 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6
> > 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
> > 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62
> > 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
> > 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
> > 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61
> > 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61
> > 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
> > 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01
> > 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62
> > 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61
> > 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
> > 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
> > 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61
> > 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61
> > 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01
> > 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01
> > 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56
> > 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56
> > 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03
> > 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01
> > 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6
> > 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59
> > 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
> > 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01
> > 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61
> > 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62
> > 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
> > 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01
> > 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62
> > 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61
> > 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01
> > 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62
> > 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
> > 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
> > 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61
> > 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62
> > 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01
> > 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
> > 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6
> > 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62
> > 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
> > 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
> > 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62
> > 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62
> > 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
> > 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
> > 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61
> > 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61
> > 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
> > 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
> > 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62
> > 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62
> > 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01
> > 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
> > 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61
> > 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61
> > 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13
> > 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02
> > 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63
> > 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62
> > 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01
> > 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62
> > 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01
> > 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01
> > 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62
> > 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62
> > 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06
> > 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02
> > 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63
> > 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61
> > 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
> > 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
> > 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61
> > 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61
> > 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
> > 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
> > 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63
> > 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61
> > 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
> > 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
> > 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61
> > 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62
> > 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
> > 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
> > 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62
> > 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64
> > 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11
> > 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06
> > 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62
> > 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62
> >
> > Results For Icelake memchr-evex
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68
> > 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37
> > 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29
> > 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32
> > 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43
> > 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21
> > 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16
> > 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21
> > 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55
> > 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29
> > 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97
> > 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25
> > 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21
> > 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26
> > 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49
> > 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48
> > 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55
> > 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42
> > 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34
> > 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18
> > 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31
> > 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27
> > 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48
> > 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46
> > 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48
> > 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51
> > 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12
> > 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25
> > 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45
> > 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32
> > 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09
> > 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5
> > 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28
> > 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51
> > 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04
> > 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01
> > 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55
> > 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55
> > 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01
> > 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01
> > 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52
> > 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53
> > 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02
> > 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02
> > 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52
> > 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52
> > 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03
> > 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04
> > 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55
> > 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54
> > 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01
> > 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01
> > 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57
> > 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52
> > 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03
> > 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02
> > 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56
> > 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54
> > 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01
> > 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01
> > 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56
> > 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57
> > 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02
> > 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52
> > 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04
> > 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05
> > 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51
> > 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54
> > 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05
> > 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04
> > 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54
> > 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53
> > 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06
> > 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02
> > 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54
> > 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54
> > 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01
> > 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04
> > 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56
> > 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62
> > 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01
> > 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01
> > 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55
> > 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59
> > 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04
> > 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04
> > 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55
> > 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54
> > 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02
> > 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02
> > 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56
> > 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52
> > 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04
> > 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55
> > 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03
> > 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01
> > 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56
> > 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54
> > 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04
> > 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01
> > 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53
> > 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55
> > 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02
> > 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01
> > 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53
> > 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55
> > 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02
> > 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03
> > 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55
> > 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43
> > 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06
> > 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03
> > 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6
> > 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55
> > 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02
> > 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02
> > 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54
> > 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53
> > 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07
> > 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02
> > 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55
> > 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53
> > 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03
> > 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5
> > 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05
> > 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04
> > 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48
> > 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5
> > 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06
> > 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01
> > 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56
> > 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54
> > 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04
> > 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
> > 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5
> > 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51
> > 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02
> > 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03
> > 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54
> > 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56
> > 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04
> > 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01
> > 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52
> > 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57
> > 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04
> > 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02
> > 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52
> > 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52
> > 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01
> > 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02
> > 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52
> > 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52
> >
> > For memchr-avx2 the improvements are more modest though again near
> > universal. The improvement is most significant for medium sizes and
> > small sizes with pos > size. For small sizes with pos < size and large
> > sizes the two implementations perform roughly the same for large
> > sizes.
> >
> > Results For Tigerlake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12
> > 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18
> > 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12
> > 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21
> > 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5
> > 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31
> > 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
> > 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25
> > 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36
> > 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33
> > 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2
> > 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2
> > 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24
> > 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25
> > 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19
> > 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22
> > 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16
> > 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37
> > 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16
> > 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38
> > 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39
> > 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26
> > 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25
> > 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23
> > 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41
> > 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38
> > 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22
> > 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37
> > 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35
> > 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31
> > 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19
> > 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18
> > 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29
> > 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35
> > 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01
> > 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
> > 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02
> > 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
> > 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01
> > 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22
> > 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21
> > 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
> > 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01
> > 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2
> > 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2
> > 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01
> > 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01
> > 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22
> > 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2
> > 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02
> > 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03
> > 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2
> > 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21
> > 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03
> > 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04
> > 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22
> > 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21
> > 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01
> > 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
> > 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24
> > 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22
> > 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
> > 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23
> > 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
> > 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
> > 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23
> > 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23
> > 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01
> > 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02
> > 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23
> > 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23
> > 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03
> > 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04
> > 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23
> > 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23
> > 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01
> > 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
> > 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2
> > 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17
> > 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02
> > 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
> > 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21
> > 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21
> > 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02
> > 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
> > 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23
> > 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23
> > 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03
> > 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05
> > 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26
> > 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25
> > 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05
> > 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2
> > 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04
> > 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01
> > 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24
> > 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21
> > 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01
> > 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06
> > 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19
> > 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2
> > 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04
> > 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02
> > 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22
> > 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23
> > 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02
> > 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
> > 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2
> > 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18
> > 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05
> > 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06
> > 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19
> > 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22
> > 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09
> > 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04
> > 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18
> > 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22
> > 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17
> > 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01
> > 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28
> > 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23
> > 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06
> > 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19
> > 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04
> > 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08
> > 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24
> > 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21
> > 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14
> > 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06
> > 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19
> > 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25
> > 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04
> > 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05
> > 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22
> > 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27
> > 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05
> > 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02
> > 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23
> > 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22
> > 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08
> > 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04
> > 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19
> > 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24
> > 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01
> > 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02
> > 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23
> > 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17
> > 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03
> > 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07
> > 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22
> > 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2
> >
> > Results For Icelake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66
> > 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23
> > 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17
> > 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08
> > 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64
> > 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05
> > 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1
> > 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1
> > 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23
> > 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11
> > 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13
> > 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13
> > 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54
> > 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05
> > 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44
> > 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49
> > 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52
> > 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39
> > 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09
> > 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48
> > 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46
> > 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51
> > 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7
> > 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68
> > 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72
> > 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26
> > 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26
> > 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21
> > 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84
> > 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25
> > 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04
> > 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83
> > 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11
> > 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21
> > 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04
> > 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01
> > 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02
> > 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01
> > 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02
> > 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02
> > 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07
> > 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05
> > 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02
> > 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02
> > 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08
> > 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09
> > 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04
> > 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04
> > 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13
> > 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19
> > 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02
> > 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04
> > 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21
> > 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1
> > 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05
> > 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05
> > 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09
> > 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09
> > 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01
> > 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01
> > 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13
> > 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11
> > 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03
> > 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08
> > 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05
> > 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05
> > 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06
> > 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1
> > 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04
> > 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04
> > 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11
> > 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09
> > 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05
> > 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04
> > 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1
> > 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09
> > 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
> > 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05
> > 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11
> > 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18
> > 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01
> > 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
> > 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09
> > 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17
> > 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04
> > 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03
> > 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08
> > 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05
> > 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03
> > 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04
> > 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08
> > 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08
> > 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05
> > 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13
> > 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08
> > 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01
> > 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12
> > 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09
> > 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07
> > 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03
> > 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08
> > 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07
> > 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03
> > 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
> > 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09
> > 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1
> > 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03
> > 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02
> > 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17
> > 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99
> > 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07
> > 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05
> > 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17
> > 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11
> > 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
> > 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02
> > 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12
> > 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06
> > 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04
> > 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01
> > 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1
> > 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09
> > 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03
> > 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04
> > 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02
> > 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05
> > 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02
> > 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0
> > 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07
> > 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01
> > 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1
> > 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08
> > 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03
> > 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01
> > 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08
> > 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09
> > 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04
> > 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
> > 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07
> > 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01
> > 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04
> > 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01
> > 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06
> > 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12
> > 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
> > 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03
> > 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05
> > 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09
> > 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01
> > 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05
> > 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06
> > 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04
> >
> >
> > Results For Skylake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21
> > 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84
> > 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56
> > 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65
> > 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18
> > 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56
> > 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17
> > 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68
> > 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7
> > 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58
> > 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1
> > 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44
> > 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31
> > 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41
> > 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71
> > 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52
> > 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65
> > 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68
> > 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57
> > 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86
> > 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11
> > 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12
> > 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02
> > 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07
> > 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98
> > 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42
> > 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14
> > 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34
> > 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06
> > 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95
> > 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82
> > 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24
> > 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95
> > 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91
> > 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66
> > 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57
> > 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52
> > 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51
> > 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49
> > 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53
> > 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51
> > 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58
> > 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55
> > 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54
> > 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54
> > 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53
> > 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58
> > 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57
> > 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51
> > 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53
> > 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53
> > 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53
> > 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51
> > 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52
> > 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56
> > 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56
> > 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52
> > 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54
> > 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51
> > 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57
> > 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54
> > 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51
> > 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51
> > 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52
> > 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53
> > 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52
> > 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52
> > 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52
> > 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62
> > 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46
> > 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53
> > 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54
> > 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57
> > 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57
> > 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52
> > 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55
> > 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49
> > 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47
> > 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45
> > 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47
> > 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48
> > 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57
> > 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5
> > 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5
> > 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56
> > 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53
> > 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51
> > 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52
> > 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57
> > 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57
> > 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55
> > 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54
> > 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5
> > 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58
> > 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39
> > 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5
> > 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52
> > 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54
> > 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56
> > 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52
> > 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53
> > 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43
> > 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47
> > 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46
> > 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7
> > 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52
> > 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4
> > 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67
> > 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5
> > 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53
> > 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48
> > 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56
> > 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48
> > 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52
> > 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49
> > 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45
> > 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68
> > 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48
> > 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71
> > 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44
> > 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5
> > 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53
> > 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45
> > 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51
> > 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4
> > 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62
> > 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39
> > 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55
> > 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45
> > 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37
> > 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5
> > 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54
> > 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6
> > 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62
> > 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4
> > 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58
> > 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38
> > 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57
> > 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55
> > 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42
> > 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6
> > 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67
> > 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59
> > 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5
> > 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56
> > 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62
> > 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61
> > 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37
> > 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45
> > 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48
> > 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75
> > 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49
> >
> >  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
> >  1 file changed, 349 insertions(+), 231 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index 6dd5d67b90..65c16ef8a4 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -26,14 +26,28 @@
> >
> >  # ifdef USE_AS_WMEMCHR
> >  #  define VPBROADCAST        vpbroadcastd
> > -#  define VPCMP              vpcmpd
> > -#  define SHIFT_REG  r8d
> > +#  define VPMINU     vpminud
> > +#  define VPCMP      vpcmpd
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPBROADCAST        vpbroadcastb
> > -#  define VPCMP              vpcmpb
> > -#  define SHIFT_REG  ecx
> > +#  define VPMINU     vpminub
> > +#  define VPCMP      vpcmpb
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define CHAR_SIZE  1
> >  # endif
> >
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  define RAW_PTR_REG        rcx
> > +#  define ALGN_PTR_REG       rdi
> > +# else
> > +#  define RAW_PTR_REG        rdi
> > +#  define ALGN_PTR_REG       rcx
> > +# endif
> > +
> > +#define XZERO                xmm23
>
> Add a space before define.  Rename XZERO to XMMZERO.

Done.

>
> > +#define YZERO                ymm23
>
> Add a space before define.  Rename YZERO to YMMZERO.

Done.

>
> >  # define XMMMATCH    xmm16
> >  # define YMMMATCH    ymm16
> >  # define YMM1                ymm17
> > @@ -44,18 +58,16 @@
> >  # define YMM6                ymm22
> >
> >  # define VEC_SIZE 32
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +# define PAGE_SIZE 4096
> >
> >       .section .text.evex,"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.

Fixed.

>
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check for zero length.  */
> >       test    %RDX_LP, %RDX_LP
> >       jz      L(zero)
> > -# endif
> > -     movl    %edi, %ecx
> > -# ifdef USE_AS_WMEMCHR
> > -     shl     $2, %RDX_LP
> > -# else
> > +
> >  #  ifdef __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %edx, %edx
> > @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
> >  # endif
> >       /* Broadcast CHAR to YMMMATCH.  */
> >       VPBROADCAST %esi, %YMMMATCH
> > -     /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     /* Check if we may cross page boundary with one
> > +        vector load.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -
> > +     VPCMP   $0, (%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rdx
> > -     jbe     L(zero)
> > +     /* If length < CHAR_PER_VEC handle special.  */
> > +     cmpq    $CHAR_PER_VEC, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     jnz     L(first_vec_x0)
> > +     addq    %rdi, %rax
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     ret
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rdx
> > -
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -     jmp     L(more_4x_vec)
> > +L(zero):
> > +     xorl    %eax, %eax
> > +     ret
> >
> > +     .p2align 5
> > +L(first_vec_x0):
> > +     /* Check if first match was before length.  */
> > +     tzcntl  %eax, %eax
> > +     xorl    %ecx, %ecx
> > +     cmpl    %eax, %edx
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +     cmovle  %rcx, %rax
> > +     ret
> > +# else
> > +     /* NB: first_vec_x0 is 17 bytes which will leave
> > +        cross_page_boundary (which is relatively cold) close
> > +        enough to ideal alignment. So only realign
> > +        L(cross_page_boundary) if rawmemchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > +# endif
> > +L(cross_page_boundary):
> > +     /* Save pointer before aligning as its original
> > +        value is necessary for computer return address if byte is
> > +        found or adjusting length if it is not and this is
> > +        memchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     movq    %rdi, %rcx
> > +     /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> > +        for memchr and rdi for rawmemchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     andq    $-VEC_SIZE, %ALGN_PTR_REG
> > +     VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> > +     kmovd   %k0, %r8d
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > -        bytes.  */
> > -     movl    %ecx, %SHIFT_REG
> > -     sarl    $2, %SHIFT_REG
> > +     /* NB: Divide shift count by 4 since each bit in
> > +        K0 represent 4 bytes.  */
> > +     sarl    $2, %eax
> > +# endif
> > +# ifndef USE_AS_RAWMEMCHR
> > +     movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> > +     subl    %eax, %esi
> >  # endif
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     /* Remove the leading bytes.  */
> > -     sarxl   %SHIFT_REG, %eax, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > -     tzcntl  %eax, %eax
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +     andl    $(CHAR_PER_VEC - 1), %eax
> >  # endif
> > +     /* Remove the leading bytes.  */
> > +     sarxl   %eax, %r8d, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     cmpq    %rsi, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(cross_page_continue)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> > +# else
> > +     addq    %RAW_PTR_REG, %rax
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> >       ret
> >
> >       .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > -        overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rdx
> > -     jbe     L(zero)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +
> > +     .p2align 5
> > +L(aligned_more):
> > +     /* Check the first 4 * VEC_SIZE.  Only one
> > +        VEC_SIZE at a time since data is only aligned to
> > +        VEC_SIZE.  */
>
> Fit comments to 72 columns.

Fixed.

>
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > +     /* Align data to VEC_SIZE.  */
> > +L(cross_page_continue):
> > +     xorl    %ecx, %ecx
> > +     subl    %edi, %ecx
> > +     andq    $-VEC_SIZE, %rdi
> > +     /* esi is for adjusting length to see if near the
> > +        end.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Divide bytes by 4 to get the wchar_t
> > +        count.  */
> > +     sarl    $2, %esi
> > +#  endif
> > +# else
> > +     andq    $-VEC_SIZE, %rdi
> > +L(cross_page_continue):
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Adjust length. If near end handle specially.
> > +      */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     subq    %rsi, %rdx
> >       jbe     L(last_4x_vec_or_less)
> >  # endif
> > -
> > -L(more_4x_vec):
> > -     /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -        since data is only aligned to VEC_SIZE.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> > +
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > +     /* Check if at last CHAR_PER_VEC * 4 length.  */
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> > +     jbe     L(last_4x_vec_or_less_cmpeq)
> > +     addq    $VEC_SIZE, %rdi
> >
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > +     /* Align data to VEC_SIZE * 4 for the loop and
> > +        readjust length.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    %edi, %ecx
> >       andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > +     /* NB: Divide bytes by 4 to get the wchar_t
> > +        count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     sarl    $2, %ecx
> >       addq    %rcx, %rdx
> > +#  else
> > +     addq    %rdi, %rdx
> > +     andq    $-(4 * VEC_SIZE), %rdi
> > +     subq    %rdi, %rdx
> > +#  endif
> > +# else
> > +     addq    $VEC_SIZE, %rdi
> > +     andq    $-(4 * VEC_SIZE), %rdi
> >  # endif
> >
> > +     vpxorq  %XZERO, %XZERO, %XZERO
> > +
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > -     kord    %k1, %k2, %k5
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > -     kord    %k3, %k4, %k6
> > -     kortestd %k5, %k6
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > +     /* It would be possible to save some instructions
> > +        using 4x VPCMP but bottleneck on port 5 makes it not woth
> > +        it.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> > +     /* xor will set bytes match esi to zero.  */
> > +     vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> > +     vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> > +     VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> > +     /* Reduce VEC2 / VEC3 with min and VEC1 with zero
> > +        mask.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPMINU  %YMM2, %YMM3, %YMM3 {%k1} {z}
> > +     VPCMP   $0, %YMM3, %YZERO, %k2
> >  # ifdef USE_AS_RAWMEMCHR
> > -     jmp     L(loop_4x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     kortestd %k2, %k3
> > +     jz      L(loop_4x_vec)
> >  # else
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     ja      L(loop_4x_vec)
> > +     kortestd %k2, %k3
> > +     jnz     L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     jle     L(last_2x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> >
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> > +     ja      L(loop_4x_vec)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     /* Fall through into less than 4 remaining
> > +        vectors of length case.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     addq    $(VEC_SIZE * 3), %rdi
> > +     .p2align 4
> > +L(last_4x_vec_or_less):
> > +     /* Check if first VEC contained match.  */
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     jnz     L(first_vec_x1_check)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +     /* If remaining length > CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jg      L(last_4x_vec)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +L(last_2x_vec):
> > +     /* If remaining length < CHAR_PER_VEC.  */
> > +     addl    $CHAR_PER_VEC, %edx
> > +     jle     L(zero_end)
> > +
> > +     /* Check VEC2 and compare any match with
> > +        remaining length.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     tzcntl  %eax, %eax
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end):
> > +     ret
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> >
> > -     jnz     L(first_vec_x3_check)
> > +     .p2align 4
> > +L(first_vec_x1_check):
> > +     tzcntl  %eax, %eax
> > +     /* Adjust length.  */
> > +     subl    $-(CHAR_PER_VEC * 4), %edx
> > +     /* Check if match within remaining length.  */
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +L(set_zero_end):
> >       xorl    %eax, %eax
> >       ret
> >
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec_end):
> > +# endif
> > +     /* rawmemchr will fall through into this if match
> > +        was found in loop.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +
> > +     /* k1 has not of matches with VEC1.  */
> >       kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     subl    $((1 << CHAR_PER_VEC) - 1), %eax
> > +# else
> > +     incl    %eax
> > +# endif
> > +     jnz     L(last_vec_x1_return)
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +     VPCMP   $0, %YMM2, %YZERO, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2_return)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     kmovd   %k2, %eax
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     xorl    %eax, %eax
> > -     ret
> > +     jnz     L(last_vec_x3_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > +     kmovd   %k3, %eax
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_x1_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > -# endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $VEC_SIZE, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> >       addq    %rdi, %rax
> > -     ret
> > -
> > -     .p2align 4
> > -L(first_vec_x2_check):
> > -     tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +#  endif
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> > -
> > -     .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x3_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (%rdi, %rax, 4), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     addq    %rdi, %rax
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     /* Check first VEC regardless.  */
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1_check)
> > +
> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jle     L(last_2x_vec)
> > +
> >       .p2align 4
> > -L(first_vec_x1):
> > +L(last_4x_vec):
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Create mask for possible matches within
> > +        remaining length.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> > +     bzhil   %edx, %ecx, %ecx
> > +#  else
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> > +#  endif
> > +     /* Test matches in data against length match.  */
> > +     andl    %ecx, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     /* if remaining length <= CHAR_PER_VEC * 3 (Note
> > +        this is after remaining length was found to be >
> > +        CHAR_PER_VEC * 2.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     subl    $CHAR_PER_VEC, %edx
> > +     jbe     L(zero_end2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Shift remaining length mask for last VEC.  */
> > +#  ifdef USE_AS_WMEMCHR
> > +     shrl    $CHAR_PER_VEC, %ecx
> > +#  else
> > +     shrq    $CHAR_PER_VEC, %rcx
> > +#  endif
> > +     andl    %ecx, %eax
> > +     jz      L(zero_end2)
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $VEC_SIZE, %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end2):
> >       ret
> >
> > -     .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x2):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> >
> >       .p2align 4
> > -L(4x_vec_end):
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     kmovd   %k2, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -     kmovd   %k3, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -     kmovd   %k4, %eax
> > -     testl   %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.Fixed.
>
> >  #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..65c16ef8a4 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@ 
 
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 # endif
 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+#define XZERO		xmm23
+#define YZERO		ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,18 +58,16 @@ 
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
 
 	.section .text.evex,"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
-# endif
-	movl	%edi, %ecx
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
+
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -63,319 +75,425 @@  ENTRY (MEMCHR)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Check if we may cross page boundary with one
+	   vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	ret
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close
+	   enough to ideal alignment. So only realign
+	   L(cross_page_boundary) if rawmemchr.  */
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original
+	   value is necessary for computer return address if byte is
+	   found or adjusting length if it is not and this is
+	   memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
+	   for memchr and rdi for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	/* NB: Divide shift count by 4 since each bit in
+	   K0 represent 4 bytes.  */
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
 # endif
-	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
 	ret
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one
+	   VEC_SIZE at a time since data is only aligned to
+	   VEC_SIZE.  */
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the
+	   end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.
+	 */
+	subq	%rsi, %rdx
 	jbe	L(last_4x_vec_or_less)
 # endif
-
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
 
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and
+	   readjust length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %ecx
 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	/* It would be possible to save some instructions
+	   using 4x VPCMP but bottleneck on port 5 makes it not woth
+	   it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero
+	   mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	/* Fall through into less than 4 remaining
+	   vectors of length case.  */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+
+	/* Check VEC2 and compare any match with
+	   remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match
+	   was found in loop.  */
+
+	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	ret
+	jnz	L(last_vec_x3_return)
 
-	.p2align 4
-L(first_vec_x0_check):
+	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
 	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(first_vec_x2_check):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
 	.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within
+	   remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note
+	   this is after remaining length was found to be >
+	   CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
 	ret
 
-	.p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
-L(4x_vec_end):
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
+# endif
 
-END (MEMCHR)
+END(MEMCHR)
 #endif