Message ID | 20220415055132.1257272-3-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/3] x86: Optimize memcmp SSE2 in memcmp.S | expand |
On Thu, Apr 14, 2022 at 10:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Old code was both inefficient and wasted code size. New code (-62 > bytes) and comparable or better performance in the page cross case. > > geometric_mean(N=20) of page cross cases New / Original: 0.960 > > size, align0, align1, ret, New Time/Old Time > 1, 4095, 0, 0, 1.001 > 1, 4095, 0, 1, 0.999 > 1, 4095, 0, -1, 1.0 > 2, 4094, 0, 0, 1.0 > 2, 4094, 0, 1, 1.0 > 2, 4094, 0, -1, 1.0 > 3, 4093, 0, 0, 1.0 > 3, 4093, 0, 1, 1.0 > 3, 4093, 0, -1, 1.0 > 4, 4092, 0, 0, 0.987 > 4, 4092, 0, 1, 1.0 > 4, 4092, 0, -1, 1.0 > 5, 4091, 0, 0, 0.984 > 5, 4091, 0, 1, 1.002 > 5, 4091, 0, -1, 1.005 > 6, 4090, 0, 0, 0.993 > 6, 4090, 0, 1, 1.001 > 6, 4090, 0, -1, 1.003 > 7, 4089, 0, 0, 0.991 > 7, 4089, 0, 1, 1.0 > 7, 4089, 0, -1, 1.001 > 8, 4088, 0, 0, 0.875 > 8, 4088, 0, 1, 0.881 > 8, 4088, 0, -1, 0.888 > 9, 4087, 0, 0, 0.872 > 9, 4087, 0, 1, 0.879 > 9, 4087, 0, -1, 0.883 > 10, 4086, 0, 0, 0.878 > 10, 4086, 0, 1, 0.886 > 10, 4086, 0, -1, 0.873 > 11, 4085, 0, 0, 0.878 > 11, 4085, 0, 1, 0.881 > 11, 4085, 0, -1, 0.879 > 12, 4084, 0, 0, 0.873 > 12, 4084, 0, 1, 0.889 > 12, 4084, 0, -1, 0.875 > 13, 4083, 0, 0, 0.873 > 13, 4083, 0, 1, 0.863 > 13, 4083, 0, -1, 0.863 > 14, 4082, 0, 0, 0.838 > 14, 4082, 0, 1, 0.869 > 14, 4082, 0, -1, 0.877 > 15, 4081, 0, 0, 0.841 > 15, 4081, 0, 1, 0.869 > 15, 4081, 0, -1, 0.876 > 16, 4080, 0, 0, 0.988 > 16, 4080, 0, 1, 0.99 > 16, 4080, 0, -1, 0.989 > 17, 4079, 0, 0, 0.978 > 17, 4079, 0, 1, 0.981 > 17, 4079, 0, -1, 0.98 > 18, 4078, 0, 0, 0.981 > 18, 4078, 0, 1, 0.98 > 18, 4078, 0, -1, 0.985 > 19, 4077, 0, 0, 0.977 > 19, 4077, 0, 1, 0.979 > 19, 4077, 0, -1, 0.986 > 20, 4076, 0, 0, 0.977 > 20, 4076, 0, 1, 0.986 > 20, 4076, 0, -1, 0.984 > 21, 4075, 0, 0, 0.977 > 21, 4075, 0, 1, 0.983 > 21, 4075, 0, -1, 0.988 > 22, 4074, 0, 0, 0.983 > 22, 4074, 0, 1, 0.994 > 22, 4074, 0, -1, 0.993 > 23, 4073, 0, 0, 0.98 > 23, 4073, 0, 1, 0.992 > 23, 4073, 0, -1, 0.995 > 24, 4072, 0, 0, 0.989 > 24, 4072, 0, 1, 0.989 > 24, 4072, 0, -1, 0.991 > 25, 4071, 0, 0, 0.99 > 25, 4071, 0, 1, 0.999 > 25, 4071, 0, -1, 0.996 > 26, 4070, 0, 0, 0.993 > 26, 4070, 0, 1, 0.995 > 26, 4070, 0, -1, 0.998 > 27, 4069, 0, 0, 0.993 > 27, 4069, 0, 1, 0.999 > 27, 4069, 0, -1, 1.0 > 28, 4068, 0, 0, 0.997 > 28, 4068, 0, 1, 1.0 > 28, 4068, 0, -1, 0.999 > 29, 4067, 0, 0, 0.996 > 29, 4067, 0, 1, 0.999 > 29, 4067, 0, -1, 0.999 > 30, 4066, 0, 0, 0.991 > 30, 4066, 0, 1, 1.001 > 30, 4066, 0, -1, 0.999 > 31, 4065, 0, 0, 0.988 > 31, 4065, 0, 1, 0.998 > 31, 4065, 0, -1, 0.998 > --- > sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- > 1 file changed, 61 insertions(+), 37 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > index a34ea1645d..210c9925b6 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > @@ -429,22 +429,21 @@ L(page_cross_less_vec): > # ifndef USE_AS_WMEMCMP > cmpl $8, %edx > jae L(between_8_15) > + /* Fall through for [4, 7]. */ > cmpl $4, %edx > - jae L(between_4_7) > + jb L(between_2_3) > > - /* Load as big endian to avoid branches. */ > - movzwl (%rdi), %eax > - movzwl (%rsi), %ecx > - shll $8, %eax > - shll $8, %ecx > - bswap %eax > - bswap %ecx > - movzbl -1(%rdi, %rdx), %edi > - movzbl -1(%rsi, %rdx), %esi > - orl %edi, %eax > - orl %esi, %ecx > - /* Subtraction is okay because the upper 8 bits are zero. */ > - subl %ecx, %eax > + movbe (%rdi), %eax > + movbe (%rsi), %ecx > + shlq $32, %rax > + shlq $32, %rcx > + movbe -4(%rdi, %rdx), %edi > + movbe -4(%rsi, %rdx), %esi > + orq %rdi, %rax > + orq %rsi, %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > /* No ymm register was touched. */ > ret > > @@ -457,9 +456,33 @@ L(one_or_less): > /* No ymm register was touched. */ > ret > > + .p2align 4,, 5 > +L(ret_nonzero): > + sbbl %eax, %eax > + orl $1, %eax > + /* No ymm register was touched. */ > + ret > + > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + /* No ymm register was touched. */ > + ret > + > .p2align 4 > L(between_8_15): > -# endif > + movbe (%rdi), %rax > + movbe (%rsi), %rcx > + subq %rcx, %rax > + jnz L(ret_nonzero) > + movbe -8(%rdi, %rdx), %rax > + movbe -8(%rsi, %rdx), %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > + /* No ymm register was touched. */ > + ret > +# else > /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ > vmovq (%rdi), %xmm1 > vmovq (%rsi), %xmm2 > @@ -475,16 +498,13 @@ L(between_8_15): > VPCMPEQ %xmm1, %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > +# endif > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > + .p2align 4,, 10 > L(between_16_31): > /* From 16 to 31 bytes. No branch when size == 16. */ > vmovdqu (%rsi), %xmm2 > @@ -501,11 +521,17 @@ L(between_16_31): > VPCMPEQ (%rdi), %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > > # ifdef USE_AS_WMEMCMP > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + ret > + > .p2align 4 > L(one_or_less): > jb L(zero) > @@ -520,22 +546,20 @@ L(one_or_less): > # else > > .p2align 4 > -L(between_4_7): > - /* Load as big endian with overlapping movbe to avoid branches. > - */ > - movbe (%rdi), %eax > - movbe (%rsi), %ecx > - shlq $32, %rax > - shlq $32, %rcx > - movbe -4(%rdi, %rdx), %edi > - movbe -4(%rsi, %rdx), %esi > - orq %rdi, %rax > - orq %rsi, %rcx > - subq %rcx, %rax > - jz L(zero_4_7) > - sbbl %eax, %eax > - orl $1, %eax > -L(zero_4_7): > +L(between_2_3): > + /* Load as big endian to avoid branches. */ > + movzwl (%rdi), %eax > + movzwl (%rsi), %ecx > + bswap %eax > + bswap %ecx > + shrl %eax > + shrl %ecx > + movzbl -1(%rdi, %rdx), %edi > + movzbl -1(%rsi, %rdx), %esi > + orl %edi, %eax > + orl %esi, %ecx > + /* Subtraction is okay because the upper bit is zero. */ > + subl %ecx, %eax > /* No ymm register was touched. */ > ret > # endif > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index a34ea1645d..210c9925b6 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -429,22 +429,21 @@ L(page_cross_less_vec): # ifndef USE_AS_WMEMCMP cmpl $8, %edx jae L(between_8_15) + /* Fall through for [4, 7]. */ cmpl $4, %edx - jae L(between_4_7) + jb L(between_2_3) - /* Load as big endian to avoid branches. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - /* Subtraction is okay because the upper 8 bits are zero. */ - subl %ecx, %eax + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) /* No ymm register was touched. */ ret @@ -457,9 +456,33 @@ L(one_or_less): /* No ymm register was touched. */ ret + .p2align 4,, 5 +L(ret_nonzero): + sbbl %eax, %eax + orl $1, %eax + /* No ymm register was touched. */ + ret + + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + /* No ymm register was touched. */ + ret + .p2align 4 L(between_8_15): -# endif + movbe (%rdi), %rax + movbe (%rsi), %rcx + subq %rcx, %rax + jnz L(ret_nonzero) + movbe -8(%rdi, %rdx), %rax + movbe -8(%rsi, %rdx), %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret +# else /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 @@ -475,16 +498,13 @@ L(between_8_15): VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret +# endif - .p2align 4 -L(zero): - xorl %eax, %eax - ret - - .p2align 4 + .p2align 4,, 10 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ vmovdqu (%rsi), %xmm2 @@ -501,11 +521,17 @@ L(between_16_31): VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret # ifdef USE_AS_WMEMCMP + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + ret + .p2align 4 L(one_or_less): jb L(zero) @@ -520,22 +546,20 @@ L(one_or_less): # else .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. - */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - jz L(zero_4_7) - sbbl %eax, %eax - orl $1, %eax -L(zero_4_7): +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + bswap %eax + bswap %ecx + shrl %eax + shrl %ecx + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx + /* Subtraction is okay because the upper bit is zero. */ + subl %ecx, %eax /* No ymm register was touched. */ ret # endif