@@ -19,10 +19,13 @@
#include "sysdep.h"
ENTRY ( __strcmp_sse2_unaligned)
- movl %edi, %eax
- xorl %edx, %edx
pxor %xmm7, %xmm7
- orl %esi, %eax
+ movl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+
+ movl %edi, %eax
andl $4095, %eax
cmpl $4032, %eax
jg L(cross_page)
@@ -30,13 +33,11 @@ ENTRY ( __strcmp_sse2_unaligned)
movdqu (%rsi), %xmm0
pcmpeqb %xmm1, %xmm0
pminub %xmm1, %xmm0
- pxor %xmm1, %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- testq %rax, %rax
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
je L(next_48_bytes)
-L(return):
- bsfq %rax, %rdx
+ bsf %eax, %edx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
@@ -50,29 +51,35 @@ L(next_48_bytes):
pcmpeqb %xmm6, %xmm3
movdqu 32(%rsi), %xmm2
pminub %xmm6, %xmm3
- pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm7, %xmm3
movdqu 48(%rdi), %xmm4
pcmpeqb %xmm5, %xmm2
- pmovmskb %xmm3, %edx
+ pmovmskb %xmm3, %edx
movdqu 48(%rsi), %xmm0
pminub %xmm5, %xmm2
- pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm7, %xmm2
pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm2, %eax
- salq $16, %rdx
+ pmovmskb %xmm2, %eax
+ sal $16, %edx
pminub %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm7, %xmm0
salq $32, %rax
orq %rdx, %rax
- pmovmskb %xmm0, %ecx
- movq %rcx, %rdx
- salq $48, %rdx
- orq %rdx, %rax
- jne L(return)
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ je L(main_loop_header)
+L(return):
+ bsf %rax, %rdx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+ ret
+
+
L(main_loop_header):
leaq 64(%rdi), %rdx
movl $4096, %ecx
- pxor %xmm9, %xmm9
andq $-64, %rdx
subq %rdi, %rdx
leaq (%rdi, %rdx), %rax
@@ -82,16 +89,11 @@ L(main_loop_header):
subq %rsi, %rcx
shrq $6, %rcx
movq %rcx, %rsi
- jmp L(loop_start)
.p2align 4
L(loop):
- addq $64, %rax
- addq $64, %rdx
-L(loop_start):
- testq %rsi, %rsi
- leaq -1(%rsi), %rsi
- je L(loop_cross_page)
+ add $-1, %rsi
+ ja L(loop_cross_page)
L(back_to_loop):
movdqu (%rdx), %xmm0
movdqu 16(%rdx), %xmm1
@@ -104,61 +106,57 @@ L(back_to_loop):
movdqu 48(%rdx), %xmm6
pminub %xmm3, %xmm1
movdqa 32(%rax), %xmm2
- pminub %xmm1, %xmm0
movdqa 48(%rax), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
+ addq $64, %rax
pminub %xmm2, %xmm5
pminub %xmm3, %xmm6
- pminub %xmm5, %xmm0
- pminub %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %ecx
+ addq $64, %rdx
+ pminub %xmm5, %xmm6
+ pminub %xmm1, %xmm6
+ pminub %xmm0, %xmm6
+ pcmpeqb %xmm7, %xmm6
+ pmovmskb %xmm6, %ecx
testl %ecx, %ecx
je L(loop)
- pcmpeqb %xmm7, %xmm5
- movdqu (%rdx), %xmm0
- pcmpeqb %xmm7, %xmm1
- movdqa (%rax), %xmm2
- pcmpeqb %xmm2, %xmm0
- pminub %xmm2, %xmm0
- pcmpeqb %xmm7, %xmm6
pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm5
+ pmovmskb %xmm0, %edi
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm5, %r8d
+ salq $48, %rcx
salq $32, %r8
- pmovmskb %xmm6, %esi
orq %r8, %rcx
orq %rdi, %rcx
- salq $48, %rsi
+ sal $16, %esi
orq %rsi, %rcx
bsfq %rcx, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
+ movzbl -64(%rax, %rcx), %eax
+ movzbl -64(%rdx, %rcx), %edx
subl %edx, %eax
ret
.p2align 4
L(loop_cross_page):
- xor %r10, %r10
+ xor %ecx, %ecx
movq %rdx, %r9
and $63, %r9
- subq %r9, %r10
+ subq %r9, %rcx
- movdqa (%rdx, %r10), %xmm0
- movdqa 16(%rdx, %r10), %xmm1
- movdqu (%rax, %r10), %xmm2
- movdqu 16(%rax, %r10), %xmm3
+ movdqa (%rdx, %rcx), %xmm0
+ movdqa 16(%rdx, %rcx), %xmm1
+ movdqu (%rax, %rcx), %xmm2
+ movdqu 16(%rax, %rcx), %xmm3
pcmpeqb %xmm2, %xmm0
- movdqa 32(%rdx, %r10), %xmm5
+ movdqa 32(%rdx, %rcx), %xmm5
pcmpeqb %xmm3, %xmm1
pminub %xmm2, %xmm0
- movdqa 48(%rdx, %r10), %xmm6
+ movdqa 48(%rdx, %rcx), %xmm6
pminub %xmm3, %xmm1
- movdqu 32(%rax, %r10), %xmm2
- movdqu 48(%rax, %r10), %xmm3
+ movdqu 32(%rax, %rcx), %xmm2
+ movdqu 48(%rax, %rcx), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
pminub %xmm2, %xmm5
@@ -169,12 +167,12 @@ L(loop_cross_page):
pcmpeqb %xmm7, %xmm5
pcmpeqb %xmm7, %xmm6
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ sal $16, %ecx
salq $32, %r8
- pmovmskb %xmm6, %esi
+ pmovmskb %xmm6, %esi
orq %r8, %rdi
orq %rcx, %rdi
salq $48, %rsi
@@ -190,20 +188,21 @@ L(loop_cross_page):
subl %edx, %eax
ret
+L(cross_page):
+ xorl %edx, %edx
+ jmp L(cross_page_loop_start)
.p2align 4
L(cross_page_loop):
- cmpb %cl, %al
- jne L(different)
- addq $1, %rdx
- cmpq $64, %rdx
+ add $1, %edx
+ cmp $64, %edx
je L(main_loop_header)
-L(cross_page):
+L(cross_page_loop_start):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
- testb %al, %al
+ subl %ecx, %eax
+ jne L(different)
+ test %ecx, %ecx
jne L(cross_page_loop)
- xorl %eax, %eax
L(different):
- subl %ecx, %eax
ret
END (__strcmp_sse2_unaligned)