diff mbox

[v3,neleai/string-x64] Improve memcmp performance and fix regression.

Message ID 20150621104732.GA16055@domone
State New
Headers show

Commit Message

Ondřej Bílka June 21, 2015, 10:47 a.m. UTC
On Fri, Jun 19, 2015 at 05:53:04PM +0200, Ondřej Bílka wrote:
> On Thu, Jun 18, 2015 at 10:09:10AM +0200, Ondřej Bílka wrote:
> > Hi,
> > 
> > As I sumbitted before in 2013 memcmp improvement here is new version
> > that improves performance a bit more.
> > 
> > Also when I browsed results I found that memcmp-sse4 is in fact
> > regression for i7 nehalem, ivy bridge and haswell architectures. There
> > its beaten by old sse2 code by more than 10%.
> >

Also when I tried different headers to see if I could improve avx2
version. It turned out that byte-by-byte loop that I use for crosspage
case is best. If I always use that it beats sse4 version on gcc
workload.

Main problem is that branch misprediction kills performance and I
couldn't make decision about n fast.

> > Main idea of new implementation is same, problem with performance is
> > that lot inputs were identical with small n. 
> > For that I found that following approach gives best performance when 
> > n<64 is likely.
> > 
> > if (!cross_page (s1) && !cross_page (s2))
> >   {
> >     mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero))
> >     mask2 = mask & (2 << (n-1));
> >     if (mask2)
> >       return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> >     if (n<=16)
> >       return 0;
> >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16;
> >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32;
> >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48;
> >     mask2 = mask & (2 << (n-1));
> >     if (mask2)
> >       return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> >     if (n<=64)                        
> >       return 0;
> >     if (mask)
> >       return s1[first_byte(mask)]-s2[first_byte(mask)];
> >   }
> > 
> > I didn't checked yet using just registers and byteswap to eliminate need
> > of getting exact byte position as I wrote in related thread.
> > 
> > I could improve this bit more, I lose lot of cycles in loop ending
> > conditions. Problem is that I need to handle that unaligned s2 may read
> > from next page, I would need to add more complicated logic to compute
> > number of loop iterations.
> > 
> > Thats related to avx2. I as RFC included it but it harm performance on
> > haswell.
> > 
> > Last is wmemcmp that I would also need to convert, now I just moved
> > memcmp-sse-4 there.
> > 
> > A profile is found here.
> > 
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html
> > 
> I updated that new version. I removed avx2 for now, I will submit it
> when I find how it could improve performance.
> 
> Second change is that I added wmemcmp conditionals so now I could delete
> memcmp-sse4 and wmemcmp-sse4.
> 
> 
After finding out bts trick for strncmp I also tried to use it in
memcmp. Problem is that in memcmp my previous control flow was better as
for memcmp its likely that arguments are equal so I save cost of bsf and
comparing bytes.

Only improvement was that using bts with same control flow saves few
cycles making around 2% improvement for gcc workload.

Also in cross-page case only optimization was to unroll a byte-by-byte
loop as switching to bigger comparison caused more overhead than saved.

So what about following version?

          * sysdeps/x86_64/memcmp.S: New implementation.
          * sysdeps/x86_64/multiarch/ifunc-impl-list.c
  	 (__libc_ifunc_impl_list): Remove memcmp-sse4
          * sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4.
          * sysdeps/x86_64/multiarch/memcmp.S: Likewise.
 	 * sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed.
          * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise.
> 

---
 sysdeps/x86_64/memcmp.S                          |  512 +++----
 sysdeps/x86_64/multiarch/Makefile                |    6 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |    9 +-
 sysdeps/x86_64/multiarch/memcmp-avx2.S           |    3 +
 sysdeps/x86_64/multiarch/memcmp-sse4.S           | 1776 ----------------------
 sysdeps/x86_64/multiarch/memcmp.S                |   25 +-
 sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S |    9 +-
 sysdeps/x86_64/multiarch/wmemcmp-sse4.S          |    4 -
 sysdeps/x86_64/multiarch/wmemcmp.S               |   12 +-
 9 files changed, 221 insertions(+), 2135 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S

Comments

Ondřej Bílka July 3, 2015, 7:45 a.m. UTC | #1
On Sun, Jun 21, 2015 at 12:47:32PM +0200, Ondřej Bílka wrote:
> On Fri, Jun 19, 2015 at 05:53:04PM +0200, Ondřej Bílka wrote:
> > On Thu, Jun 18, 2015 at 10:09:10AM +0200, Ondřej Bílka wrote:
> > > Hi,
> > > 
> > > As I sumbitted before in 2013 memcmp improvement here is new version
> > > that improves performance a bit more.
> > > 
> > > Also when I browsed results I found that memcmp-sse4 is in fact
> > > regression for i7 nehalem, ivy bridge and haswell architectures. There
> > > its beaten by old sse2 code by more than 10%.
> > >
> 
> Also when I tried different headers to see if I could improve avx2
> version. It turned out that byte-by-byte loop that I use for crosspage
> case is best. If I always use that it beats sse4 version on gcc
> workload.
> 
> Main problem is that branch misprediction kills performance and I
> couldn't make decision about n fast.
> 
> > > Main idea of new implementation is same, problem with performance is
> > > that lot inputs were identical with small n. 
> > > For that I found that following approach gives best performance when 
> > > n<64 is likely.
> > > 
> > > if (!cross_page (s1) && !cross_page (s2))
> > >   {
> > >     mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero))
> > >     mask2 = mask & (2 << (n-1));
> > >     if (mask2)
> > >       return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> > >     if (n<=16)
> > >       return 0;
> > >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16;
> > >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32;
> > >     mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48;
> > >     mask2 = mask & (2 << (n-1));
> > >     if (mask2)
> > >       return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> > >     if (n<=64)                        
> > >       return 0;
> > >     if (mask)
> > >       return s1[first_byte(mask)]-s2[first_byte(mask)];
> > >   }
> > > 
> > > I didn't checked yet using just registers and byteswap to eliminate need
> > > of getting exact byte position as I wrote in related thread.
> > > 
> > > I could improve this bit more, I lose lot of cycles in loop ending
> > > conditions. Problem is that I need to handle that unaligned s2 may read
> > > from next page, I would need to add more complicated logic to compute
> > > number of loop iterations.
> > > 
> > > Thats related to avx2. I as RFC included it but it harm performance on
> > > haswell.
> > > 
> > > Last is wmemcmp that I would also need to convert, now I just moved
> > > memcmp-sse-4 there.
> > > 
> > > A profile is found here.
> > > 
> > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html
> > > 
> > I updated that new version. I removed avx2 for now, I will submit it
> > when I find how it could improve performance.
> > 
> > Second change is that I added wmemcmp conditionals so now I could delete
> > memcmp-sse4 and wmemcmp-sse4.
> > 
> > 
> After finding out bts trick for strncmp I also tried to use it in
> memcmp. Problem is that in memcmp my previous control flow was better as
> for memcmp its likely that arguments are equal so I save cost of bsf and
> comparing bytes.
> 
> Only improvement was that using bts with same control flow saves few
> cycles making around 2% improvement for gcc workload.
> 
> Also in cross-page case only optimization was to unroll a byte-by-byte
> loop as switching to bigger comparison caused more overhead than saved.
> 
> So what about following version?
> 
>           * sysdeps/x86_64/memcmp.S: New implementation.
>           * sysdeps/x86_64/multiarch/ifunc-impl-list.c
>   	 (__libc_ifunc_impl_list): Remove memcmp-sse4
>           * sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4.
>           * sysdeps/x86_64/multiarch/memcmp.S: Likewise.
>  	 * sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed.
>           * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise.
> > 
> 
> ---
>  sysdeps/x86_64/memcmp.S                          |  512 +++----
>  sysdeps/x86_64/multiarch/Makefile                |    6 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c       |    9 +-
>  sysdeps/x86_64/multiarch/memcmp-avx2.S           |    3 +
>  sysdeps/x86_64/multiarch/memcmp-sse4.S           | 1776 ----------------------
>  sysdeps/x86_64/multiarch/memcmp.S                |   25 +-
>  sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S |    9 +-
>  sysdeps/x86_64/multiarch/wmemcmp-sse4.S          |    4 -
>  sysdeps/x86_64/multiarch/wmemcmp.S               |   12 +-
>  9 files changed, 221 insertions(+), 2135 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S
>  delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
>  delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
> 
> diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
> index f636716..88c0c4a 100644
> --- a/sysdeps/x86_64/memcmp.S
> +++ b/sysdeps/x86_64/memcmp.S
> @@ -19,340 +19,204 @@
>  
>  #include <sysdep.h>
>  
> +#ifndef MEMCMP
> +# define MEMCMP memcmp
> +#endif
> +
>  	.text
> -ENTRY (memcmp)
> -	test	%rdx, %rdx
> -	jz	L(finz)
> -	cmpq	$1, %rdx
> -	jle	L(finr1b)
> -	subq	%rdi, %rsi
> -	movq	%rdx, %r10
> -	cmpq	$32, %r10
> -	jge	L(gt32)
> -	/* Handle small chunks and last block of less than 32 bytes.  */
> -L(small):
> -	testq	$1, %r10
> -	jz	L(s2b)
> -	movzbl	(%rdi),	%eax
> -	movzbl	(%rdi, %rsi), %edx
> -	subq    $1, %r10
> -	je	L(finz1)
> -	addq	$1, %rdi
> -	subl	%edx, %eax
> -	jnz	L(exit)
> -L(s2b):
> -	testq	$2, %r10
> -	jz	L(s4b)
> -	movzwl	(%rdi),	%eax
> -	movzwl	(%rdi, %rsi), %edx
> -	subq    $2, %r10
> -	je	L(fin2_7)
> -	addq	$2, %rdi
> -	cmpl	%edx, %eax
> -	jnz	L(fin2_7)
> -L(s4b):
> -	testq	$4, %r10
> -	jz	L(s8b)
> -	movl	(%rdi),	%eax
> -	movl	(%rdi, %rsi), %edx
> -	subq    $4, %r10
> -	je	L(fin2_7)
> -	addq	$4, %rdi
> -	cmpl	%edx, %eax
> -	jnz	L(fin2_7)
> -L(s8b):
> -	testq	$8, %r10
> -	jz	L(s16b)
> -	movq	(%rdi),	%rax
> -	movq	(%rdi, %rsi), %rdx
> -	subq    $8, %r10
> -	je	L(fin2_7)
> -	addq	$8, %rdi
> -	cmpq	%rdx, %rax
> -	jnz	L(fin2_7)
> -L(s16b):
> -	movdqu    (%rdi), %xmm1
> -	movdqu    (%rdi, %rsi), %xmm0
> -	pcmpeqb   %xmm0, %xmm1
> -	pmovmskb  %xmm1, %edx
> -	xorl	  %eax, %eax
> -	subl      $0xffff, %edx
> -	jz	  L(finz)
> -	bsfl      %edx, %ecx
> -	leaq	 (%rdi, %rcx), %rcx
> -	movzbl	 (%rcx), %eax
> -	movzbl	 (%rsi, %rcx), %edx
> -	jmp	 L(finz1)
> +ENTRY (MEMCMP)
> +	testq	%rdx, %rdx
> +	je	L(return_zero)
> +#ifdef AS_WMEMCMP
> +	shl	$2, %rdx
> +#endif
> +	pxor	%xmm4, %xmm4
> +	movl	%edi, %eax
> +	andl	$4095, %eax
> +	cmpl	$4032, %eax
> +	ja	L(cross_page_start)
> +L(handle_end):
> +	movl	%esi, %eax
> +	andl	$4095, %eax
> +	cmpl	$4032, %eax
> +	ja	L(cross_page_start)
> +L(back_header):
> +	xor	%ecx, %ecx
> +	bts	%rdx, %rcx
> +	sub	$1, %rcx
> +	movdqu	(%rdi), %xmm0
> +	movdqu	(%rsi), %xmm1
> +	pcmpeqb	%xmm1, %xmm0
> +	pcmpeqb	%xmm4, %xmm0
> +	pmovmskb %xmm0, %eax
> +	and	%ecx, %eax
> +	jne	L(different)
> +	cmpq	$16, %rdx
> +	ja	L(next)
> +	ret
> +L(next):
> +	pmovmskb %xmm0, %r8d
> +	movdqu	16(%rdi), %xmm2
> +	movdqu	16(%rsi), %xmm6
> +	movdqu	32(%rdi), %xmm1
> +	pcmpeqb	%xmm6, %xmm2
> +	movdqu	32(%rsi), %xmm5
> +	pcmpeqb	%xmm4, %xmm2
> +	pcmpeqb	%xmm5, %xmm1
> +	movdqu	48(%rdi), %xmm7
> +	pmovmskb %xmm2, %eax
> +	movdqu	48(%rsi), %xmm3
> +	pcmpeqb	%xmm4, %xmm1
> +	pmovmskb %xmm1, %r9d
> +	sal	$16, %eax
> +	pcmpeqb	%xmm3, %xmm7
> +	salq	$32, %r9
> +	pcmpeqb	%xmm4, %xmm7
> +	orq	%r9, %rax
> +	orq	%r8, %rax
> +	pmovmskb %xmm7, %r8d
> +	salq	$48, %r8
> +	orq	%r8, %rax
> +	movq	%rax, %r8
> +	andq	%rcx, %rax
> +	jne	L(different)
> +	cmpq	$64, %rdx
> +	jb	L(return_zero)
> +	movq	%r8, %rax
> +	testq	%rax, %rax
> +	jne	L(different)
> +L(align_loop):
> +	leaq	64(%rdi), %rax
> +	andq	$-64, %rax
> +	subq	%rdi, %rax
> +	subq	%rax, %rdx
> +	addq	%rax, %rdi
> +	addq	%rax, %rsi
> +	cmpq	$64, %rdx
> +	ja	L(loop_start)
> +	testq	%rdx, %rdx
> +	jne	L(handle_end)
> +	xorl	%eax, %eax
> +	ret
>  
> -	.p2align 4,, 4
> -L(finr1b):
> -	movzbl	(%rdi), %eax
> -	movzbl  (%rsi), %edx
> -L(finz1):
> +	.p2align 4
> +L(different):
> +	bsfq	%rax, %rdx
> +#ifdef AS_WMEMCMP
> +	and	$-4, %rdx
> +	mov	(%rdi,%rdx), %eax
> +	mov	(%rsi,%rdx), %edx
>  	subl	%edx, %eax
> -L(exit):
> +	jg	L(ret1)
> +	jl	L(ret_neg_1)
>  	ret
> -
> -	.p2align 4,, 4
> -L(fin2_7):
> -	cmpq	%rdx, %rax
> -	jz	L(finz)
> -	movq	%rax, %r11
> -	subq	%rdx, %r11
> -	bsfq	%r11, %rcx
> -	sarq	$3, %rcx
> -	salq	$3, %rcx
> -	sarq	%cl, %rax
> -	movzbl  %al, %eax
> -	sarq	%cl, %rdx
> -	movzbl  %dl, %edx
> +L(ret1):
> +	mov $1, %eax
> +	ret
> +L(ret_neg_1):
> +	mov $-1, %eax
> +	ret
> +#else
> +	movzbl	(%rdi,%rdx), %eax
> +	movzbl	(%rsi,%rdx), %edx
>  	subl	%edx, %eax
>  	ret
> -
> -	.p2align 4,, 4
> -L(finz):
> +#endif
> +L(return_zero):
> +	xor	%eax, %eax
> +	ret
> +	.p2align 4
> +L(loop):
> +	subq	$64, %rdx
> +	addq	$64, %rdi
> +	addq	$64, %rsi
> +	cmpq	$64, %rdx
> +	jbe	L(less_64_bytes)
> +L(loop_start):
> +	movdqu	(%rsi), %xmm0
> +	movdqu	16(%rsi), %xmm1
> +	pcmpeqb	(%rdi), %xmm0
> +	movdqu	32(%rsi), %xmm2
> +	pcmpeqb	16(%rdi), %xmm1
> +	movdqu	48(%rsi), %xmm3
> +	pcmpeqb	32(%rdi), %xmm2
> +	pcmpeqb	48(%rdi), %xmm3
> +	pminub	%xmm0, %xmm3
> +	pminub	%xmm1, %xmm3
> +	pminub	%xmm2, %xmm3
> +	pcmpeqb	%xmm4, %xmm3
> +	pmovmskb %xmm3, %eax
> +	testl	%eax, %eax
> +	je	L(loop)
> +	shl	$48, %rax
> +	pcmpeqb	%xmm4, %xmm0
> +	pcmpeqb	%xmm4, %xmm1
> +	pcmpeqb	%xmm4, %xmm2
> +	pmovmskb %xmm0, %r8
> +	pmovmskb %xmm1, %rcx
> +	pmovmskb %xmm2, %r9
> +	shl	$16, %ecx
> +	shl	$32, %r9
> +	or	%r8, %rax
> +	or	%r9, %rax
> +	or	%rcx, %rax
> +	jmp	L(different)
> +
> +	.p2align 4
> +L(less_64_bytes):
> +	testq	%rdx, %rdx
> +	jne	L(handle_end)
>  	xorl	%eax, %eax
>  	ret
>  
> -	/* For blocks bigger than 32 bytes
> -	   1. Advance one of the addr pointer to be 16B aligned.
> -	   2. Treat the case of both addr pointers aligned to 16B
> -	      separately to avoid movdqu.
> -	   3. Handle any blocks of greater than 64 consecutive bytes with
> -	      unrolling to reduce branches.
> -	   4. At least one addr pointer is 16B aligned, use memory version
> -	      of pcmbeqb.
> -	*/
> -	.p2align 4,, 4
> -L(gt32):
> -	movq	%rdx, %r11
> -	addq	%rdi, %r11
> -	movq	%rdi, %r8
> -
> -	andq	$15, %r8
> -	jz	L(16am)
> -	/* Both pointers may be misaligned.  */
> -	movdqu	(%rdi),	%xmm1
> -	movdqu	(%rdi, %rsi), %xmm0
> -	pcmpeqb   %xmm0, %xmm1
> -	pmovmskb  %xmm1, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	neg	 %r8
> -	leaq    16(%rdi, %r8), %rdi
> -L(16am):
> -	/* Handle two 16B aligned pointers separately.  */
> -	testq   $15, %rsi
> -	jz      L(ATR)
> -	testq	$16, %rdi
> -	jz	L(A32)
> -	movdqu	(%rdi, %rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq	$16, %rdi
> -L(A32):
> -	movq	%r11, %r10
> -	andq	$-32, %r10
> -	cmpq	%r10, %rdi
> -        jge	L(mt16)
> -	/* Pre-unroll to be ready for unrolled 64B loop.  */
> -	testq	$32, %rdi
> -	jz	L(A64)
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb  (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -L(A64):
> -	movq	%r11, %r10
> -	andq	$-64, %r10
> -	cmpq	%r10, %rdi
> -        jge	L(mt32)
> -
> -L(A64main):
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb  (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	cmpq       %rdi, %r10
> -	jne       L(A64main)
> -
> -L(mt32):
> -	movq	%r11, %r10
> -	andq	$-32, %r10
> -	cmpq	%r10, %rdi
> -        jge	L(mt16)
>  
> -L(A32main):
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqu    (%rdi,%rsi), %xmm0
> -	pcmpeqb  (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	cmpq       %rdi, %r10
> -	jne       L(A32main)
> -L(mt16):
> -	subq       %rdi, %r11
> -	je	  L(finz)
> -	movq	  %r11, %r10
> -	jmp	  L(small)
> -
> -	.p2align 4,, 4
> -L(neq):
> -	bsfl      %edx, %ecx
> -	movzbl	 (%rdi, %rcx), %eax
> -	addq	 %rdi, %rsi
> -	movzbl	 (%rsi,%rcx), %edx
> -	jmp	 L(finz1)
> -
> -	.p2align 4,, 4
> -L(ATR):
> -	movq	%r11, %r10
> -	andq	$-32, %r10
> -	cmpq	%r10, %rdi
> -        jge	L(mt16)
> -	testq	$16, %rdi
> -	jz	L(ATR32)
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -	cmpq       %rdi, %r10
> -	je       L(mt16)
> -
> -L(ATR32):
> -	movq	%r11, %r10
> -	andq	$-64, %r10
> -	testq	$32, %rdi
> -	jz	L(ATR64)
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -L(ATR64):
> -	cmpq       %rdi, %r10
> -	je	   L(mt32)
> -
> -L(ATR64main):
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -	cmpq       %rdi, %r10
> -	jne       L(ATR64main)
> -
> -	movq	%r11, %r10
> -	andq	$-32, %r10
> -	cmpq	%r10, %rdi
> -        jge	L(mt16)
> -
> -L(ATR32res):
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	movdqa    (%rdi,%rsi), %xmm0
> -	pcmpeqb   (%rdi), %xmm0
> -	pmovmskb  %xmm0, %edx
> -	subl      $0xffff, %edx
> -	jnz       L(neq)
> -	addq       $16, %rdi
> -
> -	cmpq	  %r10, %rdi
> -	jne       L(ATR32res)
> -
> -	subq       %rdi, %r11
> -	je	  L(finz)
> -	movq	  %r11, %r10
> -	jmp	  L(small)
> -	/* Align to 16byte to improve instruction fetch.  */
> -	.p2align 4,, 4
> -END(memcmp)
> +	.p2align 4
> +L(cross_page_start):
> +	cmp	$64, %rdx
> +	ja	L(back_header)
> +
> +	.p2align 4
> +L(cross_page):
> +	test	%edx, %edx
> +	je	L(return_zero)
> +#ifdef AS_WMEMCMP
> +	mov	(%rdi), %eax
> +	mov	(%rsi), %ecx
> +	subl	%ecx, %eax
> +	jg	L(ret1)
> +	jl	L(ret_neg_1)
> +#else
> +	movzbl	(%rdi), %eax
> +	movzbl	(%rsi), %ecx
> +	subl	%ecx, %eax
> +	jne	L(return)
> +	cmp	$1, %edx
> +	je	L(return)
> +	movzbl	1(%rdi), %eax
> +	movzbl	1(%rsi), %ecx
> +	subl	%ecx, %eax
> +	jne	L(return)
> +	cmp	$2, %edx
> +	je	L(return)
> +	movzbl	2(%rdi), %eax
> +	movzbl	2(%rsi), %ecx
> +	subl	%ecx, %eax
> +	jne	L(return)
> +	cmp	$3, %edx
> +	je	L(return)
> +	movzbl	3(%rdi), %eax
> +	movzbl	3(%rsi), %ecx
> +	subl	%ecx, %eax
> +	jne	L(return)
> +#endif
> +	sub	$4, %edx
> +	add	$4, %rdi
> +	add	$4, %rsi
> +	jmp	L(cross_page)
> +L(return):
> +	ret
> +END(MEMCMP)
>  
> -#undef bcmp
> +#undef	bcmp
>  weak_alias (memcmp, bcmp)
>  libc_hidden_builtin_def (memcmp)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index c573744..679db2a 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -8,7 +8,7 @@ ifeq ($(subdir),string)
>  
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   strcmp-sse2-unaligned strncmp-ssse3 \
> -		   memcmp-sse4 memcpy-ssse3 \
> +		   memcpy-ssse3 \
>  		   memcpy-sse2-unaligned mempcpy-ssse3 \
>  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>  		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> @@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4
>  endif
>  
>  ifeq (yes,$(config-cflags-avx2))
> -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
> +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
>  endif
>  endif
>  
>  ifeq ($(subdir),wcsmbs)
> -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
> +sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
>  endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index d398e43..b3dbe65 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
>    IFUNC_IMPL (i, name, memcmp,
> -	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
> -			      __memcmp_sse4_1)
> +	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2)
>  	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
> -	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
> +	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned))
>  
>    /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
>    IFUNC_IMPL (i, name, __memmove_chk,
> @@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
>    IFUNC_IMPL (i, name, wmemcmp,
> -	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
> -			      __wmemcmp_sse4_1)
> +	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1,
> +			      __wmemcmp_sse2_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
>  			      __wmemcmp_ssse3)
>  	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
> diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
> new file mode 100644
> index 0000000..60483bf
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
> @@ -0,0 +1,3 @@
> +#define USE_AVX2
> +#define MEMCMP __memcmp_avx2
> +#include "../memcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
> deleted file mode 100644
> index 533fece..0000000
> --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
> +++ /dev/null
> @@ -1,1776 +0,0 @@
> -/* memcmp with SSE4.1, wmemcmp with SSE4.1
> -   Copyright (C) 2010-2015 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -#  define MEMCMP	__memcmp_sse4_1
> -# endif
> -
> -# define JMPTBL(I, B)	(I - B)
> -
> -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
> -  lea		TABLE(%rip), %r11;				\
> -  movslq	(%r11, INDEX, SCALE), %rcx;			\
> -  add		%r11, %rcx;					\
> -  jmp		*%rcx;						\
> -  ud2
> -
> -/* Warning!
> -           wmemcmp has to use SIGNED comparison for elements.
> -           memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> -	.section .text.sse4.1,"ax",@progbits
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> -	shl	$2, %rdx
> -# endif
> -	pxor	%xmm0, %xmm0
> -	cmp	$79, %rdx
> -	ja	L(79bytesormore)
> -# ifndef USE_AS_WMEMCMP
> -	cmp	$1, %rdx
> -	je	L(firstbyte)
> -# endif
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -# ifndef USE_AS_WMEMCMP
> -	.p2align 4
> -L(firstbyte):
> -	movzbl	(%rdi), %eax
> -	movzbl	(%rsi), %ecx
> -	sub	%ecx, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(79bytesormore):
> -	movdqu	(%rsi), %xmm1
> -	movdqu	(%rdi), %xmm2
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -	mov	%rsi, %rcx
> -	and	$-16, %rsi
> -	add	$16, %rsi
> -	sub	%rsi, %rcx
> -
> -	sub	%rcx, %rdi
> -	add	%rcx, %rdx
> -	test	$0xf, %rdi
> -	jz	L(2aligned)
> -
> -	cmp	$128, %rdx
> -	ja	L(128bytesormore)
> -L(less128bytes):
> -	sub	$64, %rdx
> -
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqu	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqu	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin64)
> -
> -	movdqu	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqu	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin64):
> -	add	$64, %rdi
> -	add	$64, %rsi
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(128bytesormore):
> -	cmp	$512, %rdx
> -	ja	L(512bytesormore)
> -	cmp	$256, %rdx
> -	ja	L(less512bytes)
> -L(less256bytes):
> -	sub	$128, %rdx
> -
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqu	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqu	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -
> -	movdqu	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqu	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -
> -	movdqu	96(%rdi), %xmm2
> -	pxor	96(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(112bytesin256)
> -
> -	movdqu	112(%rdi), %xmm2
> -	pxor	112(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(128bytesin256)
> -
> -	add	$128, %rsi
> -	add	$128, %rdi
> -
> -	cmp	$64, %rdx
> -	jae	L(less128bytes)
> -
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin128)
> -
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin128):
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(less512bytes):
> -	sub	$256, %rdx
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqu	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqu	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -
> -	movdqu	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqu	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -
> -	movdqu	96(%rdi), %xmm2
> -	pxor	96(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(112bytesin256)
> -
> -	movdqu	112(%rdi), %xmm2
> -	pxor	112(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(128bytesin256)
> -
> -	movdqu	128(%rdi), %xmm2
> -	pxor	128(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(144bytesin256)
> -
> -	movdqu	144(%rdi), %xmm2
> -	pxor	144(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(160bytesin256)
> -
> -	movdqu	160(%rdi), %xmm2
> -	pxor	160(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(176bytesin256)
> -
> -	movdqu	176(%rdi), %xmm2
> -	pxor	176(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(192bytesin256)
> -
> -	movdqu	192(%rdi), %xmm2
> -	pxor	192(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(208bytesin256)
> -
> -	movdqu	208(%rdi), %xmm2
> -	pxor	208(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(224bytesin256)
> -
> -	movdqu	224(%rdi), %xmm2
> -	pxor	224(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(240bytesin256)
> -
> -	movdqu	240(%rdi), %xmm2
> -	pxor	240(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(256bytesin256)
> -
> -	add	$256, %rsi
> -	add	$256, %rdi
> -
> -	cmp	$128, %rdx
> -	jae	L(less256bytes)
> -
> -	cmp	$64, %rdx
> -	jae	L(less128bytes)
> -
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin256)
> -
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin256):
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -	.p2align 4
> -L(512bytesormore):
> -# ifdef DATA_CACHE_SIZE_HALF
> -	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
> -# else
> -	mov	__x86_data_cache_size_half(%rip), %R8_LP
> -# endif
> -	mov	%r8, %r9
> -	shr	$1, %r8
> -	add	%r9, %r8
> -	cmp	%r8, %rdx
> -	ja	L(L2_L3_cache_unaglined)
> -	sub	$64, %rdx
> -	.p2align 4
> -L(64bytesormore_loop):
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	movdqa	%xmm2, %xmm1
> -
> -	movdqu	16(%rdi), %xmm3
> -	pxor	16(%rsi), %xmm3
> -	por	%xmm3, %xmm1
> -
> -	movdqu	32(%rdi), %xmm4
> -	pxor	32(%rsi), %xmm4
> -	por	%xmm4, %xmm1
> -
> -	movdqu	48(%rdi), %xmm5
> -	pxor	48(%rsi), %xmm5
> -	por	%xmm5, %xmm1
> -
> -	ptest	%xmm1, %xmm0
> -	jnc	L(64bytesormore_loop_end)
> -	add	$64, %rsi
> -	add	$64, %rdi
> -	sub	$64, %rdx
> -	jae	L(64bytesormore_loop)
> -
> -	add	$64, %rdx
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -L(L2_L3_cache_unaglined):
> -	sub	$64, %rdx
> -	.p2align 4
> -L(L2_L3_unaligned_128bytes_loop):
> -	prefetchnta 0x1c0(%rdi)
> -	prefetchnta 0x1c0(%rsi)
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	movdqa	%xmm2, %xmm1
> -
> -	movdqu	16(%rdi), %xmm3
> -	pxor	16(%rsi), %xmm3
> -	por	%xmm3, %xmm1
> -
> -	movdqu	32(%rdi), %xmm4
> -	pxor	32(%rsi), %xmm4
> -	por	%xmm4, %xmm1
> -
> -	movdqu	48(%rdi), %xmm5
> -	pxor	48(%rsi), %xmm5
> -	por	%xmm5, %xmm1
> -
> -	ptest	%xmm1, %xmm0
> -	jnc	L(64bytesormore_loop_end)
> -	add	$64, %rsi
> -	add	$64, %rdi
> -	sub	$64, %rdx
> -	jae	L(L2_L3_unaligned_128bytes_loop)
> -
> -	add	$64, %rdx
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -/*
> - * This case is for machines which are sensitive for unaligned instructions.
> - */
> -	.p2align 4
> -L(2aligned):
> -	cmp	$128, %rdx
> -	ja	L(128bytesormorein2aligned)
> -L(less128bytesin2aligned):
> -	sub	$64, %rdx
> -
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqa	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqa	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqa	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin64in2alinged)
> -
> -	movdqa	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqa	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin64in2alinged):
> -	add	$64, %rdi
> -	add	$64, %rsi
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -	.p2align 4
> -L(128bytesormorein2aligned):
> -	cmp	$512, %rdx
> -	ja	L(512bytesormorein2aligned)
> -	cmp	$256, %rdx
> -	ja	L(256bytesormorein2aligned)
> -L(less256bytesin2alinged):
> -	sub	$128, %rdx
> -
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqa	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqa	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqa	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -
> -	movdqa	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqa	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -
> -	movdqa	96(%rdi), %xmm2
> -	pxor	96(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(112bytesin256)
> -
> -	movdqa	112(%rdi), %xmm2
> -	pxor	112(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(128bytesin256)
> -
> -	add	$128, %rsi
> -	add	$128, %rdi
> -
> -	cmp	$64, %rdx
> -	jae	L(less128bytesin2aligned)
> -
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin128in2aligned)
> -
> -	movdqu	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqu	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin128in2aligned):
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -	.p2align 4
> -L(256bytesormorein2aligned):
> -
> -	sub	$256, %rdx
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqa	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -
> -	movdqa	32(%rdi), %xmm2
> -	pxor	32(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(48bytesin256)
> -
> -	movdqa	48(%rdi), %xmm2
> -	pxor	48(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(64bytesin256)
> -
> -	movdqa	64(%rdi), %xmm2
> -	pxor	64(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(80bytesin256)
> -
> -	movdqa	80(%rdi), %xmm2
> -	pxor	80(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(96bytesin256)
> -
> -	movdqa	96(%rdi), %xmm2
> -	pxor	96(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(112bytesin256)
> -
> -	movdqa	112(%rdi), %xmm2
> -	pxor	112(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(128bytesin256)
> -
> -	movdqa	128(%rdi), %xmm2
> -	pxor	128(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(144bytesin256)
> -
> -	movdqa	144(%rdi), %xmm2
> -	pxor	144(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(160bytesin256)
> -
> -	movdqa	160(%rdi), %xmm2
> -	pxor	160(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(176bytesin256)
> -
> -	movdqa	176(%rdi), %xmm2
> -	pxor	176(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(192bytesin256)
> -
> -	movdqa	192(%rdi), %xmm2
> -	pxor	192(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(208bytesin256)
> -
> -	movdqa	208(%rdi), %xmm2
> -	pxor	208(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(224bytesin256)
> -
> -	movdqa	224(%rdi), %xmm2
> -	pxor	224(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(240bytesin256)
> -
> -	movdqa	240(%rdi), %xmm2
> -	pxor	240(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(256bytesin256)
> -
> -	add	$256, %rsi
> -	add	$256, %rdi
> -
> -	cmp	$128, %rdx
> -	jae	L(less256bytesin2alinged)
> -
> -	cmp	$64, %rdx
> -	jae	L(less128bytesin2aligned)
> -
> -	cmp	$32, %rdx
> -	jb	L(less32bytesin256in2alinged)
> -
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytesin256)
> -
> -	movdqa	16(%rdi), %xmm2
> -	pxor	16(%rsi), %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(32bytesin256)
> -	sub	$32, %rdx
> -	add	$32, %rdi
> -	add	$32, %rsi
> -L(less32bytesin256in2alinged):
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -	.p2align 4
> -L(512bytesormorein2aligned):
> -# ifdef DATA_CACHE_SIZE_HALF
> -	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
> -# else
> -	mov	__x86_data_cache_size_half(%rip), %R8_LP
> -# endif
> -	mov	%r8, %r9
> -	shr	$1, %r8
> -	add	%r9, %r8
> -	cmp	%r8, %rdx
> -	ja	L(L2_L3_cache_aglined)
> -
> -	sub	$64, %rdx
> -	.p2align 4
> -L(64bytesormore_loopin2aligned):
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	movdqa	%xmm2, %xmm1
> -
> -	movdqa	16(%rdi), %xmm3
> -	pxor	16(%rsi), %xmm3
> -	por	%xmm3, %xmm1
> -
> -	movdqa	32(%rdi), %xmm4
> -	pxor	32(%rsi), %xmm4
> -	por	%xmm4, %xmm1
> -
> -	movdqa	48(%rdi), %xmm5
> -	pxor	48(%rsi), %xmm5
> -	por	%xmm5, %xmm1
> -
> -	ptest	%xmm1, %xmm0
> -	jnc	L(64bytesormore_loop_end)
> -	add	$64, %rsi
> -	add	$64, %rdi
> -	sub	$64, %rdx
> -	jae	L(64bytesormore_loopin2aligned)
> -
> -	add	$64, %rdx
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -L(L2_L3_cache_aglined):
> -	sub	$64, %rdx
> -
> -	.p2align 4
> -L(L2_L3_aligned_128bytes_loop):
> -	prefetchnta 0x1c0(%rdi)
> -	prefetchnta 0x1c0(%rsi)
> -	movdqa	(%rdi), %xmm2
> -	pxor	(%rsi), %xmm2
> -	movdqa	%xmm2, %xmm1
> -
> -	movdqa	16(%rdi), %xmm3
> -	pxor	16(%rsi), %xmm3
> -	por	%xmm3, %xmm1
> -
> -	movdqa	32(%rdi), %xmm4
> -	pxor	32(%rsi), %xmm4
> -	por	%xmm4, %xmm1
> -
> -	movdqa	48(%rdi), %xmm5
> -	pxor	48(%rsi), %xmm5
> -	por	%xmm5, %xmm1
> -
> -	ptest	%xmm1, %xmm0
> -	jnc	L(64bytesormore_loop_end)
> -	add	$64, %rsi
> -	add	$64, %rdi
> -	sub	$64, %rdx
> -	jae	L(L2_L3_aligned_128bytes_loop)
> -
> -	add	$64, %rdx
> -	add	%rdx, %rsi
> -	add	%rdx, %rdi
> -	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
> -
> -
> -	.p2align 4
> -L(64bytesormore_loop_end):
> -	add	$16, %rdi
> -	add	$16, %rsi
> -	ptest	%xmm2, %xmm0
> -	jnc	L(16bytes)
> -
> -	add	$16, %rdi
> -	add	$16, %rsi
> -	ptest	%xmm3, %xmm0
> -	jnc	L(16bytes)
> -
> -	add	$16, %rdi
> -	add	$16, %rsi
> -	ptest	%xmm4, %xmm0
> -	jnc	L(16bytes)
> -
> -	add	$16, %rdi
> -	add	$16, %rsi
> -	jmp	L(16bytes)
> -
> -L(256bytesin256):
> -	add	$256, %rdi
> -	add	$256, %rsi
> -	jmp	L(16bytes)
> -L(240bytesin256):
> -	add	$240, %rdi
> -	add	$240, %rsi
> -	jmp	L(16bytes)
> -L(224bytesin256):
> -	add	$224, %rdi
> -	add	$224, %rsi
> -	jmp	L(16bytes)
> -L(208bytesin256):
> -	add	$208, %rdi
> -	add	$208, %rsi
> -	jmp	L(16bytes)
> -L(192bytesin256):
> -	add	$192, %rdi
> -	add	$192, %rsi
> -	jmp	L(16bytes)
> -L(176bytesin256):
> -	add	$176, %rdi
> -	add	$176, %rsi
> -	jmp	L(16bytes)
> -L(160bytesin256):
> -	add	$160, %rdi
> -	add	$160, %rsi
> -	jmp	L(16bytes)
> -L(144bytesin256):
> -	add	$144, %rdi
> -	add	$144, %rsi
> -	jmp	L(16bytes)
> -L(128bytesin256):
> -	add	$128, %rdi
> -	add	$128, %rsi
> -	jmp	L(16bytes)
> -L(112bytesin256):
> -	add	$112, %rdi
> -	add	$112, %rsi
> -	jmp	L(16bytes)
> -L(96bytesin256):
> -	add	$96, %rdi
> -	add	$96, %rsi
> -	jmp	L(16bytes)
> -L(80bytesin256):
> -	add	$80, %rdi
> -	add	$80, %rsi
> -	jmp	L(16bytes)
> -L(64bytesin256):
> -	add	$64, %rdi
> -	add	$64, %rsi
> -	jmp	L(16bytes)
> -L(48bytesin256):
> -	add	$16, %rdi
> -	add	$16, %rsi
> -L(32bytesin256):
> -	add	$16, %rdi
> -	add	$16, %rsi
> -L(16bytesin256):
> -	add	$16, %rdi
> -	add	$16, %rsi
> -L(16bytes):
> -	mov	-16(%rdi), %rax
> -	mov	-16(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -L(8bytes):
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(12bytes):
> -	mov	-12(%rdi), %rax
> -	mov	-12(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -L(4bytes):
> -	mov	-4(%rsi), %ecx
> -# ifndef USE_AS_WMEMCMP
> -	mov	-4(%rdi), %eax
> -	cmp	%eax, %ecx
> -# else
> -	cmp	-4(%rdi), %ecx
> -# endif
> -	jne	L(diffin4bytes)
> -L(0bytes):
> -	xor	%eax, %eax
> -	ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal case for wmemcmp */
> -	.p2align 4
> -L(65bytes):
> -	movdqu	-65(%rdi), %xmm1
> -	movdqu	-65(%rsi), %xmm2
> -	mov	$-65, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(49bytes):
> -	movdqu	-49(%rdi), %xmm1
> -	movdqu	-49(%rsi), %xmm2
> -	mov	$-49, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(33bytes):
> -	movdqu	-33(%rdi), %xmm1
> -	movdqu	-33(%rsi), %xmm2
> -	mov	$-33, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(17bytes):
> -	mov	-17(%rdi), %rax
> -	mov	-17(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -L(9bytes):
> -	mov	-9(%rdi), %rax
> -	mov	-9(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	movzbl	-1(%rdi), %eax
> -	movzbl	-1(%rsi), %edx
> -	sub	%edx, %eax
> -	ret
> -
> -	.p2align 4
> -L(13bytes):
> -	mov	-13(%rdi), %rax
> -	mov	-13(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(5bytes):
> -	mov	-5(%rdi), %eax
> -	mov	-5(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	movzbl	-1(%rdi), %eax
> -	movzbl	-1(%rsi), %edx
> -	sub	%edx, %eax
> -	ret
> -
> -	.p2align 4
> -L(66bytes):
> -	movdqu	-66(%rdi), %xmm1
> -	movdqu	-66(%rsi), %xmm2
> -	mov	$-66, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(50bytes):
> -	movdqu	-50(%rdi), %xmm1
> -	movdqu	-50(%rsi), %xmm2
> -	mov	$-50, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(34bytes):
> -	movdqu	-34(%rdi), %xmm1
> -	movdqu	-34(%rsi), %xmm2
> -	mov	$-34, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(18bytes):
> -	mov	-18(%rdi), %rax
> -	mov	-18(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -L(10bytes):
> -	mov	-10(%rdi), %rax
> -	mov	-10(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	movzwl	-2(%rdi), %eax
> -	movzwl	-2(%rsi), %ecx
> -	cmp	%cl, %al
> -	jne	L(end)
> -	and	$0xffff, %eax
> -	and	$0xffff, %ecx
> -	sub	%ecx, %eax
> -	ret
> -
> -	.p2align 4
> -L(14bytes):
> -	mov	-14(%rdi), %rax
> -	mov	-14(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(6bytes):
> -	mov	-6(%rdi), %eax
> -	mov	-6(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -L(2bytes):
> -	movzwl	-2(%rsi), %ecx
> -	movzwl	-2(%rdi), %eax
> -	cmp	%cl, %al
> -	jne	L(end)
> -	and	$0xffff, %eax
> -	and	$0xffff, %ecx
> -	sub	%ecx, %eax
> -	ret
> -
> -	.p2align 4
> -L(67bytes):
> -	movdqu	-67(%rdi), %xmm2
> -	movdqu	-67(%rsi), %xmm1
> -	mov	$-67, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(51bytes):
> -	movdqu	-51(%rdi), %xmm2
> -	movdqu	-51(%rsi), %xmm1
> -	mov	$-51, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(35bytes):
> -	movdqu	-35(%rsi), %xmm1
> -	movdqu	-35(%rdi), %xmm2
> -	mov	$-35, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(19bytes):
> -	mov	-19(%rdi), %rax
> -	mov	-19(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -L(11bytes):
> -	mov	-11(%rdi), %rax
> -	mov	-11(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-4(%rdi), %eax
> -	mov	-4(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(15bytes):
> -	mov	-15(%rdi), %rax
> -	mov	-15(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(7bytes):
> -	mov	-7(%rdi), %eax
> -	mov	-7(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	mov	-4(%rdi), %eax
> -	mov	-4(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(3bytes):
> -	movzwl	-3(%rdi), %eax
> -	movzwl	-3(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin2bytes)
> -L(1bytes):
> -	movzbl	-1(%rdi), %eax
> -	movzbl	-1(%rsi), %ecx
> -	sub	%ecx, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(68bytes):
> -	movdqu	-68(%rdi), %xmm2
> -	movdqu	-68(%rsi), %xmm1
> -	mov	$-68, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(52bytes):
> -	movdqu	-52(%rdi), %xmm2
> -	movdqu	-52(%rsi), %xmm1
> -	mov	$-52, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(36bytes):
> -	movdqu	-36(%rdi), %xmm2
> -	movdqu	-36(%rsi), %xmm1
> -	mov	$-36, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(20bytes):
> -	movdqu	-20(%rdi), %xmm2
> -	movdqu	-20(%rsi), %xmm1
> -	mov	$-20, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-4(%rsi), %ecx
> -
> -# ifndef USE_AS_WMEMCMP
> -	mov	-4(%rdi), %eax
> -	cmp	%eax, %ecx
> -# else
> -	cmp	-4(%rdi), %ecx
> -# endif
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> -	.p2align 4
> -L(69bytes):
> -	movdqu	-69(%rsi), %xmm1
> -	movdqu	-69(%rdi), %xmm2
> -	mov	$-69, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(53bytes):
> -	movdqu	-53(%rsi), %xmm1
> -	movdqu	-53(%rdi), %xmm2
> -	mov	$-53, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(37bytes):
> -	movdqu	-37(%rsi), %xmm1
> -	movdqu	-37(%rdi), %xmm2
> -	mov	$-37, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(21bytes):
> -	movdqu	-21(%rsi), %xmm1
> -	movdqu	-21(%rdi), %xmm2
> -	mov	$-21, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(70bytes):
> -	movdqu	-70(%rsi), %xmm1
> -	movdqu	-70(%rdi), %xmm2
> -	mov	$-70, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(54bytes):
> -	movdqu	-54(%rsi), %xmm1
> -	movdqu	-54(%rdi), %xmm2
> -	mov	$-54, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(38bytes):
> -	movdqu	-38(%rsi), %xmm1
> -	movdqu	-38(%rdi), %xmm2
> -	mov	$-38, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(22bytes):
> -	movdqu	-22(%rsi), %xmm1
> -	movdqu	-22(%rdi), %xmm2
> -	mov	$-22, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(71bytes):
> -	movdqu	-71(%rsi), %xmm1
> -	movdqu	-71(%rdi), %xmm2
> -	mov	$-71, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(55bytes):
> -	movdqu	-55(%rdi), %xmm2
> -	movdqu	-55(%rsi), %xmm1
> -	mov	$-55, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(39bytes):
> -	movdqu	-39(%rdi), %xmm2
> -	movdqu	-39(%rsi), %xmm1
> -	mov	$-39, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(23bytes):
> -	movdqu	-23(%rdi), %xmm2
> -	movdqu	-23(%rsi), %xmm1
> -	mov	$-23, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(72bytes):
> -	movdqu	-72(%rsi), %xmm1
> -	movdqu	-72(%rdi), %xmm2
> -	mov	$-72, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(56bytes):
> -	movdqu	-56(%rdi), %xmm2
> -	movdqu	-56(%rsi), %xmm1
> -	mov	$-56, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(40bytes):
> -	movdqu	-40(%rdi), %xmm2
> -	movdqu	-40(%rsi), %xmm1
> -	mov	$-40, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(24bytes):
> -	movdqu	-24(%rdi), %xmm2
> -	movdqu	-24(%rsi), %xmm1
> -	mov	$-24, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -
> -	mov	-8(%rsi), %rcx
> -	mov	-8(%rdi), %rax
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> -	.p2align 4
> -L(73bytes):
> -	movdqu	-73(%rsi), %xmm1
> -	movdqu	-73(%rdi), %xmm2
> -	mov	$-73, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(57bytes):
> -	movdqu	-57(%rdi), %xmm2
> -	movdqu	-57(%rsi), %xmm1
> -	mov	$-57, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(41bytes):
> -	movdqu	-41(%rdi), %xmm2
> -	movdqu	-41(%rsi), %xmm1
> -	mov	$-41, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(25bytes):
> -	movdqu	-25(%rdi), %xmm2
> -	movdqu	-25(%rsi), %xmm1
> -	mov	$-25, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-9(%rdi), %rax
> -	mov	-9(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	movzbl	-1(%rdi), %eax
> -	movzbl	-1(%rsi), %ecx
> -	sub	%ecx, %eax
> -	ret
> -
> -	.p2align 4
> -L(74bytes):
> -	movdqu	-74(%rsi), %xmm1
> -	movdqu	-74(%rdi), %xmm2
> -	mov	$-74, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(58bytes):
> -	movdqu	-58(%rdi), %xmm2
> -	movdqu	-58(%rsi), %xmm1
> -	mov	$-58, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(42bytes):
> -	movdqu	-42(%rdi), %xmm2
> -	movdqu	-42(%rsi), %xmm1
> -	mov	$-42, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(26bytes):
> -	movdqu	-26(%rdi), %xmm2
> -	movdqu	-26(%rsi), %xmm1
> -	mov	$-26, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-10(%rdi), %rax
> -	mov	-10(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	movzwl	-2(%rdi), %eax
> -	movzwl	-2(%rsi), %ecx
> -	jmp	L(diffin2bytes)
> -
> -	.p2align 4
> -L(75bytes):
> -	movdqu	-75(%rsi), %xmm1
> -	movdqu	-75(%rdi), %xmm2
> -	mov	$-75, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(59bytes):
> -	movdqu	-59(%rdi), %xmm2
> -	movdqu	-59(%rsi), %xmm1
> -	mov	$-59, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(43bytes):
> -	movdqu	-43(%rdi), %xmm2
> -	movdqu	-43(%rsi), %xmm1
> -	mov	$-43, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(27bytes):
> -	movdqu	-27(%rdi), %xmm2
> -	movdqu	-27(%rsi), %xmm1
> -	mov	$-27, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-11(%rdi), %rax
> -	mov	-11(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-4(%rdi), %eax
> -	mov	-4(%rsi), %ecx
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -# endif
> -	.p2align 4
> -L(76bytes):
> -	movdqu	-76(%rsi), %xmm1
> -	movdqu	-76(%rdi), %xmm2
> -	mov	$-76, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(60bytes):
> -	movdqu	-60(%rdi), %xmm2
> -	movdqu	-60(%rsi), %xmm1
> -	mov	$-60, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(44bytes):
> -	movdqu	-44(%rdi), %xmm2
> -	movdqu	-44(%rsi), %xmm1
> -	mov	$-44, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(28bytes):
> -	movdqu	-28(%rdi), %xmm2
> -	movdqu	-28(%rsi), %xmm1
> -	mov	$-28, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-12(%rdi), %rax
> -	mov	-12(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-4(%rsi), %ecx
> -# ifndef USE_AS_WMEMCMP
> -	mov	-4(%rdi), %eax
> -	cmp	%eax, %ecx
> -# else
> -	cmp	-4(%rdi), %ecx
> -# endif
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -# ifndef USE_AS_WMEMCMP
> -/* unreal cases for wmemcmp */
> -	.p2align 4
> -L(77bytes):
> -	movdqu	-77(%rsi), %xmm1
> -	movdqu	-77(%rdi), %xmm2
> -	mov	$-77, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(61bytes):
> -	movdqu	-61(%rdi), %xmm2
> -	movdqu	-61(%rsi), %xmm1
> -	mov	$-61, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(45bytes):
> -	movdqu	-45(%rdi), %xmm2
> -	movdqu	-45(%rsi), %xmm1
> -	mov	$-45, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(29bytes):
> -	movdqu	-29(%rdi), %xmm2
> -	movdqu	-29(%rsi), %xmm1
> -	mov	$-29, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -
> -	mov	-13(%rdi), %rax
> -	mov	-13(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(78bytes):
> -	movdqu	-78(%rsi), %xmm1
> -	movdqu	-78(%rdi), %xmm2
> -	mov	$-78, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(62bytes):
> -	movdqu	-62(%rdi), %xmm2
> -	movdqu	-62(%rsi), %xmm1
> -	mov	$-62, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(46bytes):
> -	movdqu	-46(%rdi), %xmm2
> -	movdqu	-46(%rsi), %xmm1
> -	mov	$-46, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(30bytes):
> -	movdqu	-30(%rdi), %xmm2
> -	movdqu	-30(%rsi), %xmm1
> -	mov	$-30, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-14(%rdi), %rax
> -	mov	-14(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -	.p2align 4
> -L(79bytes):
> -	movdqu	-79(%rsi), %xmm1
> -	movdqu	-79(%rdi), %xmm2
> -	mov	$-79, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(63bytes):
> -	movdqu	-63(%rdi), %xmm2
> -	movdqu	-63(%rsi), %xmm1
> -	mov	$-63, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(47bytes):
> -	movdqu	-47(%rdi), %xmm2
> -	movdqu	-47(%rsi), %xmm1
> -	mov	$-47, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(31bytes):
> -	movdqu	-31(%rdi), %xmm2
> -	movdqu	-31(%rsi), %xmm1
> -	mov	$-31, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -	mov	-15(%rdi), %rax
> -	mov	-15(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -# endif
> -	.p2align 4
> -L(64bytes):
> -	movdqu	-64(%rdi), %xmm2
> -	movdqu	-64(%rsi), %xmm1
> -	mov	$-64, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(48bytes):
> -	movdqu	-48(%rdi), %xmm2
> -	movdqu	-48(%rsi), %xmm1
> -	mov	$-48, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -L(32bytes):
> -	movdqu	-32(%rdi), %xmm2
> -	movdqu	-32(%rsi), %xmm1
> -	mov	$-32, %dl
> -	pxor	%xmm1, %xmm2
> -	ptest	%xmm2, %xmm0
> -	jnc	L(less16bytes)
> -
> -	mov	-16(%rdi), %rax
> -	mov	-16(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -
> -	mov	-8(%rdi), %rax
> -	mov	-8(%rsi), %rcx
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	xor	%eax, %eax
> -	ret
> -
> -/*
> - * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
> - */
> -	.p2align 3
> -L(less16bytes):
> -	movsbq	%dl, %rdx
> -	mov	(%rsi, %rdx), %rcx
> -	mov	(%rdi, %rdx), %rax
> -	cmp	%rax, %rcx
> -	jne	L(diffin8bytes)
> -	mov	8(%rsi, %rdx), %rcx
> -	mov	8(%rdi, %rdx), %rax
> -L(diffin8bytes):
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	shr	$32, %rcx
> -	shr	$32, %rax
> -
> -# ifdef USE_AS_WMEMCMP
> -/* for wmemcmp */
> -	cmp	%eax, %ecx
> -	jne	L(diffin4bytes)
> -	xor	%eax, %eax
> -	ret
> -# endif
> -
> -L(diffin4bytes):
> -# ifndef USE_AS_WMEMCMP
> -	cmp	%cx, %ax
> -	jne	L(diffin2bytes)
> -	shr	$16, %ecx
> -	shr	$16, %eax
> -L(diffin2bytes):
> -	cmp	%cl, %al
> -	jne	L(end)
> -	and	$0xffff, %eax
> -	and	$0xffff, %ecx
> -	sub	%ecx, %eax
> -	ret
> -
> -	.p2align 4
> -L(end):
> -	and	$0xff, %eax
> -	and	$0xff, %ecx
> -	sub	%ecx, %eax
> -	ret
> -# else
> -
> -/* for wmemcmp */
> -	mov	$1, %eax
> -	jl	L(nequal_bigger)
> -	neg	%eax
> -	ret
> -
> -	.p2align 4
> -L(nequal_bigger):
> -	ret
> -
> -L(unreal_case):
> -	xor	%eax, %eax
> -	ret
> -# endif
> -
> -END (MEMCMP)
> -
> -	.section .rodata.sse4.1,"a",@progbits
> -	.p2align 3
> -# ifndef USE_AS_WMEMCMP
> -L(table_64bytes):
> -	.int	JMPTBL (L(0bytes), L(table_64bytes))
> -	.int	JMPTBL (L(1bytes), L(table_64bytes))
> -	.int	JMPTBL (L(2bytes), L(table_64bytes))
> -	.int	JMPTBL (L(3bytes), L(table_64bytes))
> -	.int	JMPTBL (L(4bytes), L(table_64bytes))
> -	.int	JMPTBL (L(5bytes), L(table_64bytes))
> -	.int	JMPTBL (L(6bytes), L(table_64bytes))
> -	.int	JMPTBL (L(7bytes), L(table_64bytes))
> -	.int	JMPTBL (L(8bytes), L(table_64bytes))
> -	.int	JMPTBL (L(9bytes), L(table_64bytes))
> -	.int	JMPTBL (L(10bytes), L(table_64bytes))
> -	.int	JMPTBL (L(11bytes), L(table_64bytes))
> -	.int	JMPTBL (L(12bytes), L(table_64bytes))
> -	.int	JMPTBL (L(13bytes), L(table_64bytes))
> -	.int	JMPTBL (L(14bytes), L(table_64bytes))
> -	.int	JMPTBL (L(15bytes), L(table_64bytes))
> -	.int	JMPTBL (L(16bytes), L(table_64bytes))
> -	.int	JMPTBL (L(17bytes), L(table_64bytes))
> -	.int	JMPTBL (L(18bytes), L(table_64bytes))
> -	.int	JMPTBL (L(19bytes), L(table_64bytes))
> -	.int	JMPTBL (L(20bytes), L(table_64bytes))
> -	.int	JMPTBL (L(21bytes), L(table_64bytes))
> -	.int	JMPTBL (L(22bytes), L(table_64bytes))
> -	.int	JMPTBL (L(23bytes), L(table_64bytes))
> -	.int	JMPTBL (L(24bytes), L(table_64bytes))
> -	.int	JMPTBL (L(25bytes), L(table_64bytes))
> -	.int	JMPTBL (L(26bytes), L(table_64bytes))
> -	.int	JMPTBL (L(27bytes), L(table_64bytes))
> -	.int	JMPTBL (L(28bytes), L(table_64bytes))
> -	.int	JMPTBL (L(29bytes), L(table_64bytes))
> -	.int	JMPTBL (L(30bytes), L(table_64bytes))
> -	.int	JMPTBL (L(31bytes), L(table_64bytes))
> -	.int	JMPTBL (L(32bytes), L(table_64bytes))
> -	.int	JMPTBL (L(33bytes), L(table_64bytes))
> -	.int	JMPTBL (L(34bytes), L(table_64bytes))
> -	.int	JMPTBL (L(35bytes), L(table_64bytes))
> -	.int	JMPTBL (L(36bytes), L(table_64bytes))
> -	.int	JMPTBL (L(37bytes), L(table_64bytes))
> -	.int	JMPTBL (L(38bytes), L(table_64bytes))
> -	.int	JMPTBL (L(39bytes), L(table_64bytes))
> -	.int	JMPTBL (L(40bytes), L(table_64bytes))
> -	.int	JMPTBL (L(41bytes), L(table_64bytes))
> -	.int	JMPTBL (L(42bytes), L(table_64bytes))
> -	.int	JMPTBL (L(43bytes), L(table_64bytes))
> -	.int	JMPTBL (L(44bytes), L(table_64bytes))
> -	.int	JMPTBL (L(45bytes), L(table_64bytes))
> -	.int	JMPTBL (L(46bytes), L(table_64bytes))
> -	.int	JMPTBL (L(47bytes), L(table_64bytes))
> -	.int	JMPTBL (L(48bytes), L(table_64bytes))
> -	.int	JMPTBL (L(49bytes), L(table_64bytes))
> -	.int	JMPTBL (L(50bytes), L(table_64bytes))
> -	.int	JMPTBL (L(51bytes), L(table_64bytes))
> -	.int	JMPTBL (L(52bytes), L(table_64bytes))
> -	.int	JMPTBL (L(53bytes), L(table_64bytes))
> -	.int	JMPTBL (L(54bytes), L(table_64bytes))
> -	.int	JMPTBL (L(55bytes), L(table_64bytes))
> -	.int	JMPTBL (L(56bytes), L(table_64bytes))
> -	.int	JMPTBL (L(57bytes), L(table_64bytes))
> -	.int	JMPTBL (L(58bytes), L(table_64bytes))
> -	.int	JMPTBL (L(59bytes), L(table_64bytes))
> -	.int	JMPTBL (L(60bytes), L(table_64bytes))
> -	.int	JMPTBL (L(61bytes), L(table_64bytes))
> -	.int	JMPTBL (L(62bytes), L(table_64bytes))
> -	.int	JMPTBL (L(63bytes), L(table_64bytes))
> -	.int	JMPTBL (L(64bytes), L(table_64bytes))
> -	.int	JMPTBL (L(65bytes), L(table_64bytes))
> -	.int	JMPTBL (L(66bytes), L(table_64bytes))
> -	.int	JMPTBL (L(67bytes), L(table_64bytes))
> -	.int	JMPTBL (L(68bytes), L(table_64bytes))
> -	.int	JMPTBL (L(69bytes), L(table_64bytes))
> -	.int	JMPTBL (L(70bytes), L(table_64bytes))
> -	.int	JMPTBL (L(71bytes), L(table_64bytes))
> -	.int	JMPTBL (L(72bytes), L(table_64bytes))
> -	.int	JMPTBL (L(73bytes), L(table_64bytes))
> -	.int	JMPTBL (L(74bytes), L(table_64bytes))
> -	.int	JMPTBL (L(75bytes), L(table_64bytes))
> -	.int	JMPTBL (L(76bytes), L(table_64bytes))
> -	.int	JMPTBL (L(77bytes), L(table_64bytes))
> -	.int	JMPTBL (L(78bytes), L(table_64bytes))
> -	.int	JMPTBL (L(79bytes), L(table_64bytes))
> -# else
> -L(table_64bytes):
> -	.int	JMPTBL (L(0bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(4bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(8bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(12bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(16bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(20bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(24bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(28bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(32bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(36bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(40bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(44bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(48bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(52bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(56bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(60bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(64bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(68bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(72bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(76bytes), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
> index f8b4636..5d87a17 100644
> --- a/sysdeps/x86_64/multiarch/memcmp.S
> +++ b/sysdeps/x86_64/multiarch/memcmp.S
> @@ -29,33 +29,28 @@ ENTRY(memcmp)
>  	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
>  	jne	1f
>  	call	__init_cpu_features
> -
> -1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
>  	jnz	2f
> -	leaq	__memcmp_sse2(%rip), %rax
> -	ret
> -
> -2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> -	jz	3f
> -	leaq	__memcmp_sse4_1(%rip), %rax
> +1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +	jnz	3f
> +2:	leaq	__memcmp_sse2_unaligned(%rip), %rax
>  	ret
>  
>  3:	leaq	__memcmp_ssse3(%rip), %rax
>  	ret
> -
>  END(memcmp)
>  
>  # undef ENTRY
>  # define ENTRY(name) \
> -	.type __memcmp_sse2, @function; \
> +	.type __memcmp_sse2_unaligned, @function; \
>  	.p2align 4; \
> -	.globl __memcmp_sse2; \
> -	.hidden __memcmp_sse2; \
> -	__memcmp_sse2: cfi_startproc; \
> +	.globl __memcmp_sse2_unaligned; \
> +	.hidden __memcmp_sse2_unaligned; \
> +	__memcmp_sse2_unaligned: cfi_startproc; \
>  	CALL_MCOUNT
>  # undef END
>  # define END(name) \
> -	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
> +	cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned
>  
>  # ifdef SHARED
>  #  undef libc_hidden_builtin_def
> @@ -63,7 +58,7 @@ END(memcmp)
>     they will be called without setting up EBX needed for PLT which is
>     used by IFUNC.  */
>  #  define libc_hidden_builtin_def(name) \
> -	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
> +	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned
>  # endif
>  #endif
>  
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> index 695a236..5dd8d44 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> @@ -201,6 +201,10 @@ L(prepare_loop):
>  	movdqu	%xmm2, 96(%rdi)
>  	movdqu	%xmm3, 112(%rdi)
>  
> +#ifdef USE_AVX2
> +	vpxor	%xmm5, %xmm5, %xmm5
> +#endif
> +
>  	subq	%rsi, %rdi
>  	add	$64, %rsi
>  	andq	$-64, %rsi
> @@ -348,10 +352,13 @@ L(cross_loop):
>  	sub	$1, %rcx
>  	ja	L(cross_loop)
>  
> +#ifdef USE_AVX2
> +	vpxor	%xmm5, %xmm5, %xmm5
> +#else
>  	pxor	%xmm5, %xmm5
>  	pxor	%xmm6, %xmm6
>  	pxor	%xmm7, %xmm7
> -
> +#endif
>  	lea	-64(%rsi), %rdx
>  	andq	$-64, %rdx
>  	addq	%rdx, %rdi
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
> deleted file mode 100644
> index b07973a..0000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_sse4_1
> -
> -#include "memcmp-sse4.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
> index 109e245..dabd3ed 100644
> --- a/sysdeps/x86_64/multiarch/wmemcmp.S
> +++ b/sysdeps/x86_64/multiarch/wmemcmp.S
> @@ -30,18 +30,16 @@ ENTRY(wmemcmp)
>  	jne	1f
>  	call	__init_cpu_features
>  
> -1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
>  	jnz	2f
> -	leaq	__wmemcmp_sse2(%rip), %rax
> -	ret
> -
> -2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> -	jz	3f
> -	leaq	__wmemcmp_sse4_1(%rip), %rax
> +1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +	jnz	3f
> +2:	leaq	__wmemcmp_sse2_unaligned(%rip), %rax
>  	ret
>  
>  3:	leaq	__wmemcmp_ssse3(%rip), %rax
>  	ret
>  
> +
>  END(wmemcmp)
>  #endif
> -- 
> 1.8.4.rc3
diff mbox

Patch

diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index f636716..88c0c4a 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -19,340 +19,204 @@ 
 
 #include <sysdep.h>
 
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
 	.text
-ENTRY (memcmp)
-	test	%rdx, %rdx
-	jz	L(finz)
-	cmpq	$1, %rdx
-	jle	L(finr1b)
-	subq	%rdi, %rsi
-	movq	%rdx, %r10
-	cmpq	$32, %r10
-	jge	L(gt32)
-	/* Handle small chunks and last block of less than 32 bytes.  */
-L(small):
-	testq	$1, %r10
-	jz	L(s2b)
-	movzbl	(%rdi),	%eax
-	movzbl	(%rdi, %rsi), %edx
-	subq    $1, %r10
-	je	L(finz1)
-	addq	$1, %rdi
-	subl	%edx, %eax
-	jnz	L(exit)
-L(s2b):
-	testq	$2, %r10
-	jz	L(s4b)
-	movzwl	(%rdi),	%eax
-	movzwl	(%rdi, %rsi), %edx
-	subq    $2, %r10
-	je	L(fin2_7)
-	addq	$2, %rdi
-	cmpl	%edx, %eax
-	jnz	L(fin2_7)
-L(s4b):
-	testq	$4, %r10
-	jz	L(s8b)
-	movl	(%rdi),	%eax
-	movl	(%rdi, %rsi), %edx
-	subq    $4, %r10
-	je	L(fin2_7)
-	addq	$4, %rdi
-	cmpl	%edx, %eax
-	jnz	L(fin2_7)
-L(s8b):
-	testq	$8, %r10
-	jz	L(s16b)
-	movq	(%rdi),	%rax
-	movq	(%rdi, %rsi), %rdx
-	subq    $8, %r10
-	je	L(fin2_7)
-	addq	$8, %rdi
-	cmpq	%rdx, %rax
-	jnz	L(fin2_7)
-L(s16b):
-	movdqu    (%rdi), %xmm1
-	movdqu    (%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
-	pmovmskb  %xmm1, %edx
-	xorl	  %eax, %eax
-	subl      $0xffff, %edx
-	jz	  L(finz)
-	bsfl      %edx, %ecx
-	leaq	 (%rdi, %rcx), %rcx
-	movzbl	 (%rcx), %eax
-	movzbl	 (%rsi, %rcx), %edx
-	jmp	 L(finz1)
+ENTRY (MEMCMP)
+	testq	%rdx, %rdx
+	je	L(return_zero)
+#ifdef AS_WMEMCMP
+	shl	$2, %rdx
+#endif
+	pxor	%xmm4, %xmm4
+	movl	%edi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	ja	L(cross_page_start)
+L(handle_end):
+	movl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	ja	L(cross_page_start)
+L(back_header):
+	xor	%ecx, %ecx
+	bts	%rdx, %rcx
+	sub	$1, %rcx
+	movdqu	(%rdi), %xmm0
+	movdqu	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	and	%ecx, %eax
+	jne	L(different)
+	cmpq	$16, %rdx
+	ja	L(next)
+	ret
+L(next):
+	pmovmskb %xmm0, %r8d
+	movdqu	16(%rdi), %xmm2
+	movdqu	16(%rsi), %xmm6
+	movdqu	32(%rdi), %xmm1
+	pcmpeqb	%xmm6, %xmm2
+	movdqu	32(%rsi), %xmm5
+	pcmpeqb	%xmm4, %xmm2
+	pcmpeqb	%xmm5, %xmm1
+	movdqu	48(%rdi), %xmm7
+	pmovmskb %xmm2, %eax
+	movdqu	48(%rsi), %xmm3
+	pcmpeqb	%xmm4, %xmm1
+	pmovmskb %xmm1, %r9d
+	sal	$16, %eax
+	pcmpeqb	%xmm3, %xmm7
+	salq	$32, %r9
+	pcmpeqb	%xmm4, %xmm7
+	orq	%r9, %rax
+	orq	%r8, %rax
+	pmovmskb %xmm7, %r8d
+	salq	$48, %r8
+	orq	%r8, %rax
+	movq	%rax, %r8
+	andq	%rcx, %rax
+	jne	L(different)
+	cmpq	$64, %rdx
+	jb	L(return_zero)
+	movq	%r8, %rax
+	testq	%rax, %rax
+	jne	L(different)
+L(align_loop):
+	leaq	64(%rdi), %rax
+	andq	$-64, %rax
+	subq	%rdi, %rax
+	subq	%rax, %rdx
+	addq	%rax, %rdi
+	addq	%rax, %rsi
+	cmpq	$64, %rdx
+	ja	L(loop_start)
+	testq	%rdx, %rdx
+	jne	L(handle_end)
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4,, 4
-L(finr1b):
-	movzbl	(%rdi), %eax
-	movzbl  (%rsi), %edx
-L(finz1):
+	.p2align 4
+L(different):
+	bsfq	%rax, %rdx
+#ifdef AS_WMEMCMP
+	and	$-4, %rdx
+	mov	(%rdi,%rdx), %eax
+	mov	(%rsi,%rdx), %edx
 	subl	%edx, %eax
-L(exit):
+	jg	L(ret1)
+	jl	L(ret_neg_1)
 	ret
-
-	.p2align 4,, 4
-L(fin2_7):
-	cmpq	%rdx, %rax
-	jz	L(finz)
-	movq	%rax, %r11
-	subq	%rdx, %r11
-	bsfq	%r11, %rcx
-	sarq	$3, %rcx
-	salq	$3, %rcx
-	sarq	%cl, %rax
-	movzbl  %al, %eax
-	sarq	%cl, %rdx
-	movzbl  %dl, %edx
+L(ret1):
+	mov $1, %eax
+	ret
+L(ret_neg_1):
+	mov $-1, %eax
+	ret
+#else
+	movzbl	(%rdi,%rdx), %eax
+	movzbl	(%rsi,%rdx), %edx
 	subl	%edx, %eax
 	ret
-
-	.p2align 4,, 4
-L(finz):
+#endif
+L(return_zero):
+	xor	%eax, %eax
+	ret
+	.p2align 4
+L(loop):
+	subq	$64, %rdx
+	addq	$64, %rdi
+	addq	$64, %rsi
+	cmpq	$64, %rdx
+	jbe	L(less_64_bytes)
+L(loop_start):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm0
+	movdqu	32(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm1
+	movdqu	48(%rsi), %xmm3
+	pcmpeqb	32(%rdi), %xmm2
+	pcmpeqb	48(%rdi), %xmm3
+	pminub	%xmm0, %xmm3
+	pminub	%xmm1, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	shl	$48, %rax
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	pcmpeqb	%xmm4, %xmm2
+	pmovmskb %xmm0, %r8
+	pmovmskb %xmm1, %rcx
+	pmovmskb %xmm2, %r9
+	shl	$16, %ecx
+	shl	$32, %r9
+	or	%r8, %rax
+	or	%r9, %rax
+	or	%rcx, %rax
+	jmp	L(different)
+
+	.p2align 4
+L(less_64_bytes):
+	testq	%rdx, %rdx
+	jne	L(handle_end)
 	xorl	%eax, %eax
 	ret
 
-	/* For blocks bigger than 32 bytes
-	   1. Advance one of the addr pointer to be 16B aligned.
-	   2. Treat the case of both addr pointers aligned to 16B
-	      separately to avoid movdqu.
-	   3. Handle any blocks of greater than 64 consecutive bytes with
-	      unrolling to reduce branches.
-	   4. At least one addr pointer is 16B aligned, use memory version
-	      of pcmbeqb.
-	*/
-	.p2align 4,, 4
-L(gt32):
-	movq	%rdx, %r11
-	addq	%rdi, %r11
-	movq	%rdi, %r8
-
-	andq	$15, %r8
-	jz	L(16am)
-	/* Both pointers may be misaligned.  */
-	movdqu	(%rdi),	%xmm1
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
-	pmovmskb  %xmm1, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	neg	 %r8
-	leaq    16(%rdi, %r8), %rdi
-L(16am):
-	/* Handle two 16B aligned pointers separately.  */
-	testq   $15, %rsi
-	jz      L(ATR)
-	testq	$16, %rdi
-	jz	L(A32)
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq	$16, %rdi
-L(A32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jge	L(mt16)
-	/* Pre-unroll to be ready for unrolled 64B loop.  */
-	testq	$32, %rdi
-	jz	L(A64)
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(A64):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	cmpq	%r10, %rdi
-        jge	L(mt32)
-
-L(A64main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A64main)
-
-L(mt32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jge	L(mt16)
 
-L(A32main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A32main)
-L(mt16):
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-
-	.p2align 4,, 4
-L(neq):
-	bsfl      %edx, %ecx
-	movzbl	 (%rdi, %rcx), %eax
-	addq	 %rdi, %rsi
-	movzbl	 (%rsi,%rcx), %edx
-	jmp	 L(finz1)
-
-	.p2align 4,, 4
-L(ATR):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jge	L(mt16)
-	testq	$16, %rdi
-	jz	L(ATR32)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	je       L(mt16)
-
-L(ATR32):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	testq	$32, %rdi
-	jz	L(ATR64)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(ATR64):
-	cmpq       %rdi, %r10
-	je	   L(mt32)
-
-L(ATR64main):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	jne       L(ATR64main)
-
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jge	L(mt16)
-
-L(ATR32res):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq	  %r10, %rdi
-	jne       L(ATR32res)
-
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-	/* Align to 16byte to improve instruction fetch.  */
-	.p2align 4,, 4
-END(memcmp)
+	.p2align 4
+L(cross_page_start):
+	cmp	$64, %rdx
+	ja	L(back_header)
+
+	.p2align 4
+L(cross_page):
+	test	%edx, %edx
+	je	L(return_zero)
+#ifdef AS_WMEMCMP
+	mov	(%rdi), %eax
+	mov	(%rsi), %ecx
+	subl	%ecx, %eax
+	jg	L(ret1)
+	jl	L(ret_neg_1)
+#else
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	jne	L(return)
+	cmp	$1, %edx
+	je	L(return)
+	movzbl	1(%rdi), %eax
+	movzbl	1(%rsi), %ecx
+	subl	%ecx, %eax
+	jne	L(return)
+	cmp	$2, %edx
+	je	L(return)
+	movzbl	2(%rdi), %eax
+	movzbl	2(%rsi), %ecx
+	subl	%ecx, %eax
+	jne	L(return)
+	cmp	$3, %edx
+	je	L(return)
+	movzbl	3(%rdi), %eax
+	movzbl	3(%rsi), %ecx
+	subl	%ecx, %eax
+	jne	L(return)
+#endif
+	sub	$4, %edx
+	add	$4, %rdi
+	add	$4, %rsi
+	jmp	L(cross_page)
+L(return):
+	ret
+END(MEMCMP)
 
-#undef bcmp
+#undef	bcmp
 weak_alias (memcmp, bcmp)
 libc_hidden_builtin_def (memcmp)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c573744..679db2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@  ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   memcmp-sse4 memcpy-ssse3 \
+		   memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
@@ -29,10 +29,10 @@  CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
 endif
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d398e43..b3dbe65 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -39,10 +39,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
-			      __memcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
@@ -211,8 +210,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
-			      __wmemcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1,
+			      __wmemcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
new file mode 100644
index 0000000..60483bf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,3 @@ 
+#define USE_AVX2
+#define MEMCMP __memcmp_avx2
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 533fece..0000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,1776 +0,0 @@ 
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_sse4_1
-# endif
-
-# define JMPTBL(I, B)	(I - B)
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), %rcx;			\
-  add		%r11, %rcx;					\
-  jmp		*%rcx;						\
-  ud2
-
-/* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	.section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
-# endif
-	pxor	%xmm0, %xmm0
-	cmp	$79, %rdx
-	ja	L(79bytesormore)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %rdx
-	je	L(firstbyte)
-# endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(firstbyte):
-	movzbl	(%rdi), %eax
-	movzbl	(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-# endif
-
-	.p2align 4
-L(79bytesormore):
-	movdqu	(%rsi), %xmm1
-	movdqu	(%rdi), %xmm2
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-	mov	%rsi, %rcx
-	and	$-16, %rsi
-	add	$16, %rsi
-	sub	%rsi, %rcx
-
-	sub	%rcx, %rdi
-	add	%rcx, %rdx
-	test	$0xf, %rdi
-	jz	L(2aligned)
-
-	cmp	$128, %rdx
-	ja	L(128bytesormore)
-L(less128bytes):
-	sub	$64, %rdx
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-	cmp	$32, %rdx
-	jb	L(less32bytesin64)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin64):
-	add	$64, %rdi
-	add	$64, %rsi
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(128bytesormore):
-	cmp	$512, %rdx
-	ja	L(512bytesormore)
-	cmp	$256, %rdx
-	ja	L(less512bytes)
-L(less256bytes):
-	sub	$128, %rdx
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqu	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqu	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	add	$128, %rsi
-	add	$128, %rdi
-
-	cmp	$64, %rdx
-	jae	L(less128bytes)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin128)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin128):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(less512bytes):
-	sub	$256, %rdx
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqu	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqu	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	movdqu	128(%rdi), %xmm2
-	pxor	128(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(144bytesin256)
-
-	movdqu	144(%rdi), %xmm2
-	pxor	144(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(160bytesin256)
-
-	movdqu	160(%rdi), %xmm2
-	pxor	160(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(176bytesin256)
-
-	movdqu	176(%rdi), %xmm2
-	pxor	176(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(192bytesin256)
-
-	movdqu	192(%rdi), %xmm2
-	pxor	192(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(208bytesin256)
-
-	movdqu	208(%rdi), %xmm2
-	pxor	208(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(224bytesin256)
-
-	movdqu	224(%rdi), %xmm2
-	pxor	224(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(240bytesin256)
-
-	movdqu	240(%rdi), %xmm2
-	pxor	240(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(256bytesin256)
-
-	add	$256, %rsi
-	add	$256, %rdi
-
-	cmp	$128, %rdx
-	jae	L(less256bytes)
-
-	cmp	$64, %rdx
-	jae	L(less128bytes)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin256)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin256):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(512bytesormore):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	mov	%r8, %r9
-	shr	$1, %r8
-	add	%r9, %r8
-	cmp	%r8, %rdx
-	ja	L(L2_L3_cache_unaglined)
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loop):
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqu	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqu	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqu	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(64bytesormore_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(L2_L3_cache_unaglined):
-	sub	$64, %rdx
-	.p2align 4
-L(L2_L3_unaligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqu	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqu	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqu	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(L2_L3_unaligned_128bytes_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-/*
- * This case is for machines which are sensitive for unaligned instructions.
- */
-	.p2align 4
-L(2aligned):
-	cmp	$128, %rdx
-	ja	L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
-	sub	$64, %rdx
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-	cmp	$32, %rdx
-	jb	L(less32bytesin64in2alinged)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin64in2alinged):
-	add	$64, %rdi
-	add	$64, %rsi
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(128bytesormorein2aligned):
-	cmp	$512, %rdx
-	ja	L(512bytesormorein2aligned)
-	cmp	$256, %rdx
-	ja	L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
-	sub	$128, %rdx
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqa	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqa	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	add	$128, %rsi
-	add	$128, %rdi
-
-	cmp	$64, %rdx
-	jae	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin128in2aligned)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin128in2aligned):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(256bytesormorein2aligned):
-
-	sub	$256, %rdx
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqa	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqa	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	movdqa	128(%rdi), %xmm2
-	pxor	128(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(144bytesin256)
-
-	movdqa	144(%rdi), %xmm2
-	pxor	144(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(160bytesin256)
-
-	movdqa	160(%rdi), %xmm2
-	pxor	160(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(176bytesin256)
-
-	movdqa	176(%rdi), %xmm2
-	pxor	176(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(192bytesin256)
-
-	movdqa	192(%rdi), %xmm2
-	pxor	192(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(208bytesin256)
-
-	movdqa	208(%rdi), %xmm2
-	pxor	208(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(224bytesin256)
-
-	movdqa	224(%rdi), %xmm2
-	pxor	224(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(240bytesin256)
-
-	movdqa	240(%rdi), %xmm2
-	pxor	240(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(256bytesin256)
-
-	add	$256, %rsi
-	add	$256, %rdi
-
-	cmp	$128, %rdx
-	jae	L(less256bytesin2alinged)
-
-	cmp	$64, %rdx
-	jae	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin256in2alinged)
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin256in2alinged):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(512bytesormorein2aligned):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	mov	%r8, %r9
-	shr	$1, %r8
-	add	%r9, %r8
-	cmp	%r8, %rdx
-	ja	L(L2_L3_cache_aglined)
-
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loopin2aligned):
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqa	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqa	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqa	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(64bytesormore_loopin2aligned)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-L(L2_L3_cache_aglined):
-	sub	$64, %rdx
-
-	.p2align 4
-L(L2_L3_aligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqa	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqa	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqa	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(L2_L3_aligned_128bytes_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-
-	.p2align 4
-L(64bytesormore_loop_end):
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm3, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm4, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	jmp	L(16bytes)
-
-L(256bytesin256):
-	add	$256, %rdi
-	add	$256, %rsi
-	jmp	L(16bytes)
-L(240bytesin256):
-	add	$240, %rdi
-	add	$240, %rsi
-	jmp	L(16bytes)
-L(224bytesin256):
-	add	$224, %rdi
-	add	$224, %rsi
-	jmp	L(16bytes)
-L(208bytesin256):
-	add	$208, %rdi
-	add	$208, %rsi
-	jmp	L(16bytes)
-L(192bytesin256):
-	add	$192, %rdi
-	add	$192, %rsi
-	jmp	L(16bytes)
-L(176bytesin256):
-	add	$176, %rdi
-	add	$176, %rsi
-	jmp	L(16bytes)
-L(160bytesin256):
-	add	$160, %rdi
-	add	$160, %rsi
-	jmp	L(16bytes)
-L(144bytesin256):
-	add	$144, %rdi
-	add	$144, %rsi
-	jmp	L(16bytes)
-L(128bytesin256):
-	add	$128, %rdi
-	add	$128, %rsi
-	jmp	L(16bytes)
-L(112bytesin256):
-	add	$112, %rdi
-	add	$112, %rsi
-	jmp	L(16bytes)
-L(96bytesin256):
-	add	$96, %rdi
-	add	$96, %rsi
-	jmp	L(16bytes)
-L(80bytesin256):
-	add	$80, %rdi
-	add	$80, %rsi
-	jmp	L(16bytes)
-L(64bytesin256):
-	add	$64, %rdi
-	add	$64, %rsi
-	jmp	L(16bytes)
-L(48bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(32bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(16bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(16bytes):
-	mov	-16(%rdi), %rax
-	mov	-16(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(8bytes):
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(12bytes):
-	mov	-12(%rdi), %rax
-	mov	-12(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(4bytes):
-	mov	-4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal case for wmemcmp */
-	.p2align 4
-L(65bytes):
-	movdqu	-65(%rdi), %xmm1
-	movdqu	-65(%rsi), %xmm2
-	mov	$-65, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(49bytes):
-	movdqu	-49(%rdi), %xmm1
-	movdqu	-49(%rsi), %xmm2
-	mov	$-49, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(33bytes):
-	movdqu	-33(%rdi), %xmm1
-	movdqu	-33(%rsi), %xmm2
-	mov	$-33, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(17bytes):
-	mov	-17(%rdi), %rax
-	mov	-17(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(9bytes):
-	mov	-9(%rdi), %rax
-	mov	-9(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(13bytes):
-	mov	-13(%rdi), %rax
-	mov	-13(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(5bytes):
-	mov	-5(%rdi), %eax
-	mov	-5(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(66bytes):
-	movdqu	-66(%rdi), %xmm1
-	movdqu	-66(%rsi), %xmm2
-	mov	$-66, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(50bytes):
-	movdqu	-50(%rdi), %xmm1
-	movdqu	-50(%rsi), %xmm2
-	mov	$-50, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(34bytes):
-	movdqu	-34(%rdi), %xmm1
-	movdqu	-34(%rsi), %xmm2
-	mov	$-34, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(18bytes):
-	mov	-18(%rdi), %rax
-	mov	-18(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(10bytes):
-	mov	-10(%rdi), %rax
-	mov	-10(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(14bytes):
-	mov	-14(%rdi), %rax
-	mov	-14(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(6bytes):
-	mov	-6(%rdi), %eax
-	mov	-6(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-L(2bytes):
-	movzwl	-2(%rsi), %ecx
-	movzwl	-2(%rdi), %eax
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(67bytes):
-	movdqu	-67(%rdi), %xmm2
-	movdqu	-67(%rsi), %xmm1
-	mov	$-67, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(51bytes):
-	movdqu	-51(%rdi), %xmm2
-	movdqu	-51(%rsi), %xmm1
-	mov	$-51, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(35bytes):
-	movdqu	-35(%rsi), %xmm1
-	movdqu	-35(%rdi), %xmm2
-	mov	$-35, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(19bytes):
-	mov	-19(%rdi), %rax
-	mov	-19(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(11bytes):
-	mov	-11(%rdi), %rax
-	mov	-11(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(15bytes):
-	mov	-15(%rdi), %rax
-	mov	-15(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(7bytes):
-	mov	-7(%rdi), %eax
-	mov	-7(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin2bytes)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-# endif
-
-	.p2align 4
-L(68bytes):
-	movdqu	-68(%rdi), %xmm2
-	movdqu	-68(%rsi), %xmm1
-	mov	$-68, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(52bytes):
-	movdqu	-52(%rdi), %xmm2
-	movdqu	-52(%rsi), %xmm1
-	mov	$-52, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(36bytes):
-	movdqu	-36(%rdi), %xmm2
-	movdqu	-36(%rsi), %xmm1
-	mov	$-36, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(20bytes):
-	movdqu	-20(%rdi), %xmm2
-	movdqu	-20(%rsi), %xmm1
-	mov	$-20, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(69bytes):
-	movdqu	-69(%rsi), %xmm1
-	movdqu	-69(%rdi), %xmm2
-	mov	$-69, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(53bytes):
-	movdqu	-53(%rsi), %xmm1
-	movdqu	-53(%rdi), %xmm2
-	mov	$-53, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(37bytes):
-	movdqu	-37(%rsi), %xmm1
-	movdqu	-37(%rdi), %xmm2
-	mov	$-37, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(21bytes):
-	movdqu	-21(%rsi), %xmm1
-	movdqu	-21(%rdi), %xmm2
-	mov	$-21, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(70bytes):
-	movdqu	-70(%rsi), %xmm1
-	movdqu	-70(%rdi), %xmm2
-	mov	$-70, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(54bytes):
-	movdqu	-54(%rsi), %xmm1
-	movdqu	-54(%rdi), %xmm2
-	mov	$-54, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(38bytes):
-	movdqu	-38(%rsi), %xmm1
-	movdqu	-38(%rdi), %xmm2
-	mov	$-38, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(22bytes):
-	movdqu	-22(%rsi), %xmm1
-	movdqu	-22(%rdi), %xmm2
-	mov	$-22, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(71bytes):
-	movdqu	-71(%rsi), %xmm1
-	movdqu	-71(%rdi), %xmm2
-	mov	$-71, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(55bytes):
-	movdqu	-55(%rdi), %xmm2
-	movdqu	-55(%rsi), %xmm1
-	mov	$-55, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(39bytes):
-	movdqu	-39(%rdi), %xmm2
-	movdqu	-39(%rsi), %xmm1
-	mov	$-39, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(23bytes):
-	movdqu	-23(%rdi), %xmm2
-	movdqu	-23(%rsi), %xmm1
-	mov	$-23, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(72bytes):
-	movdqu	-72(%rsi), %xmm1
-	movdqu	-72(%rdi), %xmm2
-	mov	$-72, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(56bytes):
-	movdqu	-56(%rdi), %xmm2
-	movdqu	-56(%rsi), %xmm1
-	mov	$-56, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(40bytes):
-	movdqu	-40(%rdi), %xmm2
-	movdqu	-40(%rsi), %xmm1
-	mov	$-40, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(24bytes):
-	movdqu	-24(%rdi), %xmm2
-	movdqu	-24(%rsi), %xmm1
-	mov	$-24, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-8(%rsi), %rcx
-	mov	-8(%rdi), %rax
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(73bytes):
-	movdqu	-73(%rsi), %xmm1
-	movdqu	-73(%rdi), %xmm2
-	mov	$-73, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(57bytes):
-	movdqu	-57(%rdi), %xmm2
-	movdqu	-57(%rsi), %xmm1
-	mov	$-57, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(41bytes):
-	movdqu	-41(%rdi), %xmm2
-	movdqu	-41(%rsi), %xmm1
-	mov	$-41, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(25bytes):
-	movdqu	-25(%rdi), %xmm2
-	movdqu	-25(%rsi), %xmm1
-	mov	$-25, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-9(%rdi), %rax
-	mov	-9(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(74bytes):
-	movdqu	-74(%rsi), %xmm1
-	movdqu	-74(%rdi), %xmm2
-	mov	$-74, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(58bytes):
-	movdqu	-58(%rdi), %xmm2
-	movdqu	-58(%rsi), %xmm1
-	mov	$-58, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(42bytes):
-	movdqu	-42(%rdi), %xmm2
-	movdqu	-42(%rsi), %xmm1
-	mov	$-42, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(26bytes):
-	movdqu	-26(%rdi), %xmm2
-	movdqu	-26(%rsi), %xmm1
-	mov	$-26, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-10(%rdi), %rax
-	mov	-10(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	jmp	L(diffin2bytes)
-
-	.p2align 4
-L(75bytes):
-	movdqu	-75(%rsi), %xmm1
-	movdqu	-75(%rdi), %xmm2
-	mov	$-75, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(59bytes):
-	movdqu	-59(%rdi), %xmm2
-	movdqu	-59(%rsi), %xmm1
-	mov	$-59, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(43bytes):
-	movdqu	-43(%rdi), %xmm2
-	movdqu	-43(%rsi), %xmm1
-	mov	$-43, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(27bytes):
-	movdqu	-27(%rdi), %xmm2
-	movdqu	-27(%rsi), %xmm1
-	mov	$-27, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-11(%rdi), %rax
-	mov	-11(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(76bytes):
-	movdqu	-76(%rsi), %xmm1
-	movdqu	-76(%rdi), %xmm2
-	mov	$-76, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(60bytes):
-	movdqu	-60(%rdi), %xmm2
-	movdqu	-60(%rsi), %xmm1
-	mov	$-60, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(44bytes):
-	movdqu	-44(%rdi), %xmm2
-	movdqu	-44(%rsi), %xmm1
-	mov	$-44, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(28bytes):
-	movdqu	-28(%rdi), %xmm2
-	movdqu	-28(%rsi), %xmm1
-	mov	$-28, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-12(%rdi), %rax
-	mov	-12(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(77bytes):
-	movdqu	-77(%rsi), %xmm1
-	movdqu	-77(%rdi), %xmm2
-	mov	$-77, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(61bytes):
-	movdqu	-61(%rdi), %xmm2
-	movdqu	-61(%rsi), %xmm1
-	mov	$-61, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(45bytes):
-	movdqu	-45(%rdi), %xmm2
-	movdqu	-45(%rsi), %xmm1
-	mov	$-45, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(29bytes):
-	movdqu	-29(%rdi), %xmm2
-	movdqu	-29(%rsi), %xmm1
-	mov	$-29, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-13(%rdi), %rax
-	mov	-13(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(78bytes):
-	movdqu	-78(%rsi), %xmm1
-	movdqu	-78(%rdi), %xmm2
-	mov	$-78, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(62bytes):
-	movdqu	-62(%rdi), %xmm2
-	movdqu	-62(%rsi), %xmm1
-	mov	$-62, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(46bytes):
-	movdqu	-46(%rdi), %xmm2
-	movdqu	-46(%rsi), %xmm1
-	mov	$-46, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(30bytes):
-	movdqu	-30(%rdi), %xmm2
-	movdqu	-30(%rsi), %xmm1
-	mov	$-30, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-14(%rdi), %rax
-	mov	-14(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(79bytes):
-	movdqu	-79(%rsi), %xmm1
-	movdqu	-79(%rdi), %xmm2
-	mov	$-79, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(63bytes):
-	movdqu	-63(%rdi), %xmm2
-	movdqu	-63(%rsi), %xmm1
-	mov	$-63, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(47bytes):
-	movdqu	-47(%rdi), %xmm2
-	movdqu	-47(%rsi), %xmm1
-	mov	$-47, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(31bytes):
-	movdqu	-31(%rdi), %xmm2
-	movdqu	-31(%rsi), %xmm1
-	mov	$-31, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-15(%rdi), %rax
-	mov	-15(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(64bytes):
-	movdqu	-64(%rdi), %xmm2
-	movdqu	-64(%rsi), %xmm1
-	mov	$-64, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(48bytes):
-	movdqu	-48(%rdi), %xmm2
-	movdqu	-48(%rsi), %xmm1
-	mov	$-48, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(32bytes):
-	movdqu	-32(%rdi), %xmm2
-	movdqu	-32(%rsi), %xmm1
-	mov	$-32, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-16(%rdi), %rax
-	mov	-16(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
- */
-	.p2align 3
-L(less16bytes):
-	movsbq	%dl, %rdx
-	mov	(%rsi, %rdx), %rcx
-	mov	(%rdi, %rdx), %rax
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	8(%rsi, %rdx), %rcx
-	mov	8(%rdi, %rdx), %rax
-L(diffin8bytes):
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	shr	$32, %rcx
-	shr	$32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-# endif
-
-L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-	cmp	%cx, %ax
-	jne	L(diffin2bytes)
-	shr	$16, %ecx
-	shr	$16, %eax
-L(diffin2bytes):
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(end):
-	and	$0xff, %eax
-	and	$0xff, %ecx
-	sub	%ecx, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	mov	$1, %eax
-	jl	L(nequal_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(nequal_bigger):
-	ret
-
-L(unreal_case):
-	xor	%eax, %eax
-	ret
-# endif
-
-END (MEMCMP)
-
-	.section .rodata.sse4.1,"a",@progbits
-	.p2align 3
-# ifndef USE_AS_WMEMCMP
-L(table_64bytes):
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
-	.int	JMPTBL (L(1bytes), L(table_64bytes))
-	.int	JMPTBL (L(2bytes), L(table_64bytes))
-	.int	JMPTBL (L(3bytes), L(table_64bytes))
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
-	.int	JMPTBL (L(5bytes), L(table_64bytes))
-	.int	JMPTBL (L(6bytes), L(table_64bytes))
-	.int	JMPTBL (L(7bytes), L(table_64bytes))
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
-	.int	JMPTBL (L(9bytes), L(table_64bytes))
-	.int	JMPTBL (L(10bytes), L(table_64bytes))
-	.int	JMPTBL (L(11bytes), L(table_64bytes))
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
-	.int	JMPTBL (L(13bytes), L(table_64bytes))
-	.int	JMPTBL (L(14bytes), L(table_64bytes))
-	.int	JMPTBL (L(15bytes), L(table_64bytes))
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
-	.int	JMPTBL (L(17bytes), L(table_64bytes))
-	.int	JMPTBL (L(18bytes), L(table_64bytes))
-	.int	JMPTBL (L(19bytes), L(table_64bytes))
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
-	.int	JMPTBL (L(21bytes), L(table_64bytes))
-	.int	JMPTBL (L(22bytes), L(table_64bytes))
-	.int	JMPTBL (L(23bytes), L(table_64bytes))
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
-	.int	JMPTBL (L(25bytes), L(table_64bytes))
-	.int	JMPTBL (L(26bytes), L(table_64bytes))
-	.int	JMPTBL (L(27bytes), L(table_64bytes))
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
-	.int	JMPTBL (L(29bytes), L(table_64bytes))
-	.int	JMPTBL (L(30bytes), L(table_64bytes))
-	.int	JMPTBL (L(31bytes), L(table_64bytes))
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
-	.int	JMPTBL (L(33bytes), L(table_64bytes))
-	.int	JMPTBL (L(34bytes), L(table_64bytes))
-	.int	JMPTBL (L(35bytes), L(table_64bytes))
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
-	.int	JMPTBL (L(37bytes), L(table_64bytes))
-	.int	JMPTBL (L(38bytes), L(table_64bytes))
-	.int	JMPTBL (L(39bytes), L(table_64bytes))
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
-	.int	JMPTBL (L(41bytes), L(table_64bytes))
-	.int	JMPTBL (L(42bytes), L(table_64bytes))
-	.int	JMPTBL (L(43bytes), L(table_64bytes))
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
-	.int	JMPTBL (L(45bytes), L(table_64bytes))
-	.int	JMPTBL (L(46bytes), L(table_64bytes))
-	.int	JMPTBL (L(47bytes), L(table_64bytes))
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
-	.int	JMPTBL (L(49bytes), L(table_64bytes))
-	.int	JMPTBL (L(50bytes), L(table_64bytes))
-	.int	JMPTBL (L(51bytes), L(table_64bytes))
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
-	.int	JMPTBL (L(53bytes), L(table_64bytes))
-	.int	JMPTBL (L(54bytes), L(table_64bytes))
-	.int	JMPTBL (L(55bytes), L(table_64bytes))
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
-	.int	JMPTBL (L(57bytes), L(table_64bytes))
-	.int	JMPTBL (L(58bytes), L(table_64bytes))
-	.int	JMPTBL (L(59bytes), L(table_64bytes))
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
-	.int	JMPTBL (L(61bytes), L(table_64bytes))
-	.int	JMPTBL (L(62bytes), L(table_64bytes))
-	.int	JMPTBL (L(63bytes), L(table_64bytes))
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.int	JMPTBL (L(65bytes), L(table_64bytes))
-	.int	JMPTBL (L(66bytes), L(table_64bytes))
-	.int	JMPTBL (L(67bytes), L(table_64bytes))
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
-	.int	JMPTBL (L(69bytes), L(table_64bytes))
-	.int	JMPTBL (L(70bytes), L(table_64bytes))
-	.int	JMPTBL (L(71bytes), L(table_64bytes))
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
-	.int	JMPTBL (L(73bytes), L(table_64bytes))
-	.int	JMPTBL (L(74bytes), L(table_64bytes))
-	.int	JMPTBL (L(75bytes), L(table_64bytes))
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
-	.int	JMPTBL (L(77bytes), L(table_64bytes))
-	.int	JMPTBL (L(78bytes), L(table_64bytes))
-	.int	JMPTBL (L(79bytes), L(table_64bytes))
-# else
-L(table_64bytes):
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b4636..5d87a17 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -29,33 +29,28 @@  ENTRY(memcmp)
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
 	jnz	2f
-	leaq	__memcmp_sse2(%rip), %rax
-	ret
-
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-	jz	3f
-	leaq	__memcmp_sse4_1(%rip), %rax
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	3f
+2:	leaq	__memcmp_sse2_unaligned(%rip), %rax
 	ret
 
 3:	leaq	__memcmp_ssse3(%rip), %rax
 	ret
-
 END(memcmp)
 
 # undef ENTRY
 # define ENTRY(name) \
-	.type __memcmp_sse2, @function; \
+	.type __memcmp_sse2_unaligned, @function; \
 	.p2align 4; \
-	.globl __memcmp_sse2; \
-	.hidden __memcmp_sse2; \
-	__memcmp_sse2: cfi_startproc; \
+	.globl __memcmp_sse2_unaligned; \
+	.hidden __memcmp_sse2_unaligned; \
+	__memcmp_sse2_unaligned: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
 # define END(name) \
-	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+	cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned
 
 # ifdef SHARED
 #  undef libc_hidden_builtin_def
@@ -63,7 +58,7 @@  END(memcmp)
    they will be called without setting up EBX needed for PLT which is
    used by IFUNC.  */
 #  define libc_hidden_builtin_def(name) \
-	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned
 # endif
 #endif
 
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
index 695a236..5dd8d44 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -201,6 +201,10 @@  L(prepare_loop):
 	movdqu	%xmm2, 96(%rdi)
 	movdqu	%xmm3, 112(%rdi)
 
+#ifdef USE_AVX2
+	vpxor	%xmm5, %xmm5, %xmm5
+#endif
+
 	subq	%rsi, %rdi
 	add	$64, %rsi
 	andq	$-64, %rsi
@@ -348,10 +352,13 @@  L(cross_loop):
 	sub	$1, %rcx
 	ja	L(cross_loop)
 
+#ifdef USE_AVX2
+	vpxor	%xmm5, %xmm5, %xmm5
+#else
 	pxor	%xmm5, %xmm5
 	pxor	%xmm6, %xmm6
 	pxor	%xmm7, %xmm7
-
+#endif
 	lea	-64(%rsi), %rdx
 	andq	$-64, %rdx
 	addq	%rdx, %rdi
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
deleted file mode 100644
index b07973a..0000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
+++ /dev/null
@@ -1,4 +0,0 @@ 
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_sse4_1
-
-#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 109e245..dabd3ed 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -30,18 +30,16 @@  ENTRY(wmemcmp)
 	jne	1f
 	call	__init_cpu_features
 
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+        testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
 	jnz	2f
-	leaq	__wmemcmp_sse2(%rip), %rax
-	ret
-
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-	jz	3f
-	leaq	__wmemcmp_sse4_1(%rip), %rax
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	3f
+2:	leaq	__wmemcmp_sse2_unaligned(%rip), %rax
 	ret
 
 3:	leaq	__wmemcmp_ssse3(%rip), %rax
 	ret
 
+
 END(wmemcmp)
 #endif