diff mbox series

[v1,17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S

Message ID 20220323215734.3927131-17-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c | expand

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .894

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
     1,      1,      1,      127,               0.903
     2,      2,      2,      127,               0.905
     3,      3,      3,      127,               0.877
     4,      4,      4,      127,               0.888
     5,      5,      5,      127,               0.901
     6,      6,      6,      127,               0.954
     7,      7,      7,      127,               0.932
     8,      0,      0,      127,               0.918
     9,      1,      1,      127,               0.914
    10,      2,      2,      127,               0.877
    11,      3,      3,      127,               0.909
    12,      4,      4,      127,               0.876
    13,      5,      5,      127,               0.886
    14,      6,      6,      127,               0.914
    15,      7,      7,      127,               0.939
     4,      0,      0,      127,               0.963
     4,      0,      0,      254,               0.943
     8,      0,      0,      254,               0.927
    16,      0,      0,      127,               0.876
    16,      0,      0,      254,               0.865
    32,      0,      0,      127,               0.865
    32,      0,      0,      254,               0.862
    64,      0,      0,      127,               0.863
    64,      0,      0,      254,               0.896
   128,      0,      0,      127,               0.885
   128,      0,      0,      254,               0.882
   256,      0,      0,      127,                0.87
   256,      0,      0,      254,               0.869
   512,      0,      0,      127,               0.832
   512,      0,      0,      254,               0.848
  1024,      0,      0,      127,               0.835
  1024,      0,      0,      254,               0.843
    16,      1,      2,      127,               0.914
    16,      2,      1,      254,               0.949
    32,      2,      4,      127,               0.955
    32,      4,      2,      254,               1.004
    64,      3,      6,      127,               0.844
    64,      6,      3,      254,               0.905
   128,      4,      0,      127,               0.889
   128,      0,      4,      254,               0.845
   256,      5,      2,      127,               0.929
   256,      2,      5,      254,               0.907
   512,      6,      4,      127,               0.837
   512,      4,      6,      254,               0.862
  1024,      7,      6,      127,               0.895
  1024,      6,      7,      254,                0.89

 sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

Comments

H.J. Lu March 24, 2022, 7:02 p.m. UTC | #1
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
>      1,      1,      1,      127,               0.903
>      2,      2,      2,      127,               0.905
>      3,      3,      3,      127,               0.877
>      4,      4,      4,      127,               0.888
>      5,      5,      5,      127,               0.901
>      6,      6,      6,      127,               0.954
>      7,      7,      7,      127,               0.932
>      8,      0,      0,      127,               0.918
>      9,      1,      1,      127,               0.914
>     10,      2,      2,      127,               0.877
>     11,      3,      3,      127,               0.909
>     12,      4,      4,      127,               0.876
>     13,      5,      5,      127,               0.886
>     14,      6,      6,      127,               0.914
>     15,      7,      7,      127,               0.939
>      4,      0,      0,      127,               0.963
>      4,      0,      0,      254,               0.943
>      8,      0,      0,      254,               0.927
>     16,      0,      0,      127,               0.876
>     16,      0,      0,      254,               0.865
>     32,      0,      0,      127,               0.865
>     32,      0,      0,      254,               0.862
>     64,      0,      0,      127,               0.863
>     64,      0,      0,      254,               0.896
>    128,      0,      0,      127,               0.885
>    128,      0,      0,      254,               0.882
>    256,      0,      0,      127,                0.87
>    256,      0,      0,      254,               0.869
>    512,      0,      0,      127,               0.832
>    512,      0,      0,      254,               0.848
>   1024,      0,      0,      127,               0.835
>   1024,      0,      0,      254,               0.843
>     16,      1,      2,      127,               0.914
>     16,      2,      1,      254,               0.949
>     32,      2,      4,      127,               0.955
>     32,      4,      2,      254,               1.004
>     64,      3,      6,      127,               0.844
>     64,      6,      3,      254,               0.905
>    128,      4,      0,      127,               0.889
>    128,      0,      4,      254,               0.845
>    256,      5,      2,      127,               0.929
>    256,      2,      5,      254,               0.907
>    512,      6,      4,      127,               0.837
>    512,      4,      6,      254,               0.862
>   1024,      7,      6,      127,               0.895
>   1024,      6,      7,      254,                0.89
>
>  sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
>  1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RDX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strcasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RCX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END2 (__strncasecmp)
>  # ifndef NO_NOLOCALE_ALIAS
>  weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>         .section .rodata.cst16,"aM",@progbits,16
>         .align 16
> -.Lbelowupper:
> -       .quad   0x4040404040404040
> -       .quad   0x4040404040404040
> -.Ltopupper:
> -       .quad   0x5b5b5b5b5b5b5b5b
> -       .quad   0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +.Lcase_add:
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .previous
> -       movdqa  .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> -       movdqa  .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> -       movdqa  .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> +       movdqa  .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> +       movdqa  .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> +       movdqa  .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
>  #endif
>         cmp     $0x30, %ecx
>         ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
>         movhpd  8(%rdi), %xmm1
>         movhpd  8(%rsi), %xmm2
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> -       movdqa  reg1, %xmm8;                                    \
> -       movdqa  UCHIGH_reg, %xmm9;                              \
> -       movdqa  reg2, %xmm10;                                   \
> -       movdqa  UCHIGH_reg, %xmm11;                             \
> -       pcmpgtb UCLOW_reg, %xmm8;                               \
> -       pcmpgtb reg1, %xmm9;                                    \
> -       pcmpgtb UCLOW_reg, %xmm10;                              \
> -       pcmpgtb reg2, %xmm11;                                   \
> -       pand    %xmm9, %xmm8;                                   \
> -       pand    %xmm11, %xmm10;                                 \
> -       pand    LCQWORD_reg, %xmm8;                             \
> -       pand    LCQWORD_reg, %xmm10;                            \
> -       por     %xmm8, reg1;                                    \
> -       por     %xmm10, reg2
> -       TOLOWER (%xmm1, %xmm2)
> +#  define TOLOWER(reg1, reg2) \
> +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> +       movdqa  LCASE_MIN_reg, %xmm9;                                   \
> +       paddb   reg1, %xmm8;                                    \
> +       paddb   reg2, %xmm9;                                    \
> +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> +       pcmpgtb LCASE_MAX_reg, %xmm9;                           \
> +       pandn   CASE_ADD_reg, %xmm8;                                    \
> +       pandn   CASE_ADD_reg, %xmm9;                                    \
> +       paddb   %xmm8, reg1;                                    \
> +       paddb   %xmm9, reg2
> +       TOLOWER (%xmm1, %xmm2)
>  #else
>  # define TOLOWER(reg1, reg2)
>  #endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
Sunil Pandey May 12, 2022, 7:44 p.m. UTC | #2
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .894
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> >      1,      1,      1,      127,               0.903
> >      2,      2,      2,      127,               0.905
> >      3,      3,      3,      127,               0.877
> >      4,      4,      4,      127,               0.888
> >      5,      5,      5,      127,               0.901
> >      6,      6,      6,      127,               0.954
> >      7,      7,      7,      127,               0.932
> >      8,      0,      0,      127,               0.918
> >      9,      1,      1,      127,               0.914
> >     10,      2,      2,      127,               0.877
> >     11,      3,      3,      127,               0.909
> >     12,      4,      4,      127,               0.876
> >     13,      5,      5,      127,               0.886
> >     14,      6,      6,      127,               0.914
> >     15,      7,      7,      127,               0.939
> >      4,      0,      0,      127,               0.963
> >      4,      0,      0,      254,               0.943
> >      8,      0,      0,      254,               0.927
> >     16,      0,      0,      127,               0.876
> >     16,      0,      0,      254,               0.865
> >     32,      0,      0,      127,               0.865
> >     32,      0,      0,      254,               0.862
> >     64,      0,      0,      127,               0.863
> >     64,      0,      0,      254,               0.896
> >    128,      0,      0,      127,               0.885
> >    128,      0,      0,      254,               0.882
> >    256,      0,      0,      127,                0.87
> >    256,      0,      0,      254,               0.869
> >    512,      0,      0,      127,               0.832
> >    512,      0,      0,      254,               0.848
> >   1024,      0,      0,      127,               0.835
> >   1024,      0,      0,      254,               0.843
> >     16,      1,      2,      127,               0.914
> >     16,      2,      1,      254,               0.949
> >     32,      2,      4,      127,               0.955
> >     32,      4,      2,      254,               1.004
> >     64,      3,      6,      127,               0.844
> >     64,      6,      3,      254,               0.905
> >    128,      4,      0,      127,               0.889
> >    128,      0,      4,      254,               0.845
> >    256,      5,      2,      127,               0.929
> >    256,      2,      5,      254,               0.907
> >    512,      6,      4,      127,               0.837
> >    512,      4,      6,      254,               0.862
> >   1024,      7,      6,      127,               0.895
> >   1024,      6,      7,      254,                0.89
> >
> >  sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> >  1 file changed, 29 insertions(+), 35 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> > index e2ab59c555..99d8b36f1d 100644
> > --- a/sysdeps/x86_64/strcmp.S
> > +++ b/sysdeps/x86_64/strcmp.S
> > @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RDX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END2 (__strcasecmp)
> >  # ifndef NO_NOLOCALE_ALIAS
> >  weak_alias (__strcasecmp, strcasecmp)
> > @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RCX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END2 (__strncasecmp)
> >  # ifndef NO_NOLOCALE_ALIAS
> >  weak_alias (__strncasecmp, strncasecmp)
> > @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >         .section .rodata.cst16,"aM",@progbits,16
> >         .align 16
> > -.Lbelowupper:
> > -       .quad   0x4040404040404040
> > -       .quad   0x4040404040404040
> > -.Ltopupper:
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -.Ltouppermask:
> > +.Llcase_min:
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +.Llcase_max:
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +.Lcase_add:
> >         .quad   0x2020202020202020
> >         .quad   0x2020202020202020
> >         .previous
> > -       movdqa  .Lbelowupper(%rip), %xmm5
> > -# define UCLOW_reg %xmm5
> > -       movdqa  .Ltopupper(%rip), %xmm6
> > -# define UCHIGH_reg %xmm6
> > -       movdqa  .Ltouppermask(%rip), %xmm7
> > -# define LCQWORD_reg %xmm7
> > +       movdqa  .Llcase_min(%rip), %xmm5
> > +# define LCASE_MIN_reg %xmm5
> > +       movdqa  .Llcase_max(%rip), %xmm6
> > +# define LCASE_MAX_reg %xmm6
> > +       movdqa  .Lcase_add(%rip), %xmm7
> > +# define CASE_ADD_reg %xmm7
> >  #endif
> >         cmp     $0x30, %ecx
> >         ja      LABEL(crosscache)       /* rsi: 16-byte load will cross cache line */
> > @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> >         movhpd  8(%rdi), %xmm1
> >         movhpd  8(%rsi), %xmm2
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# define TOLOWER(reg1, reg2) \
> > -       movdqa  reg1, %xmm8;                                    \
> > -       movdqa  UCHIGH_reg, %xmm9;                              \
> > -       movdqa  reg2, %xmm10;                                   \
> > -       movdqa  UCHIGH_reg, %xmm11;                             \
> > -       pcmpgtb UCLOW_reg, %xmm8;                               \
> > -       pcmpgtb reg1, %xmm9;                                    \
> > -       pcmpgtb UCLOW_reg, %xmm10;                              \
> > -       pcmpgtb reg2, %xmm11;                                   \
> > -       pand    %xmm9, %xmm8;                                   \
> > -       pand    %xmm11, %xmm10;                                 \
> > -       pand    LCQWORD_reg, %xmm8;                             \
> > -       pand    LCQWORD_reg, %xmm10;                            \
> > -       por     %xmm8, reg1;                                    \
> > -       por     %xmm10, reg2
> > -       TOLOWER (%xmm1, %xmm2)
> > +#  define TOLOWER(reg1, reg2) \
> > +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> > +       movdqa  LCASE_MIN_reg, %xmm9;                                   \
> > +       paddb   reg1, %xmm8;                                    \
> > +       paddb   reg2, %xmm9;                                    \
> > +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> > +       pcmpgtb LCASE_MAX_reg, %xmm9;                           \
> > +       pandn   CASE_ADD_reg, %xmm8;                                    \
> > +       pandn   CASE_ADD_reg, %xmm9;                                    \
> > +       paddb   %xmm8, reg1;                                    \
> > +       paddb   %xmm9, reg2
> > +       TOLOWER (%xmm1, %xmm2)
> >  #else
> >  # define TOLOWER(reg1, reg2)
> >  #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
diff mbox series

Patch

diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index e2ab59c555..99d8b36f1d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -75,9 +75,8 @@  ENTRY2 (__strcasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strcasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strcasecmp, strcasecmp)
@@ -94,9 +93,8 @@  ENTRY2 (__strncasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strncasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strncasecmp, strncasecmp)
@@ -146,22 +144,22 @@  ENTRY (STRCMP)
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-.Lbelowupper:
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-.Ltopupper:
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+.Lcase_add:
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	.Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
-	movdqa	.Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
-	movdqa	.Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+	movdqa	.Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+	movdqa	.Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+	movdqa	.Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
@@ -172,22 +170,18 @@  ENTRY (STRCMP)
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm8;					\
-	movdqa	UCHIGH_reg, %xmm9;				\
-	movdqa	reg2, %xmm10;					\
-	movdqa	UCHIGH_reg, %xmm11;				\
-	pcmpgtb	UCLOW_reg, %xmm8;				\
-	pcmpgtb	reg1, %xmm9;					\
-	pcmpgtb	UCLOW_reg, %xmm10;				\
-	pcmpgtb	reg2, %xmm11;					\
-	pand	%xmm9, %xmm8;					\
-	pand	%xmm11, %xmm10;					\
-	pand	LCQWORD_reg, %xmm8;				\
-	pand	LCQWORD_reg, %xmm10;				\
-	por	%xmm8, reg1;					\
-	por	%xmm10, reg2
-	TOLOWER (%xmm1, %xmm2)
+#  define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	movdqa	LCASE_MIN_reg, %xmm9;					\
+	paddb	reg1, %xmm8;					\
+	paddb	reg2, %xmm9;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	pandn	CASE_ADD_reg, %xmm9;					\
+	paddb	%xmm8, reg1;					\
+	paddb	%xmm9, reg2
+	TOLOWER	(%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif