Message ID | 20220323215734.3927131-18-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,01/23] benchtests: Use json-lib in bench-strchr.c | expand |
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Slightly faster method of doing TOLOWER that saves an > instruction. > > Also replace the hard coded 5-byte no with .p2align 4. On builds with > CET enabled this misaligned entry to strcasecmp. > > geometric_mean(N=40) of all benchmarks New / Original: .920 > > All string/memory tests pass. > --- > Geomtric Mean N=40 runs; All functions page aligned > length, align1, align2, max_char, New Time / Old Time > 1, 1, 1, 127, 0.914 > 2, 2, 2, 127, 0.952 > 3, 3, 3, 127, 0.924 > 4, 4, 4, 127, 0.995 > 5, 5, 5, 127, 0.985 > 6, 6, 6, 127, 1.017 > 7, 7, 7, 127, 1.031 > 8, 0, 0, 127, 0.967 > 9, 1, 1, 127, 0.969 > 10, 2, 2, 127, 0.951 > 11, 3, 3, 127, 0.938 > 12, 4, 4, 127, 0.937 > 13, 5, 5, 127, 0.967 > 14, 6, 6, 127, 0.941 > 15, 7, 7, 127, 0.951 > 4, 0, 0, 127, 0.959 > 4, 0, 0, 254, 0.98 > 8, 0, 0, 254, 0.959 > 16, 0, 0, 127, 0.895 > 16, 0, 0, 254, 0.901 > 32, 0, 0, 127, 0.85 > 32, 0, 0, 254, 0.851 > 64, 0, 0, 127, 0.897 > 64, 0, 0, 254, 0.895 > 128, 0, 0, 127, 0.944 > 128, 0, 0, 254, 0.935 > 256, 0, 0, 127, 0.922 > 256, 0, 0, 254, 0.913 > 512, 0, 0, 127, 0.921 > 512, 0, 0, 254, 0.914 > 1024, 0, 0, 127, 0.845 > 1024, 0, 0, 254, 0.84 > 16, 1, 2, 127, 0.923 > 16, 2, 1, 254, 0.955 > 32, 2, 4, 127, 0.979 > 32, 4, 2, 254, 0.957 > 64, 3, 6, 127, 0.866 > 64, 6, 3, 254, 0.849 > 128, 4, 0, 127, 0.882 > 128, 0, 4, 254, 0.876 > 256, 5, 2, 127, 0.877 > 256, 2, 5, 254, 0.882 > 512, 6, 4, 127, 0.822 > 512, 4, 6, 254, 0.862 > 1024, 7, 6, 127, 0.903 > 1024, 6, 7, 254, 0.908 > > sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- > 1 file changed, 35 insertions(+), 48 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S > index 580feb90e9..7805ae9d41 100644 > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp)) > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > mov %fs:(%rax),%RDX_LP > > - // XXX 5 byte should be before the function > - /* 5-byte NOP. */ > - .byte 0x0f,0x1f,0x44,0x00,0x00 > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > + .p2align 4 > END (GLABEL(__strcasecmp)) > /* FALLTHROUGH to strcasecmp_l. */ > #endif > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp)) > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > mov %fs:(%rax),%RCX_LP > > - // XXX 5 byte should be before the function > - /* 5-byte NOP. */ > - .byte 0x0f,0x1f,0x44,0x00,0x00 > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > + .p2align 4 > END (GLABEL(__strncasecmp)) > /* FALLTHROUGH to strncasecmp_l. */ > #endif > @@ -169,27 +167,22 @@ STRCMP_SSE42: > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > .section .rodata.cst16,"aM",@progbits,16 > .align 16 > -LABEL(belowupper): > - .quad 0x4040404040404040 > - .quad 0x4040404040404040 > -LABEL(topupper): > -# ifdef USE_AVX > - .quad 0x5a5a5a5a5a5a5a5a > - .quad 0x5a5a5a5a5a5a5a5a > -# else > - .quad 0x5b5b5b5b5b5b5b5b > - .quad 0x5b5b5b5b5b5b5b5b > -# endif > -LABEL(touppermask): > +LABEL(lcase_min): > + .quad 0x3f3f3f3f3f3f3f3f > + .quad 0x3f3f3f3f3f3f3f3f > +LABEL(lcase_max): > + .quad 0x9999999999999999 > + .quad 0x9999999999999999 > +LABEL(case_add): > .quad 0x2020202020202020 > .quad 0x2020202020202020 > .previous > - movdqa LABEL(belowupper)(%rip), %xmm4 > -# define UCLOW_reg %xmm4 > - movdqa LABEL(topupper)(%rip), %xmm5 > -# define UCHIGH_reg %xmm5 > - movdqa LABEL(touppermask)(%rip), %xmm6 > -# define LCQWORD_reg %xmm6 > + movdqa LABEL(lcase_min)(%rip), %xmm4 > +# define LCASE_MIN_reg %xmm4 > + movdqa LABEL(lcase_max)(%rip), %xmm5 > +# define LCASE_MAX_reg %xmm5 > + movdqa LABEL(case_add)(%rip), %xmm6 > +# define CASE_ADD_reg %xmm6 > #endif > cmp $0x30, %ecx > ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ > @@ -200,32 +193,26 @@ LABEL(touppermask): > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > # ifdef USE_AVX > # define TOLOWER(reg1, reg2) \ > - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ > - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ > - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ > - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ > - vpandn %xmm7, %xmm8, %xmm8; \ > - vpandn %xmm9, %xmm10, %xmm10; \ > - vpand LCQWORD_reg, %xmm8, %xmm8; \ > - vpand LCQWORD_reg, %xmm10, %xmm10; \ > - vpor reg1, %xmm8, reg1; \ > - vpor reg2, %xmm10, reg2 > + vpaddb LCASE_MIN_reg, reg1, %xmm7; \ > + vpaddb LCASE_MIN_reg, reg2, %xmm8; \ > + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ > + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ > + vpandn CASE_ADD_reg, %xmm7, %xmm7; \ > + vpandn CASE_ADD_reg, %xmm8, %xmm8; \ > + vpaddb %xmm7, reg1, reg1; \ > + vpaddb %xmm8, reg2, reg2 > # else > # define TOLOWER(reg1, reg2) \ > - movdqa reg1, %xmm7; \ > - movdqa UCHIGH_reg, %xmm8; \ > - movdqa reg2, %xmm9; \ > - movdqa UCHIGH_reg, %xmm10; \ > - pcmpgtb UCLOW_reg, %xmm7; \ > - pcmpgtb reg1, %xmm8; \ > - pcmpgtb UCLOW_reg, %xmm9; \ > - pcmpgtb reg2, %xmm10; \ > - pand %xmm8, %xmm7; \ > - pand %xmm10, %xmm9; \ > - pand LCQWORD_reg, %xmm7; \ > - pand LCQWORD_reg, %xmm9; \ > - por %xmm7, reg1; \ > - por %xmm9, reg2 > + movdqa LCASE_MIN_reg, %xmm7; \ > + movdqa LCASE_MIN_reg, %xmm8; \ > + paddb reg1, %xmm7; \ > + paddb reg2, %xmm8; \ > + pcmpgtb LCASE_MAX_reg, %xmm7; \ > + pcmpgtb LCASE_MAX_reg, %xmm8; \ > + pandn CASE_ADD_reg, %xmm7; \ > + pandn CASE_ADD_reg, %xmm8; \ > + paddb %xmm7, reg1; \ > + paddb %xmm8, reg2 > # endif > TOLOWER (%xmm1, %xmm2) > #else > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Slightly faster method of doing TOLOWER that saves an > > instruction. > > > > Also replace the hard coded 5-byte no with .p2align 4. On builds with > > CET enabled this misaligned entry to strcasecmp. > > > > geometric_mean(N=40) of all benchmarks New / Original: .920 > > > > All string/memory tests pass. > > --- > > Geomtric Mean N=40 runs; All functions page aligned > > length, align1, align2, max_char, New Time / Old Time > > 1, 1, 1, 127, 0.914 > > 2, 2, 2, 127, 0.952 > > 3, 3, 3, 127, 0.924 > > 4, 4, 4, 127, 0.995 > > 5, 5, 5, 127, 0.985 > > 6, 6, 6, 127, 1.017 > > 7, 7, 7, 127, 1.031 > > 8, 0, 0, 127, 0.967 > > 9, 1, 1, 127, 0.969 > > 10, 2, 2, 127, 0.951 > > 11, 3, 3, 127, 0.938 > > 12, 4, 4, 127, 0.937 > > 13, 5, 5, 127, 0.967 > > 14, 6, 6, 127, 0.941 > > 15, 7, 7, 127, 0.951 > > 4, 0, 0, 127, 0.959 > > 4, 0, 0, 254, 0.98 > > 8, 0, 0, 254, 0.959 > > 16, 0, 0, 127, 0.895 > > 16, 0, 0, 254, 0.901 > > 32, 0, 0, 127, 0.85 > > 32, 0, 0, 254, 0.851 > > 64, 0, 0, 127, 0.897 > > 64, 0, 0, 254, 0.895 > > 128, 0, 0, 127, 0.944 > > 128, 0, 0, 254, 0.935 > > 256, 0, 0, 127, 0.922 > > 256, 0, 0, 254, 0.913 > > 512, 0, 0, 127, 0.921 > > 512, 0, 0, 254, 0.914 > > 1024, 0, 0, 127, 0.845 > > 1024, 0, 0, 254, 0.84 > > 16, 1, 2, 127, 0.923 > > 16, 2, 1, 254, 0.955 > > 32, 2, 4, 127, 0.979 > > 32, 4, 2, 254, 0.957 > > 64, 3, 6, 127, 0.866 > > 64, 6, 3, 254, 0.849 > > 128, 4, 0, 127, 0.882 > > 128, 0, 4, 254, 0.876 > > 256, 5, 2, 127, 0.877 > > 256, 2, 5, 254, 0.882 > > 512, 6, 4, 127, 0.822 > > 512, 4, 6, 254, 0.862 > > 1024, 7, 6, 127, 0.903 > > 1024, 6, 7, 254, 0.908 > > > > sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- > > 1 file changed, 35 insertions(+), 48 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S > > index 580feb90e9..7805ae9d41 100644 > > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S > > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S > > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp)) > > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > > mov %fs:(%rax),%RDX_LP > > > > - // XXX 5 byte should be before the function > > - /* 5-byte NOP. */ > > - .byte 0x0f,0x1f,0x44,0x00,0x00 > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > > + .p2align 4 > > END (GLABEL(__strcasecmp)) > > /* FALLTHROUGH to strcasecmp_l. */ > > #endif > > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp)) > > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > > mov %fs:(%rax),%RCX_LP > > > > - // XXX 5 byte should be before the function > > - /* 5-byte NOP. */ > > - .byte 0x0f,0x1f,0x44,0x00,0x00 > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > > + .p2align 4 > > END (GLABEL(__strncasecmp)) > > /* FALLTHROUGH to strncasecmp_l. */ > > #endif > > @@ -169,27 +167,22 @@ STRCMP_SSE42: > > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > > .section .rodata.cst16,"aM",@progbits,16 > > .align 16 > > -LABEL(belowupper): > > - .quad 0x4040404040404040 > > - .quad 0x4040404040404040 > > -LABEL(topupper): > > -# ifdef USE_AVX > > - .quad 0x5a5a5a5a5a5a5a5a > > - .quad 0x5a5a5a5a5a5a5a5a > > -# else > > - .quad 0x5b5b5b5b5b5b5b5b > > - .quad 0x5b5b5b5b5b5b5b5b > > -# endif > > -LABEL(touppermask): > > +LABEL(lcase_min): > > + .quad 0x3f3f3f3f3f3f3f3f > > + .quad 0x3f3f3f3f3f3f3f3f > > +LABEL(lcase_max): > > + .quad 0x9999999999999999 > > + .quad 0x9999999999999999 > > +LABEL(case_add): > > .quad 0x2020202020202020 > > .quad 0x2020202020202020 > > .previous > > - movdqa LABEL(belowupper)(%rip), %xmm4 > > -# define UCLOW_reg %xmm4 > > - movdqa LABEL(topupper)(%rip), %xmm5 > > -# define UCHIGH_reg %xmm5 > > - movdqa LABEL(touppermask)(%rip), %xmm6 > > -# define LCQWORD_reg %xmm6 > > + movdqa LABEL(lcase_min)(%rip), %xmm4 > > +# define LCASE_MIN_reg %xmm4 > > + movdqa LABEL(lcase_max)(%rip), %xmm5 > > +# define LCASE_MAX_reg %xmm5 > > + movdqa LABEL(case_add)(%rip), %xmm6 > > +# define CASE_ADD_reg %xmm6 > > #endif > > cmp $0x30, %ecx > > ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ > > @@ -200,32 +193,26 @@ LABEL(touppermask): > > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > > # ifdef USE_AVX > > # define TOLOWER(reg1, reg2) \ > > - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ > > - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ > > - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ > > - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ > > - vpandn %xmm7, %xmm8, %xmm8; \ > > - vpandn %xmm9, %xmm10, %xmm10; \ > > - vpand LCQWORD_reg, %xmm8, %xmm8; \ > > - vpand LCQWORD_reg, %xmm10, %xmm10; \ > > - vpor reg1, %xmm8, reg1; \ > > - vpor reg2, %xmm10, reg2 > > + vpaddb LCASE_MIN_reg, reg1, %xmm7; \ > > + vpaddb LCASE_MIN_reg, reg2, %xmm8; \ > > + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ > > + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ > > + vpandn CASE_ADD_reg, %xmm7, %xmm7; \ > > + vpandn CASE_ADD_reg, %xmm8, %xmm8; \ > > + vpaddb %xmm7, reg1, reg1; \ > > + vpaddb %xmm8, reg2, reg2 > > # else > > # define TOLOWER(reg1, reg2) \ > > - movdqa reg1, %xmm7; \ > > - movdqa UCHIGH_reg, %xmm8; \ > > - movdqa reg2, %xmm9; \ > > - movdqa UCHIGH_reg, %xmm10; \ > > - pcmpgtb UCLOW_reg, %xmm7; \ > > - pcmpgtb reg1, %xmm8; \ > > - pcmpgtb UCLOW_reg, %xmm9; \ > > - pcmpgtb reg2, %xmm10; \ > > - pand %xmm8, %xmm7; \ > > - pand %xmm10, %xmm9; \ > > - pand LCQWORD_reg, %xmm7; \ > > - pand LCQWORD_reg, %xmm9; \ > > - por %xmm7, reg1; \ > > - por %xmm9, reg2 > > + movdqa LCASE_MIN_reg, %xmm7; \ > > + movdqa LCASE_MIN_reg, %xmm8; \ > > + paddb reg1, %xmm7; \ > > + paddb reg2, %xmm8; \ > > + pcmpgtb LCASE_MAX_reg, %xmm7; \ > > + pcmpgtb LCASE_MAX_reg, %xmm8; \ > > + pandn CASE_ADD_reg, %xmm7; \ > > + pandn CASE_ADD_reg, %xmm8; \ > > + paddb %xmm7, reg1; \ > > + paddb %xmm8, reg2 > > # endif > > TOLOWER (%xmm1, %xmm2) > > #else > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 580feb90e9..7805ae9d41 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strcasecmp)) /* FALLTHROUGH to strcasecmp_l. */ #endif @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strncasecmp)) /* FALLTHROUGH to strncasecmp_l. */ #endif @@ -169,27 +167,22 @@ STRCMP_SSE42: #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -LABEL(belowupper): - .quad 0x4040404040404040 - .quad 0x4040404040404040 -LABEL(topupper): -# ifdef USE_AVX - .quad 0x5a5a5a5a5a5a5a5a - .quad 0x5a5a5a5a5a5a5a5a -# else - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -# endif -LABEL(touppermask): +LABEL(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +LABEL(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 +LABEL(case_add): .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa LABEL(belowupper)(%rip), %xmm4 -# define UCLOW_reg %xmm4 - movdqa LABEL(topupper)(%rip), %xmm5 -# define UCHIGH_reg %xmm5 - movdqa LABEL(touppermask)(%rip), %xmm6 -# define LCQWORD_reg %xmm6 + movdqa LABEL(lcase_min)(%rip), %xmm4 +# define LCASE_MIN_reg %xmm4 + movdqa LABEL(lcase_max)(%rip), %xmm5 +# define LCASE_MAX_reg %xmm5 + movdqa LABEL(case_add)(%rip), %xmm6 +# define CASE_ADD_reg %xmm6 #endif cmp $0x30, %ecx ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ @@ -200,32 +193,26 @@ LABEL(touppermask): #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L # ifdef USE_AVX # define TOLOWER(reg1, reg2) \ - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ - vpandn %xmm7, %xmm8, %xmm8; \ - vpandn %xmm9, %xmm10, %xmm10; \ - vpand LCQWORD_reg, %xmm8, %xmm8; \ - vpand LCQWORD_reg, %xmm10, %xmm10; \ - vpor reg1, %xmm8, reg1; \ - vpor reg2, %xmm10, reg2 + vpaddb LCASE_MIN_reg, reg1, %xmm7; \ + vpaddb LCASE_MIN_reg, reg2, %xmm8; \ + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ + vpandn CASE_ADD_reg, %xmm7, %xmm7; \ + vpandn CASE_ADD_reg, %xmm8, %xmm8; \ + vpaddb %xmm7, reg1, reg1; \ + vpaddb %xmm8, reg2, reg2 # else # define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm7; \ - movdqa UCHIGH_reg, %xmm8; \ - movdqa reg2, %xmm9; \ - movdqa UCHIGH_reg, %xmm10; \ - pcmpgtb UCLOW_reg, %xmm7; \ - pcmpgtb reg1, %xmm8; \ - pcmpgtb UCLOW_reg, %xmm9; \ - pcmpgtb reg2, %xmm10; \ - pand %xmm8, %xmm7; \ - pand %xmm10, %xmm9; \ - pand LCQWORD_reg, %xmm7; \ - pand LCQWORD_reg, %xmm9; \ - por %xmm7, reg1; \ - por %xmm9, reg2 + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ + paddb reg2, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm7; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm7; \ + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 # endif TOLOWER (%xmm1, %xmm2) #else