Message ID | 20150818211826.GA8700@domone |
---|---|
State | New |
Headers | show |
Hi Thanks for checking on that. Comments below: On 18-08-2015 18:18, Ondřej Bílka wrote: > Hi, > > As I told before that benchmarks should be read or they are useless so I > looked on powerpc ones. I noticed that power7 strcmp and strncmp are > about five times slower than memcmp for unaligned case. > > Thats too much so I could easily improve performance by 50% on that case by > implementing strcmp as strnlen+memcmp loop despite overhead of strnlen. > As that loop is due that overhead lot slower than aligned data it should be fixed in > assembly by changing unaligned case to follow pattern in following c > code. > [...] > + > +# include "libc-internal.h" > +int __strcmp_power7b(const char *a, const char *b) > +{ > + size_t len; > + int ret; > + len = __strnlen_power7 (a, 64); > + len = __strnlen_power7 (b, len); > + if (len != 64) > + { > + return __memcmp_power7 (a, b, len + 1); > + } > + ret = __memcmp_power7 (a, b, 64); > + if (ret) > + return ret; > + > + const char *a_old = a; > + a = PTR_ALIGN_DOWN (a + 64, 64); > + b += a - a_old; > + > + while (1) > + { > + len = __strnlen_power7 (b, 64); > + if (len != 64) > + { > + return __memcmp_power7 (a, b, len + 1); > + } > + > + ret = __memcmp_power7 (a, b, 64); > + if (ret) > + return ret; > + a+=64; > + b+=64; > + } > +} > > libc_ifunc (strcmp, > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > Indeed this seems a better strategy, although I am not convinced it will have much gain by aligning the 'a' source. The strnlen do take the source alignment in consideration (aligned and unaligned will take the same path), and memcmp implementation will take the unaligned path anyway (since although 'a' is aligned, 'b' won't be). Using a similar strategy as you did: int __strcmp_power7c (const char *a, const char *b) { if (IS_ALIGN(a, 8) && IS_ALIGN(b, 8)) return __strcmp_power7 (a, b); while (1) { size_t len = __strnlen_power7 (b, 64); if (len != 64) { return __memcmp_power7 (a, b, len + 1); } int ret = __memcmp_power7 (a, b, 64); if (ret) return ret; a+=64; b+=64; } } I see simple_strcmp stupid_strcmp __strcmp_power8 __strcmp_power7 __strcmp_power7b __strcmp_power7c __strcmp_ppc Length 32, alignment 0/ 0: 43.1406 52.7188 5.39062 4.90625 24.3438 9.01562 8.8125 Length 32, alignment 0/ 4: 43.0156 52.7188 5.39062 29.5781 26.75 23.2812 32.6562 Length 32, alignment 4/ 5: 43.3906 52.9062 5.42188 29.8906 27.0312 21.3594 32.7812 Length 64, alignment 0/ 0: 77.5469 82.7969 8.5 8.59375 23.5625 11.125 13.1562 Length 64, alignment 0/ 5: 77.375 186.5 11.2188 47.4844 28.5938 24.0156 56.8906 Length 64, alignment 5/ 6: 77.7031 82.8438 11.9062 47.3125 30.7812 24.7812 56.8594 Length 128, alignment 0/ 0: 141.516 149.859 14.3594 14.4062 55.4531 16.5 25.5781 Length 128, alignment 0/ 6: 142.469 149.781 26.4531 84.6094 62.3438 40.1719 102.594 Length 128, alignment 6/ 7: 142.484 149.594 26.1562 85.9844 66.4219 48.3594 102.984 Length 256, alignment 0/ 0: 268.594 262.5 30.0312 37.0312 93.25 38.4531 43.0469 Length 256, alignment 0/ 7: 270.766 260.016 41.9688 159.188 105.594 86.0312 195.328 Length 256, alignment 7/ 8: 269.781 262.812 43.1719 158.672 106.234 84.9375 194.594 Length 512, alignment 0/ 0: 523.906 513.438 47.0781 53.3906 149.234 57.9062 72.625 Length 512, alignment 0/ 0: 523.359 513.516 47.4531 55.3906 148.781 58.7656 72.8438 Length 512, alignment 0/ 8: 524.734 516.953 49.5156 55.1094 151.344 59.0156 74.6875 Length 512, alignment 8/ 9: 523.797 512.938 74.4688 306.953 172.781 158.578 378.078 Length 1024, alignment 0/ 0: 1031.11 988.484 84.6406 100.531 263.141 104.812 138.109 Length 1024, alignment 0/ 0: 1030.39 988.297 84.3438 99.1719 263.469 103.703 138.031 Length 1024, alignment 0/ 9: 1030.23 990.984 138.531 603.812 327.781 301.75 749.344 Length 1024, alignment 9/10: 1029.92 987.844 138.953 603.641 309.672 332.953 750.312
On Wed, Aug 19, 2015 at 05:07:28PM -0300, Adhemerval Zanella wrote: > Hi > > Thanks for checking on that. Comments below: > > On 18-08-2015 18:18, Ondřej Bílka wrote: > > Hi, > > > > As I told before that benchmarks should be read or they are useless so I > > looked on powerpc ones. I noticed that power7 strcmp and strncmp are > > about five times slower than memcmp for unaligned case. > > > > Thats too much so I could easily improve performance by 50% on that case by > > implementing strcmp as strnlen+memcmp loop despite overhead of strnlen. > > As that loop is due that overhead lot slower than aligned data it should be fixed in > > assembly by changing unaligned case to follow pattern in following c > > code. > > > [...] > > + > > +# include "libc-internal.h" > > +int __strcmp_power7b(const char *a, const char *b) > > +{ > > + size_t len; > > + int ret; > > + len = __strnlen_power7 (a, 64); > > + len = __strnlen_power7 (b, len); > > + if (len != 64) > > + { > > + return __memcmp_power7 (a, b, len + 1); > > + } > > + ret = __memcmp_power7 (a, b, 64); > > + if (ret) > > + return ret; > > + > > + const char *a_old = a; > > + a = PTR_ALIGN_DOWN (a + 64, 64); > > + b += a - a_old; > > + > > + while (1) > > + { > > + len = __strnlen_power7 (b, 64); > > + if (len != 64) > > + { > > + return __memcmp_power7 (a, b, len + 1); > > + } > > + > > + ret = __memcmp_power7 (a, b, 64); > > + if (ret) > > + return ret; > > + a+=64; > > + b+=64; > > + } > > +} > > > > libc_ifunc (strcmp, > > (hwcap2 & PPC_FEATURE2_ARCH_2_07) > > > > Indeed this seems a better strategy, although I am not convinced it will have > much gain by aligning the 'a' source. The strnlen do take the source alignment > in consideration (aligned and unaligned will take the same path), and memcmp > implementation will take the unaligned path anyway (since although 'a' is > aligned, 'b' won't be). > You need that to be able to use memcmp as it could segfault by reading past end which doesn't happen on aligned case. That is unless particular memcmp guarantees it doesn't fault by reading cross-page boundary which several implementations do. > Using a similar strategy as you did: > > int __strcmp_power7c (const char *a, const char *b) > { > if (IS_ALIGN(a, 8) && IS_ALIGN(b, 8)) > return __strcmp_power7 (a, b); > > while (1) > { > size_t len = __strnlen_power7 (b, 64); > if (len != 64) > { > return __memcmp_power7 (a, b, len + 1); > } > > int ret = __memcmp_power7 (a, b, 64); > if (ret) > return ret; > a+=64; > b+=64; > } > } > And as strncmp I was tired when I wrote previous mail so implementation is following, bug was that I forgot to consider checking null in limit. int __strncmp_power7b (char *a, char *b, size_t l) { size_t len; int ret; if (l==0) return 0; l--; len = strnlen (a, l < 64 ? l : 64); len = strnlen (b, len); if (len != 64) { return memcmp (a, b, len + 1); } ret = memcmp (a, b, 64); if (ret) return ret; const char *a_old = a; a = ALIGN_DOWN (a + 64, 64); b += a - a_old; l -= a - a_old; while (1) { len = strnlen (b, l < 64 ? l : 64); if (len != 64) { return memcmp (a, b, len + 1); } ret = memcmp (a, b, 64); if (ret) return ret; a+=64; b+=64; l -= 64; } }
>> Indeed this seems a better strategy, although I am not convinced it will have >> much gain by aligning the 'a' source. The strnlen do take the source alignment >> in consideration (aligned and unaligned will take the same path), and memcmp >> implementation will take the unaligned path anyway (since although 'a' is >> aligned, 'b' won't be). >> > You need that to be able to use memcmp as it could segfault by reading > past end which doesn't happen on aligned case. That is unless > particular memcmp guarantees it doesn't fault by reading cross-page > boundary which several implementations do. POWER7 memcmp won't issue any unaligned load, the 'unaligned' patch will issue aligned loads with some shift logic. The snippet I used did not show any issue in string tests. > > >> Using a similar strategy as you did: >> >> int __strcmp_power7c (const char *a, const char *b) >> { >> if (IS_ALIGN(a, 8) && IS_ALIGN(b, 8)) >> return __strcmp_power7 (a, b); >> >> while (1) >> { >> size_t len = __strnlen_power7 (b, 64); >> if (len != 64) >> { >> return __memcmp_power7 (a, b, len + 1); >> } >> >> int ret = __memcmp_power7 (a, b, 64); >> if (ret) >> return ret; >> a+=64; >> b+=64; >> } >> } >> > > And as strncmp I was tired when I wrote previous mail so implementation > is following, bug was that I forgot to consider checking null in limit. > > int __strncmp_power7b (char *a, char *b, size_t l) > { > size_t len; > int ret; > if (l==0) > return 0; > l--; > len = strnlen (a, l < 64 ? l : 64); > len = strnlen (b, len); > if (len != 64) > { > return memcmp (a, b, len + 1); > } > ret = memcmp (a, b, 64); > if (ret) > return ret; > > const char *a_old = a; > a = ALIGN_DOWN (a + 64, 64); > b += a - a_old; > l -= a - a_old; > while (1) > { > len = strnlen (b, l < 64 ? l : 64); > if (len != 64) > { > return memcmp (a, b, len + 1); > } > > ret = memcmp (a, b, 64); > if (ret) > return ret; > a+=64; > b+=64; > l -= 64; > } > } >
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 364385b..bbf6ee6 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -308,6 +318,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, hwcap & PPC_FEATURE_HAS_VSX, __strcmp_power7) + IFUNC_IMPL_ADD (array, i, strcmp, + hwcap & PPC_FEATURE_HAS_VSX, + __strcmp_power7b) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ppc)) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index b45ba1f..fd7a1b9 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -20,10 +20,47 @@ # include <string.h> # include <shlib-compat.h> # include "init-arch.h" - extern __typeof (strcmp) __strcmp_ppc attribute_hidden; extern __typeof (strcmp) __strcmp_power7 attribute_hidden; extern __typeof (strcmp) __strcmp_power8 attribute_hidden; +extern __typeof (strnlen) __strnlen_power7 attribute_hidden; +extern __typeof (memcmp) __memcmp_power7 attribute_hidden; +extern __typeof (strcmp) __strcmp_power7 attribute_hidden; + +# include "libc-internal.h" +int __strcmp_power7b(const char *a, const char *b) +{ + size_t len; + int ret; + len = __strnlen_power7 (a, 64); + len = __strnlen_power7 (b, len); + if (len != 64) + { + return __memcmp_power7 (a, b, len + 1); + } + ret = __memcmp_power7 (a, b, 64); + if (ret) + return ret; + + const char *a_old = a; + a = PTR_ALIGN_DOWN (a + 64, 64); + b += a - a_old; + + while (1) + { + len = __strnlen_power7 (b, 64); + if (len != 64) + { + return __memcmp_power7 (a, b, len + 1); + } + + ret = __memcmp_power7 (a, b, 64); + if (ret) + return ret; + a+=64; + b+=64; + } +} libc_ifunc (strcmp, (hwcap2 & PPC_FEATURE2_ARCH_2_07)