Message ID | 20220518172635.291119-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v10,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked | expand |
andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Unroll slightly and enforce good instruction scheduling. This improves > performance on out-of-order machines. The unrolling allows for > pipelined multiplies. > > As well, as an optional sysdep, reorder the operations and prevent > reassosiation for better scheduling and higher ILP. This commit > only adds the barrier for x86, although it should be either no > change or a win for any architecture. > > Unrolling further started to induce slowdowns for sizes [0, 4] > but can help the loop so if larger sizes are the target further > unrolling can be beneficial. > > Results for _dl_new_hash > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > Time as Geometric Mean of N=30 runs > Geometric of all benchmark New / Old: 0.674 > type, length, New Time, Old Time, New Time / Old Time > fixed, 0, 2.865, 2.72, 1.053 > fixed, 1, 3.567, 2.489, 1.433 > fixed, 2, 2.577, 3.649, 0.706 > fixed, 3, 3.644, 5.983, 0.609 > fixed, 4, 4.211, 6.833, 0.616 > fixed, 5, 4.741, 9.372, 0.506 > fixed, 6, 5.415, 9.561, 0.566 > fixed, 7, 6.649, 10.789, 0.616 > fixed, 8, 8.081, 11.808, 0.684 > fixed, 9, 8.427, 12.935, 0.651 > fixed, 10, 8.673, 14.134, 0.614 > fixed, 11, 10.69, 15.408, 0.694 > fixed, 12, 10.789, 16.982, 0.635 > fixed, 13, 12.169, 18.411, 0.661 > fixed, 14, 12.659, 19.914, 0.636 > fixed, 15, 13.526, 21.541, 0.628 > fixed, 16, 14.211, 23.088, 0.616 > fixed, 32, 29.412, 52.722, 0.558 > fixed, 64, 65.41, 142.351, 0.459 > fixed, 128, 138.505, 295.625, 0.469 > fixed, 256, 291.707, 601.983, 0.485 > random, 2, 12.698, 12.849, 0.988 > random, 4, 16.065, 15.857, 1.013 > random, 8, 19.564, 21.105, 0.927 > random, 16, 23.919, 26.823, 0.892 > random, 32, 31.987, 39.591, 0.808 > random, 64, 49.282, 71.487, 0.689 > random, 128, 82.23, 145.364, 0.566 > random, 256, 152.209, 298.434, 0.51 > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > --- > benchtests/bench-dl-new-hash.c | 3 +- > elf/{dl-new-hash.h => simple-dl-new-hash.h} | 20 ++-- > elf/tst-dl-hash.c | 1 + > sysdeps/generic/dl-new-hash.h | 111 ++++++++++++++++++++ > sysdeps/x86/dl-new-hash.h | 24 +++++ > 5 files changed, 146 insertions(+), 13 deletions(-) > rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%) > create mode 100644 sysdeps/generic/dl-new-hash.h > create mode 100644 sysdeps/x86/dl-new-hash.h > > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c > index 3c8a1d5a82..040fa7ce01 100644 > --- a/benchtests/bench-dl-new-hash.c > +++ b/benchtests/bench-dl-new-hash.c > @@ -16,7 +16,8 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <elf/dl-new-hash.h> > +#include <dl-new-hash.h> > +#include <elf/simple-dl-new-hash.h> > #define TEST_FUNC(x, y) _dl_new_hash (x) > #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) > > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h > similarity index 75% > rename from elf/dl-new-hash.h > rename to elf/simple-dl-new-hash.h > index 8641bb4196..1437b1bd36 100644 > --- a/elf/dl-new-hash.h > +++ b/elf/simple-dl-new-hash.h > @@ -1,4 +1,4 @@ > -/* _dl_new_hash for elf symbol lookup > +/* __simple_dl_new_hash for testing true elf symbol lookup. > Copyright (C) 2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > @@ -16,16 +16,16 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#ifndef _DL_NEW_HASH_H > -#define _DL_NEW_HASH_H 1 > +#ifndef _SIMPLE_DL_NEW_HASH_H > +#define _SIMPLE_DL_NEW_HASH_H 1 > > #include <stdint.h> > -/* For __always_inline. */ > -#include <sys/cdefs.h> > > -static __always_inline uint32_t > +/* For testing/benchmarking purposes. Real implementation in > + sysdeps/generic/dl-new-hash.h. */ > +static uint32_t > __attribute__ ((unused)) > -_dl_new_hash (const char *s) > +__simple_dl_new_hash (const char *s) > { > uint32_t h = 5381; > for (unsigned char c = *s; c != '\0'; c = *++s) > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) > return h; > } > > -/* For testing/benchmarking purposes. */ > -#define __simple_dl_new_hash _dl_new_hash > - > - > -#endif /* dl-new-hash.h */ > +#endif /* simple-dl-new-hash.h */ > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c > index 8697eb73a0..b21766c63d 100644 > --- a/elf/tst-dl-hash.c > +++ b/elf/tst-dl-hash.c > @@ -18,6 +18,7 @@ > > > #include <simple-dl-hash.h> > +#include <simple-dl-new-hash.h> > #include <dl-hash.h> > #include <dl-new-hash.h> > #include <support/support.h> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h > new file mode 100644 > index 0000000000..1faf309c97 > --- /dev/null > +++ b/sysdeps/generic/dl-new-hash.h > @@ -0,0 +1,111 @@ > +/* _dl_new_hash for elf symbol lookup > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef _DL_NEW_HASH_H > +#define _DL_NEW_HASH_H 1 > + > +#include <stdint.h> > +/* For __always_inline. */ > +#include <sys/cdefs.h> > +/* For __glibc_unlikely. */ > +#include <sys/cdefs.h> > + > +/* The simplest implementation of _dl_new_hash is: > + > + _dl_new_hash (const char *s) > + { > + uint32_t h = 5381; > + for (unsigned char c = *s; c != '\0'; c = *++s) > + h = h * 33 + c; > + return h; > + } > + > + We can get better performance by slightly unrolling the loop to > + pipeline the multiples, which gcc cannot easily do due to > + dependencies across iterations. > + > + As well, as an architecture specific option we add asm statements > + to explicitly specify order of operations and prevent reassociation > + of instructions that lengthens the loop carried dependency. This > + may have no affect as the compiler may have ordered instructions > + the same way without it but in testing this has not been the case > + for GCC. Improving GCC to reliably schedule instructions ideally > + cannot be easily done. > + > + Architecture(s) that use the reassociation barries are: > + x86 > + > + Note it is very unlikely the reassociation barriers would > + de-optimize performance on any architecture and with an imperfect > + compiler it may help performance, especially on out-of-order cpus, > + so it is suggested that the respective maintainers add them. > + > + architecture maintainers are encouraged to benchmark this with > + __asm_reassociation_barrier defined to __asm__ like it is in x86. > +*/ > + > + > +#ifndef __asm_reassociation_barrier > +# define __asm_reassociation_barrier(...) > +#endif > + > +static __always_inline uint32_t > +__attribute__ ((unused)) > +_dl_new_hash (const char *str) > +{ > + const unsigned char *s = (const unsigned char *) str; > + unsigned int h = 5381; > + unsigned int c0, c1; > + for (;;) > + { > + c0 = s[0]; > + /* Since hashed string is normally not empty, this is unlikely on the > + first iteration of the loop. */ > + if (__glibc_unlikely (c0 == 0)) > + return h; > + > + c1 = s[1]; > + if (c1 == 0) > + { > + /* Ideal computational order is: > + c0 += h; > + h *= 32; > + h += c0; */ > + c0 += h; > + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); > + h = h * 32 + c0; > + return h; > + } > + > + /* Ideal computational order is: > + c1 += c0; > + h *= 33 * 33; > + c0 *= 32; > + c1 += c0; > + h += c1; */ > + c1 += c0; > + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); > + h *= 33 * 33; > + c1 += c0 * 32; > + __asm_reassociation_barrier("" : "+r"(c1)); > + h += c1; > + s += 2; > + } > +} > + > +#endif /* dl-new-hash.h */ > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h > new file mode 100644 > index 0000000000..ce8fb5a838 > --- /dev/null > +++ b/sysdeps/x86/dl-new-hash.h > @@ -0,0 +1,24 @@ > +/* _dl_new_hash for elf symbol lookup > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifdef __asm_reassociation_barrier > +# error "__asm_reassociation_barrier should never already be defined." > +#endif > + > +#define __asm_reassociation_barrier __asm__ > +#include <sysdeps/generic/dl-new-hash.h> > -- > 2.34.1 > Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h and leave the generic one unchanged?
On Wed, May 18, 2022 at 12:32 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein > <goldstein.w.n@gmail.com> wrote: > > > > Unroll slightly and enforce good instruction scheduling. This improves > > performance on out-of-order machines. The unrolling allows for > > pipelined multiplies. > > > > As well, as an optional sysdep, reorder the operations and prevent > > reassosiation for better scheduling and higher ILP. This commit > > only adds the barrier for x86, although it should be either no > > change or a win for any architecture. > > > > Unrolling further started to induce slowdowns for sizes [0, 4] > > but can help the loop so if larger sizes are the target further > > unrolling can be beneficial. > > > > Results for _dl_new_hash > > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > > > Time as Geometric Mean of N=30 runs > > Geometric of all benchmark New / Old: 0.674 > > type, length, New Time, Old Time, New Time / Old Time > > fixed, 0, 2.865, 2.72, 1.053 > > fixed, 1, 3.567, 2.489, 1.433 > > fixed, 2, 2.577, 3.649, 0.706 > > fixed, 3, 3.644, 5.983, 0.609 > > fixed, 4, 4.211, 6.833, 0.616 > > fixed, 5, 4.741, 9.372, 0.506 > > fixed, 6, 5.415, 9.561, 0.566 > > fixed, 7, 6.649, 10.789, 0.616 > > fixed, 8, 8.081, 11.808, 0.684 > > fixed, 9, 8.427, 12.935, 0.651 > > fixed, 10, 8.673, 14.134, 0.614 > > fixed, 11, 10.69, 15.408, 0.694 > > fixed, 12, 10.789, 16.982, 0.635 > > fixed, 13, 12.169, 18.411, 0.661 > > fixed, 14, 12.659, 19.914, 0.636 > > fixed, 15, 13.526, 21.541, 0.628 > > fixed, 16, 14.211, 23.088, 0.616 > > fixed, 32, 29.412, 52.722, 0.558 > > fixed, 64, 65.41, 142.351, 0.459 > > fixed, 128, 138.505, 295.625, 0.469 > > fixed, 256, 291.707, 601.983, 0.485 > > random, 2, 12.698, 12.849, 0.988 > > random, 4, 16.065, 15.857, 1.013 > > random, 8, 19.564, 21.105, 0.927 > > random, 16, 23.919, 26.823, 0.892 > > random, 32, 31.987, 39.591, 0.808 > > random, 64, 49.282, 71.487, 0.689 > > random, 128, 82.23, 145.364, 0.566 > > random, 256, 152.209, 298.434, 0.51 > > > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > > --- > > benchtests/bench-dl-new-hash.c | 3 +- > > elf/{dl-new-hash.h => simple-dl-new-hash.h} | 20 ++-- > > elf/tst-dl-hash.c | 1 + > > sysdeps/generic/dl-new-hash.h | 111 ++++++++++++++++++++ > > sysdeps/x86/dl-new-hash.h | 24 +++++ > > 5 files changed, 146 insertions(+), 13 deletions(-) > > rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%) > > create mode 100644 sysdeps/generic/dl-new-hash.h > > create mode 100644 sysdeps/x86/dl-new-hash.h > > > > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c > > index 3c8a1d5a82..040fa7ce01 100644 > > --- a/benchtests/bench-dl-new-hash.c > > +++ b/benchtests/bench-dl-new-hash.c > > @@ -16,7 +16,8 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <elf/dl-new-hash.h> > > +#include <dl-new-hash.h> > > +#include <elf/simple-dl-new-hash.h> > > #define TEST_FUNC(x, y) _dl_new_hash (x) > > #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) > > > > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h > > similarity index 75% > > rename from elf/dl-new-hash.h > > rename to elf/simple-dl-new-hash.h > > index 8641bb4196..1437b1bd36 100644 > > --- a/elf/dl-new-hash.h > > +++ b/elf/simple-dl-new-hash.h > > @@ -1,4 +1,4 @@ > > -/* _dl_new_hash for elf symbol lookup > > +/* __simple_dl_new_hash for testing true elf symbol lookup. > > Copyright (C) 2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > @@ -16,16 +16,16 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#ifndef _DL_NEW_HASH_H > > -#define _DL_NEW_HASH_H 1 > > +#ifndef _SIMPLE_DL_NEW_HASH_H > > +#define _SIMPLE_DL_NEW_HASH_H 1 > > > > #include <stdint.h> > > -/* For __always_inline. */ > > -#include <sys/cdefs.h> > > > > -static __always_inline uint32_t > > +/* For testing/benchmarking purposes. Real implementation in > > + sysdeps/generic/dl-new-hash.h. */ > > +static uint32_t > > __attribute__ ((unused)) > > -_dl_new_hash (const char *s) > > +__simple_dl_new_hash (const char *s) > > { > > uint32_t h = 5381; > > for (unsigned char c = *s; c != '\0'; c = *++s) > > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) > > return h; > > } > > > > -/* For testing/benchmarking purposes. */ > > -#define __simple_dl_new_hash _dl_new_hash > > - > > - > > -#endif /* dl-new-hash.h */ > > +#endif /* simple-dl-new-hash.h */ > > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c > > index 8697eb73a0..b21766c63d 100644 > > --- a/elf/tst-dl-hash.c > > +++ b/elf/tst-dl-hash.c > > @@ -18,6 +18,7 @@ > > > > > > #include <simple-dl-hash.h> > > +#include <simple-dl-new-hash.h> > > #include <dl-hash.h> > > #include <dl-new-hash.h> > > #include <support/support.h> > > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h > > new file mode 100644 > > index 0000000000..1faf309c97 > > --- /dev/null > > +++ b/sysdeps/generic/dl-new-hash.h > > @@ -0,0 +1,111 @@ > > +/* _dl_new_hash for elf symbol lookup > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifndef _DL_NEW_HASH_H > > +#define _DL_NEW_HASH_H 1 > > + > > +#include <stdint.h> > > +/* For __always_inline. */ > > +#include <sys/cdefs.h> > > +/* For __glibc_unlikely. */ > > +#include <sys/cdefs.h> > > + > > +/* The simplest implementation of _dl_new_hash is: > > + > > + _dl_new_hash (const char *s) > > + { > > + uint32_t h = 5381; > > + for (unsigned char c = *s; c != '\0'; c = *++s) > > + h = h * 33 + c; > > + return h; > > + } > > + > > + We can get better performance by slightly unrolling the loop to > > + pipeline the multiples, which gcc cannot easily do due to > > + dependencies across iterations. > > + > > + As well, as an architecture specific option we add asm statements > > + to explicitly specify order of operations and prevent reassociation > > + of instructions that lengthens the loop carried dependency. This > > + may have no affect as the compiler may have ordered instructions > > + the same way without it but in testing this has not been the case > > + for GCC. Improving GCC to reliably schedule instructions ideally > > + cannot be easily done. > > + > > + Architecture(s) that use the reassociation barries are: > > + x86 > > + > > + Note it is very unlikely the reassociation barriers would > > + de-optimize performance on any architecture and with an imperfect > > + compiler it may help performance, especially on out-of-order cpus, > > + so it is suggested that the respective maintainers add them. > > + > > + architecture maintainers are encouraged to benchmark this with > > + __asm_reassociation_barrier defined to __asm__ like it is in x86. > > +*/ > > + > > + > > +#ifndef __asm_reassociation_barrier > > +# define __asm_reassociation_barrier(...) > > +#endif > > + > > +static __always_inline uint32_t > > +__attribute__ ((unused)) > > +_dl_new_hash (const char *str) > > +{ > > + const unsigned char *s = (const unsigned char *) str; > > + unsigned int h = 5381; > > + unsigned int c0, c1; > > + for (;;) > > + { > > + c0 = s[0]; > > + /* Since hashed string is normally not empty, this is unlikely on the > > + first iteration of the loop. */ > > + if (__glibc_unlikely (c0 == 0)) > > + return h; > > + > > + c1 = s[1]; > > + if (c1 == 0) > > + { > > + /* Ideal computational order is: > > + c0 += h; > > + h *= 32; > > + h += c0; */ > > + c0 += h; > > + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); > > + h = h * 32 + c0; > > + return h; > > + } > > + > > + /* Ideal computational order is: > > + c1 += c0; > > + h *= 33 * 33; > > + c0 *= 32; > > + c1 += c0; > > + h += c1; */ > > + c1 += c0; > > + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); > > + h *= 33 * 33; > > + c1 += c0 * 32; > > + __asm_reassociation_barrier("" : "+r"(c1)); > > + h += c1; > > + s += 2; > > + } > > +} > > + > > +#endif /* dl-new-hash.h */ > > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h > > new file mode 100644 > > index 0000000000..ce8fb5a838 > > --- /dev/null > > +++ b/sysdeps/x86/dl-new-hash.h > > @@ -0,0 +1,24 @@ > > +/* _dl_new_hash for elf symbol lookup > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifdef __asm_reassociation_barrier > > +# error "__asm_reassociation_barrier should never already be defined." > > +#endif > > + > > +#define __asm_reassociation_barrier __asm__ > > +#include <sysdeps/generic/dl-new-hash.h> > > -- > > 2.34.1 > > > > Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h > and leave the generic one unchanged? I think the expectation is the generic code is going to be a win across the board. The sysdep aspect of the perf were the asm barriers. But again, I've only benchmarks on x86. > > -- > H.J.
On 18/05/2022 23:02, H.J. Lu via Libc-alpha wrote: > andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein > <goldstein.w.n@gmail.com> wrote: >> >> Unroll slightly and enforce good instruction scheduling. This improves >> performance on out-of-order machines. The unrolling allows for >> pipelined multiplies. >> >> As well, as an optional sysdep, reorder the operations and prevent >> reassosiation for better scheduling and higher ILP. This commit >> only adds the barrier for x86, although it should be either no >> change or a win for any architecture. >> >> Unrolling further started to induce slowdowns for sizes [0, 4] >> but can help the loop so if larger sizes are the target further >> unrolling can be beneficial. >> >> Results for _dl_new_hash >> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz >> >> Time as Geometric Mean of N=30 runs >> Geometric of all benchmark New / Old: 0.674 >> type, length, New Time, Old Time, New Time / Old Time >> fixed, 0, 2.865, 2.72, 1.053 >> fixed, 1, 3.567, 2.489, 1.433 >> fixed, 2, 2.577, 3.649, 0.706 >> fixed, 3, 3.644, 5.983, 0.609 >> fixed, 4, 4.211, 6.833, 0.616 >> fixed, 5, 4.741, 9.372, 0.506 >> fixed, 6, 5.415, 9.561, 0.566 >> fixed, 7, 6.649, 10.789, 0.616 >> fixed, 8, 8.081, 11.808, 0.684 >> fixed, 9, 8.427, 12.935, 0.651 >> fixed, 10, 8.673, 14.134, 0.614 >> fixed, 11, 10.69, 15.408, 0.694 >> fixed, 12, 10.789, 16.982, 0.635 >> fixed, 13, 12.169, 18.411, 0.661 >> fixed, 14, 12.659, 19.914, 0.636 >> fixed, 15, 13.526, 21.541, 0.628 >> fixed, 16, 14.211, 23.088, 0.616 >> fixed, 32, 29.412, 52.722, 0.558 >> fixed, 64, 65.41, 142.351, 0.459 >> fixed, 128, 138.505, 295.625, 0.469 >> fixed, 256, 291.707, 601.983, 0.485 >> random, 2, 12.698, 12.849, 0.988 >> random, 4, 16.065, 15.857, 1.013 >> random, 8, 19.564, 21.105, 0.927 >> random, 16, 23.919, 26.823, 0.892 >> random, 32, 31.987, 39.591, 0.808 >> random, 64, 49.282, 71.487, 0.689 >> random, 128, 82.23, 145.364, 0.566 >> random, 256, 152.209, 298.434, 0.51 >> >> Co-authored-by: Alexander Monakov <amonakov@ispras.ru> >> --- >> benchtests/bench-dl-new-hash.c | 3 +- >> elf/{dl-new-hash.h => simple-dl-new-hash.h} | 20 ++-- >> elf/tst-dl-hash.c | 1 + >> sysdeps/generic/dl-new-hash.h | 111 ++++++++++++++++++++ >> sysdeps/x86/dl-new-hash.h | 24 +++++ >> 5 files changed, 146 insertions(+), 13 deletions(-) >> rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%) >> create mode 100644 sysdeps/generic/dl-new-hash.h >> create mode 100644 sysdeps/x86/dl-new-hash.h >> >> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c >> index 3c8a1d5a82..040fa7ce01 100644 >> --- a/benchtests/bench-dl-new-hash.c >> +++ b/benchtests/bench-dl-new-hash.c >> @@ -16,7 +16,8 @@ >> License along with the GNU C Library; if not, see >> <https://www.gnu.org/licenses/>. */ >> >> -#include <elf/dl-new-hash.h> >> +#include <dl-new-hash.h> >> +#include <elf/simple-dl-new-hash.h> >> #define TEST_FUNC(x, y) _dl_new_hash (x) >> #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) >> >> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h >> similarity index 75% >> rename from elf/dl-new-hash.h >> rename to elf/simple-dl-new-hash.h >> index 8641bb4196..1437b1bd36 100644 >> --- a/elf/dl-new-hash.h >> +++ b/elf/simple-dl-new-hash.h >> @@ -1,4 +1,4 @@ >> -/* _dl_new_hash for elf symbol lookup >> +/* __simple_dl_new_hash for testing true elf symbol lookup. >> Copyright (C) 2022 Free Software Foundation, Inc. >> This file is part of the GNU C Library. >> >> @@ -16,16 +16,16 @@ >> License along with the GNU C Library; if not, see >> <https://www.gnu.org/licenses/>. */ >> >> -#ifndef _DL_NEW_HASH_H >> -#define _DL_NEW_HASH_H 1 >> +#ifndef _SIMPLE_DL_NEW_HASH_H >> +#define _SIMPLE_DL_NEW_HASH_H 1 >> >> #include <stdint.h> >> -/* For __always_inline. */ >> -#include <sys/cdefs.h> >> >> -static __always_inline uint32_t >> +/* For testing/benchmarking purposes. Real implementation in >> + sysdeps/generic/dl-new-hash.h. */ >> +static uint32_t >> __attribute__ ((unused)) >> -_dl_new_hash (const char *s) >> +__simple_dl_new_hash (const char *s) >> { >> uint32_t h = 5381; >> for (unsigned char c = *s; c != '\0'; c = *++s) >> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) >> return h; >> } >> >> -/* For testing/benchmarking purposes. */ >> -#define __simple_dl_new_hash _dl_new_hash >> - >> - >> -#endif /* dl-new-hash.h */ >> +#endif /* simple-dl-new-hash.h */ >> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c >> index 8697eb73a0..b21766c63d 100644 >> --- a/elf/tst-dl-hash.c >> +++ b/elf/tst-dl-hash.c >> @@ -18,6 +18,7 @@ >> >> >> #include <simple-dl-hash.h> >> +#include <simple-dl-new-hash.h> >> #include <dl-hash.h> >> #include <dl-new-hash.h> >> #include <support/support.h> >> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h >> new file mode 100644 >> index 0000000000..1faf309c97 >> --- /dev/null >> +++ b/sysdeps/generic/dl-new-hash.h >> @@ -0,0 +1,111 @@ >> +/* _dl_new_hash for elf symbol lookup >> + Copyright (C) 2022 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#ifndef _DL_NEW_HASH_H >> +#define _DL_NEW_HASH_H 1 >> + >> +#include <stdint.h> >> +/* For __always_inline. */ >> +#include <sys/cdefs.h> >> +/* For __glibc_unlikely. */ >> +#include <sys/cdefs.h> >> + >> +/* The simplest implementation of _dl_new_hash is: >> + >> + _dl_new_hash (const char *s) >> + { >> + uint32_t h = 5381; >> + for (unsigned char c = *s; c != '\0'; c = *++s) >> + h = h * 33 + c; >> + return h; >> + } >> + >> + We can get better performance by slightly unrolling the loop to >> + pipeline the multiples, which gcc cannot easily do due to >> + dependencies across iterations. >> + >> + As well, as an architecture specific option we add asm statements >> + to explicitly specify order of operations and prevent reassociation >> + of instructions that lengthens the loop carried dependency. This >> + may have no affect as the compiler may have ordered instructions >> + the same way without it but in testing this has not been the case >> + for GCC. Improving GCC to reliably schedule instructions ideally >> + cannot be easily done. >> + >> + Architecture(s) that use the reassociation barries are: >> + x86 >> + >> + Note it is very unlikely the reassociation barriers would >> + de-optimize performance on any architecture and with an imperfect >> + compiler it may help performance, especially on out-of-order cpus, >> + so it is suggested that the respective maintainers add them. >> + >> + architecture maintainers are encouraged to benchmark this with >> + __asm_reassociation_barrier defined to __asm__ like it is in x86. >> +*/ >> + >> + >> +#ifndef __asm_reassociation_barrier >> +# define __asm_reassociation_barrier(...) >> +#endif >> + >> +static __always_inline uint32_t >> +__attribute__ ((unused)) >> +_dl_new_hash (const char *str) >> +{ >> + const unsigned char *s = (const unsigned char *) str; >> + unsigned int h = 5381; >> + unsigned int c0, c1; >> + for (;;) >> + { >> + c0 = s[0]; >> + /* Since hashed string is normally not empty, this is unlikely on the >> + first iteration of the loop. */ >> + if (__glibc_unlikely (c0 == 0)) >> + return h; >> + >> + c1 = s[1]; >> + if (c1 == 0) >> + { >> + /* Ideal computational order is: >> + c0 += h; >> + h *= 32; >> + h += c0; */ >> + c0 += h; >> + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); >> + h = h * 32 + c0; >> + return h; >> + } >> + >> + /* Ideal computational order is: >> + c1 += c0; >> + h *= 33 * 33; >> + c0 *= 32; >> + c1 += c0; >> + h += c1; */ >> + c1 += c0; >> + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); >> + h *= 33 * 33; >> + c1 += c0 * 32; >> + __asm_reassociation_barrier("" : "+r"(c1)); >> + h += c1; >> + s += 2; >> + } >> +} >> + >> +#endif /* dl-new-hash.h */ >> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h >> new file mode 100644 >> index 0000000000..ce8fb5a838 >> --- /dev/null >> +++ b/sysdeps/x86/dl-new-hash.h >> @@ -0,0 +1,24 @@ >> +/* _dl_new_hash for elf symbol lookup >> + Copyright (C) 2022 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#ifdef __asm_reassociation_barrier >> +# error "__asm_reassociation_barrier should never already be defined." >> +#endif >> + >> +#define __asm_reassociation_barrier __asm__ >> +#include <sysdeps/generic/dl-new-hash.h> >> -- >> 2.34.1 >> > > Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h > and leave the generic one unchanged? > There are 3 implementations: the reference one in elf/dl-new-hash.h that's retained for verification, the optimized one in sysdeps-generic/dl-new-hash.h that is suitable for all architectures and the micro-optimized one with optimized schedule for x86, giving it that little bit more. Siddhesh
On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote: > Unroll slightly and enforce good instruction scheduling. This improves > performance on out-of-order machines. The unrolling allows for > pipelined multiplies. > > As well, as an optional sysdep, reorder the operations and prevent > reassosiation for better scheduling and higher ILP. This commit > only adds the barrier for x86, although it should be either no > change or a win for any architecture. > > Unrolling further started to induce slowdowns for sizes [0, 4] > but can help the loop so if larger sizes are the target further > unrolling can be beneficial. > > Results for _dl_new_hash > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > Time as Geometric Mean of N=30 runs > Geometric of all benchmark New / Old: 0.674 > type, length, New Time, Old Time, New Time / Old Time > fixed, 0, 2.865, 2.72, 1.053 > fixed, 1, 3.567, 2.489, 1.433 > fixed, 2, 2.577, 3.649, 0.706 > fixed, 3, 3.644, 5.983, 0.609 > fixed, 4, 4.211, 6.833, 0.616 > fixed, 5, 4.741, 9.372, 0.506 > fixed, 6, 5.415, 9.561, 0.566 > fixed, 7, 6.649, 10.789, 0.616 > fixed, 8, 8.081, 11.808, 0.684 > fixed, 9, 8.427, 12.935, 0.651 > fixed, 10, 8.673, 14.134, 0.614 > fixed, 11, 10.69, 15.408, 0.694 > fixed, 12, 10.789, 16.982, 0.635 > fixed, 13, 12.169, 18.411, 0.661 > fixed, 14, 12.659, 19.914, 0.636 > fixed, 15, 13.526, 21.541, 0.628 > fixed, 16, 14.211, 23.088, 0.616 > fixed, 32, 29.412, 52.722, 0.558 > fixed, 64, 65.41, 142.351, 0.459 > fixed, 128, 138.505, 295.625, 0.469 > fixed, 256, 291.707, 601.983, 0.485 > random, 2, 12.698, 12.849, 0.988 > random, 4, 16.065, 15.857, 1.013 > random, 8, 19.564, 21.105, 0.927 > random, 16, 23.919, 26.823, 0.892 > random, 32, 31.987, 39.591, 0.808 > random, 64, 49.282, 71.487, 0.689 > random, 128, 82.23, 145.364, 0.566 > random, 256, 152.209, 298.434, 0.51 > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > --- > benchtests/bench-dl-new-hash.c | 3 +- > elf/{dl-new-hash.h => simple-dl-new-hash.h} | 20 ++-- > elf/tst-dl-hash.c | 1 + > sysdeps/generic/dl-new-hash.h | 111 ++++++++++++++++++++ > sysdeps/x86/dl-new-hash.h | 24 +++++ > 5 files changed, 146 insertions(+), 13 deletions(-) > rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%) > create mode 100644 sysdeps/generic/dl-new-hash.h > create mode 100644 sysdeps/x86/dl-new-hash.h Mostly OK, just minor nits to fix below. > > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c > index 3c8a1d5a82..040fa7ce01 100644 > --- a/benchtests/bench-dl-new-hash.c > +++ b/benchtests/bench-dl-new-hash.c > @@ -16,7 +16,8 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <elf/dl-new-hash.h> > +#include <dl-new-hash.h> > +#include <elf/simple-dl-new-hash.h> > #define TEST_FUNC(x, y) _dl_new_hash (x) > #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) OK. > > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h > similarity index 75% > rename from elf/dl-new-hash.h > rename to elf/simple-dl-new-hash.h > index 8641bb4196..1437b1bd36 100644 > --- a/elf/dl-new-hash.h > +++ b/elf/simple-dl-new-hash.h > @@ -1,4 +1,4 @@ > -/* _dl_new_hash for elf symbol lookup > +/* __simple_dl_new_hash for testing true elf symbol lookup. > Copyright (C) 2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > @@ -16,16 +16,16 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#ifndef _DL_NEW_HASH_H > -#define _DL_NEW_HASH_H 1 > +#ifndef _SIMPLE_DL_NEW_HASH_H > +#define _SIMPLE_DL_NEW_HASH_H 1 > > #include <stdint.h> > -/* For __always_inline. */ > -#include <sys/cdefs.h> > > -static __always_inline uint32_t > +/* For testing/benchmarking purposes. Real implementation in > + sysdeps/generic/dl-new-hash.h. */ > +static uint32_t > __attribute__ ((unused)) > -_dl_new_hash (const char *s) > +__simple_dl_new_hash (const char *s) > { > uint32_t h = 5381; > for (unsigned char c = *s; c != '\0'; c = *++s) > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) > return h; > } > > -/* For testing/benchmarking purposes. */ > -#define __simple_dl_new_hash _dl_new_hash > - > - > -#endif /* dl-new-hash.h */ > +#endif /* simple-dl-new-hash.h */ > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c > index 8697eb73a0..b21766c63d 100644 > --- a/elf/tst-dl-hash.c > +++ b/elf/tst-dl-hash.c > @@ -18,6 +18,7 @@ > > > #include <simple-dl-hash.h> > +#include <simple-dl-new-hash.h> > #include <dl-hash.h> > #include <dl-new-hash.h> > #include <support/support.h> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h > new file mode 100644 > index 0000000000..1faf309c97 > --- /dev/null > +++ b/sysdeps/generic/dl-new-hash.h > @@ -0,0 +1,111 @@ > +/* _dl_new_hash for elf symbol lookup > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef _DL_NEW_HASH_H > +#define _DL_NEW_HASH_H 1 > + > +#include <stdint.h> > +/* For __always_inline. */ > +#include <sys/cdefs.h> > +/* For __glibc_unlikely. */ > +#include <sys/cdefs.h> Duplicate, but you already know this. > + > +/* The simplest implementation of _dl_new_hash is: > + > + _dl_new_hash (const char *s) > + { > + uint32_t h = 5381; > + for (unsigned char c = *s; c != '\0'; c = *++s) > + h = h * 33 + c; > + return h; > + } > + > + We can get better performance by slightly unrolling the loop to > + pipeline the multiples, which gcc cannot easily do due to > + dependencies across iterations. > + > + As well, as an architecture specific option we add asm statements > + to explicitly specify order of operations and prevent reassociation > + of instructions that lengthens the loop carried dependency. This > + may have no affect as the compiler may have ordered instructions > + the same way without it but in testing this has not been the case > + for GCC. Improving GCC to reliably schedule instructions ideally > + cannot be easily done. > + > + Architecture(s) that use the reassociation barries are: barriers > + x86 > + > + Note it is very unlikely the reassociation barriers would > + de-optimize performance on any architecture and with an imperfect > + compiler it may help performance, especially on out-of-order cpus, > + so it is suggested that the respective maintainers add them. > + > + architecture maintainers are encouraged to benchmark this with Architecture > + __asm_reassociation_barrier defined to __asm__ like it is in x86. > +*/ > + > + > +#ifndef __asm_reassociation_barrier > +# define __asm_reassociation_barrier(...) > +#endif > + > +static __always_inline uint32_t > +__attribute__ ((unused)) > +_dl_new_hash (const char *str) > +{ > + const unsigned char *s = (const unsigned char *) str; > + unsigned int h = 5381; > + unsigned int c0, c1; > + for (;;) > + { > + c0 = s[0]; > + /* Since hashed string is normally not empty, this is unlikely on the > + first iteration of the loop. */ > + if (__glibc_unlikely (c0 == 0)) > + return h; > + > + c1 = s[1]; > + if (c1 == 0) > + { > + /* Ideal computational order is: > + c0 += h; > + h *= 32; > + h += c0; */ > + c0 += h; > + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); > + h = h * 32 + c0; > + return h; > + } > + > + /* Ideal computational order is: > + c1 += c0; > + h *= 33 * 33; > + c0 *= 32; > + c1 += c0; > + h += c1; */ > + c1 += c0; > + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); > + h *= 33 * 33; > + c1 += c0 * 32; > + __asm_reassociation_barrier("" : "+r"(c1)); > + h += c1; > + s += 2; > + } > +} > + OK. > +#endif /* dl-new-hash.h */ > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h > new file mode 100644 > index 0000000000..ce8fb5a838 > --- /dev/null > +++ b/sysdeps/x86/dl-new-hash.h > @@ -0,0 +1,24 @@ > +/* _dl_new_hash for elf symbol lookup > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifdef __asm_reassociation_barrier > +# error "__asm_reassociation_barrier should never already be defined." > +#endif > + > +#define __asm_reassociation_barrier __asm__ > +#include <sysdeps/generic/dl-new-hash.h> OK.
On Thu, May 19, 2022 at 10:55 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote: > > On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote: > > Unroll slightly and enforce good instruction scheduling. This improves > > performance on out-of-order machines. The unrolling allows for > > pipelined multiplies. > > > > As well, as an optional sysdep, reorder the operations and prevent > > reassosiation for better scheduling and higher ILP. This commit > > only adds the barrier for x86, although it should be either no > > change or a win for any architecture. > > > > Unrolling further started to induce slowdowns for sizes [0, 4] > > but can help the loop so if larger sizes are the target further > > unrolling can be beneficial. > > > > Results for _dl_new_hash > > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > > > Time as Geometric Mean of N=30 runs > > Geometric of all benchmark New / Old: 0.674 > > type, length, New Time, Old Time, New Time / Old Time > > fixed, 0, 2.865, 2.72, 1.053 > > fixed, 1, 3.567, 2.489, 1.433 > > fixed, 2, 2.577, 3.649, 0.706 > > fixed, 3, 3.644, 5.983, 0.609 > > fixed, 4, 4.211, 6.833, 0.616 > > fixed, 5, 4.741, 9.372, 0.506 > > fixed, 6, 5.415, 9.561, 0.566 > > fixed, 7, 6.649, 10.789, 0.616 > > fixed, 8, 8.081, 11.808, 0.684 > > fixed, 9, 8.427, 12.935, 0.651 > > fixed, 10, 8.673, 14.134, 0.614 > > fixed, 11, 10.69, 15.408, 0.694 > > fixed, 12, 10.789, 16.982, 0.635 > > fixed, 13, 12.169, 18.411, 0.661 > > fixed, 14, 12.659, 19.914, 0.636 > > fixed, 15, 13.526, 21.541, 0.628 > > fixed, 16, 14.211, 23.088, 0.616 > > fixed, 32, 29.412, 52.722, 0.558 > > fixed, 64, 65.41, 142.351, 0.459 > > fixed, 128, 138.505, 295.625, 0.469 > > fixed, 256, 291.707, 601.983, 0.485 > > random, 2, 12.698, 12.849, 0.988 > > random, 4, 16.065, 15.857, 1.013 > > random, 8, 19.564, 21.105, 0.927 > > random, 16, 23.919, 26.823, 0.892 > > random, 32, 31.987, 39.591, 0.808 > > random, 64, 49.282, 71.487, 0.689 > > random, 128, 82.23, 145.364, 0.566 > > random, 256, 152.209, 298.434, 0.51 > > > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > > --- > > benchtests/bench-dl-new-hash.c | 3 +- > > elf/{dl-new-hash.h => simple-dl-new-hash.h} | 20 ++-- > > elf/tst-dl-hash.c | 1 + > > sysdeps/generic/dl-new-hash.h | 111 ++++++++++++++++++++ > > sysdeps/x86/dl-new-hash.h | 24 +++++ > > 5 files changed, 146 insertions(+), 13 deletions(-) > > rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%) > > create mode 100644 sysdeps/generic/dl-new-hash.h > > create mode 100644 sysdeps/x86/dl-new-hash.h > > Mostly OK, just minor nits to fix below. > > > > > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c > > index 3c8a1d5a82..040fa7ce01 100644 > > --- a/benchtests/bench-dl-new-hash.c > > +++ b/benchtests/bench-dl-new-hash.c > > @@ -16,7 +16,8 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <elf/dl-new-hash.h> > > +#include <dl-new-hash.h> > > +#include <elf/simple-dl-new-hash.h> > > #define TEST_FUNC(x, y) _dl_new_hash (x) > > #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) > > OK. > > > > > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h > > similarity index 75% > > rename from elf/dl-new-hash.h > > rename to elf/simple-dl-new-hash.h > > index 8641bb4196..1437b1bd36 100644 > > --- a/elf/dl-new-hash.h > > +++ b/elf/simple-dl-new-hash.h > > @@ -1,4 +1,4 @@ > > -/* _dl_new_hash for elf symbol lookup > > +/* __simple_dl_new_hash for testing true elf symbol lookup. > > Copyright (C) 2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > @@ -16,16 +16,16 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#ifndef _DL_NEW_HASH_H > > -#define _DL_NEW_HASH_H 1 > > +#ifndef _SIMPLE_DL_NEW_HASH_H > > +#define _SIMPLE_DL_NEW_HASH_H 1 > > > > #include <stdint.h> > > -/* For __always_inline. */ > > -#include <sys/cdefs.h> > > > > -static __always_inline uint32_t > > +/* For testing/benchmarking purposes. Real implementation in > > + sysdeps/generic/dl-new-hash.h. */ > > +static uint32_t > > __attribute__ ((unused)) > > -_dl_new_hash (const char *s) > > +__simple_dl_new_hash (const char *s) > > { > > uint32_t h = 5381; > > for (unsigned char c = *s; c != '\0'; c = *++s) > > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) > > return h; > > } > > > > -/* For testing/benchmarking purposes. */ > > -#define __simple_dl_new_hash _dl_new_hash > > - > > - > > -#endif /* dl-new-hash.h */ > > +#endif /* simple-dl-new-hash.h */ > > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c > > index 8697eb73a0..b21766c63d 100644 > > --- a/elf/tst-dl-hash.c > > +++ b/elf/tst-dl-hash.c > > @@ -18,6 +18,7 @@ > > > > > > #include <simple-dl-hash.h> > > +#include <simple-dl-new-hash.h> > > #include <dl-hash.h> > > #include <dl-new-hash.h> > > #include <support/support.h> > > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h > > new file mode 100644 > > index 0000000000..1faf309c97 > > --- /dev/null > > +++ b/sysdeps/generic/dl-new-hash.h > > @@ -0,0 +1,111 @@ > > +/* _dl_new_hash for elf symbol lookup > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifndef _DL_NEW_HASH_H > > +#define _DL_NEW_HASH_H 1 > > + > > +#include <stdint.h> > > +/* For __always_inline. */ > > +#include <sys/cdefs.h> > > +/* For __glibc_unlikely. */ > > +#include <sys/cdefs.h> > > Duplicate, but you already know this. > > > + > > +/* The simplest implementation of _dl_new_hash is: > > + > > + _dl_new_hash (const char *s) > > + { > > + uint32_t h = 5381; > > + for (unsigned char c = *s; c != '\0'; c = *++s) > > + h = h * 33 + c; > > + return h; > > + } > > + > > + We can get better performance by slightly unrolling the loop to > > + pipeline the multiples, which gcc cannot easily do due to > > + dependencies across iterations. > > + > > + As well, as an architecture specific option we add asm statements > > + to explicitly specify order of operations and prevent reassociation > > + of instructions that lengthens the loop carried dependency. This > > + may have no affect as the compiler may have ordered instructions > > + the same way without it but in testing this has not been the case > > + for GCC. Improving GCC to reliably schedule instructions ideally > > + cannot be easily done. > > + > > + Architecture(s) that use the reassociation barries are: > > barriers Fixed in V11. > > > + x86 > > + > > + Note it is very unlikely the reassociation barriers would > > + de-optimize performance on any architecture and with an imperfect > > + compiler it may help performance, especially on out-of-order cpus, > > + so it is suggested that the respective maintainers add them. > > + > > + architecture maintainers are encouraged to benchmark this with > > Architecture Fixed in V11. > > > + __asm_reassociation_barrier defined to __asm__ like it is in x86. > > +*/ > > + > > + > > +#ifndef __asm_reassociation_barrier > > +# define __asm_reassociation_barrier(...) > > +#endif > > + > > +static __always_inline uint32_t > > +__attribute__ ((unused)) > > +_dl_new_hash (const char *str) > > +{ > > + const unsigned char *s = (const unsigned char *) str; > > + unsigned int h = 5381; > > + unsigned int c0, c1; > > + for (;;) > > + { > > + c0 = s[0]; > > + /* Since hashed string is normally not empty, this is unlikely on the > > + first iteration of the loop. */ > > + if (__glibc_unlikely (c0 == 0)) > > + return h; > > + > > + c1 = s[1]; > > + if (c1 == 0) > > + { > > + /* Ideal computational order is: > > + c0 += h; > > + h *= 32; > > + h += c0; */ > > + c0 += h; > > + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); > > + h = h * 32 + c0; > > + return h; > > + } > > + > > + /* Ideal computational order is: > > + c1 += c0; > > + h *= 33 * 33; > > + c0 *= 32; > > + c1 += c0; > > + h += c1; */ > > + c1 += c0; > > + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); > > + h *= 33 * 33; > > + c1 += c0 * 32; > > + __asm_reassociation_barrier("" : "+r"(c1)); > > + h += c1; > > + s += 2; > > + } > > +} > > + > > OK. > > > +#endif /* dl-new-hash.h */ > > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h > > new file mode 100644 > > index 0000000000..ce8fb5a838 > > --- /dev/null > > +++ b/sysdeps/x86/dl-new-hash.h > > @@ -0,0 +1,24 @@ > > +/* _dl_new_hash for elf symbol lookup > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifdef __asm_reassociation_barrier > > +# error "__asm_reassociation_barrier should never already be defined." > > +#endif > > + > > +#define __asm_reassociation_barrier __asm__ > > +#include <sysdeps/generic/dl-new-hash.h> > > OK.
diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c index 3c8a1d5a82..040fa7ce01 100644 --- a/benchtests/bench-dl-new-hash.c +++ b/benchtests/bench-dl-new-hash.c @@ -16,7 +16,8 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <elf/dl-new-hash.h> +#include <dl-new-hash.h> +#include <elf/simple-dl-new-hash.h> #define TEST_FUNC(x, y) _dl_new_hash (x) #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x) diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h similarity index 75% rename from elf/dl-new-hash.h rename to elf/simple-dl-new-hash.h index 8641bb4196..1437b1bd36 100644 --- a/elf/dl-new-hash.h +++ b/elf/simple-dl-new-hash.h @@ -1,4 +1,4 @@ -/* _dl_new_hash for elf symbol lookup +/* __simple_dl_new_hash for testing true elf symbol lookup. Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,16 +16,16 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#ifndef _DL_NEW_HASH_H -#define _DL_NEW_HASH_H 1 +#ifndef _SIMPLE_DL_NEW_HASH_H +#define _SIMPLE_DL_NEW_HASH_H 1 #include <stdint.h> -/* For __always_inline. */ -#include <sys/cdefs.h> -static __always_inline uint32_t +/* For testing/benchmarking purposes. Real implementation in + sysdeps/generic/dl-new-hash.h. */ +static uint32_t __attribute__ ((unused)) -_dl_new_hash (const char *s) +__simple_dl_new_hash (const char *s) { uint32_t h = 5381; for (unsigned char c = *s; c != '\0'; c = *++s) @@ -33,8 +33,4 @@ _dl_new_hash (const char *s) return h; } -/* For testing/benchmarking purposes. */ -#define __simple_dl_new_hash _dl_new_hash - - -#endif /* dl-new-hash.h */ +#endif /* simple-dl-new-hash.h */ diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c index 8697eb73a0..b21766c63d 100644 --- a/elf/tst-dl-hash.c +++ b/elf/tst-dl-hash.c @@ -18,6 +18,7 @@ #include <simple-dl-hash.h> +#include <simple-dl-new-hash.h> #include <dl-hash.h> #include <dl-new-hash.h> #include <support/support.h> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h new file mode 100644 index 0000000000..1faf309c97 --- /dev/null +++ b/sysdeps/generic/dl-new-hash.h @@ -0,0 +1,111 @@ +/* _dl_new_hash for elf symbol lookup + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _DL_NEW_HASH_H +#define _DL_NEW_HASH_H 1 + +#include <stdint.h> +/* For __always_inline. */ +#include <sys/cdefs.h> +/* For __glibc_unlikely. */ +#include <sys/cdefs.h> + +/* The simplest implementation of _dl_new_hash is: + + _dl_new_hash (const char *s) + { + uint32_t h = 5381; + for (unsigned char c = *s; c != '\0'; c = *++s) + h = h * 33 + c; + return h; + } + + We can get better performance by slightly unrolling the loop to + pipeline the multiples, which gcc cannot easily do due to + dependencies across iterations. + + As well, as an architecture specific option we add asm statements + to explicitly specify order of operations and prevent reassociation + of instructions that lengthens the loop carried dependency. This + may have no affect as the compiler may have ordered instructions + the same way without it but in testing this has not been the case + for GCC. Improving GCC to reliably schedule instructions ideally + cannot be easily done. + + Architecture(s) that use the reassociation barries are: + x86 + + Note it is very unlikely the reassociation barriers would + de-optimize performance on any architecture and with an imperfect + compiler it may help performance, especially on out-of-order cpus, + so it is suggested that the respective maintainers add them. + + architecture maintainers are encouraged to benchmark this with + __asm_reassociation_barrier defined to __asm__ like it is in x86. +*/ + + +#ifndef __asm_reassociation_barrier +# define __asm_reassociation_barrier(...) +#endif + +static __always_inline uint32_t +__attribute__ ((unused)) +_dl_new_hash (const char *str) +{ + const unsigned char *s = (const unsigned char *) str; + unsigned int h = 5381; + unsigned int c0, c1; + for (;;) + { + c0 = s[0]; + /* Since hashed string is normally not empty, this is unlikely on the + first iteration of the loop. */ + if (__glibc_unlikely (c0 == 0)) + return h; + + c1 = s[1]; + if (c1 == 0) + { + /* Ideal computational order is: + c0 += h; + h *= 32; + h += c0; */ + c0 += h; + __asm_reassociation_barrier("" : "+r"(h) : "r"(c0)); + h = h * 32 + c0; + return h; + } + + /* Ideal computational order is: + c1 += c0; + h *= 33 * 33; + c0 *= 32; + c1 += c0; + h += c1; */ + c1 += c0; + __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0)); + h *= 33 * 33; + c1 += c0 * 32; + __asm_reassociation_barrier("" : "+r"(c1)); + h += c1; + s += 2; + } +} + +#endif /* dl-new-hash.h */ diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h new file mode 100644 index 0000000000..ce8fb5a838 --- /dev/null +++ b/sysdeps/x86/dl-new-hash.h @@ -0,0 +1,24 @@ +/* _dl_new_hash for elf symbol lookup + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifdef __asm_reassociation_barrier +# error "__asm_reassociation_barrier should never already be defined." +#endif + +#define __asm_reassociation_barrier __asm__ +#include <sysdeps/generic/dl-new-hash.h>