diff mbox series

[v6,6/6] elf: Optimize _dl_new_hash in dl-new-hash.h

Message ID 20220510150441.20948-6-goldstein.w.n@gmail.com
State New
Headers show
Series [v6,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked | expand

Commit Message

Noah Goldstein May 10, 2022, 3:04 p.m. UTC
Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=25 runs
Geometric of all benchmark New / Old: 0.791
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    0.641,    0.658,               0.974
 fixed,      1,    1.888,    1.883,               1.003
 fixed,      2,    2.712,    2.833,               0.957
 fixed,      3,    3.314,    3.739,               0.886
 fixed,      4,    4.316,    4.866,               0.887
 fixed,      5,     5.16,    5.966,               0.865
 fixed,      6,    5.986,    7.241,               0.827
 fixed,      7,    7.264,    8.435,               0.861
 fixed,      8,    8.052,    9.846,               0.818
 fixed,      9,    9.369,   11.316,               0.828
 fixed,     10,   10.256,   12.925,               0.794
 fixed,     11,   12.191,   14.546,               0.838
 fixed,     12,   12.667,    15.92,               0.796
 fixed,     13,   14.442,   17.465,               0.827
 fixed,     14,   14.808,   18.981,                0.78
 fixed,     15,   16.244,   20.565,                0.79
 fixed,     16,   17.166,   22.044,               0.779
 fixed,     32,   35.447,   50.558,               0.701
 fixed,     64,   86.479,  134.529,               0.643
 fixed,    128,  155.453,  287.527,               0.541
 fixed,    256,   302.57,   593.64,                0.51
random,      2,   11.168,    10.61,               1.053
random,      4,   13.308,    13.53,               0.984
random,      8,   16.579,   19.437,               0.853
random,     16,   21.292,   24.776,               0.859
random,     32,    30.56,   35.906,               0.851
random,     64,   49.249,   68.577,               0.718
random,    128,   81.845,  140.664,               0.582
random,    256,  152.517,  292.204,               0.522

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

Comments

H.J. Lu May 10, 2022, 3:29 p.m. UTC | #1
On Tue, May 10, 2022 at 8:04 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.791
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    0.641,    0.658,               0.974
>  fixed,      1,    1.888,    1.883,               1.003
>  fixed,      2,    2.712,    2.833,               0.957
>  fixed,      3,    3.314,    3.739,               0.886
>  fixed,      4,    4.316,    4.866,               0.887
>  fixed,      5,     5.16,    5.966,               0.865
>  fixed,      6,    5.986,    7.241,               0.827
>  fixed,      7,    7.264,    8.435,               0.861
>  fixed,      8,    8.052,    9.846,               0.818
>  fixed,      9,    9.369,   11.316,               0.828
>  fixed,     10,   10.256,   12.925,               0.794
>  fixed,     11,   12.191,   14.546,               0.838
>  fixed,     12,   12.667,    15.92,               0.796
>  fixed,     13,   14.442,   17.465,               0.827
>  fixed,     14,   14.808,   18.981,                0.78
>  fixed,     15,   16.244,   20.565,                0.79
>  fixed,     16,   17.166,   22.044,               0.779
>  fixed,     32,   35.447,   50.558,               0.701
>  fixed,     64,   86.479,  134.529,               0.643
>  fixed,    128,  155.453,  287.527,               0.541
>  fixed,    256,   302.57,   593.64,                0.51
> random,      2,   11.168,    10.61,               1.053
> random,      4,   13.308,    13.53,               0.984
> random,      8,   16.579,   19.437,               0.853
> random,     16,   21.292,   24.776,               0.859
> random,     32,    30.56,   35.906,               0.851
> random,     64,   49.249,   68.577,               0.718
> random,    128,   81.845,  140.664,               0.582
> random,    256,  152.517,  292.204,               0.522
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..cacbeec289 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,55 @@
>  #define _DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
>
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *signed_s)
>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) signed_s;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

I don't think it is safe for strictly aligned targets.

> +      /* Unlikely length zero string so evens will be slightly less
> +        common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       {
> +         return h;
> +       }
> +
> +      c1 = (unsigned int) *(s + 1);
> +      if (c1 == 0)
> +       {
> +         c0 += h;
> +         /* Ideal instruction scheduling is:
> +            c0 += h;
> +            h *= 32;
> +            h += c0;
> +            The asm statement ensures the compiler can't mess that up.  */
> +         asm("" : "+r"(h) : "r"(c0));
> +         h = h * 32 + c0;
> +      return h;
> +       }
> +
> +      /* Ideal instruction scheduling is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;
> +        The asm statements ensures the compiler can't mess that up.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>  }
>
>
> --
> 2.34.1
>
H.J. Lu May 10, 2022, 3:31 p.m. UTC | #2
On Tue, May 10, 2022 at 8:29 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 8:04 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=25 runs
> > Geometric of all benchmark New / Old: 0.791
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    0.641,    0.658,               0.974
> >  fixed,      1,    1.888,    1.883,               1.003
> >  fixed,      2,    2.712,    2.833,               0.957
> >  fixed,      3,    3.314,    3.739,               0.886
> >  fixed,      4,    4.316,    4.866,               0.887
> >  fixed,      5,     5.16,    5.966,               0.865
> >  fixed,      6,    5.986,    7.241,               0.827
> >  fixed,      7,    7.264,    8.435,               0.861
> >  fixed,      8,    8.052,    9.846,               0.818
> >  fixed,      9,    9.369,   11.316,               0.828
> >  fixed,     10,   10.256,   12.925,               0.794
> >  fixed,     11,   12.191,   14.546,               0.838
> >  fixed,     12,   12.667,    15.92,               0.796
> >  fixed,     13,   14.442,   17.465,               0.827
> >  fixed,     14,   14.808,   18.981,                0.78
> >  fixed,     15,   16.244,   20.565,                0.79
> >  fixed,     16,   17.166,   22.044,               0.779
> >  fixed,     32,   35.447,   50.558,               0.701
> >  fixed,     64,   86.479,  134.529,               0.643
> >  fixed,    128,  155.453,  287.527,               0.541
> >  fixed,    256,   302.57,   593.64,                0.51
> > random,      2,   11.168,    10.61,               1.053
> > random,      4,   13.308,    13.53,               0.984
> > random,      8,   16.579,   19.437,               0.853
> > random,     16,   21.292,   24.776,               0.859
> > random,     32,    30.56,   35.906,               0.851
> > random,     64,   49.249,   68.577,               0.718
> > random,    128,   81.845,  140.664,               0.582
> > random,    256,  152.517,  292.204,               0.522
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  elf/dl-new-hash.h | 50 ++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 45 insertions(+), 5 deletions(-)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > index 40d88c81f9..cacbeec289 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,55 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *signed_s)
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > +  const unsigned char *s = (const unsigned char *) signed_s;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> I don't think it is safe for strictly aligned targets.

Never mind.  I didn't read it properly.

> > +      /* Unlikely length zero string so evens will be slightly less
> > +        common.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       {
> > +         return h;
> > +       }
> > +
> > +      c1 = (unsigned int) *(s + 1);
> > +      if (c1 == 0)
> > +       {
> > +         c0 += h;
> > +         /* Ideal instruction scheduling is:
> > +            c0 += h;
> > +            h *= 32;
> > +            h += c0;
> > +            The asm statement ensures the compiler can't mess that up.  */
> > +         asm("" : "+r"(h) : "r"(c0));
> > +         h = h * 32 + c0;
> > +      return h;
> > +       }
> > +
> > +      /* Ideal instruction scheduling is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;
> > +        The asm statements ensures the compiler can't mess that up.  */
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
> >
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
Alexander Monakov May 10, 2022, 4:49 p.m. UTC | #3
On Tue, 10 May 2022, Noah Goldstein wrote:

> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

It seems benchmark figures are unchanged from the first iteration of this
patch, did the revision not affect them?

(more comments below)

> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,55 @@
>  #define _DL_NEW_HASH_H 1
>  
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
>  
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *signed_s)

This is technically inaccurate (whether plain 'char' is signed depends on the
target), so if you're revising this further I'd suggest to change this too to
e.g. 'str'.

>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;

I think it would be nice to retain this loop as a comment to indicate what this
function is supposed to implement.

> +  const unsigned char *s = (const unsigned char *) signed_s;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

Surprised to see an explicit cast where plain assignment with implicit type
conversion is doing the obvious thing. Is it really necessary?

> +      /* Unlikely length zero string so evens will be slightly less
> +	 common.  */

I had trouble understanding this comment. I'd suggest dropping it or rephrasing
like 'Since hashed string is normally not empty, this is unlikely on the first
iteration of the loop'.

> +      if (__glibc_unlikely (c0 == 0))
> +	{
> +	  return h;
> +	}

Braces look unnecessary.

> +
> +      c1 = (unsigned int) *(s + 1);

Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
Alternatively, you could use 'c1 = *s++' here and above and drop explicit
s += 2 below, I expect resulting assembly to be the same.

> +      if (c1 == 0)
> +	{
> +	  c0 += h;
> +	  /* Ideal instruction scheduling is:
> +	     c0 += h;
> +	     h *= 32;
> +	     h += c0;
> +	     The asm statement ensures the compiler can't mess that up.  */

The main concern here is preventing reassociation from 'h = h*32 + (c0 + h)'
to 'h = (h*32 + h) + c0', not scheduling. We're using an empty asm to break
up a sequence of additions.

Also note that this leads to a return, so only saves a couple cycles once per
call, not inside the loop.

> +	  asm("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +      return h;

Wrong indentation here.

> +	}
> +
> +      /* Ideal instruction scheduling is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;
> +	 The asm statements ensures the compiler can't mess that up.  */

As above, we are placing empty asms mainly as reassociation barriers.

> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>  }

Alexander
Noah Goldstein May 10, 2022, 5:17 p.m. UTC | #4
On Tue, May 10, 2022 at 11:49 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Tue, 10 May 2022, Noah Goldstein wrote:
>
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> It seems benchmark figures are unchanged from the first iteration of this
> patch, did the revision not affect them?

Didn't rerun, will do for V7.
>
> (more comments below)
>
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,55 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *signed_s)
>
> This is technically inaccurate (whether plain 'char' is signed depends on the
> target), so if you're revising this further I'd suggest to change this too to
> e.g. 'str'.

Will fix for V7.
>
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
>
> I think it would be nice to retain this loop as a comment to indicate what this
> function is supposed to implement.

Will fix for V7.
>
> > +  const unsigned char *s = (const unsigned char *) signed_s;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> Surprised to see an explicit cast where plain assignment with implicit type
> conversion is doing the obvious thing. Is it really necessary?

It's not, I just tend to err on the side of explicitness.

>
> > +      /* Unlikely length zero string so evens will be slightly less
> > +      common.  */
>
> I had trouble understanding this comment. I'd suggest dropping it or rephrasing
> like 'Since hashed string is normally not empty, this is unlikely on the first
> iteration of the loop'.

Will fix for V7.
>
> > +      if (__glibc_unlikely (c0 == 0))
> > +     {
> > +       return h;
> > +     }
>
> Braces look unnecessary.

Will fix v7.
>
> > +
> > +      c1 = (unsigned int) *(s + 1);
>
> Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
> Alternatively, you could use 'c1 = *s++' here and above and drop explicit
> s += 2 below, I expect resulting assembly to be the same.

generally think user `[]` access makes stylistic sense when incrementing
an index and *(s1 + N) makes sense when incrementing a pointer.

I'm generally in favor of leaving the casts/access as is but its not a hill
worth dying on.

LMK if its important to you for V7 (will be a few hours to rerun benchmarks).

>
> > +      if (c1 == 0)
> > +     {
> > +       c0 += h;
> > +       /* Ideal instruction scheduling is:
> > +          c0 += h;
> > +          h *= 32;
> > +          h += c0;
> > +          The asm statement ensures the compiler can't mess that up.  */
>
> The main concern here is preventing reassociation from 'h = h*32 + (c0 + h)'
> to 'h = (h*32 + h) + c0', not scheduling. We're using an empty asm to break
> up a sequence of additions.

Well the reason (h * 32 + h) + c0 is worse is it creates scheduling
dependencies.
No? Seems like if the comment is "to avoid reassociation" the natural question
is why does reassociation matter? To say it's explicitly for
scheduling answers that.

I'll update the comment in V7 to mention both.
>
> Also note that this leads to a return, so only saves a couple cycles once per
> call, not inside the loop.
>
> > +       asm("" : "+r"(h) : "r"(c0));
> > +       h = h * 32 + c0;
> > +      return h;
>
> Wrong indentation here.

Will fix V7.
>
> > +     }
> > +
> > +      /* Ideal instruction scheduling is:
> > +      c1 += c0;
> > +      h *= 33 * 33;
> > +      c0 *= 32;
> > +      c1 += c0;
> > +      h  += c1;
> > +      The asm statements ensures the compiler can't mess that up.  */
>
> As above, we are placing empty asms mainly as reassociation barriers.

Will update to mention both. It really is a missed-optimization in GCC
instruction
scheduling pass though.
>
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
>
> Alexander
Alexander Monakov May 10, 2022, 5:40 p.m. UTC | #5
On Tue, 10 May 2022, Noah Goldstein wrote:
> > > +
> > > +      c1 = (unsigned int) *(s + 1);
> >
> > Again unnecessary explicit cast here (c1 = s[1] might be easier to read).
> > Alternatively, you could use 'c1 = *s++' here and above and drop explicit
> > s += 2 below, I expect resulting assembly to be the same.
> 
> generally think user `[]` access makes stylistic sense when incrementing
> an index and *(s1 + N) makes sense when incrementing a pointer.
> 
> I'm generally in favor of leaving the casts/access as is but its not a hill
> worth dying on.
> 
> LMK if its important to you for V7 (will be a few hours to rerun benchmarks).

Thanks for the explanation. I'm not an active contributor, so I don't mind.

Alexander
diff mbox series

Patch

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..cacbeec289 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,55 @@ 
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *signed_s)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) signed_s;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = (unsigned int) *s;
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	{
+	  return h;
+	}
+
+      c1 = (unsigned int) *(s + 1);
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	     c0 += h;
+	     h *= 32;
+	     h += c0;
+	     The asm statement ensures the compiler can't mess that up.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+      return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+	 The asm statements ensures the compiler can't mess that up.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }