diff mbox series

[2/2] util: add util function buffer_zero_avx512()

Message ID 1581580379-54109-3-git-send-email-robert.hu@linux.intel.com
State New
Headers show
Series Add AVX512F optimization option and buffer_zero_avx512() | expand

Commit Message

Robert Hoo Feb. 13, 2020, 7:52 a.m. UTC
And initialize buffer_is_zero() with it, when Intel AVX512F is
available on host.

This function utilizes Intel AVX512 fundamental instructions which
perform over previous AVX2 instructions.

Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
---
 include/qemu/cpuid.h |  3 +++
 util/bufferiszero.c  | 56 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 56 insertions(+), 3 deletions(-)

Comments

Paolo Bonzini Feb. 13, 2020, 10:30 a.m. UTC | #1
On 13/02/20 08:52, Robert Hoo wrote:
> +
> +}
> +#pragma GCC pop_options
> +#endif
> +
> +
>  /* Note that for test_buffer_is_zero_next_accel, the most preferred
>   * ISA must have the least significant bit.
>   */
> -#define CACHE_AVX2    1
> -#define CACHE_SSE4    2
> -#define CACHE_SSE2    4
> +#define CACHE_AVX512F 1
> +#define CACHE_AVX2    2
> +#define CACHE_SSE4    4
> +#define CACHE_SSE2    6

This should be 8, not 6.

Paolo

>  
>  /* Make sure that these variables are appropriately initialized when
>   * SSE2 is enabled on the compiler command-line, but the compiler is
> @@ -226,6 +268,11 @@ static void init_accel(unsigned cache)
>          fn = buffer_zero_avx2;
>      }
>  #endif
> +#ifdef CONFIG_AVX512F_OPT
> +    if (cache & CACHE_AVX512F) {
> +        fn = buffer_zero_avx512;
> +    }
> +#endif
>      buffer_accel = fn;
>  }
>  
> @@ -255,6 +302,9 @@ static void __attribute__((constructor)) init_cpuid_cache(void)
>              if ((bv & 6) == 6 && (b & bit_AVX2)) {
>                  cache |= CACHE_AVX2;
>              }
> +            if ((bv & 6) == 6 && (b & bit_AVX512F)) {
> +                cache |= CACHE_AVX512F;
> +            }
>          }
Robert Hoo Feb. 13, 2020, 11:58 a.m. UTC | #2
On Thu, 2020-02-13 at 11:30 +0100, Paolo Bonzini wrote:
> On 13/02/20 08:52, Robert Hoo wrote:
> > +
> > +}
> > +#pragma GCC pop_options
> > +#endif
> > +
> > +
> >  /* Note that for test_buffer_is_zero_next_accel, the most
> > preferred
> >   * ISA must have the least significant bit.
> >   */
> > -#define CACHE_AVX2    1
> > -#define CACHE_SSE4    2
> > -#define CACHE_SSE2    4
> > +#define CACHE_AVX512F 1
> > +#define CACHE_AVX2    2
> > +#define CACHE_SSE4    4
> > +#define CACHE_SSE2    6
> 
> This should be 8, not 6.
> 
> Paolo

Thanks Paolo, going to fix it in v2.
> 
> >  
> >  /* Make sure that these variables are appropriately initialized
> > when
> >   * SSE2 is enabled on the compiler command-line, but the compiler
> > is
> > @@ -226,6 +268,11 @@ static void init_accel(unsigned cache)
> >          fn = buffer_zero_avx2;
> >      }
> >  #endif
> > +#ifdef CONFIG_AVX512F_OPT
> > +    if (cache & CACHE_AVX512F) {
> > +        fn = buffer_zero_avx512;
> > +    }
> > +#endif
> >      buffer_accel = fn;
> >  }
> >  
> > @@ -255,6 +302,9 @@ static void __attribute__((constructor))
> > init_cpuid_cache(void)
> >              if ((bv & 6) == 6 && (b & bit_AVX2)) {
> >                  cache |= CACHE_AVX2;
> >              }
> > +            if ((bv & 6) == 6 && (b & bit_AVX512F)) {
> > +                cache |= CACHE_AVX512F;
> > +            }
> >          }
> 
>
Richard Henderson Feb. 13, 2020, 6:20 p.m. UTC | #3
On 2/12/20 11:52 PM, Robert Hoo wrote:
> And initialize buffer_is_zero() with it, when Intel AVX512F is
> available on host.
> 
> This function utilizes Intel AVX512 fundamental instructions which
> perform over previous AVX2 instructions.

Is it not still true that any AVX512 insn will cause the entire cpu package,
not just the current core, to drop frequency by 20%?

As far as I know one should only use the 512-bit instructions when you can
overcome that frequency drop, which seems unlikely in this case.  That said...


> +    if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/
> +        return buffer_zero_int(buf, len);
> +    }

First, len < 64 has been eliminated already in select_accel_fn.
Second, len < 256 is not handled properly by the code below...


> +    /* Begin with an unaligned head of 64 bytes.  */
> +    t = _mm512_loadu_si512(buf);
> +    p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> +    e = (__m512i *)(((uintptr_t)buf + len) & -64);
> +
> +    /* Loop over 64-byte aligned blocks of 256.  */
> +    while (p < e) {
> +        __builtin_prefetch(p);
> +        if (unlikely(_mm512_test_epi64_mask(t, t))) {
> +            return false;
> +        }
> +        t = p[-4] | p[-3] | p[-2] | p[-1];
> +        p += 4;
> +    }
> +
> +    t |= _mm512_loadu_si512(buf + len - 4 * 64);
> +    t |= _mm512_loadu_si512(buf + len - 3 * 64);
> +    t |= _mm512_loadu_si512(buf + len - 2 * 64);
> +    t |= _mm512_loadu_si512(buf + len - 1 * 64);

... because this final sequence loads 256 bytes.

Rather than make a second test vs 256 in buffer_zero_avx512, I wonder if it
would be better to have select_accel_fn do the job.  Have a global variable
buffer_accel_size alongside buffer_accel so there's only one branch
(mis)predict to worry about.

FWIW, something that the compiler should do, but doesn't currently, is use
vpternlogq to perform a 3-input OR.  Something like

    /* 0xfe -> orABC */
    t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe);
    t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe);


r~
Robert Hoo Feb. 24, 2020, 7:07 a.m. UTC | #4
Thanks Richard:-)
Sorry for late reply.
On Thu, 2020-02-13 at 10:20 -0800, Richard Henderson wrote:
> On 2/12/20 11:52 PM, Robert Hoo wrote:
> > And initialize buffer_is_zero() with it, when Intel AVX512F is
> > available on host.
> > 
> > This function utilizes Intel AVX512 fundamental instructions which
> > perform over previous AVX2 instructions.
> 
> Is it not still true that any AVX512 insn will cause the entire cpu
> package,
> not just the current core, to drop frequency by 20%?
> 
> As far as I know one should only use the 512-bit instructions when
> you can
> overcome that frequency drop, which seems unlikely in this
> case.  That said...
> I don't think so. AVX512 has been applied in various places.
> > +    if (unlikely(len < 64)) { /*buff less than 512 bits,
> > unlikely*/
> > +        return buffer_zero_int(buf, len);
> > +    }
> 
> First, len < 64 has been eliminated already in select_accel_fn.
> Second, len < 256 is not handled properly by the code below...
> 
Right. I'm going to fix this in v2.
> 
> > +    /* Begin with an unaligned head of 64 bytes.  */
> > +    t = _mm512_loadu_si512(buf);
> > +    p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> > +    e = (__m512i *)(((uintptr_t)buf + len) & -64);
> > +
> > +    /* Loop over 64-byte aligned blocks of 256.  */
> > +    while (p < e) {
> > +        __builtin_prefetch(p);
> > +        if (unlikely(_mm512_test_epi64_mask(t, t))) {
> > +            return false;
> > +        }
> > +        t = p[-4] | p[-3] | p[-2] | p[-1];
> > +        p += 4;
> > +    }
> > +
> > +    t |= _mm512_loadu_si512(buf + len - 4 * 64);
> > +    t |= _mm512_loadu_si512(buf + len - 3 * 64);
> > +    t |= _mm512_loadu_si512(buf + len - 2 * 64);
> > +    t |= _mm512_loadu_si512(buf + len - 1 * 64);
> 
> ... because this final sequence loads 256 bytes.
> 
> Rather than make a second test vs 256 in buffer_zero_avx512, I wonder
> if it
> would be better to have select_accel_fn do the job.  Have a global
> variable
> buffer_accel_size alongside buffer_accel so there's only one branch
> (mis)predict to worry about.
> 
Thanks Richard, very enlightening!
Inspired by your suggestion, I'm thinking go further: use immediate
rather than a global variable, so that saves 1 memory(/cache) access. 

#ifdef CONFIG_AVX512F_OPT   
#define OPTIMIZE_LEN    256
#else
#define OPTIMIZE_LEN    64
#endif
> FWIW, something that the compiler should do, but doesn't currently,
> is use
> vpternlogq to perform a 3-input OR.  Something like
> 
>     /* 0xfe -> orABC */
>     t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe);
>     t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe);
> 
Very enlightening. Yes, seems compiler doesn't do this.
I tried explicitly use this, however, looks it will have more
instructions generated, and unit test shows it performs less than then
conventional code.
Let me keep the conventional code for this moment, will ask around and
dig further outside this patch.

> 
> r~
Richard Henderson Feb. 24, 2020, 4:13 p.m. UTC | #5
On 2/23/20 11:07 PM, Robert Hoo wrote:
> Inspired by your suggestion, I'm thinking go further: use immediate
> rather than a global variable, so that saves 1 memory(/cache) access. 
> 
> #ifdef CONFIG_AVX512F_OPT   
> #define OPTIMIZE_LEN    256
> #else
> #define OPTIMIZE_LEN    64
> #endif

With that, the testing in tests/test-bufferiszero.c, looping through the
implementations, is invalidated.  Because once you start compiling for avx512,
you're no longer testing sse2 et al with the same inputs.

IF we want to change the length to suit avx512, we would want to change it
unconditionally.  And then you could also tidy up avx2 to avoid the extra
comparisons there.


r~
Robert Hoo Feb. 25, 2020, 7:34 a.m. UTC | #6
On Mon, 2020-02-24 at 08:13 -0800, Richard Henderson wrote:
> On 2/23/20 11:07 PM, Robert Hoo wrote:
> > Inspired by your suggestion, I'm thinking go further: use immediate
> > rather than a global variable, so that saves 1 memory(/cache)
> > access. 
> > 
> > #ifdef CONFIG_AVX512F_OPT   
> > #define OPTIMIZE_LEN    256
> > #else
> > #define OPTIMIZE_LEN    64
> > #endif
> 
> With that, the testing in tests/test-bufferiszero.c, looping through
> the
> implementations, is invalidated.  Because once you start compiling
> for avx512,
> you're no longer testing sse2 et al with the same inputs.
> 
Right. Thanks pointing out. I didn't noticed that.
More precisely, it would cause no longer testing sse2 et al with < 256
length.

> IF we want to change the length to suit avx512, we would want to
> change it
> unconditionally.  And then you could also tidy up avx2 to avoid the
> extra
> comparisons there.
Considering the length's dependency on sse2/sse4/avx2/avx512 and the
algorithms, as well as future's possible changes, additions, I'd rather
roll back to your original suggestion, use a companion variable with
each accel_fn(). How do you like it?

> 
> 
> r~
Richard Henderson Feb. 25, 2020, 3:29 p.m. UTC | #7
On 2/24/20 11:34 PM, Robert Hoo wrote:
> Considering the length's dependency on sse2/sse4/avx2/avx512 and the
> algorithms, as well as future's possible changes, additions, I'd rather
> roll back to your original suggestion, use a companion variable with
> each accel_fn(). How do you like it?

How do I like it?

With a modification to init_accel() so that the function and the minimum length
are selected at the same time.


r~
diff mbox series

Patch

diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 6930170..09fc245 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -45,6 +45,9 @@ 
 #ifndef bit_AVX2
 #define bit_AVX2        (1 << 5)
 #endif
+#ifndef bit_AVX512F
+#define bit_AVX512F        (1 << 16)
+#endif
 #ifndef bit_BMI2
 #define bit_BMI2        (1 << 8)
 #endif
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index bfb2605..cbb854a 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -187,12 +187,54 @@  buffer_zero_avx2(const void *buf, size_t len)
 #pragma GCC pop_options
 #endif /* CONFIG_AVX2_OPT */
 
+#ifdef CONFIG_AVX512F_OPT
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#include <immintrin.h>
+
+static bool
+buffer_zero_avx512(const void *buf, size_t len)
+{
+    __m512i t;
+    __m512i *p, *e;
+
+    if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/
+        return buffer_zero_int(buf, len);
+    }
+    /* Begin with an unaligned head of 64 bytes.  */
+    t = _mm512_loadu_si512(buf);
+    p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
+    e = (__m512i *)(((uintptr_t)buf + len) & -64);
+
+    /* Loop over 64-byte aligned blocks of 256.  */
+    while (p < e) {
+        __builtin_prefetch(p);
+        if (unlikely(_mm512_test_epi64_mask(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    t |= _mm512_loadu_si512(buf + len - 4 * 64);
+    t |= _mm512_loadu_si512(buf + len - 3 * 64);
+    t |= _mm512_loadu_si512(buf + len - 2 * 64);
+    t |= _mm512_loadu_si512(buf + len - 1 * 64);
+
+    return !_mm512_test_epi64_mask(t, t);
+
+}
+#pragma GCC pop_options
+#endif
+
+
 /* Note that for test_buffer_is_zero_next_accel, the most preferred
  * ISA must have the least significant bit.
  */
-#define CACHE_AVX2    1
-#define CACHE_SSE4    2
-#define CACHE_SSE2    4
+#define CACHE_AVX512F 1
+#define CACHE_AVX2    2
+#define CACHE_SSE4    4
+#define CACHE_SSE2    6
 
 /* Make sure that these variables are appropriately initialized when
  * SSE2 is enabled on the compiler command-line, but the compiler is
@@ -226,6 +268,11 @@  static void init_accel(unsigned cache)
         fn = buffer_zero_avx2;
     }
 #endif
+#ifdef CONFIG_AVX512F_OPT
+    if (cache & CACHE_AVX512F) {
+        fn = buffer_zero_avx512;
+    }
+#endif
     buffer_accel = fn;
 }
 
@@ -255,6 +302,9 @@  static void __attribute__((constructor)) init_cpuid_cache(void)
             if ((bv & 6) == 6 && (b & bit_AVX2)) {
                 cache |= CACHE_AVX2;
             }
+            if ((bv & 6) == 6 && (b & bit_AVX512F)) {
+                cache |= CACHE_AVX512F;
+            }
         }
     }
     cpuid_cache = cache;