Message ID | 20230601123332.3297404-2-ardb@kernel.org |
---|---|
State | New |
Headers | show |
Series | Implement PMULL using host intrinsics | expand |
On Thu, 1 Jun 2023 at 13:33, Ard Biesheuvel <ardb@kernel.org> wrote: > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > host/include/i386/host/cpuinfo.h | 1 + > target/arm/tcg/vec_helper.c | 26 +++++++++++++++++++- > util/cpuinfo-i386.c | 1 + > 3 files changed, 27 insertions(+), 1 deletion(-) > > diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h > index 073d0a426f31487d..cf4ced844760d28f 100644 > --- a/host/include/i386/host/cpuinfo.h > +++ b/host/include/i386/host/cpuinfo.h > @@ -27,6 +27,7 @@ > #define CPUINFO_ATOMIC_VMOVDQA (1u << 16) > #define CPUINFO_ATOMIC_VMOVDQU (1u << 17) > #define CPUINFO_AES (1u << 18) > +#define CPUINFO_PMULL (1u << 19) > > /* Initialized with a constructor. */ > extern unsigned cpuinfo; > diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c > index f59d3b26eacf08f8..fb422627588439b3 100644 > --- a/target/arm/tcg/vec_helper.c > +++ b/target/arm/tcg/vec_helper.c > @@ -25,6 +25,14 @@ > #include "qemu/int128.h" > #include "vec_internal.h" > > +#ifdef __x86_64__ > +#include "host/cpuinfo.h" > +#include <wmmintrin.h> > +#define TARGET_PMULL __attribute__((__target__("pclmul"))) > +#else > +#define TARGET_PMULL > +#endif > + > /* > * Data for expanding active predicate bits to bytes, for byte elements. > * > @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) > * Because of the lanes are not accessed in strict columns, > * this probably cannot be turned into a generic helper. > */ > -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) > +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) > { > intptr_t i, j, opr_sz = simd_oprsz(desc); > intptr_t hi = simd_data(desc); > uint64_t *d = vd, *n = vn, *m = vm; > > +#ifdef __x86_64__ > + if (cpuinfo & CPUINFO_PMULL) { > + switch (hi) { > + case 0: > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0); > + break; > + case 1: > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11); > + break; > + default: > + g_assert_not_reached(); > + } > + return; > + } > +#endif This needs to cope with the input vectors being more than just 128 bits wide, I think. Also you probably still need the clear_tail() to clear any high bits of the register. thanks -- PMM
On Thu, 1 Jun 2023 at 15:01, Peter Maydell <peter.maydell@linaro.org> wrote: > > On Thu, 1 Jun 2023 at 13:33, Ard Biesheuvel <ardb@kernel.org> wrote: > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > --- > > host/include/i386/host/cpuinfo.h | 1 + > > target/arm/tcg/vec_helper.c | 26 +++++++++++++++++++- > > util/cpuinfo-i386.c | 1 + > > 3 files changed, 27 insertions(+), 1 deletion(-) > > > > diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h > > index 073d0a426f31487d..cf4ced844760d28f 100644 > > --- a/host/include/i386/host/cpuinfo.h > > +++ b/host/include/i386/host/cpuinfo.h > > @@ -27,6 +27,7 @@ > > #define CPUINFO_ATOMIC_VMOVDQA (1u << 16) > > #define CPUINFO_ATOMIC_VMOVDQU (1u << 17) > > #define CPUINFO_AES (1u << 18) > > +#define CPUINFO_PMULL (1u << 19) > > > > /* Initialized with a constructor. */ > > extern unsigned cpuinfo; > > diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c > > index f59d3b26eacf08f8..fb422627588439b3 100644 > > --- a/target/arm/tcg/vec_helper.c > > +++ b/target/arm/tcg/vec_helper.c > > @@ -25,6 +25,14 @@ > > #include "qemu/int128.h" > > #include "vec_internal.h" > > > > +#ifdef __x86_64__ > > +#include "host/cpuinfo.h" > > +#include <wmmintrin.h> > > +#define TARGET_PMULL __attribute__((__target__("pclmul"))) > > +#else > > +#define TARGET_PMULL > > +#endif > > + > > /* > > * Data for expanding active predicate bits to bytes, for byte elements. > > * > > @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) > > * Because of the lanes are not accessed in strict columns, > > * this probably cannot be turned into a generic helper. > > */ > > -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) > > +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) > > { > > intptr_t i, j, opr_sz = simd_oprsz(desc); > > intptr_t hi = simd_data(desc); > > uint64_t *d = vd, *n = vn, *m = vm; > > > > +#ifdef __x86_64__ > > + if (cpuinfo & CPUINFO_PMULL) { > > + switch (hi) { > > + case 0: > > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0); > > + break; > > + case 1: > > + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11); > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + return; > > + } > > +#endif > > This needs to cope with the input vectors being more than > just 128 bits wide, I think. Also you probably still > need the clear_tail() to clear any high bits of the register. > Ah yes, I missed that completely.
diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h index 073d0a426f31487d..cf4ced844760d28f 100644 --- a/host/include/i386/host/cpuinfo.h +++ b/host/include/i386/host/cpuinfo.h @@ -27,6 +27,7 @@ #define CPUINFO_ATOMIC_VMOVDQA (1u << 16) #define CPUINFO_ATOMIC_VMOVDQU (1u << 17) #define CPUINFO_AES (1u << 18) +#define CPUINFO_PMULL (1u << 19) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index f59d3b26eacf08f8..fb422627588439b3 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -25,6 +25,14 @@ #include "qemu/int128.h" #include "vec_internal.h" +#ifdef __x86_64__ +#include "host/cpuinfo.h" +#include <wmmintrin.h> +#define TARGET_PMULL __attribute__((__target__("pclmul"))) +#else +#define TARGET_PMULL +#endif + /* * Data for expanding active predicate bits to bytes, for byte elements. * @@ -2010,12 +2018,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) * Because of the lanes are not accessed in strict columns, * this probably cannot be turned into a generic helper. */ -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) +void TARGET_PMULL HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t i, j, opr_sz = simd_oprsz(desc); intptr_t hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; +#ifdef __x86_64__ + if (cpuinfo & CPUINFO_PMULL) { + switch (hi) { + case 0: + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x0); + break; + case 1: + *(__m128i *)vd = _mm_clmulepi64_si128(*(__m128i *)vm, *(__m128i *)vn, 0x11); + break; + default: + g_assert_not_reached(); + } + return; + } +#endif + for (i = 0; i < opr_sz / 8; i += 2) { uint64_t nn = n[i + hi]; uint64_t mm = m[i + hi]; diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c index 3043f066c0182dc8..8930e13451201a64 100644 --- a/util/cpuinfo-i386.c +++ b/util/cpuinfo-i386.c @@ -40,6 +40,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void) info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0); info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0); info |= (c & bit_AES ? CPUINFO_AES : 0); + info |= (c & bit_PCLMULQDQ ? CPUINFO_PMULL : 0); /* For AVX features, we must check available and usable. */ if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- host/include/i386/host/cpuinfo.h | 1 + target/arm/tcg/vec_helper.c | 26 +++++++++++++++++++- util/cpuinfo-i386.c | 1 + 3 files changed, 27 insertions(+), 1 deletion(-)