Message ID | 50b726a8-5857-3cd1-0d3b-a08e0e13fdf9@us.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2,rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit | expand |
ping. On 02/19/2019 03:03 PM, Paul Clarke wrote: > Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64 > (big-endian). > > _mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from > vector doubleword type to vector word type leaves the results in even > lanes in big endian mode. > > Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc > (32-bit big-endian). > > Incorrect type for interpreting the result from mfvsrd instruction leads > to incorrect results. Also, mfvsrd instruction only works as expected in > 64-bit mode or for 32-bit quantities in 32-bit mode. A more general, > if slower, solution is needed for 32-bit mode. > > 2019-02-19 Paul A. Clarke <pc@us.ibm.com> > > [gcc] > > * config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian. > (_mm_cvtpd_ps): Likewise. > (_mm_cvttpd_epi32): Likewise. > > PR89338 > * config/rs6000/xmmintrin.h (_mm_cvtss_f32): Fix type mismatch. > (_mm_cvt_ss2si): Fix type mismatch and 32-bit. > > PR89339 > * config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit. > > --- > v2: more elegant solution for the 32-bit mode fix in _mm_movemask_pi8, > as suggested by Segher. > > Index: gcc/config/rs6000/emmintrin.h > =================================================================== > diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h > --- a/trunk/gcc/config/rs6000/emmintrin.h (revision 268997) > +++ b/trunk/gcc/config/rs6000/emmintrin.h (working copy) > @@ -887,7 +887,11 @@ _mm_cvtpd_epi32 (__m128d __A) > : ); > > #ifdef _ARCH_PWR8 > +#ifdef __LITTLE_ENDIAN__ > temp = vec_mergeo (temp, temp); > +#else > + temp = vec_mergee (temp, temp); > +#endif > result = (__v4si) vec_vpkudum ((__vector long long) temp, > (__vector long long) vzero); > #else > @@ -922,7 +926,11 @@ _mm_cvtpd_ps (__m128d __A) > : ); > > #ifdef _ARCH_PWR8 > +#ifdef __LITTLE_ENDIAN__ > temp = vec_mergeo (temp, temp); > +#else > + temp = vec_mergee (temp, temp); > +#endif > result = (__v4sf) vec_vpkudum ((__vector long long) temp, > (__vector long long) vzero); > #else > @@ -951,7 +959,11 @@ _mm_cvttpd_epi32 (__m128d __A) > : ); > > #ifdef _ARCH_PWR8 > +#ifdef __LITTLE_ENDIAN__ > temp = vec_mergeo (temp, temp); > +#else > + temp = vec_mergee (temp, temp); > +#endif > result = (__v4si) vec_vpkudum ((__vector long long) temp, > (__vector long long) vzero); > #else > Index: gcc/config/rs6000/xmmintrin.h > =================================================================== > diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h > --- a/trunk/gcc/config/rs6000/xmmintrin.h (revision 268997) > +++ b/trunk/gcc/config/rs6000/xmmintrin.h (working copy) > @@ -905,7 +905,7 @@ _mm_cvtss_f32 (__m128 __A) > extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _mm_cvtss_si32 (__m128 __A) > { > - __m64 res = 0; > + int res; > #ifdef _ARCH_PWR8 > double dtmp; > __asm__( > @@ -938,8 +938,8 @@ _mm_cvt_ss2si (__m128 __A) > extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _mm_cvtss_si64 (__m128 __A) > { > - __m64 res = 0; > -#ifdef _ARCH_PWR8 > + long long res; > +#if defined (_ARCH_PWR8) && defined (__powerpc64__) > double dtmp; > __asm__( > #ifdef __LITTLE_ENDIAN__ > @@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B) > extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _mm_movemask_pi8 (__m64 __A) > { > +#ifdef __powerpc64__ > unsigned long long p = > #ifdef __LITTLE_ENDIAN__ > 0x0008101820283038UL; // permute control for sign bits > @@ -1584,6 +1585,12 @@ _mm_movemask_pi8 (__m64 __A) > 0x3830282018100800UL; // permute control for sign bits > #endif > return __builtin_bpermd (p, __A); > +#else > + unsigned int mask = 0x20283038UL; > + unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; > + unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; > + return (r2 << 4) | r1; > +#endif > } > > extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) >
Hi Paul, On Tue, Feb 19, 2019 at 03:03:58PM -0600, Paul Clarke wrote: > Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64 > (big-endian). > > _mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from > vector doubleword type to vector word type leaves the results in even > lanes in big endian mode. > > Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc > (32-bit big-endian). > > Incorrect type for interpreting the result from mfvsrd instruction leads > to incorrect results. Also, mfvsrd instruction only works as expected in > 64-bit mode or for 32-bit quantities in 32-bit mode. A more general, > if slower, solution is needed for 32-bit mode. Sorry for not reviewing this before. Thanks for the ping :-) > 2019-02-19 Paul A. Clarke <pc@us.ibm.com> > > [gcc] > > * config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian. > (_mm_cvtpd_ps): Likewise. > (_mm_cvttpd_epi32): Likewise. > > PR89338 This should be PR target/89338 > * config/rs6000/xmmintrin.h (_mm_cvtss_f32): Fix type mismatch. > (_mm_cvt_ss2si): Fix type mismatch and 32-bit. > > PR89339 > * config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit. Okay for trunk with those corrected. Thanks! Segher
Index: gcc/config/rs6000/emmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h --- a/trunk/gcc/config/rs6000/emmintrin.h (revision 268997) +++ b/trunk/gcc/config/rs6000/emmintrin.h (working copy) @@ -887,7 +887,11 @@ _mm_cvtpd_epi32 (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -922,7 +926,11 @@ _mm_cvtpd_ps (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4sf) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -951,7 +959,11 @@ _mm_cvttpd_epi32 (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else Index: gcc/config/rs6000/xmmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h --- a/trunk/gcc/config/rs6000/xmmintrin.h (revision 268997) +++ b/trunk/gcc/config/rs6000/xmmintrin.h (working copy) @@ -905,7 +905,7 @@ _mm_cvtss_f32 (__m128 __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si32 (__m128 __A) { - __m64 res = 0; + int res; #ifdef _ARCH_PWR8 double dtmp; __asm__( @@ -938,8 +938,8 @@ _mm_cvt_ss2si (__m128 __A) extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si64 (__m128 __A) { - __m64 res = 0; -#ifdef _ARCH_PWR8 + long long res; +#if defined (_ARCH_PWR8) && defined (__powerpc64__) double dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ @@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { +#ifdef __powerpc64__ unsigned long long p = #ifdef __LITTLE_ENDIAN__ 0x0008101820283038UL; // permute control for sign bits @@ -1584,6 +1585,12 @@ _mm_movemask_pi8 (__m64 __A) 0x3830282018100800UL; // permute control for sign bits #endif return __builtin_bpermd (p, __A); +#else + unsigned int mask = 0x20283038UL; + unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; + return (r2 << 4) | r1; +#endif } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))