[1/3,rs6000] x86-compat vector intrinsics fixes for BE, 32bit

Message ID	fd7571d1-1ce6-be2c-b474-116485a9674c@us.ibm.com
State	New
Headers	show Return-Path: <gcc-patches-return-491625-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:date:mime-version:message-id:content-type :content-transfer-encoding; q=dns; s=default; b=gcydVSMJl68pzL0v NlTpeeS4IDt9MzmIvoB1+gZxg6YQJs3pEckgKsEc2p/zZjfhOvuzG3MJh6L3dZDP vdhSK7/4rROEvGO8KjMMZoYNkk6YNo37/5/B/ZUhhVl3ShkP8nNZjOGriMcGPMeN 19aBWkGL11vwjtDAYH08KZfgMeI= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Gateway: Authorized Use Only! Violators will be prosecuted for <gcc-patches@gcc.gnu.org> from <pc@us.ibm.com>; Tue, 4 Dec 2018 14:59:06 -0000 Gateway: Authorized Use Only! Violators will be prosecuted; (version=TLSv1/SSLv3 cipher=AES256-GCM-SHA384 bits=256/256) Tue, 4 Dec 2018 14:59:05 -0000 To: gcc-patches@gcc.gnu.org, Segher Boessenkool <segher@kernel.crashing.org> From: Paul Clarke <pc@us.ibm.com> Subject: [PATCH 1/3][rs6000] x86-compat vector intrinsics fixes for BE, 32bit Date: Tue, 4 Dec 2018 08:59:03 -0600 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.9.1 MIME-Version: 1.0 Message-Id: <fd7571d1-1ce6-be2c-b474-116485a9674c@us.ibm.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit
Series	x86-compat vector intrinsics fixes for BE, 32bit \| expand [0/3,rs6000] x86-compat vector intrinsics fixes for BE, 32bit [1/3,rs6000] x86-compat vector intrinsics fixes for BE, 32bit [2/3,rs6000] Fix x86-compat vector intrinsics testcases for BE, 32bit [3/3,rs6000] Enable x86-compat vector intrinsics testing

Index: gcc/config/rs6000/emmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h --- a/trunk/gcc/config/rs6000/emmintrin.h (revision 266157) +++ b/trunk/gcc/config/rs6000/emmintrin.h (working copy) @@ -1237,7 +1237,7 @@ _mm_movemask_pd (__m128d __A) #ifdef __LITTLE_ENDIAN__ 0x80800040, 0x80808080, 0x80808080, 0x80808080 #elif __BIG_ENDIAN__ - 0x80808080, 0x80808080, 0x80808080, 0x80800040 + 0x80808080, 0x80808080, 0x80808080, 0x80804000 #endif }; @@ -1483,12 +1483,8 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) #endif return (__m128i) result; #else -#ifdef __LITTLE_ENDIAN__ return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); -#elif __BIG_ENDIAN__ - return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B); #endif -#endif } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1612,7 +1608,8 @@ _mm_bsrli_si128 (__m128i __A, const int __N) const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (__N < 16) - if (__builtin_constant_p(__N)) + if (__builtin_constant_p(__N) && + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) /* Would like to use Vector Shift Left Double by Octet Immediate here to use the immediate form and avoid load of __N * 8 value into a separate VR. */ @@ -1620,7 +1617,11 @@ _mm_bsrli_si128 (__m128i __A, const int __N) else { __v16qu shift = vec_splats((unsigned char)(__N*8)); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ result = vec_sro ((__v16qu)__A, shift); +#else + result = vec_slo ((__v16qu)__A, shift); +#endif } else result = zeros; @@ -2026,13 +2027,8 @@ _mm_movemask_epi8 (__m128i __A) __vector unsigned long long result; static const __vector unsigned char perm_mask = { -#ifdef __LITTLE_ENDIAN__ 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 -#elif __BIG_ENDIAN__ - 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, - 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 -#endif }; result = ((__vector unsigned long long) @@ -2078,34 +2074,23 @@ _mm_shufflehi_epi16 (__m128i __A, const int __mask #ifdef __LITTLE_ENDIAN__ 0x0908, 0x0B0A, 0x0D0C, 0x0F0E #elif __BIG_ENDIAN__ - 0x0607, 0x0405, 0x0203, 0x0001 + 0x0809, 0x0A0B, 0x0C0D, 0x0E0F #endif }; __v2du pmask = #ifdef __LITTLE_ENDIAN__ - { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL}; + { 0x1716151413121110UL, 0UL}; #elif __BIG_ENDIAN__ - { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL}; + { 0x1011121314151617UL, 0UL}; #endif __m64_union t; __v2du a, r; -#ifdef __LITTLE_ENDIAN__ t.as_short[0] = permute_selectors[element_selector_98]; t.as_short[1] = permute_selectors[element_selector_BA]; t.as_short[2] = permute_selectors[element_selector_DC]; t.as_short[3] = permute_selectors[element_selector_FE]; -#elif __BIG_ENDIAN__ - t.as_short[3] = permute_selectors[element_selector_98]; - t.as_short[2] = permute_selectors[element_selector_BA]; - t.as_short[1] = permute_selectors[element_selector_DC]; - t.as_short[0] = permute_selectors[element_selector_FE]; -#endif -#ifdef __LITTLE_ENDIAN__ pmask[1] = t.as_m64; -#elif __BIG_ENDIAN__ - pmask[0] = t.as_m64; -#endif a = (__v2du)__A; r = vec_perm (a, a, (__vector unsigned char)pmask); return (__m128i) r; @@ -2122,30 +2107,23 @@ _mm_shufflelo_epi16 (__m128i __A, const int __mask { #ifdef __LITTLE_ENDIAN__ 0x0100, 0x0302, 0x0504, 0x0706 -#elif __BIG_ENDIAN__ - 0x0e0f, 0x0c0d, 0x0a0b, 0x0809 +#else + 0x0001, 0x0203, 0x0405, 0x0607 #endif }; - __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL}; + __v2du pmask = +#ifdef __LITTLE_ENDIAN__ + { 0UL, 0x1f1e1d1c1b1a1918UL}; +#else + { 0UL, 0x18191a1b1c1d1e1fUL}; +#endif __m64_union t; __v2du a, r; - -#ifdef __LITTLE_ENDIAN__ t.as_short[0] = permute_selectors[element_selector_10]; t.as_short[1] = permute_selectors[element_selector_32]; t.as_short[2] = permute_selectors[element_selector_54]; t.as_short[3] = permute_selectors[element_selector_76]; -#elif __BIG_ENDIAN__ - t.as_short[3] = permute_selectors[element_selector_10]; - t.as_short[2] = permute_selectors[element_selector_32]; - t.as_short[1] = permute_selectors[element_selector_54]; - t.as_short[0] = permute_selectors[element_selector_76]; -#endif -#ifdef __LITTLE_ENDIAN__ pmask[0] = t.as_m64; -#elif __BIG_ENDIAN__ - pmask[1] = t.as_m64; -#endif a = (__v2du)__A; r = vec_perm (a, a, (__vector unsigned char)pmask); return (__m128i) r; @@ -2163,22 +2141,15 @@ _mm_shuffle_epi32 (__m128i __A, const int __mask) #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C #elif __BIG_ENDIAN__ - 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; __v4su t; -#ifdef __LITTLE_ENDIAN__ t[0] = permute_selectors[element_selector_10]; t[1] = permute_selectors[element_selector_32]; t[2] = permute_selectors[element_selector_54] + 0x10101010; t[3] = permute_selectors[element_selector_76] + 0x10101010; -#elif __BIG_ENDIAN__ - t[3] = permute_selectors[element_selector_10] + 0x10101010; - t[2] = permute_selectors[element_selector_32] + 0x10101010; - t[1] = permute_selectors[element_selector_54]; - t[0] = permute_selectors[element_selector_76]; -#endif return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); } Index: gcc/config/rs6000/mmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/mmintrin.h b/trunk/gcc/config/rs6000/mmintrin.h --- a/trunk/gcc/config/rs6000/mmintrin.h (revision 266157) +++ b/trunk/gcc/config/rs6000/mmintrin.h (working copy) @@ -172,8 +172,13 @@ _mm_packs_pi16 (__m64 __m1, __m64 __m2) __vector signed short vm1; __vector signed char vresult; - vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkshss (vm1, vm1); + vm1 = (__vector signed short) (__vector unsigned long long) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + vresult = vec_packs (vm1, vm1); return (__m64) ((__vector long long) vresult)[0]; } @@ -192,8 +197,13 @@ _mm_packs_pi32 (__m64 __m1, __m64 __m2) __vector signed int vm1; __vector signed short vresult; - vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkswss (vm1, vm1); + vm1 = (__vector signed int) (__vector unsigned long long) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + vresult = vec_packs (vm1, vm1); return (__m64) ((__vector long long) vresult)[0]; } @@ -209,12 +219,19 @@ _m_packssdw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16 (__m64 __m1, __m64 __m2) { - __vector signed short vm1; - __vector unsigned char vresult; - - vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkshus (vm1, vm1); - return (__m64) ((__vector long long) vresult)[0]; + __vector unsigned char r; + __vector signed short vm1 = (__vector signed short) (__vector long long) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + const __vector signed short __zero = { 0 }; + __vector __bool short __select = vec_cmplt (vm1, __zero); + r = vec_packs ((vector unsigned short) vm1, (vector unsigned short) vm1); + __vector __bool char packsel = vec_pack (__select, __select); + r = vec_sel (r, (const vector unsigned char) __zero, packsel); + return (__m64) ((__vector long long) r)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -235,7 +252,7 @@ _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) a = (__vector unsigned char)vec_splats (__m1); b = (__vector unsigned char)vec_splats (__m2); c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[0]; + return (__m64) ((__vector long long) c)[1]; #else __m64_union m1, m2, res; @@ -316,7 +333,7 @@ _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) a = (__vector unsigned char)vec_splats (__m1); b = (__vector unsigned char)vec_splats (__m2); c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[1]; + return (__m64) ((__vector long long) c)[0]; #else __m64_union m1, m2, res; @@ -710,7 +727,7 @@ _mm_setzero_si64 (void) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) { -#ifdef _ARCH_PWR6 +#if defined(_ARCH_PWR6) && defined(__powerpc64__) __m64 res; __asm__( "cmpb %0,%1,%2;\n" @@ -1084,8 +1101,13 @@ _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) __vector signed short c; __vector signed int w0, w1; __vector unsigned char xform1 = { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif }; a = (__vector signed short)vec_splats (__m1); Index: gcc/config/rs6000/pmmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/pmmintrin.h b/trunk/gcc/config/rs6000/pmmintrin.h --- a/trunk/gcc/config/rs6000/pmmintrin.h (revision 266157) +++ b/trunk/gcc/config/rs6000/pmmintrin.h (working copy) @@ -75,18 +75,16 @@ extern __inline __m128 __attribute__((__gnu_inline _mm_hadd_ps (__m128 __X, __m128 __Y) { __vector unsigned char xform2 = { - #ifdef __LITTLE_ENDIAN__ - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B - #elif __BIG_ENDIAN__ - 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F - #endif + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B }; __vector unsigned char xform1 = { - #ifdef __LITTLE_ENDIAN__ - 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F - #elif __BIG_ENDIAN__ - 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B - #endif + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F }; return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); @@ -96,18 +94,16 @@ extern __inline __m128 __attribute__((__gnu_inline _mm_hsub_ps (__m128 __X, __m128 __Y) { __vector unsigned char xform2 = { - #ifdef __LITTLE_ENDIAN__ - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B - #elif __BIG_ENDIAN__ - 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F - #endif + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B }; __vector unsigned char xform1 = { - #ifdef __LITTLE_ENDIAN__ - 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F - #elif __BIG_ENDIAN__ - 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B - #endif + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F }; return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); Index: gcc/config/rs6000/xmmintrin.h =================================================================== diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h --- a/trunk/gcc/config/rs6000/xmmintrin.h (revision 266157) +++ b/trunk/gcc/config/rs6000/xmmintrin.h (working copy) @@ -907,17 +907,17 @@ _mm_cvtss_si32 (__m128 __A) { __m64 res = 0; #ifdef _ARCH_PWR8 - __m128 vtmp; double dtmp; __asm__( - "xxsldwi %x1,%x3,%x3,3;\n" - "xscvspdp %x2,%x1;\n" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" "fctiw %2,%2;\n" - "mfvsrd %0,%x2;\n" - : "=r" (res), - "=&wa" (vtmp), + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), "=f" (dtmp) - : "wa" (__A) : ); #else res = __builtin_rint(__A[0]); @@ -940,17 +940,17 @@ _mm_cvtss_si64 (__m128 __A) { __m64 res = 0; #ifdef _ARCH_PWR8 - __m128 vtmp; double dtmp; __asm__( - "xxsldwi %x1,%x3,%x3,3;\n" - "xscvspdp %x2,%x1;\n" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" "fctid %2,%2;\n" - "mfvsrd %0,%x2;\n" - : "=r" (res), - "=&wa" (vtmp), + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), "=f" (dtmp) - : "wa" (__A) : ); #else res = __builtin_llrint(__A[0]); @@ -1148,7 +1148,12 @@ _mm_cvtpu16_ps (__m64 __A) __vector float vf1; vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; - vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero); + vi4 = (__vector unsigned int) vec_mergel +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + (vs8, zero); +#else + (zero, vs8); +#endif vf1 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf1; @@ -1184,9 +1189,15 @@ _mm_cvtpu8_ps (__m64 __A) __vector float vf1; vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; - vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero); - vi4 = (__vector unsigned int) vec_vmrghh (vs8, +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + vs8 = (__vector unsigned short) vec_mergel (vc16, zero); + vi4 = (__vector unsigned int) vec_mergeh (vs8, (__vector unsigned short) zero); +#else + vs8 = (__vector unsigned short) vec_mergel (zero, vc16); + vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, + vs8); +#endif vf1 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf1; @@ -1199,7 +1210,7 @@ _mm_cvtpi32x2_ps (__m64 __A, __m64 __B) __vector signed int vi4; __vector float vf4; - vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A }; + vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; vf4 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf4; } @@ -1250,22 +1261,15 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int cons #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C #elif __BIG_ENDIAN__ - 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; __vector unsigned int t; -#ifdef __LITTLE_ENDIAN__ t[0] = permute_selectors[element_selector_10]; t[1] = permute_selectors[element_selector_32]; t[2] = permute_selectors[element_selector_54] + 0x10101010; t[3] = permute_selectors[element_selector_76] + 0x10101010; -#elif __BIG_ENDIAN__ - t[3] = permute_selectors[element_selector_10] + 0x10101010; - t[2] = permute_selectors[element_selector_32] + 0x10101010; - t[1] = permute_selectors[element_selector_54]; - t[0] = permute_selectors[element_selector_76]; -#endif return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); } @@ -1573,8 +1577,12 @@ _m_pminub (__m64 __A, __m64 __B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { - unsigned long long p = 0x0008101820283038UL; // permute control for sign bits - + unsigned long long p = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + 0x0008101820283038UL; // permute control for sign bits +#else + 0x3830282018100800UL; // permute control for sign bits +#endif return __builtin_bpermd (p, __A); } @@ -1593,8 +1601,13 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B) __vector unsigned short c; __vector unsigned int w0, w1; __vector unsigned char xform1 = { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif }; a = (__vector unsigned short)vec_splats (__A); @@ -1725,7 +1738,7 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) __vector signed int vsum; const __vector unsigned int zero = { 0, 0, 0, 0 }; - unsigned short result; + __m64_union result = {0}; a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; @@ -1738,8 +1751,8 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) vsum = vec_sums (vsum, (__vector signed int) zero); /* The sum is in the right most 32-bits of the vector result. Transfer to a GPR and truncate to 16 bits. */ - result = vsum[3]; - return (result); + result.as_short[0] = vsum[3]; + return result.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

[1/3,rs6000] x86-compat vector intrinsics fixes for BE, 32bit

Commit Message

Comments

Patch