===================================================================
@@ -1237,7 +1237,7 @@ _mm_movemask_pd (__m128d __A)
#ifdef __LITTLE_ENDIAN__
0x80800040, 0x80808080, 0x80808080, 0x80808080
#elif __BIG_ENDIAN__
- 0x80808080, 0x80808080, 0x80808080, 0x80800040
+ 0x80808080, 0x80808080, 0x80808080, 0x80804000
#endif
};
@@ -1483,12 +1483,8 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
#endif
return (__m128i) result;
#else
-#ifdef __LITTLE_ENDIAN__
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
-#elif __BIG_ENDIAN__
- return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B);
#endif
-#endif
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1612,7 +1608,8 @@ _mm_bsrli_si128 (__m128i __A, const int __N)
const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
if (__N < 16)
- if (__builtin_constant_p(__N))
+ if (__builtin_constant_p(__N) &&
+ __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
/* Would like to use Vector Shift Left Double by Octet
Immediate here to use the immediate form and avoid
load of __N * 8 value into a separate VR. */
@@ -1620,7 +1617,11 @@ _mm_bsrli_si128 (__m128i __A, const int __N)
else
{
__v16qu shift = vec_splats((unsigned char)(__N*8));
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
result = vec_sro ((__v16qu)__A, shift);
+#else
+ result = vec_slo ((__v16qu)__A, shift);
+#endif
}
else
result = zeros;
@@ -2026,13 +2027,8 @@ _mm_movemask_epi8 (__m128i __A)
__vector unsigned long long result;
static const __vector unsigned char perm_mask =
{
-#ifdef __LITTLE_ENDIAN__
0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
-#elif __BIG_ENDIAN__
- 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
- 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78
-#endif
};
result = ((__vector unsigned long long)
@@ -2078,34 +2074,23 @@ _mm_shufflehi_epi16 (__m128i __A, const int __mask
#ifdef __LITTLE_ENDIAN__
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#elif __BIG_ENDIAN__
- 0x0607, 0x0405, 0x0203, 0x0001
+ 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
#endif
};
__v2du pmask =
#ifdef __LITTLE_ENDIAN__
- { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL};
+ { 0x1716151413121110UL, 0UL};
#elif __BIG_ENDIAN__
- { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL};
+ { 0x1011121314151617UL, 0UL};
#endif
__m64_union t;
__v2du a, r;
-#ifdef __LITTLE_ENDIAN__
t.as_short[0] = permute_selectors[element_selector_98];
t.as_short[1] = permute_selectors[element_selector_BA];
t.as_short[2] = permute_selectors[element_selector_DC];
t.as_short[3] = permute_selectors[element_selector_FE];
-#elif __BIG_ENDIAN__
- t.as_short[3] = permute_selectors[element_selector_98];
- t.as_short[2] = permute_selectors[element_selector_BA];
- t.as_short[1] = permute_selectors[element_selector_DC];
- t.as_short[0] = permute_selectors[element_selector_FE];
-#endif
-#ifdef __LITTLE_ENDIAN__
pmask[1] = t.as_m64;
-#elif __BIG_ENDIAN__
- pmask[0] = t.as_m64;
-#endif
a = (__v2du)__A;
r = vec_perm (a, a, (__vector unsigned char)pmask);
return (__m128i) r;
@@ -2122,30 +2107,23 @@ _mm_shufflelo_epi16 (__m128i __A, const int __mask
{
#ifdef __LITTLE_ENDIAN__
0x0100, 0x0302, 0x0504, 0x0706
-#elif __BIG_ENDIAN__
- 0x0e0f, 0x0c0d, 0x0a0b, 0x0809
+#else
+ 0x0001, 0x0203, 0x0405, 0x0607
#endif
};
- __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL};
+ __v2du pmask =
+#ifdef __LITTLE_ENDIAN__
+ { 0UL, 0x1f1e1d1c1b1a1918UL};
+#else
+ { 0UL, 0x18191a1b1c1d1e1fUL};
+#endif
__m64_union t;
__v2du a, r;
-
-#ifdef __LITTLE_ENDIAN__
t.as_short[0] = permute_selectors[element_selector_10];
t.as_short[1] = permute_selectors[element_selector_32];
t.as_short[2] = permute_selectors[element_selector_54];
t.as_short[3] = permute_selectors[element_selector_76];
-#elif __BIG_ENDIAN__
- t.as_short[3] = permute_selectors[element_selector_10];
- t.as_short[2] = permute_selectors[element_selector_32];
- t.as_short[1] = permute_selectors[element_selector_54];
- t.as_short[0] = permute_selectors[element_selector_76];
-#endif
-#ifdef __LITTLE_ENDIAN__
pmask[0] = t.as_m64;
-#elif __BIG_ENDIAN__
- pmask[1] = t.as_m64;
-#endif
a = (__v2du)__A;
r = vec_perm (a, a, (__vector unsigned char)pmask);
return (__m128i) r;
@@ -2163,22 +2141,15 @@ _mm_shuffle_epi32 (__m128i __A, const int __mask)
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#elif __BIG_ENDIAN__
- 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
+ 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__v4su t;
-#ifdef __LITTLE_ENDIAN__
t[0] = permute_selectors[element_selector_10];
t[1] = permute_selectors[element_selector_32];
t[2] = permute_selectors[element_selector_54] + 0x10101010;
t[3] = permute_selectors[element_selector_76] + 0x10101010;
-#elif __BIG_ENDIAN__
- t[3] = permute_selectors[element_selector_10] + 0x10101010;
- t[2] = permute_selectors[element_selector_32] + 0x10101010;
- t[1] = permute_selectors[element_selector_54];
- t[0] = permute_selectors[element_selector_76];
-#endif
return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
}
===================================================================
@@ -172,8 +172,13 @@ _mm_packs_pi16 (__m64 __m1, __m64 __m2)
__vector signed short vm1;
__vector signed char vresult;
- vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
- vresult = vec_vpkshss (vm1, vm1);
+ vm1 = (__vector signed short) (__vector unsigned long long)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ { __m1, __m2 };
+#else
+ { __m2, __m1 };
+#endif
+ vresult = vec_packs (vm1, vm1);
return (__m64) ((__vector long long) vresult)[0];
}
@@ -192,8 +197,13 @@ _mm_packs_pi32 (__m64 __m1, __m64 __m2)
__vector signed int vm1;
__vector signed short vresult;
- vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 };
- vresult = vec_vpkswss (vm1, vm1);
+ vm1 = (__vector signed int) (__vector unsigned long long)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ { __m1, __m2 };
+#else
+ { __m2, __m1 };
+#endif
+ vresult = vec_packs (vm1, vm1);
return (__m64) ((__vector long long) vresult)[0];
}
@@ -209,12 +219,19 @@ _m_packssdw (__m64 __m1, __m64 __m2)
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16 (__m64 __m1, __m64 __m2)
{
- __vector signed short vm1;
- __vector unsigned char vresult;
-
- vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
- vresult = vec_vpkshus (vm1, vm1);
- return (__m64) ((__vector long long) vresult)[0];
+ __vector unsigned char r;
+ __vector signed short vm1 = (__vector signed short) (__vector long long)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ { __m1, __m2 };
+#else
+ { __m2, __m1 };
+#endif
+ const __vector signed short __zero = { 0 };
+ __vector __bool short __select = vec_cmplt (vm1, __zero);
+ r = vec_packs ((vector unsigned short) vm1, (vector unsigned short) vm1);
+ __vector __bool char packsel = vec_pack (__select, __select);
+ r = vec_sel (r, (const vector unsigned char) __zero, packsel);
+ return (__m64) ((__vector long long) r)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -235,7 +252,7 @@ _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
a = (__vector unsigned char)vec_splats (__m1);
b = (__vector unsigned char)vec_splats (__m2);
c = vec_mergel (a, b);
- return (__m64) ((__vector long long) c)[0];
+ return (__m64) ((__vector long long) c)[1];
#else
__m64_union m1, m2, res;
@@ -316,7 +333,7 @@ _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
a = (__vector unsigned char)vec_splats (__m1);
b = (__vector unsigned char)vec_splats (__m2);
c = vec_mergel (a, b);
- return (__m64) ((__vector long long) c)[1];
+ return (__m64) ((__vector long long) c)[0];
#else
__m64_union m1, m2, res;
@@ -710,7 +727,7 @@ _mm_setzero_si64 (void)
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
{
-#ifdef _ARCH_PWR6
+#if defined(_ARCH_PWR6) && defined(__powerpc64__)
__m64 res;
__asm__(
"cmpb %0,%1,%2;\n"
@@ -1084,8 +1101,13 @@ _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
__vector signed short c;
__vector signed int w0, w1;
__vector unsigned char xform1 = {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
+#else
+ 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
+ 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
+#endif
};
a = (__vector signed short)vec_splats (__m1);
===================================================================
@@ -75,18 +75,16 @@ extern __inline __m128 __attribute__((__gnu_inline
_mm_hadd_ps (__m128 __X, __m128 __Y)
{
__vector unsigned char xform2 = {
- #ifdef __LITTLE_ENDIAN__
- 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B
- #elif __BIG_ENDIAN__
- 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F
- #endif
+ 0x00, 0x01, 0x02, 0x03,
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x10, 0x11, 0x12, 0x13,
+ 0x18, 0x19, 0x1A, 0x1B
};
__vector unsigned char xform1 = {
- #ifdef __LITTLE_ENDIAN__
- 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
- #elif __BIG_ENDIAN__
- 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B
- #endif
+ 0x04, 0x05, 0x06, 0x07,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x14, 0x15, 0x16, 0x17,
+ 0x1C, 0x1D, 0x1E, 0x1F
};
return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
@@ -96,18 +94,16 @@ extern __inline __m128 __attribute__((__gnu_inline
_mm_hsub_ps (__m128 __X, __m128 __Y)
{
__vector unsigned char xform2 = {
- #ifdef __LITTLE_ENDIAN__
- 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B
- #elif __BIG_ENDIAN__
- 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F
- #endif
+ 0x00, 0x01, 0x02, 0x03,
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x10, 0x11, 0x12, 0x13,
+ 0x18, 0x19, 0x1A, 0x1B
};
__vector unsigned char xform1 = {
- #ifdef __LITTLE_ENDIAN__
- 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
- #elif __BIG_ENDIAN__
- 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B
- #endif
+ 0x04, 0x05, 0x06, 0x07,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x14, 0x15, 0x16, 0x17,
+ 0x1C, 0x1D, 0x1E, 0x1F
};
return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
===================================================================
@@ -907,17 +907,17 @@ _mm_cvtss_si32 (__m128 __A)
{
__m64 res = 0;
#ifdef _ARCH_PWR8
- __m128 vtmp;
double dtmp;
__asm__(
- "xxsldwi %x1,%x3,%x3,3;\n"
- "xscvspdp %x2,%x1;\n"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+ "xscvspdp %x2,%x0;\n"
"fctiw %2,%2;\n"
- "mfvsrd %0,%x2;\n"
- : "=r" (res),
- "=&wa" (vtmp),
+ "mfvsrd %1,%x2;\n"
+ : "+wa" (__A),
+ "=r" (res),
"=f" (dtmp)
- : "wa" (__A)
: );
#else
res = __builtin_rint(__A[0]);
@@ -940,17 +940,17 @@ _mm_cvtss_si64 (__m128 __A)
{
__m64 res = 0;
#ifdef _ARCH_PWR8
- __m128 vtmp;
double dtmp;
__asm__(
- "xxsldwi %x1,%x3,%x3,3;\n"
- "xscvspdp %x2,%x1;\n"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+ "xscvspdp %x2,%x0;\n"
"fctid %2,%2;\n"
- "mfvsrd %0,%x2;\n"
- : "=r" (res),
- "=&wa" (vtmp),
+ "mfvsrd %1,%x2;\n"
+ : "+wa" (__A),
+ "=r" (res),
"=f" (dtmp)
- : "wa" (__A)
: );
#else
res = __builtin_llrint(__A[0]);
@@ -1148,7 +1148,12 @@ _mm_cvtpu16_ps (__m64 __A)
__vector float vf1;
vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
- vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
+ vi4 = (__vector unsigned int) vec_mergel
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ (vs8, zero);
+#else
+ (zero, vs8);
+#endif
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
@@ -1184,9 +1189,15 @@ _mm_cvtpu8_ps (__m64 __A)
__vector float vf1;
vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
- vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
- vi4 = (__vector unsigned int) vec_vmrghh (vs8,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
+ vi4 = (__vector unsigned int) vec_mergeh (vs8,
(__vector unsigned short) zero);
+#else
+ vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
+ vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
+ vs8);
+#endif
vf1 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf1;
@@ -1199,7 +1210,7 @@ _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
__vector signed int vi4;
__vector float vf4;
- vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A };
+ vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
vf4 = (__vector float) vec_ctf (vi4, 0);
return (__m128) vf4;
}
@@ -1250,22 +1261,15 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int cons
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#elif __BIG_ENDIAN__
- 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
+ 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__vector unsigned int t;
-#ifdef __LITTLE_ENDIAN__
t[0] = permute_selectors[element_selector_10];
t[1] = permute_selectors[element_selector_32];
t[2] = permute_selectors[element_selector_54] + 0x10101010;
t[3] = permute_selectors[element_selector_76] + 0x10101010;
-#elif __BIG_ENDIAN__
- t[3] = permute_selectors[element_selector_10] + 0x10101010;
- t[2] = permute_selectors[element_selector_32] + 0x10101010;
- t[1] = permute_selectors[element_selector_54];
- t[0] = permute_selectors[element_selector_76];
-#endif
return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
}
@@ -1573,8 +1577,12 @@ _m_pminub (__m64 __A, __m64 __B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
- unsigned long long p = 0x0008101820283038UL; // permute control for sign bits
-
+ unsigned long long p =
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ 0x0008101820283038UL; // permute control for sign bits
+#else
+ 0x3830282018100800UL; // permute control for sign bits
+#endif
return __builtin_bpermd (p, __A);
}
@@ -1593,8 +1601,13 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
__vector unsigned short c;
__vector unsigned int w0, w1;
__vector unsigned char xform1 = {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
+#else
+ 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
+ 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
+#endif
};
a = (__vector unsigned short)vec_splats (__A);
@@ -1725,7 +1738,7 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
__vector signed int vsum;
const __vector unsigned int zero =
{ 0, 0, 0, 0 };
- unsigned short result;
+ __m64_union result = {0};
a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
@@ -1738,8 +1751,8 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
vsum = vec_sums (vsum, (__vector signed int) zero);
/* The sum is in the right most 32-bits of the vector result.
Transfer to a GPR and truncate to 16 bits. */
- result = vsum[3];
- return (result);
+ result.as_short[0] = vsum[3];
+ return result.as_m64;
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))