@@ -1212,16 +1212,18 @@ (define_expand "altivec_vmrghw"
(use (match_operand:V4SI 2 "register_operand"))]
"VECTOR_MEM_ALTIVEC_P (V4SImode)"
{
- rtx (*fun) (rtx, rtx, rtx);
- fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrghw_direct_v4si
- : gen_altivec_vmrglw_direct_v4si;
- if (!BYTES_BIG_ENDIAN)
- std::swap (operands[1], operands[2]);
- emit_insn (fun (operands[0], operands[1], operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrghw_direct_v4si_be (operands[0],
+ operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_altivec_vmrglw_direct_v4si_le (operands[0],
+ operands[2],
+ operands[1]));
DONE;
})
-(define_insn "altivec_vmrghw_direct_<mode>"
+(define_insn "altivec_vmrghw_direct_<mode>_be"
[(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
(vec_select:VSX_W
(vec_concat:<VS_double>
@@ -1229,7 +1231,21 @@ (define_insn "altivec_vmrghw_direct_<mode>"
(match_operand:VSX_W 2 "register_operand" "wa,v"))
(parallel [(const_int 0) (const_int 4)
(const_int 1) (const_int 5)])))]
- "TARGET_ALTIVEC"
+ "TARGET_ALTIVEC && BYTES_BIG_ENDIAN"
+ "@
+ xxmrghw %x0,%x1,%x2
+ vmrghw %0,%1,%2"
+ [(set_attr "type" "vecperm")])
+
+(define_insn "altivec_vmrghw_direct_<mode>_le"
+ [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
+ (vec_select:VSX_W
+ (vec_concat:<VS_double>
+ (match_operand:VSX_W 2 "register_operand" "wa,v")
+ (match_operand:VSX_W 1 "register_operand" "wa,v"))
+ (parallel [(const_int 2) (const_int 6)
+ (const_int 3) (const_int 7)])))]
+ "TARGET_ALTIVEC && !BYTES_BIG_ENDIAN"
"@
xxmrghw %x0,%x1,%x2
vmrghw %0,%1,%2"
@@ -1318,16 +1334,18 @@ (define_expand "altivec_vmrglw"
(use (match_operand:V4SI 2 "register_operand"))]
"VECTOR_MEM_ALTIVEC_P (V4SImode)"
{
- rtx (*fun) (rtx, rtx, rtx);
- fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrglw_direct_v4si
- : gen_altivec_vmrghw_direct_v4si;
- if (!BYTES_BIG_ENDIAN)
- std::swap (operands[1], operands[2]);
- emit_insn (fun (operands[0], operands[1], operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrglw_direct_v4si_be (operands[0],
+ operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_altivec_vmrghw_direct_v4si_le (operands[0],
+ operands[2],
+ operands[1]));
DONE;
})
-(define_insn "altivec_vmrglw_direct_<mode>"
+(define_insn "altivec_vmrglw_direct_<mode>_be"
[(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
(vec_select:VSX_W
(vec_concat:<VS_double>
@@ -1335,7 +1353,21 @@ (define_insn "altivec_vmrglw_direct_<mode>"
(match_operand:VSX_W 2 "register_operand" "wa,v"))
(parallel [(const_int 2) (const_int 6)
(const_int 3) (const_int 7)])))]
- "TARGET_ALTIVEC"
+ "TARGET_ALTIVEC && BYTES_BIG_ENDIAN"
+ "@
+ xxmrglw %x0,%x1,%x2
+ vmrglw %0,%1,%2"
+ [(set_attr "type" "vecperm")])
+
+(define_insn "altivec_vmrglw_direct_<mode>_le"
+ [(set (match_operand:VSX_W 0 "register_operand" "=wa,v")
+ (vec_select:VSX_W
+ (vec_concat:<VS_double>
+ (match_operand:VSX_W 2 "register_operand" "wa,v")
+ (match_operand:VSX_W 1 "register_operand" "wa,v"))
+ (parallel [(const_int 0) (const_int 4)
+ (const_int 1) (const_int 5)])))]
+ "TARGET_ALTIVEC && !BYTES_BIG_ENDIAN"
"@
xxmrglw %x0,%x1,%x2
vmrglw %0,%1,%2"
@@ -3861,13 +3893,13 @@ (define_expand "vec_widen_umult_hi_v8hi"
{
emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
+ emit_insn (gen_altivec_vmrghw (operands[0], ve, vo));
}
else
{
emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
+ emit_insn (gen_altivec_vmrglw (operands[0], ve, vo));
}
DONE;
})
@@ -3886,13 +3918,13 @@ (define_expand "vec_widen_umult_lo_v8hi"
{
emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
+ emit_insn (gen_altivec_vmrglw (operands[0], ve, vo));
}
else
{
emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
+ emit_insn (gen_altivec_vmrghw (operands[0], ve, vo));
}
DONE;
})
@@ -3911,13 +3943,13 @@ (define_expand "vec_widen_smult_hi_v8hi"
{
emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo));
+ emit_insn (gen_altivec_vmrghw (operands[0], ve, vo));
}
else
{
emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve));
+ emit_insn (gen_altivec_vmrglw (operands[0], ve, vo));
}
DONE;
})
@@ -3936,13 +3968,13 @@ (define_expand "vec_widen_smult_lo_v8hi"
{
emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo));
+ emit_insn (gen_altivec_vmrglw (operands[0], ve, vo));
}
else
{
emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2]));
emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2]));
- emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve));
+ emit_insn (gen_altivec_vmrghw (operands[0], ve, vo));
}
DONE;
})
@@ -23450,8 +23450,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
: CODE_FOR_altivec_vmrglh_direct,
{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}},
{OPTION_MASK_ALTIVEC,
- BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si
- : CODE_FOR_altivec_vmrglw_direct_v4si,
+ BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si_be
+ : CODE_FOR_altivec_vmrglw_direct_v4si_le,
{0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}},
{OPTION_MASK_ALTIVEC,
BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglb_direct
@@ -23462,8 +23462,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
: CODE_FOR_altivec_vmrghh_direct,
{8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31}},
{OPTION_MASK_ALTIVEC,
- BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct_v4si
- : CODE_FOR_altivec_vmrghw_direct_v4si,
+ BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct_v4si_be
+ : CODE_FOR_altivec_vmrghw_direct_v4si_le,
{8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}},
{OPTION_MASK_P8_VECTOR,
BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgew_v4sf_direct
@@ -4798,12 +4798,14 @@ (define_expand "vsx_xxmrghw_<mode>"
(const_int 1) (const_int 5)])))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
{
- rtx (*fun) (rtx, rtx, rtx);
- fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrghw_direct_<mode>
- : gen_altivec_vmrglw_direct_<mode>;
- if (!BYTES_BIG_ENDIAN)
- std::swap (operands[1], operands[2]);
- emit_insn (fun (operands[0], operands[1], operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrghw_direct_v4si_be (operands[0],
+ operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_altivec_vmrglw_direct_v4si_le (operands[0],
+ operands[2],
+ operands[1]));
DONE;
}
[(set_attr "type" "vecperm")])
@@ -4818,12 +4820,14 @@ (define_expand "vsx_xxmrglw_<mode>"
(const_int 3) (const_int 7)])))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
{
- rtx (*fun) (rtx, rtx, rtx);
- fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrglw_direct_<mode>
- : gen_altivec_vmrghw_direct_<mode>;
- if (!BYTES_BIG_ENDIAN)
- std::swap (operands[1], operands[2]);
- emit_insn (fun (operands[0], operands[1], operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrglw_direct_v4si_be (operands[0],
+ operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_altivec_vmrghw_direct_v4si_le (operands[0],
+ operands[2],
+ operands[1]));
DONE;
}
[(set_attr "type" "vecperm")])
new file mode 100644
@@ -0,0 +1,119 @@
+/* { dg-options "-O -fno-tree-forwprop -maltivec" } */
+/* { dg-require-effective-target vmx_hw } */
+/* { dg-do run } */
+
+typedef __attribute__ ((altivec (vector__))) unsigned native_simd_type;
+
+union
+{
+ native_simd_type V;
+ int R[4];
+} store_le_vec;
+
+struct S
+{
+ S () = default;
+ S (unsigned B0)
+ {
+ native_simd_type val{B0};
+ m_simd = val;
+ }
+ void store_le (unsigned int out[])
+ {
+ store_le_vec.V = m_simd;
+ unsigned int x0 = store_le_vec.R[0];
+ __builtin_memcpy (out, &x0, 4);
+ }
+ S rotl (unsigned int r)
+ {
+ native_simd_type rot{r};
+ return __builtin_vec_rl (m_simd, rot);
+ }
+ void operator+= (S other)
+ {
+ m_simd = __builtin_vec_add (m_simd, other.m_simd);
+ }
+ void operator^= (S other)
+ {
+ m_simd = __builtin_vec_xor (m_simd, other.m_simd);
+ }
+ static void transpose (S &B0, S B1, S B2, S B3)
+ {
+ native_simd_type T0 = __builtin_vec_mergeh (B0.m_simd, B2.m_simd);
+ native_simd_type T1 = __builtin_vec_mergeh (B1.m_simd, B3.m_simd);
+ native_simd_type T2 = __builtin_vec_mergel (B0.m_simd, B2.m_simd);
+ native_simd_type T3 = __builtin_vec_mergel (B1.m_simd, B3.m_simd);
+ B0 = __builtin_vec_mergeh (T0, T1);
+ B3 = __builtin_vec_mergel (T2, T3);
+ }
+ S (native_simd_type x) : m_simd (x) {}
+ native_simd_type m_simd;
+};
+
+void
+foo (unsigned int output[], unsigned state[])
+{
+ S R00 = state[0];
+ S R01 = state[0];
+ S R02 = state[2];
+ S R03 = state[0];
+ S R05 = state[5];
+ S R06 = state[6];
+ S R07 = state[7];
+ S R08 = state[8];
+ S R09 = state[9];
+ S R10 = state[10];
+ S R11 = state[11];
+ S R12 = state[12];
+ S R13 = state[13];
+ S R14 = state[4];
+ S R15 = state[15];
+ for (int r = 0; r != 10; ++r)
+ {
+ R09 += R13;
+ R11 += R15;
+ R05 ^= R09;
+ R06 ^= R10;
+ R07 ^= R11;
+ R07 = R07.rotl (7);
+ R00 += R05;
+ R01 += R06;
+ R02 += R07;
+ R15 ^= R00;
+ R12 ^= R01;
+ R13 ^= R02;
+ R00 += R05;
+ R01 += R06;
+ R02 += R07;
+ R15 ^= R00;
+ R12 = R12.rotl (8);
+ R13 = R13.rotl (8);
+ R10 += R15;
+ R11 += R12;
+ R08 += R13;
+ R09 += R14;
+ R05 ^= R10;
+ R06 ^= R11;
+ R07 ^= R08;
+ R05 = R05.rotl (7);
+ R06 = R06.rotl (7);
+ R07 = R07.rotl (7);
+ }
+ R00 += state[0];
+ S::transpose (R00, R01, R02, R03);
+ R00.store_le (output);
+}
+
+unsigned int res[1];
+unsigned main_state[]{1634760805, 60878, 2036477234, 6,
+ 0, 825562964, 1471091955, 1346092787,
+ 506976774, 4197066702, 518848283, 118491664,
+ 0, 0, 0, 0};
+int
+main ()
+{
+ foo (res, main_state);
+ if (res[0] != 0x41fcef98)
+ __builtin_abort ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p9vector_hw } */
+/* Force vectorization with -fno-vect-cost-model to have vector unpack
+ which exposes the issue in PR115355. */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -fno-vect-cost-model" } */
+
+/* Verify it runs successfully. */
+
+__attribute__((noipa))
+void setToIdentityGOOD(unsigned long long *mVec, unsigned int mLen)
+{
+ #pragma GCC novector
+ for (unsigned int i = 0; i < mLen; i++)
+ mVec[i] = i;
+}
+
+__attribute__((noipa))
+void setToIdentityBAD(unsigned long long *mVec, unsigned int mLen)
+{
+ for (unsigned int i = 0; i < mLen; i++)
+ mVec[i] = i;
+}
+
+unsigned long long vec1[100];
+unsigned long long vec2[100];
+
+int main()
+{
+ unsigned int l = 29;
+ setToIdentityGOOD (vec1, 29);
+ setToIdentityBAD (vec2, 29);
+
+ if (__builtin_memcmp (vec1, vec2, l * sizeof (vec1[0])) != 0)
+ __builtin_abort ();
+
+ return 0;
+}