@@ -134,10 +134,8 @@ (define_c_enum "unspec"
UNSPEC_VMULWLUH
UNSPEC_VMULWHSH
UNSPEC_VMULWLSH
- UNSPEC_VUPKHUB
- UNSPEC_VUPKHUH
- UNSPEC_VUPKLUB
- UNSPEC_VUPKLUH
+ UNSPEC_VUPKHU
+ UNSPEC_VUPKLU
UNSPEC_VPERMSI
UNSPEC_VPERMHI
UNSPEC_INTERHI
@@ -3885,143 +3883,45 @@ (define_insn "xxeval"
[(set_attr "type" "vecsimple")
(set_attr "prefixed" "yes")])
-(define_expand "vec_unpacku_hi_v16qi"
- [(set (match_operand:V8HI 0 "register_operand" "=v")
- (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")]
- UNSPEC_VUPKHUB))]
- "TARGET_ALTIVEC"
-{
- rtx vzero = gen_reg_rtx (V8HImode);
- rtx mask = gen_reg_rtx (V16QImode);
- rtvec v = rtvec_alloc (16);
- bool be = BYTES_BIG_ENDIAN;
-
- emit_insn (gen_altivec_vspltish (vzero, const0_rtx));
-
- RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7);
- RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 0 : 16);
- RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 6);
- RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16);
- RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5);
- RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 2 : 16);
- RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 4);
- RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16);
- RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3);
- RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 4 : 16);
- RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 2);
- RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16);
- RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1);
- RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 6 : 16);
- RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 0);
- RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16);
-
- emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
- emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask));
- DONE;
-})
-
-(define_expand "vec_unpacku_hi_v8hi"
- [(set (match_operand:V4SI 0 "register_operand" "=v")
- (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
- UNSPEC_VUPKHUH))]
+(define_expand "vec_unpacku_hi_<VP_small_lc>"
+ [(set (match_operand:VP 0 "register_operand" "=v")
+ (unspec:VP [(match_operand:<VP_small> 1 "register_operand" "v")]
+ UNSPEC_VUPKHU))]
"TARGET_ALTIVEC"
{
- rtx vzero = gen_reg_rtx (V4SImode);
- rtx mask = gen_reg_rtx (V16QImode);
- rtvec v = rtvec_alloc (16);
- bool be = BYTES_BIG_ENDIAN;
+ rtx vzero = gen_reg_rtx (<VP_small>mode);
+ emit_insn (gen_altivec_vspltis<VU_char> (vzero, const0_rtx));
- emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
-
- RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7);
- RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 6);
- RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 0 : 17);
- RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16);
- RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5);
- RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 4);
- RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 2 : 17);
- RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16);
- RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3);
- RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 2);
- RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 4 : 17);
- RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16);
- RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1);
- RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 0);
- RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 6 : 17);
- RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16);
-
- emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
- emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask));
- DONE;
-})
+ rtx res = gen_reg_rtx (<VP_small>mode);
+ rtx op1 = operands[1];
-(define_expand "vec_unpacku_lo_v16qi"
- [(set (match_operand:V8HI 0 "register_operand" "=v")
- (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")]
- UNSPEC_VUPKLUB))]
- "TARGET_ALTIVEC"
-{
- rtx vzero = gen_reg_rtx (V8HImode);
- rtx mask = gen_reg_rtx (V16QImode);
- rtvec v = rtvec_alloc (16);
- bool be = BYTES_BIG_ENDIAN;
-
- emit_insn (gen_altivec_vspltish (vzero, const0_rtx));
-
- RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15);
- RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 8 : 16);
- RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 14);
- RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16);
- RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13);
- RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 10 : 16);
- RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 12);
- RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16);
- RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11);
- RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 12 : 16);
- RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 10);
- RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16);
- RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9);
- RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 14 : 16);
- RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 8);
- RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16);
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrgh<VU_char> (res, vzero, op1));
+ else
+ emit_insn (gen_altivec_vmrgl<VU_char> (res, op1, vzero));
- emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
- emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask));
+ emit_insn (gen_move_insn (operands[0], gen_lowpart (<MODE>mode, res)));
DONE;
})
-(define_expand "vec_unpacku_lo_v8hi"
- [(set (match_operand:V4SI 0 "register_operand" "=v")
- (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
- UNSPEC_VUPKLUH))]
+(define_expand "vec_unpacku_lo_<VP_small_lc>"
+ [(set (match_operand:VP 0 "register_operand" "=v")
+ (unspec:VP [(match_operand:<VP_small> 1 "register_operand" "v")]
+ UNSPEC_VUPKLU))]
"TARGET_ALTIVEC"
{
- rtx vzero = gen_reg_rtx (V4SImode);
- rtx mask = gen_reg_rtx (V16QImode);
- rtvec v = rtvec_alloc (16);
- bool be = BYTES_BIG_ENDIAN;
+ rtx vzero = gen_reg_rtx (<VP_small>mode);
+ emit_insn (gen_altivec_vspltis<VU_char> (vzero, const0_rtx));
- emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
-
- RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15);
- RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 14);
- RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 8 : 17);
- RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16);
- RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13);
- RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 12);
- RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 10 : 17);
- RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16);
- RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11);
- RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 10);
- RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 12 : 17);
- RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16);
- RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9);
- RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 8);
- RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 14 : 17);
- RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16);
+ rtx res = gen_reg_rtx (<VP_small>mode);
+ rtx op1 = operands[1];
- emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
- emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_altivec_vmrgl<VU_char> (res, vzero, op1));
+ else
+ emit_insn (gen_altivec_vmrgh<VU_char> (res, op1, vzero));
+
+ emit_insn (gen_move_insn (operands[0], gen_lowpart (<MODE>mode, res)));
DONE;
})
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-maltivec -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if unpack vectorization succeeds for type signed/unsigned
+ short and char. */
+
+#include "unpack-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvupkhsb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvupklsb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvupkhsh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvupklsh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmrghb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmrglb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmrghh\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvmrglh\M} 2 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+#include "unpack-vectorize.h"
+
+DEF_ARR (si)
+DEF_ARR (ui)
+DEF_ARR (sh)
+DEF_ARR (uh)
+DEF_ARR (sc)
+DEF_ARR (uc)
+
+TEST1 (sh, si)
+TEST1 (uh, ui)
+TEST1 (sc, sh)
+TEST1 (uc, uh)
+
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-mdejagnu-cpu=power7 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if unsigned int unpack vectorization succeeds. V2DImode is
+ supported since Power7 so guard it under Power7 and up. */
+
+#include "unpack-vectorize-2.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-assembler-times {\mxxmrghw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxmrglw\M} 1 } } */
new file mode 100644
@@ -0,0 +1,7 @@
+#include "unpack-vectorize.h"
+
+DEF_ARR (ui)
+DEF_ARR (ull)
+
+TEST1 (ui, ull)
+
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed int unpack vectorization succeeds. */
+
+#include "unpack-vectorize-3.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvupkhsw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvupklsw\M} 1 } } */
new file mode 100644
@@ -0,0 +1,7 @@
+#include "unpack-vectorize.h"
+
+DEF_ARR (si)
+DEF_ARR (sll)
+
+TEST1 (si, sll)
+
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vmx_hw } */
+/* { dg-options "-maltivec -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "unpack-vectorize-1.h"
+
+/* Test if unpack vectorization cases on signed/unsigned short and char
+ run successfully. */
+
+CHECK1 (sh, si)
+CHECK1 (uh, ui)
+CHECK1 (sc, sh)
+CHECK1 (uc, uh)
+
+int
+main ()
+{
+ check1_sh_si ();
+ check1_uh_ui ();
+ check1_sc_sh ();
+ check1_uc_uh ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-mdejagnu-cpu=power7 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "unpack-vectorize-2.h"
+
+/* Test if unpack vectorization cases on unsigned int run successfully. */
+
+CHECK1 (ui, ull)
+
+int
+main ()
+{
+ check1_ui_ull ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "unpack-vectorize-3.h"
+
+/* Test if unpack vectorization cases on signed int run successfully. */
+
+CHECK1 (si, sll)
+
+int
+main ()
+{
+ check1_si_sll ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,42 @@
+typedef signed long long sll;
+typedef unsigned long long ull;
+typedef signed int si;
+typedef unsigned int ui;
+typedef signed short sh;
+typedef unsigned short uh;
+typedef signed char sc;
+typedef unsigned char uc;
+
+#ifndef ALIGN
+#define ALIGN 32
+#endif
+
+#define ALIGN_ATTR __attribute__((__aligned__(ALIGN)))
+
+#define N 128
+
+#define DEF_ARR(TYPE) \
+ TYPE TYPE##_a[N] ALIGN_ATTR; \
+ TYPE TYPE##_b[N] ALIGN_ATTR; \
+ TYPE TYPE##_c[N] ALIGN_ATTR;
+
+#define TEST1(NTYPE, WTYPE) \
+ __attribute__((noipa)) void test1_##NTYPE##_##WTYPE() { \
+ for (int i = 0; i < N; i++) \
+ WTYPE##_c[i] = NTYPE##_a[i] + NTYPE##_b[i]; \
+ }
+
+#define CHECK1(NTYPE, WTYPE) \
+ __attribute__((noipa, optimize(0))) void check1_##NTYPE##_##WTYPE() { \
+ for (int i = 0; i < N; i++) { \
+ NTYPE##_a[i] = 2 * i * sizeof(NTYPE) + 10; \
+ NTYPE##_b[i] = 7 * i * sizeof(NTYPE) / 5 - 10; \
+ } \
+ test1_##NTYPE##_##WTYPE(); \
+ for (int i = 0; i < N; i++) { \
+ WTYPE exp = NTYPE##_a[i] + NTYPE##_b[i]; \
+ if (WTYPE##_c[i] != exp) \
+ __builtin_abort(); \
+ } \
+ }
+