Message ID | 1323378383-9824-7-git-send-email-rth@redhat.com |
---|---|
State | New |
Headers | show |
On 8 December 2011 21:06, Richard Henderson <rth@redhat.com> wrote: > --- > gcc/config/arm/arm-protos.h | 3 + > gcc/config/arm/arm.c | 527 ++++++++++++++++++++++++++++++++- > gcc/config/arm/neon.md | 59 ++++ > gcc/config/arm/vec-common.md | 26 ++ > gcc/testsuite/lib/target-supports.exp | 9 +- > 5 files changed, 620 insertions(+), 4 deletions(-) I haven't been following the vector permute work in great detail and I must say I haven't read this patch series in great detail yet. For Neon a further optimization to consider might be to use the vext instruction which could achieve permute masks that are monotonically increasing constants ? While I expect the latency for a vext or vtbl instruction to be about the same (your mileage might vary depending on the core), using vext gives us the freedom of not needing a register for the permute mask - a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n + 4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n If the mask being provided is a reverse of the mask above, it's probably not worth it. Additionally , can we also detect rotate rights ? unless ofcourse there's a different interface - a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1 Masks doing rotate lefts are probably not worth it in this regards, Ramana > > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h > index 296550a..8c3e412 100644 > --- a/gcc/config/arm/arm-protos.h > +++ b/gcc/config/arm/arm-protos.h > @@ -244,4 +244,7 @@ extern const struct tune_params *current_tune; > extern int vfp3_const_double_for_fract_bits (rtx); > #endif /* RTX_CODE */ > > +extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); > +extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel); > + > #endif /* ! GCC_ARM_PROTOS_H */ > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c > index 65b4e9d..0395a41 100644 > --- a/gcc/config/arm/arm.c > +++ b/gcc/config/arm/arm.c > @@ -267,6 +267,9 @@ static unsigned int arm_autovectorize_vector_sizes (void); > static int arm_default_branch_cost (bool, bool); > static int arm_cortex_a5_branch_cost (bool, bool); > > +static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode, > + const unsigned char *sel); > + > > /* Table of machine attributes. */ > static const struct attribute_spec arm_attribute_table[] = > @@ -604,6 +607,10 @@ static const struct attribute_spec arm_attribute_table[] = > #define TARGET_PREFERRED_RENAME_CLASS \ > arm_preferred_rename_class > > +#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK > +#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \ > + arm_vectorize_vec_perm_const_ok > + > struct gcc_target targetm = TARGET_INITIALIZER; > > /* Obstack for minipool constant handling. */ > @@ -25064,6 +25071,524 @@ vfp3_const_double_for_fract_bits (rtx operand) > } > return 0; > } > + > +#define MAX_VECT_LEN 16 > > -#include "gt-arm.h" > +struct expand_vec_perm_d > +{ > + rtx target, op0, op1; > + unsigned char perm[MAX_VECT_LEN]; > + enum machine_mode vmode; > + unsigned char nelt; > + bool one_vector_p; > + bool testing_p; > +}; > + > +/* Generate a variable permutation. */ > + > +static void > +arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) > +{ > + enum machine_mode vmode = GET_MODE (target); > + bool one_vector_p = rtx_equal_p (op0, op1); > + > + gcc_checking_assert (vmode == V8QImode || vmode == V16QImode); > + gcc_checking_assert (GET_MODE (op0) == vmode); > + gcc_checking_assert (GET_MODE (op1) == vmode); > + gcc_checking_assert (GET_MODE (sel) == vmode); > + gcc_checking_assert (TARGET_NEON); > + > + if (one_vector_p) > + { > + if (vmode == V8QImode) > + emit_insn (gen_neon_vtbl1v8qi (target, op0, sel)); > + else > + emit_insn (gen_neon_vtbl1v16qi (target, op0, sel)); > + } > + else > + { > + enum machine_mode mode1, mode2; > + rtx pair, part; > + > + if (vmode == V8QImode) > + mode1 = DImode, mode2 = TImode; > + else > + mode1 = TImode, mode2 = OImode; > + > + pair = gen_reg_rtx (mode2); > + emit_insn (gen_rtx_CLOBBER (VOIDmode, pair)); > + > + part = simplify_gen_subreg (mode1, pair, mode2, > + subreg_lowpart_offset (mode1, mode2)); > + emit_move_insn (part, gen_lowpart (mode1, op0)); > + > + part = simplify_gen_subreg (mode1, pair, mode2, > + subreg_highpart_offset (mode1, mode2)); > + emit_move_insn (part, gen_lowpart (mode1, op1)); > + > + if (vmode == V8QImode) > + emit_insn (gen_neon_vtbl2v8qi (target, pair, sel)); > + else > + emit_insn (gen_neon_vtbl2v16qi (target, pair, sel)); > + } > +} > + > +void > +arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) > +{ > + enum machine_mode vmode = GET_MODE (target); > + unsigned int i, nelt = GET_MODE_NUNITS (vmode); > + bool one_vector_p = rtx_equal_p (op0, op1); > + rtx rmask[MAX_VECT_LEN], mask; > + > + /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's > + numbering of elements for big-endian, we must reverse the order. */ > + gcc_checking_assert (!BYTES_BIG_ENDIAN); > + > + /* The VTBL instruction does not use a modulo index, so we must take care > + of that ourselves. */ > + mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1); > + for (i = 0; i < nelt; ++i) > + rmask[i] = mask; > + mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask)); > + sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN); > + > + arm_expand_vec_perm_1 (target, op0, op1, sel); > +} > + > +/* Generate or test for an insn that supports a constant permutation. */ > + > +/* Recognize patterns for the VUZP insns. */ > + > +static bool > +arm_evpc_neon_vuzp (struct expand_vec_perm_d *d) > +{ > + unsigned int i, odd, mask, nelt = d->nelt; > + rtx out0, out1, in0, in1, x; > + rtx (*gen)(rtx, rtx, rtx, rtx); > + > + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) > + return false; > + > + /* Note that these are little-endian tests. Adjust for big-endian later. */ > + if (d->perm[0] == 0) > + odd = 0; > + else if (d->perm[0] == 1) > + odd = 1; > + else > + return false; > + mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1); > + > + for (i = 0; i < nelt; i++) > + { > + unsigned elt = (i * 2 + odd) & mask; > + if (d->perm[i] != elt) > + return false; > + } > + > + /* Success! */ > + if (d->testing_p) > + return true; > + > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vuzpv16qi_internal; break; > + case V8QImode: gen = gen_neon_vuzpv8qi_internal; break; > + case V8HImode: gen = gen_neon_vuzpv8hi_internal; break; > + case V4HImode: gen = gen_neon_vuzpv4hi_internal; break; > + case V4SImode: gen = gen_neon_vuzpv4si_internal; break; > + case V2SImode: gen = gen_neon_vuzpv2si_internal; break; > + case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break; > + case V4SFmode: gen = gen_neon_vuzpv4sf_internal; break; > + default: > + gcc_unreachable (); > + } > + > + in0 = d->op0; > + in1 = d->op1; > + if (BYTES_BIG_ENDIAN) > + { > + x = in0, in0 = in1, in1 = x; > + odd = !odd; > + } > + > + out0 = d->target; > + out1 = gen_reg_rtx (d->vmode); > + if (odd) > + x = out0, out0 = out1, out1 = x; > + > + emit_insn (gen (out0, in0, in1, out1)); > + return true; > +} > + > +/* Recognize patterns for the VZIP insns. */ > + > +static bool > +arm_evpc_neon_vzip (struct expand_vec_perm_d *d) > +{ > + unsigned int i, high, mask, nelt = d->nelt; > + rtx out0, out1, in0, in1, x; > + rtx (*gen)(rtx, rtx, rtx, rtx); > + > + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) > + return false; > + > + /* Note that these are little-endian tests. Adjust for big-endian later. */ > + high = nelt / 2; > + if (d->perm[0] == high) > + ; > + else if (d->perm[0] == 0) > + high = 0; > + else > + return false; > + mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1); > + > + for (i = 0; i < nelt / 2; i++) > + { > + unsigned elt = (i + high) & mask; > + if (d->perm[i * 2] != elt) > + return false; > + elt = (elt + nelt) & mask; > + if (d->perm[i * 2 + 1] != elt) > + return false; > + } > + > + /* Success! */ > + if (d->testing_p) > + return true; > + > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vzipv16qi_internal; break; > + case V8QImode: gen = gen_neon_vzipv8qi_internal; break; > + case V8HImode: gen = gen_neon_vzipv8hi_internal; break; > + case V4HImode: gen = gen_neon_vzipv4hi_internal; break; > + case V4SImode: gen = gen_neon_vzipv4si_internal; break; > + case V2SImode: gen = gen_neon_vzipv2si_internal; break; > + case V2SFmode: gen = gen_neon_vzipv2sf_internal; break; > + case V4SFmode: gen = gen_neon_vzipv4sf_internal; break; > + default: > + gcc_unreachable (); > + } > + > + in0 = d->op0; > + in1 = d->op1; > + if (BYTES_BIG_ENDIAN) > + { > + x = in0, in0 = in1, in1 = x; > + high = !high; > + } > + > + out0 = d->target; > + out1 = gen_reg_rtx (d->vmode); > + if (high) > + x = out0, out0 = out1, out1 = x; > + > + emit_insn (gen (out0, in0, in1, out1)); > + return true; > +} > + > +/* Recognize patterns for the VREV insns. */ > + > +static bool > +arm_evpc_neon_vrev (struct expand_vec_perm_d *d) > +{ > + unsigned int i, j, diff, nelt = d->nelt; > + rtx (*gen)(rtx, rtx, rtx); > + > + if (!d->one_vector_p) > + return false; > + > + diff = d->perm[0]; > + switch (diff) > + { > + case 7: > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vrev64v16qi; break; > + case V8QImode: gen = gen_neon_vrev64v8qi; break; > + default: > + return false; > + } > + break; > + case 3: > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vrev32v16qi; break; > + case V8QImode: gen = gen_neon_vrev32v8qi; break; > + case V8HImode: gen = gen_neon_vrev64v8hi; break; > + case V4HImode: gen = gen_neon_vrev64v4hi; break; > + default: > + return false; > + } > + break; > + case 1: > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vrev16v16qi; break; > + case V8QImode: gen = gen_neon_vrev16v8qi; break; > + case V8HImode: gen = gen_neon_vrev32v8hi; break; > + case V4HImode: gen = gen_neon_vrev32v4hi; break; > + case V4SImode: gen = gen_neon_vrev64v4si; break; > + case V2SImode: gen = gen_neon_vrev64v2si; break; > + case V4SFmode: gen = gen_neon_vrev64v4sf; break; > + case V2SFmode: gen = gen_neon_vrev64v2sf; break; > + default: > + return false; > + } > + break; > + default: > + return false; > + } > + > + for (i = 0; i < nelt; i += diff) > + for (j = 0; j <= diff; j += 1) > + if (d->perm[i + j] != i + diff - j) > + return false; > + > + /* Success! */ > + if (d->testing_p) > + return true; > + > + /* ??? The third operand is an artifact of the builtin infrastructure > + and is ignored by the actual instruction. */ > + emit_insn (gen (d->target, d->op0, const0_rtx)); > + return true; > +} > + > +/* Recognize patterns for the VTRN insns. */ > + > +static bool > +arm_evpc_neon_vtrn (struct expand_vec_perm_d *d) > +{ > + unsigned int i, odd, nelt = d->nelt; > + rtx out0, out1, in0, in1, x; > + rtx (*gen)(rtx, rtx, rtx, rtx); > + > + if (d->one_vector_p) > + return false; > + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) > + return false; > + > + /* Note that these are little-endian tests. Adjust for big-endian later. */ > + if (d->perm[0] == 0) > + odd = 0; > + else if (d->perm[0] == 1) > + odd = 1; > + else > + return false; > + > + for (i = 0; i < nelt; i += 2) > + { > + if (d->perm[i] != i + odd) > + return false; > + if (d->perm[i + 1] != i + nelt + odd) > + return false; > + } > + > + /* Success! */ > + if (d->testing_p) > + return true; > + > + switch (d->vmode) > + { > + case V16QImode: gen = gen_neon_vtrnv16qi_internal; break; > + case V8QImode: gen = gen_neon_vtrnv8qi_internal; break; > + case V8HImode: gen = gen_neon_vtrnv8hi_internal; break; > + case V4HImode: gen = gen_neon_vtrnv4hi_internal; break; > + case V4SImode: gen = gen_neon_vtrnv4si_internal; break; > + case V2SImode: gen = gen_neon_vtrnv2si_internal; break; > + case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break; > + case V4SFmode: gen = gen_neon_vtrnv4sf_internal; break; > + default: > + gcc_unreachable (); > + } > + > + in0 = d->op0; > + in1 = d->op1; > + if (BYTES_BIG_ENDIAN) > + { > + x = in0, in0 = in1, in1 = x; > + odd = !odd; > + } > + > + out0 = d->target; > + out1 = gen_reg_rtx (d->vmode); > + if (odd) > + x = out0, out0 = out1, out1 = x; > + > + emit_insn (gen (out0, in0, in1, out1)); > + return true; > +} > + > +/* The NEON VTBL instruction is a fully variable permuation that's even > + stronger than what we expose via VEC_PERM_EXPR. What it doesn't do > + is mask the index operand as VEC_PERM_EXPR requires. Therefore we > + can do slightly better by expanding this as a constant where we don't > + have to apply a mask. */ > + > +static bool > +arm_evpc_neon_vtbl (struct expand_vec_perm_d *d) > +{ > + rtx rperm[MAX_VECT_LEN], sel; > + enum machine_mode vmode = d->vmode; > + unsigned int i, nelt = d->nelt; > > + /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's > + numbering of elements for big-endian, we must reverse the order. */ > + if (BYTES_BIG_ENDIAN) > + return false; > + > + if (d->testing_p) > + return true; > + > + /* Generic code will try constant permutation twice. Once with the > + original mode and again with the elements lowered to QImode. > + So wait and don't do the selector expansion ourselves. */ > + if (vmode != V8QImode && vmode != V16QImode) > + return false; > + > + for (i = 0; i < nelt; ++i) > + rperm[i] = GEN_INT (d->perm[i]); > + sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); > + sel = force_reg (vmode, sel); > + > + arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel); > + return true; > +} > + > +static bool > +arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > +{ > + /* The pattern matching functions above are written to look for a small > + number to begin the sequence (0, 1, N/2). If we begin with an index > + from the second operand, we can swap the operands. */ > + if (d->perm[0] >= d->nelt) > + { > + unsigned i, nelt = d->nelt; > + rtx x; > + > + for (i = 0; i < nelt; ++i) > + d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1); > + > + x = d->op0; > + d->op0 = d->op1; > + d->op1 = x; > + } > + > + if (TARGET_NEON) > + { > + if (arm_evpc_neon_vuzp (d)) > + return true; > + if (arm_evpc_neon_vzip (d)) > + return true; > + if (arm_evpc_neon_vrev (d)) > + return true; > + if (arm_evpc_neon_vtrn (d)) > + return true; > + return arm_evpc_neon_vtbl (d); > + } > + return false; > +} > + > +/* Expand a vec_perm_const pattern. */ > + > +bool > +arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel) > +{ > + struct expand_vec_perm_d d; > + int i, nelt, which; > + > + d.target = target; > + d.op0 = op0; > + d.op1 = op1; > + > + d.vmode = GET_MODE (target); > + gcc_assert (VECTOR_MODE_P (d.vmode)); > + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); > + d.testing_p = false; > + > + for (i = which = 0; i < nelt; ++i) > + { > + rtx e = XVECEXP (sel, 0, i); > + int ei = INTVAL (e) & (2 * nelt - 1); > + which |= (ei < nelt ? 1 : 2); > + d.perm[i] = ei; > + } > + > + switch (which) > + { > + default: > + gcc_unreachable(); > + > + case 3: > + d.one_vector_p = false; > + if (!rtx_equal_p (op0, op1)) > + break; > + > + /* The elements of PERM do not suggest that only the first operand > + is used, but both operands are identical. Allow easier matching > + of the permutation by folding the permutation into the single > + input vector. */ > + /* FALLTHRU */ > + case 2: > + for (i = 0; i < nelt; ++i) > + d.perm[i] &= nelt - 1; > + d.op0 = op1; > + d.one_vector_p = true; > + break; > + > + case 1: > + d.op1 = op0; > + d.one_vector_p = true; > + break; > + } > + > + return arm_expand_vec_perm_const_1 (&d); > +} > + > +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */ > + > +static bool > +arm_vectorize_vec_perm_const_ok (enum machine_mode vmode, > + const unsigned char *sel) > +{ > + struct expand_vec_perm_d d; > + unsigned int i, nelt, which; > + bool ret; > + > + d.vmode = vmode; > + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); > + d.testing_p = true; > + memcpy (d.perm, sel, nelt); > + > + /* Categorize the set of elements in the selector. */ > + for (i = which = 0; i < nelt; ++i) > + { > + unsigned char e = d.perm[i]; > + gcc_assert (e < 2 * nelt); > + which |= (e < nelt ? 1 : 2); > + } > + > + /* For all elements from second vector, fold the elements to first. */ > + if (which == 2) > + for (i = 0; i < nelt; ++i) > + d.perm[i] -= nelt; > + > + /* Check whether the mask can be applied to the vector type. */ > + d.one_vector_p = (which != 3); > + > + d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); > + d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); > + if (!d.one_vector_p) > + d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); > + > + start_sequence (); > + ret = arm_expand_vec_perm_const_1 (&d); > + end_sequence (); > + > + return ret; > +} > + > + > +#include "gt-arm.h" > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index 94e0a5f..bd68d39 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -3876,6 +3876,65 @@ > [(set_attr "neon_type" "neon_bp_3cycle")] > ) > > +;; These two are used by the vec_perm infrastructure for V16QImode. > +(define_insn_and_split "neon_vtbl1v16qi" > + [(set (match_operand:V16QI 0 "s_register_operand" "=w") > + (unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w") > + (match_operand:V16QI 2 "s_register_operand" "w")] > + UNSPEC_VTBL))] > + "TARGET_NEON" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + rtx op0, op1, op2, part0, part2; > + unsigned ofs; > + > + op0 = operands[0]; > + op1 = gen_lowpart (TImode, operands[1]); > + op2 = operands[2]; > + > + ofs = subreg_lowpart_offset (V8QImode, V16QImode); > + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); > + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); > + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); > + > + ofs = subreg_highpart_offset (V8QImode, V16QImode); > + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); > + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); > + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); > + DONE; > +}) > + > +(define_insn_and_split "neon_vtbl2v16qi" > + [(set (match_operand:V16QI 0 "s_register_operand" "=w") > + (unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w") > + (match_operand:V16QI 2 "s_register_operand" "w")] > + UNSPEC_VTBL))] > + "TARGET_NEON" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + rtx op0, op1, op2, part0, part2; > + unsigned ofs; > + > + op0 = operands[0]; > + op1 = operands[1]; > + op2 = operands[2]; > + > + ofs = subreg_lowpart_offset (V8QImode, V16QImode); > + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); > + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); > + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); > + > + ofs = subreg_highpart_offset (V8QImode, V16QImode); > + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); > + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); > + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); > + DONE; > +}) > + > (define_insn "neon_vtbx1v8qi" > [(set (match_operand:V8QI 0 "s_register_operand" "=w") > (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index c27c414..eb29900 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -108,3 +108,29 @@ > || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" > { > }) > + > +(define_expand "vec_perm_const<mode>" > + [(match_operand:VALL 0 "s_register_operand" "") > + (match_operand:VALL 1 "s_register_operand" "") > + (match_operand:VALL 2 "s_register_operand" "") > + (match_operand:<V_cmp_result> 3 "" "")] > + "TARGET_NEON > + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" > +{ > + if (arm_expand_vec_perm_const (operands[0], operands[1], > + operands[2], operands[3])) > + DONE; > + else > + FAIL; > +}) > + > +(define_expand "vec_perm<mode>" > + [(match_operand:VE 0 "s_register_operand" "") > + (match_operand:VE 1 "s_register_operand" "") > + (match_operand:VE 2 "s_register_operand" "") > + (match_operand:VE 3 "s_register_operand" "")] > + "TARGET_NEON && !BYTES_BIG_ENDIAN" > +{ > + arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]); > + DONE; > +}) > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index 78223af..d99a0b3 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -2725,7 +2725,8 @@ proc check_effective_target_vect_perm { } { > verbose "check_effective_target_vect_perm: using cached result" 2 > } else { > set et_vect_perm_saved 0 > - if { [istarget powerpc*-*-*] > + if { [is-effective-target arm_neon_ok] > + || [istarget powerpc*-*-*] > || [istarget spu-*-*] > || [istarget i?86-*-*] > || [istarget x86_64-*-*] } { > @@ -2748,7 +2749,8 @@ proc check_effective_target_vect_perm_byte { } { > verbose "check_effective_target_vect_perm_byte: using cached result" 2 > } else { > set et_vect_perm_byte_saved 0 > - if { [istarget powerpc*-*-*] > + if { [is-effective-target arm_neon_ok] > + || [istarget powerpc*-*-*] > || [istarget spu-*-*] } { > set et_vect_perm_byte_saved 1 > } > @@ -2769,7 +2771,8 @@ proc check_effective_target_vect_perm_short { } { > verbose "check_effective_target_vect_perm_short: using cached result" 2 > } else { > set et_vect_perm_short_saved 0 > - if { [istarget powerpc*-*-*] > + if { [is-effective-target arm_neon_ok] > + || [istarget powerpc*-*-*] > || [istarget spu-*-*] } { > set et_vect_perm_short_saved 1 > } > -- > 1.7.7.3 >
On Fri, Dec 09, 2011 at 06:02:21PM +0000, Ramana Radhakrishnan wrote: > On 8 December 2011 21:06, Richard Henderson <rth@redhat.com> wrote: > > --- > > gcc/config/arm/arm-protos.h | 3 + > > gcc/config/arm/arm.c | 527 ++++++++++++++++++++++++++++++++- > > gcc/config/arm/neon.md | 59 ++++ > > gcc/config/arm/vec-common.md | 26 ++ > > gcc/testsuite/lib/target-supports.exp | 9 +- > > 5 files changed, 620 insertions(+), 4 deletions(-) > > I haven't been following the vector permute work in great detail and > I must say I haven't read this patch series in great detail yet. > > For Neon a further optimization to consider might be to use the vext > instruction which could achieve permute masks that are monotonically > increasing constants ? While I expect the latency for a vext or vtbl > instruction to be about the same (your mileage might vary depending on > the core), using vext gives us the freedom of not needing a register > for the permute mask - > > a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n + > 4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n > > If the mask being provided is a reverse of the mask above, it's > probably not worth it. > > > Additionally , can we also detect rotate rights ? unless ofcourse > there's a different interface - > > a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1 > > > Masks doing rotate lefts are probably not worth it in this Richard and I were discussing this last night on IRC, and it is certainly possible. Somebody would just have to write a predicate to recognize the case. We do wonder how frequently it will occur, and whether people doing this would just use the whole vector shift instead of shuffle.
On 12/09/2011 10:02 AM, Ramana Radhakrishnan wrote: > For Neon a further optimization to consider might be to use the vext > instruction which could achieve permute masks that are monotonically > increasing constants ? While I expect the latency for a vext or vtbl > instruction to be about the same (your mileage might vary depending on > the core), using vext gives us the freedom of not needing a register > for the permute mask - > > a = vec_shuffle (b, c, mask) where mask is { n + 7, n + 6, n + 5, n + > 4, n + 3, n + 2, n + 1, n } could just be vext.8 A, B, C, #n Good to know. I missed that one in my reading of the manual. > Additionally , can we also detect rotate rights ? unless ofcourse > there's a different interface - > > a = vec_shuffle (vec, {0, 7, 6, 5, 4, 3, 2, 1}) => vext.8 a, vec, vec, #1 Certainly we can. r~
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 296550a..8c3e412 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -244,4 +244,7 @@ extern const struct tune_params *current_tune; extern int vfp3_const_double_for_fract_bits (rtx); #endif /* RTX_CODE */ +extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel); +extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel); + #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 65b4e9d..0395a41 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -267,6 +267,9 @@ static unsigned int arm_autovectorize_vector_sizes (void); static int arm_default_branch_cost (bool, bool); static int arm_cortex_a5_branch_cost (bool, bool); +static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode, + const unsigned char *sel); + /* Table of machine attributes. */ static const struct attribute_spec arm_attribute_table[] = @@ -604,6 +607,10 @@ static const struct attribute_spec arm_attribute_table[] = #define TARGET_PREFERRED_RENAME_CLASS \ arm_preferred_rename_class +#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK +#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \ + arm_vectorize_vec_perm_const_ok + struct gcc_target targetm = TARGET_INITIALIZER; /* Obstack for minipool constant handling. */ @@ -25064,6 +25071,524 @@ vfp3_const_double_for_fract_bits (rtx operand) } return 0; } + +#define MAX_VECT_LEN 16 -#include "gt-arm.h" +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_vector_p; + bool testing_p; +}; + +/* Generate a variable permutation. */ + +static void +arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) +{ + enum machine_mode vmode = GET_MODE (target); + bool one_vector_p = rtx_equal_p (op0, op1); + + gcc_checking_assert (vmode == V8QImode || vmode == V16QImode); + gcc_checking_assert (GET_MODE (op0) == vmode); + gcc_checking_assert (GET_MODE (op1) == vmode); + gcc_checking_assert (GET_MODE (sel) == vmode); + gcc_checking_assert (TARGET_NEON); + + if (one_vector_p) + { + if (vmode == V8QImode) + emit_insn (gen_neon_vtbl1v8qi (target, op0, sel)); + else + emit_insn (gen_neon_vtbl1v16qi (target, op0, sel)); + } + else + { + enum machine_mode mode1, mode2; + rtx pair, part; + + if (vmode == V8QImode) + mode1 = DImode, mode2 = TImode; + else + mode1 = TImode, mode2 = OImode; + + pair = gen_reg_rtx (mode2); + emit_insn (gen_rtx_CLOBBER (VOIDmode, pair)); + + part = simplify_gen_subreg (mode1, pair, mode2, + subreg_lowpart_offset (mode1, mode2)); + emit_move_insn (part, gen_lowpart (mode1, op0)); + + part = simplify_gen_subreg (mode1, pair, mode2, + subreg_highpart_offset (mode1, mode2)); + emit_move_insn (part, gen_lowpart (mode1, op1)); + + if (vmode == V8QImode) + emit_insn (gen_neon_vtbl2v8qi (target, pair, sel)); + else + emit_insn (gen_neon_vtbl2v16qi (target, pair, sel)); + } +} + +void +arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) +{ + enum machine_mode vmode = GET_MODE (target); + unsigned int i, nelt = GET_MODE_NUNITS (vmode); + bool one_vector_p = rtx_equal_p (op0, op1); + rtx rmask[MAX_VECT_LEN], mask; + + /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's + numbering of elements for big-endian, we must reverse the order. */ + gcc_checking_assert (!BYTES_BIG_ENDIAN); + + /* The VTBL instruction does not use a modulo index, so we must take care + of that ourselves. */ + mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1); + for (i = 0; i < nelt; ++i) + rmask[i] = mask; + mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask)); + sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN); + + arm_expand_vec_perm_1 (target, op0, op1, sel); +} + +/* Generate or test for an insn that supports a constant permutation. */ + +/* Recognize patterns for the VUZP insns. */ + +static bool +arm_evpc_neon_vuzp (struct expand_vec_perm_d *d) +{ + unsigned int i, odd, mask, nelt = d->nelt; + rtx out0, out1, in0, in1, x; + rtx (*gen)(rtx, rtx, rtx, rtx); + + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) + return false; + + /* Note that these are little-endian tests. Adjust for big-endian later. */ + if (d->perm[0] == 0) + odd = 0; + else if (d->perm[0] == 1) + odd = 1; + else + return false; + mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1); + + for (i = 0; i < nelt; i++) + { + unsigned elt = (i * 2 + odd) & mask; + if (d->perm[i] != elt) + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vuzpv16qi_internal; break; + case V8QImode: gen = gen_neon_vuzpv8qi_internal; break; + case V8HImode: gen = gen_neon_vuzpv8hi_internal; break; + case V4HImode: gen = gen_neon_vuzpv4hi_internal; break; + case V4SImode: gen = gen_neon_vuzpv4si_internal; break; + case V2SImode: gen = gen_neon_vuzpv2si_internal; break; + case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break; + case V4SFmode: gen = gen_neon_vuzpv4sf_internal; break; + default: + gcc_unreachable (); + } + + in0 = d->op0; + in1 = d->op1; + if (BYTES_BIG_ENDIAN) + { + x = in0, in0 = in1, in1 = x; + odd = !odd; + } + + out0 = d->target; + out1 = gen_reg_rtx (d->vmode); + if (odd) + x = out0, out0 = out1, out1 = x; + + emit_insn (gen (out0, in0, in1, out1)); + return true; +} + +/* Recognize patterns for the VZIP insns. */ + +static bool +arm_evpc_neon_vzip (struct expand_vec_perm_d *d) +{ + unsigned int i, high, mask, nelt = d->nelt; + rtx out0, out1, in0, in1, x; + rtx (*gen)(rtx, rtx, rtx, rtx); + + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) + return false; + + /* Note that these are little-endian tests. Adjust for big-endian later. */ + high = nelt / 2; + if (d->perm[0] == high) + ; + else if (d->perm[0] == 0) + high = 0; + else + return false; + mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1); + + for (i = 0; i < nelt / 2; i++) + { + unsigned elt = (i + high) & mask; + if (d->perm[i * 2] != elt) + return false; + elt = (elt + nelt) & mask; + if (d->perm[i * 2 + 1] != elt) + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vzipv16qi_internal; break; + case V8QImode: gen = gen_neon_vzipv8qi_internal; break; + case V8HImode: gen = gen_neon_vzipv8hi_internal; break; + case V4HImode: gen = gen_neon_vzipv4hi_internal; break; + case V4SImode: gen = gen_neon_vzipv4si_internal; break; + case V2SImode: gen = gen_neon_vzipv2si_internal; break; + case V2SFmode: gen = gen_neon_vzipv2sf_internal; break; + case V4SFmode: gen = gen_neon_vzipv4sf_internal; break; + default: + gcc_unreachable (); + } + + in0 = d->op0; + in1 = d->op1; + if (BYTES_BIG_ENDIAN) + { + x = in0, in0 = in1, in1 = x; + high = !high; + } + + out0 = d->target; + out1 = gen_reg_rtx (d->vmode); + if (high) + x = out0, out0 = out1, out1 = x; + + emit_insn (gen (out0, in0, in1, out1)); + return true; +} + +/* Recognize patterns for the VREV insns. */ + +static bool +arm_evpc_neon_vrev (struct expand_vec_perm_d *d) +{ + unsigned int i, j, diff, nelt = d->nelt; + rtx (*gen)(rtx, rtx, rtx); + + if (!d->one_vector_p) + return false; + + diff = d->perm[0]; + switch (diff) + { + case 7: + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vrev64v16qi; break; + case V8QImode: gen = gen_neon_vrev64v8qi; break; + default: + return false; + } + break; + case 3: + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vrev32v16qi; break; + case V8QImode: gen = gen_neon_vrev32v8qi; break; + case V8HImode: gen = gen_neon_vrev64v8hi; break; + case V4HImode: gen = gen_neon_vrev64v4hi; break; + default: + return false; + } + break; + case 1: + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vrev16v16qi; break; + case V8QImode: gen = gen_neon_vrev16v8qi; break; + case V8HImode: gen = gen_neon_vrev32v8hi; break; + case V4HImode: gen = gen_neon_vrev32v4hi; break; + case V4SImode: gen = gen_neon_vrev64v4si; break; + case V2SImode: gen = gen_neon_vrev64v2si; break; + case V4SFmode: gen = gen_neon_vrev64v4sf; break; + case V2SFmode: gen = gen_neon_vrev64v2sf; break; + default: + return false; + } + break; + default: + return false; + } + + for (i = 0; i < nelt; i += diff) + for (j = 0; j <= diff; j += 1) + if (d->perm[i + j] != i + diff - j) + return false; + + /* Success! */ + if (d->testing_p) + return true; + + /* ??? The third operand is an artifact of the builtin infrastructure + and is ignored by the actual instruction. */ + emit_insn (gen (d->target, d->op0, const0_rtx)); + return true; +} + +/* Recognize patterns for the VTRN insns. */ + +static bool +arm_evpc_neon_vtrn (struct expand_vec_perm_d *d) +{ + unsigned int i, odd, nelt = d->nelt; + rtx out0, out1, in0, in1, x; + rtx (*gen)(rtx, rtx, rtx, rtx); + + if (d->one_vector_p) + return false; + if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) + return false; + + /* Note that these are little-endian tests. Adjust for big-endian later. */ + if (d->perm[0] == 0) + odd = 0; + else if (d->perm[0] == 1) + odd = 1; + else + return false; + + for (i = 0; i < nelt; i += 2) + { + if (d->perm[i] != i + odd) + return false; + if (d->perm[i + 1] != i + nelt + odd) + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vtrnv16qi_internal; break; + case V8QImode: gen = gen_neon_vtrnv8qi_internal; break; + case V8HImode: gen = gen_neon_vtrnv8hi_internal; break; + case V4HImode: gen = gen_neon_vtrnv4hi_internal; break; + case V4SImode: gen = gen_neon_vtrnv4si_internal; break; + case V2SImode: gen = gen_neon_vtrnv2si_internal; break; + case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break; + case V4SFmode: gen = gen_neon_vtrnv4sf_internal; break; + default: + gcc_unreachable (); + } + + in0 = d->op0; + in1 = d->op1; + if (BYTES_BIG_ENDIAN) + { + x = in0, in0 = in1, in1 = x; + odd = !odd; + } + + out0 = d->target; + out1 = gen_reg_rtx (d->vmode); + if (odd) + x = out0, out0 = out1, out1 = x; + + emit_insn (gen (out0, in0, in1, out1)); + return true; +} + +/* The NEON VTBL instruction is a fully variable permuation that's even + stronger than what we expose via VEC_PERM_EXPR. What it doesn't do + is mask the index operand as VEC_PERM_EXPR requires. Therefore we + can do slightly better by expanding this as a constant where we don't + have to apply a mask. */ + +static bool +arm_evpc_neon_vtbl (struct expand_vec_perm_d *d) +{ + rtx rperm[MAX_VECT_LEN], sel; + enum machine_mode vmode = d->vmode; + unsigned int i, nelt = d->nelt; + /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's + numbering of elements for big-endian, we must reverse the order. */ + if (BYTES_BIG_ENDIAN) + return false; + + if (d->testing_p) + return true; + + /* Generic code will try constant permutation twice. Once with the + original mode and again with the elements lowered to QImode. + So wait and don't do the selector expansion ourselves. */ + if (vmode != V8QImode && vmode != V16QImode) + return false; + + for (i = 0; i < nelt; ++i) + rperm[i] = GEN_INT (d->perm[i]); + sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); + sel = force_reg (vmode, sel); + + arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel); + return true; +} + +static bool +arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) +{ + /* The pattern matching functions above are written to look for a small + number to begin the sequence (0, 1, N/2). If we begin with an index + from the second operand, we can swap the operands. */ + if (d->perm[0] >= d->nelt) + { + unsigned i, nelt = d->nelt; + rtx x; + + for (i = 0; i < nelt; ++i) + d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1); + + x = d->op0; + d->op0 = d->op1; + d->op1 = x; + } + + if (TARGET_NEON) + { + if (arm_evpc_neon_vuzp (d)) + return true; + if (arm_evpc_neon_vzip (d)) + return true; + if (arm_evpc_neon_vrev (d)) + return true; + if (arm_evpc_neon_vtrn (d)) + return true; + return arm_evpc_neon_vtbl (d); + } + return false; +} + +/* Expand a vec_perm_const pattern. */ + +bool +arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel) +{ + struct expand_vec_perm_d d; + int i, nelt, which; + + d.target = target; + d.op0 = op0; + d.op1 = op1; + + d.vmode = GET_MODE (target); + gcc_assert (VECTOR_MODE_P (d.vmode)); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = false; + + for (i = which = 0; i < nelt; ++i) + { + rtx e = XVECEXP (sel, 0, i); + int ei = INTVAL (e) & (2 * nelt - 1); + which |= (ei < nelt ? 1 : 2); + d.perm[i] = ei; + } + + switch (which) + { + default: + gcc_unreachable(); + + case 3: + d.one_vector_p = false; + if (!rtx_equal_p (op0, op1)) + break; + + /* The elements of PERM do not suggest that only the first operand + is used, but both operands are identical. Allow easier matching + of the permutation by folding the permutation into the single + input vector. */ + /* FALLTHRU */ + case 2: + for (i = 0; i < nelt; ++i) + d.perm[i] &= nelt - 1; + d.op0 = op1; + d.one_vector_p = true; + break; + + case 1: + d.op1 = op0; + d.one_vector_p = true; + break; + } + + return arm_expand_vec_perm_const_1 (&d); +} + +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */ + +static bool +arm_vectorize_vec_perm_const_ok (enum machine_mode vmode, + const unsigned char *sel) +{ + struct expand_vec_perm_d d; + unsigned int i, nelt, which; + bool ret; + + d.vmode = vmode; + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = true; + memcpy (d.perm, sel, nelt); + + /* Categorize the set of elements in the selector. */ + for (i = which = 0; i < nelt; ++i) + { + unsigned char e = d.perm[i]; + gcc_assert (e < 2 * nelt); + which |= (e < nelt ? 1 : 2); + } + + /* For all elements from second vector, fold the elements to first. */ + if (which == 2) + for (i = 0; i < nelt; ++i) + d.perm[i] -= nelt; + + /* Check whether the mask can be applied to the vector type. */ + d.one_vector_p = (which != 3); + + d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); + d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); + if (!d.one_vector_p) + d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); + + start_sequence (); + ret = arm_expand_vec_perm_const_1 (&d); + end_sequence (); + + return ret; +} + + +#include "gt-arm.h" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 94e0a5f..bd68d39 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3876,6 +3876,65 @@ [(set_attr "neon_type" "neon_bp_3cycle")] ) +;; These two are used by the vec_perm infrastructure for V16QImode. +(define_insn_and_split "neon_vtbl1v16qi" + [(set (match_operand:V16QI 0 "s_register_operand" "=w") + (unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w") + (match_operand:V16QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op0, op1, op2, part0, part2; + unsigned ofs; + + op0 = operands[0]; + op1 = gen_lowpart (TImode, operands[1]); + op2 = operands[2]; + + ofs = subreg_lowpart_offset (V8QImode, V16QImode); + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); + + ofs = subreg_highpart_offset (V8QImode, V16QImode); + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); + DONE; +}) + +(define_insn_and_split "neon_vtbl2v16qi" + [(set (match_operand:V16QI 0 "s_register_operand" "=w") + (unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w") + (match_operand:V16QI 2 "s_register_operand" "w")] + UNSPEC_VTBL))] + "TARGET_NEON" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rtx op0, op1, op2, part0, part2; + unsigned ofs; + + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + + ofs = subreg_lowpart_offset (V8QImode, V16QImode); + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); + + ofs = subreg_highpart_offset (V8QImode, V16QImode); + part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs); + part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs); + emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2)); + DONE; +}) + (define_insn "neon_vtbx1v8qi" [(set (match_operand:V8QI 0 "s_register_operand" "=w") (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index c27c414..eb29900 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -108,3 +108,29 @@ || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" { }) + +(define_expand "vec_perm_const<mode>" + [(match_operand:VALL 0 "s_register_operand" "") + (match_operand:VALL 1 "s_register_operand" "") + (match_operand:VALL 2 "s_register_operand" "") + (match_operand:<V_cmp_result> 3 "" "")] + "TARGET_NEON + || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))" +{ + if (arm_expand_vec_perm_const (operands[0], operands[1], + operands[2], operands[3])) + DONE; + else + FAIL; +}) + +(define_expand "vec_perm<mode>" + [(match_operand:VE 0 "s_register_operand" "") + (match_operand:VE 1 "s_register_operand" "") + (match_operand:VE 2 "s_register_operand" "") + (match_operand:VE 3 "s_register_operand" "")] + "TARGET_NEON && !BYTES_BIG_ENDIAN" +{ + arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]); + DONE; +}) diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 78223af..d99a0b3 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -2725,7 +2725,8 @@ proc check_effective_target_vect_perm { } { verbose "check_effective_target_vect_perm: using cached result" 2 } else { set et_vect_perm_saved 0 - if { [istarget powerpc*-*-*] + if { [is-effective-target arm_neon_ok] + || [istarget powerpc*-*-*] || [istarget spu-*-*] || [istarget i?86-*-*] || [istarget x86_64-*-*] } { @@ -2748,7 +2749,8 @@ proc check_effective_target_vect_perm_byte { } { verbose "check_effective_target_vect_perm_byte: using cached result" 2 } else { set et_vect_perm_byte_saved 0 - if { [istarget powerpc*-*-*] + if { [is-effective-target arm_neon_ok] + || [istarget powerpc*-*-*] || [istarget spu-*-*] } { set et_vect_perm_byte_saved 1 } @@ -2769,7 +2771,8 @@ proc check_effective_target_vect_perm_short { } { verbose "check_effective_target_vect_perm_short: using cached result" 2 } else { set et_vect_perm_short_saved 0 - if { [istarget powerpc*-*-*] + if { [is-effective-target arm_neon_ok] + || [istarget powerpc*-*-*] || [istarget spu-*-*] } { set et_vect_perm_short_saved 1 }