Message ID | CAOvf_xzkGVO6t_TWkLA=bqrf4JGpYW+0aNgGUm6Ababa5NjFig@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Thu, Nov 20, 2014 at 02:36:26PM +0300, Evgeny Stupachenko wrote: > + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general > + shuffles. */ I think switch (d->vmode) would be more readable. > + op = gen_reg_rtx (d->vmode); > + t = gen_reg_rtx (V4DImode); > + emit_insn (gen_pack (op, dop0, dop1)); > + emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op), > const0_rtx, Too long line, wrap it? Will leave the rest to Uros. Jakub
On Thu, Nov 20, 2014 at 12:36 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > Hi, > > The patch expand even/odd permutation using: > "and, and, pack" in odd case > "shift, shift, pack" in even case > > instead of current "pshufb, pshufb, or" or big set of unpack insns. > > AVX2/CORE bootstrap and make check passed. > expensive tests are in progress > > Is it ok for trunk? > > Evgeny > > 2014-11-20 Evgeny Stupachenko <evstupac@gmail.com> > > gcc/testsuite > PR target/60451 > * gcc.target/i386/pr60451.c: New. > > gcc/ > PR target/60451 > * config/i386/i386.c (expand_vec_perm_even_odd_pack): New. > (expand_vec_perm_even_odd_1): Add new expand for SSE cases, > replace with for AVX2 cases. > (ix86_expand_vec_perm_const_1): Add new expand. OK with a couple of small adjustments below. Thanks, Uros. > +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even > + and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands > + with two "and" and "pack" or two "shift" and "pack" insns. We should > + have already failed all two instruction sequences. */ > + > +static bool > +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) > +{ > + rtx op, dop0, dop1, t, rperm[16]; > + unsigned i, odd, c, s, nelt = d->nelt; > + bool end_perm = false; > + machine_mode half_mode; > + rtx (*gen_and) (rtx, rtx, rtx); > + rtx (*gen_pack) (rtx, rtx, rtx); > + rtx (*gen_shift) (rtx, rtx, rtx); > + > + /* Required for "pack". */ > + if (!TARGET_SSE4_2 || d->one_operand_p) > + return false; > + > + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general > + shuffles. */ > + if (d->vmode == V8HImode) Use switch, as proposed by Jakub. > + { > + c = 0xffff; > + s = 16; > + half_mode = V4SImode; > + gen_and = gen_andv4si3; > + gen_pack = gen_sse4_1_packusdw; > + gen_shift = gen_lshrv4si3; > + } > + else if (d->vmode == V16QImode) > + { > + c = 0xff; > + s = 8; > + half_mode = V8HImode; > + gen_and = gen_andv8hi3; > + gen_pack = gen_sse2_packuswb; > + gen_shift = gen_lshrv8hi3; > + } > + else if (d->vmode == V16HImode) > + { > + c = 0xffff; > + s = 16; > + half_mode = V8SImode; > + gen_and = gen_andv8si3; > + gen_pack = gen_avx2_packusdw; > + gen_shift = gen_lshrv8si3; > + end_perm = true; > + } > + else if (d->vmode == V32QImode) > + { > + c = 0xff; > + s = 8; > + half_mode = V16HImode; > + gen_and = gen_andv16hi3; > + gen_pack = gen_avx2_packuswb; > + gen_shift = gen_lshrv16hi3; > + end_perm = true; > + } > + else > + return false; > + > + /* Check that permutation is even or odd. */ > + odd = d->perm[0]; > + if (odd != 0 && odd != 1) if (odd > 1) > + return false; > + > + for (i = 1; i < nelt; ++i) > + if (d->perm[i] != 2 * i + odd) > + return false; > + > + if (d->testing_p) > + return true; > + > + dop0 = gen_reg_rtx (half_mode); > + dop1 = gen_reg_rtx (half_mode); > + if (odd == 0) > + { > + for (i = 0; i < nelt / 2; rperm[i++] = GEN_INT (c)); Please write above as: for (i = 0; i < nelt / 2; i++) rperm[i] = GEN_INT (c)); > + t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm)); > + t = force_reg (half_mode, t); > + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); > + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); > + } > + else > + { > + emit_insn (gen_shift (dop0, > + gen_lowpart (half_mode, d->op0), > + GEN_INT (s))); > + emit_insn (gen_shift (dop1, > + gen_lowpart (half_mode, d->op1), > + GEN_INT (s))); > + } > + /* In AVX2 for 256 bit case we need to permute pack result. */ > + if (TARGET_AVX2 && end_perm) > + { > + op = gen_reg_rtx (d->vmode); > + t = gen_reg_rtx (V4DImode); > + emit_insn (gen_pack (op, dop0, dop1)); > + emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op), > const0_rtx, > + const2_rtx, const1_rtx, GEN_INT (3))); > + emit_move_insn (d->target, gen_lowpart (d->vmode, t)); > + } > + else > + emit_insn (gen_pack (d->target, dop0, dop1)); > + > + return true; > +} > + > /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even > and extract-odd permutations. */ > > @@ -48393,6 +48503,8 @@ expand_vec_perm_even_odd_1 (struct > expand_vec_perm_d *d, unsigned odd) > gcc_unreachable (); > > case V8HImode: > + if (TARGET_SSE4_2) > + return expand_vec_perm_even_odd_pack (d); > if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) "else if" in the above line, to be consistent with else below. > return expand_vec_perm_pshufb2 (d); > else > @@ -48416,6 +48528,8 @@ expand_vec_perm_even_odd_1 (struct > expand_vec_perm_d *d, unsigned odd) > break; > > case V16QImode: > + if (TARGET_SSE4_2) > + return expand_vec_perm_even_odd_pack (d); > if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) "else if" in the above line. > return expand_vec_perm_pshufb2 (d); > else > @@ -48441,7 +48555,7 @@ expand_vec_perm_even_odd_1 (struct > expand_vec_perm_d *d, unsigned odd) > > case V16HImode: > case V32QImode: > - return expand_vec_perm_vpshufb2_vpermq_even_odd (d); > + return expand_vec_perm_even_odd_pack (d); > > case V4DImode: > if (!TARGET_AVX2) > @@ -48814,6 +48928,9 @@ ix86_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > > /* Try sequences of three instructions. */ > > + if (expand_vec_perm_even_odd_pack (d)) > + return true; > + > if (expand_vec_perm_2vperm2f128_vshuf (d)) > return true; > > diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c > b/gcc/testsuite/gcc.target/i386/pr60451.c > new file mode 100644 > index 0000000..29f019d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr60451.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target sse4 } */ > +/* { dg-options "-O2 -ftree-vectorize -msse4.2" } */ > + > +void > +foo (unsigned char *a, unsigned char *b, unsigned char *c, int size) > +{ > + int i; > + > + for (i = 0; i < size; i++) > + a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117); > +} > + > +/* { dg-final { scan-assembler "packuswb|vpunpck" } } */
On 11/20/2014 12:36 PM, Evgeny Stupachenko wrote: > + /* Required for "pack". */ > + if (!TARGET_SSE4_2 || d->one_operand_p) > + return false; Why the SSE4_2 check here when... > + > + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general > + shuffles. */ > + if (d->vmode == V8HImode) > + { > + c = 0xffff; > + s = 16; > + half_mode = V4SImode; > + gen_and = gen_andv4si3; > + gen_pack = gen_sse4_1_packusdw; ... it's SSE4_1 here, > + gen_shift = gen_lshrv4si3; > + } > + else if (d->vmode == V16QImode) > + { > + c = 0xff; > + s = 8; > + half_mode = V8HImode; > + gen_and = gen_andv8hi3; > + gen_pack = gen_sse2_packuswb; ... and SSE2 here? r~
diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c b/gcc/testsuite/gcc.target/i386/pr60451.c new file mode 100644 index 0000000..29f019d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr60451.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -ftree-vectorize -msse4.2" } */ + +void +foo (unsigned char *a, unsigned char *b, unsigned char *c, int size) +{ + int i; + + for (i = 0; i < size; i++) + a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117); +} + +/* { dg-final { scan-assembler "packuswb|vpunpck" } } */