Message ID | 20220804042839.49603-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [RFC:] Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift with a constant. | expand |
On Thu, Aug 4, 2022 at 6:29 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no > vec_step is needed to update vectorized iv since vf is always multiple > of 2(negative * negative is positive). > > For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..] > as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is > updated as vec_def = vec_init >>/<< vec_step. > > For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..] > as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def = > vec_init * vec_step. > > The patch handles nonlinear iv for > 1. Integer type only, floating point is not handled. > 2. No slp_node. > 3. iv_loop should be same as vector loop, not nested loop. > 4. No UD is created, for mul, no UD overlow for pow (c, vf), for > shift, shift count should be less than type precision. > > Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}. > There's some cases observed in SPEC2017, but no big performance impact. > > Any comments? Looks good overall - a few comments inline. Also can you please add SLP support? I've tried hard to fill in gaps where SLP support is missing since my goal is still to get rid of non-SLP. > gcc/ChangeLog: > > PR tree-optimization/103144 > * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function. > (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function. > (vect_create_nonlinear_iv_init): New function. > (vect_create_nonlinear_iv_step): Ditto > (vect_create_nonlinear_iv_vec_step): Ditto > (vect_update_nonlinear_iv): Ditto > (vectorizable_nonlinear_induction): Ditto. > (vectorizable_induction): Call > vectorizable_nonlinear_induction when induction_type is not > vect_step_op_add. > * tree-vectorizer.h (enum vect_induction_op_type): New enum. > (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr103144-mul-1.c: New test. > * gcc.target/i386/pr103144-mul-2.c: New test. > * gcc.target/i386/pr103144-neg-1.c: New test. > * gcc.target/i386/pr103144-neg-2.c: New test. > * gcc.target/i386/pr103144-shift-1.c: New test. > * gcc.target/i386/pr103144-shift-2.c: New test. > --- > .../gcc.target/i386/pr103144-mul-1.c | 25 + > .../gcc.target/i386/pr103144-mul-2.c | 43 ++ > .../gcc.target/i386/pr103144-neg-1.c | 25 + > .../gcc.target/i386/pr103144-neg-2.c | 36 ++ > .../gcc.target/i386/pr103144-shift-1.c | 34 + > .../gcc.target/i386/pr103144-shift-2.c | 61 ++ > gcc/tree-vect-loop.cc | 604 +++++++++++++++++- > gcc/tree-vectorizer.h | 11 + > 8 files changed, 834 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > new file mode 100644 > index 00000000000..2357541d95d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > + > +#define N 10000 > +void > +foo_mul (int* a, int b) > +{ > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b *= 3; > + } > +} > + > +void > +foo_mul_const (int* a) > +{ > + int b = 1; > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b *= 3; > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > new file mode 100644 > index 00000000000..4ea53e44658 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > @@ -0,0 +1,43 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > +/* { dg-require-effective-target avx2 } */ > + > +#include "avx2-check.h" > +#include <string.h> > +#include "pr103144-mul-1.c" > + > +typedef int v8si __attribute__((vector_size(32))); > + > +void > +avx2_test (void) > +{ > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > + > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > + int b = 8; > + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 }; > + > + for (int i = 0; i != N / 8; i++) > + { > + memcpy (epi32_exp + i * 8, &init, 32); > + init *= 6561; > + } > + > + foo_mul (epi32_dst, b); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 }; > + for (int i = 0; i != N / 8; i++) > + { > + memcpy (epi32_exp + i * 8, &init, 32); > + init *= 6561; > + } > + > + foo_mul_const (epi32_dst); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + return; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > new file mode 100644 > index 00000000000..63a3331563d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > @@ -0,0 +1,25 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > + > +#define N 10000 > +void > +foo_neg (int* a, int b) > +{ > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b = -b; > + } > +} > + > +void > +foo_neg_const (int* a) > +{ > + int b = 1; > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b = -b; > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > new file mode 100644 > index 00000000000..13bc61cdce7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > @@ -0,0 +1,36 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > +/* { dg-require-effective-target avx2 } */ > + > +#include "avx2-check.h" > +#include <string.h> > +#include "pr103144-neg-1.c" > + > +void > +avx2_test (void) > +{ > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > + long long* epi64_exp = (long long*) malloc (N * sizeof (int)); > + > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > + int b = 100; > + > + for (int i = 0; i != N / 2; i++) > + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32); > + > + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); > + foo_neg (epi32_dst, b); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + for (int i = 0; i != N / 2; i++) > + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32); > + > + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); > + foo_neg_const (epi32_dst); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + return; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > new file mode 100644 > index 00000000000..51531495d60 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > @@ -0,0 +1,34 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */ > + > +#define N 10000 > +void > +foo_shl (int* a, int b) > +{ > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b <<= 1; > + } > +} > + > +void > +foo_ashr (int* a, int b) > +{ > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b >>= 1; > + } > +} > + > +void > +foo_lshr (unsigned int* a, unsigned int b) > +{ > + for (int i = 0; i != N; i++) > + { > + a[i] = b; > + b >>= 1U; > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > new file mode 100644 > index 00000000000..02deda6d1ab > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > @@ -0,0 +1,61 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > +/* { dg-require-effective-target avx2 } */ > + > +#include "avx2-check.h" > +#include <string.h> > +#include "pr103144-shift-1.c" > + > +typedef int v8si __attribute__((vector_size(32))); > +typedef unsigned int v8usi __attribute__((vector_size(32))); > + > +void > +avx2_test (void) > +{ > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int)); > + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int)); > + > + > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > + int b = 8; > + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 }; > + > + for (int i = 0; i != N / 8; i++) > + { > + memcpy (epi32_exp + i * 8, &init, 32); > + init <<= 8; > + } > + > + foo_shl (epi32_dst, b); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + b = 11111111; > + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 }; > + for (int i = 0; i != N / 8; i++) > + { > + memcpy (epi32_exp + i * 8, &init, 32); > + init >>= 8; > + } > + > + foo_ashr (epi32_dst, b); > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + __builtin_memset (epu32_exp, 0, N * sizeof (int)); > + unsigned int c = 11111111; > + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U }; > + for (int i = 0; i != N / 8; i++) > + { > + memcpy (epu32_exp + i * 8, &initu, 32); > + initu >>= 8U; > + } > + > + foo_lshr (epu32_dst, c); > + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0) > + __builtin_abort (); > + > + return; > +} > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 2257b29a652..fc2a1584048 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, > return true; > } > > +static bool > +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, > + gphi* loop_phi_node, tree *init, tree *step) > +{ > + tree init_expr, ev_expr, result, op1, op2; > + basic_block bb1, bb2; > + gimple* def; > + imm_use_iterator imm_iter; > + use_operand_p use_p; > + > + > + gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI); gcc_assert (is_a <gphi *> (loop_phi_node)); is shorter > + > + if (gimple_phi_num_args (loop_phi_node) != 2) > + return false; > + > + init_expr = PHI_ARG_DEF (loop_phi_node, 0); > + ev_expr = PHI_ARG_DEF (loop_phi_node, 1); I think you should use init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop)); and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1. > + > + /* Only support nonlinear induction for integer. */ > + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) > + return false; > + > + bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src; > + bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src; Likewise. Use preheader/latch edge. > + > + /* One outside the loop, the other inside the loop. */ > + if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2))) > + return false; > + > + if (flow_bb_inside_loop_p (loop, bb1)) > + std::swap (init_expr, ev_expr); and those two should then go away. > + > + if (TREE_CODE (ev_expr) != SSA_NAME > + || !has_single_use (ev_expr) A comment might be nice - this makes sure we only see post-increment uses. But that also means for(;;) { b = -b; a[i] = b; } is not vectorized? I think it should be possible to relax this. > + || !(def = SSA_NAME_DEF_STMT (ev_expr)) def is never NULL so a cheaper way to write this is || ((def = SSA_NAME_DEF_STMT (ev_expr)), true) > + || !is_gimple_assign (def)) > + return false; > + > + *init = init_expr; > + result = PHI_RESULT (loop_phi_node); > + > + /* final_value_replacement_loop should handle outside loop use. */ > + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result) > + { > + gimple *use_stmt; > + use_stmt = USE_STMT (use_p); > + if (is_gimple_debug (use_stmt)) > + continue; > + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) > + return false; > + } not sure if we need to bother here - ideally vectorizable_live_operation would give up instead (but I suppose the regular IV / induction analysis gives up here as well?) > + > + enum tree_code t_code = gimple_assign_rhs_code (def); > + switch (t_code) > + { > + case NEGATE_EXPR: > + if (gimple_assign_rhs1 (def) != result) > + return false; > + *step = build_int_cst (TREE_TYPE (init_expr), -1); > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; > + break; > + > + case RSHIFT_EXPR: > + case LSHIFT_EXPR: > + case MULT_EXPR: > + op1 = gimple_assign_rhs1 (def); > + op2 = gimple_assign_rhs2 (def); > + if (TREE_CODE (op2) != INTEGER_CST > + || op1 != result) > + return false; > + *step = op2; > + break; > + > + default: > + return false; > + } > + > + if (t_code == LSHIFT_EXPR) > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; > + else if (t_code == RSHIFT_EXPR) > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; > + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ > + else if (t_code == MULT_EXPR) > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; the above can at least go into the combined switch case > + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; Seeing this - did you check whether you handle prologue peeling correctly? The same issue might show up with a vectorized epilogue. I think you can force a peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will simply optimize for the larger number of aligned memory ops) > + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; > + > + return true; > +} > + > /* Return true if PHI, described by STMT_INFO, is the inner PHI in > what we are assuming is a double reduction. For example, given > a structure like this: > @@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, > = evolution_part_in_loop_num (access_fn, loop->num); > } > > - if (!access_fn > - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) > - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) > - || (LOOP_VINFO_LOOP (loop_vinfo) != loop > - && TREE_CODE (step) != INTEGER_CST)) > + if ((!access_fn > + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) > + || !vect_is_simple_iv_evolution (loop->num, access_fn, > + &init, &step) > + || (LOOP_VINFO_LOOP (loop_vinfo) != loop > + && TREE_CODE (step) != INTEGER_CST)) > + /* Only handle nonlinear iv for same loop. */ > + && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, > + phi, &init, &step) > + || LOOP_VINFO_LOOP (loop_vinfo) != loop)) since you only handle inner loop nonlinear IVs you should probably swap the two checks? > { > worklist.safe_push (stmt_vinfo); > continue; > @@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code) > && vect_can_vectorize_without_simd_p (tree_code (code))); > } > > +/* Create vector init for vectorized iv. */ > +static tree > +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, > + tree step_expr, poly_uint64 nunits, > + tree vectype, > + enum vect_induction_op_type induction_type) > +{ > + unsigned HOST_WIDE_INT const_nunits; > + tree vec_shift, vec_init, new_name; > + unsigned i; > + > + /* iv_loop is the loop to be vectorized. Create: > + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ > + new_name = init_expr; > + switch (induction_type) > + { > + case vect_step_op_shr: > + case vect_step_op_shl: > + /* Build the Initial value from shift_expr. */ > + vec_init = gimple_build_vector_from_val (stmts, > + vectype, > + new_name); > + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype, > + build_zero_cst (TREE_TYPE (step_expr)), > + step_expr); There might be a more canonical way to build the series expr - Richard? > + vec_init = gimple_build (stmts, > + (induction_type == vect_step_op_shr > + ? RSHIFT_EXPR : LSHIFT_EXPR), > + vectype, vec_init, vec_shift); > + break; > + > + case vect_step_op_neg: > + { > + vec_init = gimple_build_vector_from_val (stmts, > + vectype, > + new_name); > + tree vec_neg = gimple_build (stmts, NEGATE_EXPR, > + vectype, vec_init); > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nunits, 2, 3); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > + { > + sel[2 * i] = i; > + sel[2 * i + 1] = i + nunits; > + } > + vec_perm_indices indices (sel, 2, nunits); > + tree perm_mask_even > + = vect_gen_perm_mask_checked (vectype, indices); > + vec_init = gimple_build (stmts, VEC_PERM_EXPR, > + vectype, > + vec_init, vec_neg, > + perm_mask_even); > + } > + break; > + > + case vect_step_op_mul: > + { > + gcc_assert (nunits.is_constant (&const_nunits)); > + vec_init = gimple_build_vector_from_val (stmts, > + vectype, > + new_name); > + tree_vector_builder elts (vectype, const_nunits, 1); > + tree elt_step = build_one_cst (TREE_TYPE (step_expr)); > + > + elts.quick_push (elt_step); > + for (i = 1; i < const_nunits; i++) > + { > + /* Create: new_name_i = new_name + step_expr. */ > + elt_step = gimple_build (stmts, MULT_EXPR, > + TREE_TYPE (step_expr), > + elt_step, step_expr); You probably should perform this multiplication in an unsigned type to avoid overflow. > + elts.quick_push (elt_step); > + } > + /* Create a vector from [new_name_0, new_name_1, ..., > + new_name_nunits-1]. */ > + tree vec_mul = gimple_build_vector (stmts, &elts); > + vec_init = gimple_build (stmts, MULT_EXPR, vectype, > + vec_init, vec_mul); > + } > + break; > + > + default: > + gcc_unreachable (); > + } > + > + return vec_init; > +} > + > +/* Create vector step for vectorized iv. */ > +static tree > +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, > + poly_uint64 vf, > + enum vect_induction_op_type induction_type) > +{ > + tree expr = build_int_cst (TREE_TYPE (step_expr), vf); > + tree new_name = NULL; > + /* Step should be pow (step, vf) for mult induction. */ > + if (induction_type == vect_step_op_mul) > + { > + gcc_assert (vf.is_constant ()); > + wide_int begin = wi::to_wide (step_expr); > + > + for (unsigned i = 0; i != vf.to_constant () - 1; i++) > + begin = wi::mul (begin, wi::to_wide (step_expr)); > + > + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin); > + } > + else if (induction_type == vect_step_op_neg) > + /* Do nothing. */ > + ; > + else > + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr), > + expr, step_expr); > + return new_name; > +} > + > +static tree > +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, > + stmt_vec_info stmt_info, > + tree new_name, tree vectype, > + enum vect_induction_op_type induction_type) > +{ > + /* No step is needed for neg induction. */ > + if (induction_type == vect_step_op_neg) > + return NULL; > + > + tree t = unshare_expr (new_name); > + gcc_assert (CONSTANT_CLASS_P (new_name) > + || TREE_CODE (new_name) == SSA_NAME); > + tree new_vec = build_vector_from_val (vectype, t); > + tree vec_step = vect_init_vector (loop_vinfo, stmt_info, > + new_vec, vectype, NULL); > + return vec_step; > +} > + > +/* Update vectorized iv with vect_step, induc_def is init. */ > +static tree > +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, > + tree induc_def, tree vec_step, > + enum vect_induction_op_type induction_type) > +{ > + tree vec_def = induc_def; > + switch (induction_type) > + { > + case vect_step_op_mul: > + vec_def = gimple_build (stmts, MULT_EXPR, vectype, > + vec_def, vec_step); > + break; > + > + case vect_step_op_shr: > + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype, > + vec_def, vec_step); > + break; > + > + case vect_step_op_shl: > + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype, > + vec_def, vec_step); > + break; > + case vect_step_op_neg: > + vec_def = induc_def; > + /* Do nothing. */ > + break; > + default: > + gcc_unreachable (); > + } > + > + return vec_def; > + > +} > +/* Function vectorizable_induction > + > + Check if STMT_INFO performs an nonlinear induction computation that can be > + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create > + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same > + basic block. > + Return true if STMT_INFO is vectorizable in this way. */ > + > +static bool > +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, > + stmt_vec_info stmt_info, > + gimple **vec_stmt, slp_tree slp_node, > + stmt_vector_for_cost *cost_vec) > +{ > + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + unsigned ncopies; > + bool nested_in_vect_loop = false; > + class loop *iv_loop; > + tree vec_def; > + edge pe = loop_preheader_edge (loop); > + basic_block new_bb; > + tree vec_init, vec_step; > + tree new_name; > + gimple *new_stmt; > + gphi *induction_phi; > + tree induc_def, vec_dest; > + tree init_expr, step_expr; > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > + unsigned i; > + gimple_stmt_iterator si; > + > + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); > + > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > + enum vect_induction_op_type induction_type > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); > + > + gcc_assert (induction_type > vect_step_op_add); > + > + if (slp_node) > + ncopies = 1; > + else > + ncopies = vect_get_num_copies (loop_vinfo, vectype); > + gcc_assert (ncopies >= 1); > + > + /* FORNOW. Only handle nonlinear induction in the same loop. */ > + if (nested_in_vect_loop_p (loop, stmt_info)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "nonlinear induction in nested loop.\n"); > + return false; > + } > + > + iv_loop = loop; > + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); > + > + if (slp_node) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "SLP induction not supported for nonlinear" > + " induction.\n"); > + return false; > + } > + > + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Peel for alignment not supported for nonlinear" > + " induction.\n"); Note LOOP_VINFO_MASK_SKIP_NITERS doesn't capture all cases of prologue peeling. I _think_ that you eventually could check LOOP_VINFO_EPILOGUE_P (is this a vectorized epilouge) and LOOP_VINFO_PEELING_FOR_ALIGNMENT here instead. But do you see any difficulties in supporting this? > + return false; > + } > + > + if (FLOAT_TYPE_P (vectype)) It's always better to state what is supported "positively", so please use if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "floating point nonlinear induction vectorization" > + " not supported.\n"); > + return false; > + } > + > + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); > + init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info); > + gcc_assert (step_expr != NULL_TREE && init_expr != NULL); > + /* step_expr should be aligned with init_expr, > + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ > + step_expr = fold_convert (TREE_TYPE (vectype), step_expr); > + gcc_assert (TREE_CODE (init_expr) == INTEGER_CST > + || (TYPE_CANONICAL (TREE_TYPE (vectype)) > + == TYPE_CANONICAL (TREE_TYPE (init_expr)))); use types_compatible_p (...) instead of comparing TYPE_CANONICAL. A small enhancement would be to support different signedness (use tree_nop_conversion_p then). > + gcc_assert (TREE_CODE (step_expr) == INTEGER_CST); > + init_expr = fold_convert (TREE_TYPE (vectype), init_expr); above you asserted that the conversion is only necessary for constants but then fold_convert will also generate a tree NOP_EXPR for some types_compatible_p types. So maybe only do this for INTEGER_CST init_expr or use init_expr = gimple_convert (...) and insert required stmts on the preheader. > + > + switch (induction_type) > + { > + case vect_step_op_neg: > + if (TREE_CODE (init_expr) != INTEGER_CST > + && TREE_CODE (init_expr) != REAL_CST) > + { > + /* Check for backend support of NEGATE_EXPR and vec_perm. */ > + if (!directly_supported_p (NEGATE_EXPR, vectype)) > + return false; > + > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nunits, 2, 3); > + machine_mode mode = TYPE_MODE (vectype); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > + { > + sel[i * 2] = i; > + sel[i * 2 + 1] = i + nunits; > + } > + vec_perm_indices indices (sel, 2, nunits); > + if (!can_vec_perm_const_p (mode, mode, indices)) > + return false; > + } > + break; > + > + case vect_step_op_mul: > + { > + /* Check for backend support of MULT_EXPR. */ > + if (!directly_supported_p (MULT_EXPR, vectype)) > + return false; > + > + /* ?? How to construct vector step for variable number vector. > + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ > + if (!nunits.is_constant ()) > + return false; > + > + /* Make sure there's no overflow or overflow is defined. */ > + if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr))) > + break; Alternatively you could perform the vector IV updates in an unsigned type? > + wi::overflow_type overflow; > + wide_int begin = wi::to_wide (step_expr); > + > + for (unsigned i = 0; i != nunits.to_constant () - 1; i++) > + { > + begin = wi::mul (begin, wi::to_wide (step_expr), > + TYPE_SIGN (TREE_TYPE (init_expr)), > + &overflow); > + if (overflow) > + return false; > + } > + } > + break; > + > + case vect_step_op_shr: > + /* Check for backend support of RSHIFT_EXPR. */ > + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) > + return false; > + > + /* Don't shift more than type precision. */ > + if (!tree_fits_uhwi_p (step_expr) > + || maybe_ge (nunits * tree_to_uhwi (step_expr), > + TYPE_PRECISION (TREE_TYPE (init_expr)))) > + return false; why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1, prec-1 ... }) to fix things up? The shift IVs degenerate quickly with a large niter (or VF). > + break; > + > + case vect_step_op_shl: > + /* Check for backend support of RSHIFT_EXPR. */ > + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) > + return false; > + > + /* Don't shift more than type precision. */ > + if (!tree_fits_uhwi_p (step_expr) > + || maybe_ge (nunits * tree_to_uhwi (step_expr), > + TYPE_PRECISION (TREE_TYPE (init_expr)))) > + return false; Likewise. > + > + break; > + > + default: > + gcc_unreachable (); > + } > + > + if (!vec_stmt) /* transformation not required. */ > + { > + unsigned inside_cost = 0, prologue_cost = 0; > + /* loop cost for vec_loop. Neg induction doesn't have any > + inside_cost. */ > + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, > + stmt_info, 0, vect_body); > + > + /* loop cost for vec_loop. Neg induction doesn't have any > + inside_cost. */ > + if (induction_type == vect_step_op_neg) > + inside_cost = 0; > + > + /* prologue cost for vec_init and vec_step. */ > + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, > + stmt_info, 0, vect_prologue); > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_induction_cost: inside_cost = %d, " > + "prologue_cost = %d. \n", inside_cost, > + prologue_cost); > + > + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; > + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction"); > + return true; > + } > + > + /* Transform. */ > + > + /* Compute a vector variable, initialized with the first VF values of > + the induction variable. E.g., for an iv with IV_PHI='X' and > + evolution S, for a vector of 4 units, we want to compute: > + [X, X + S, X + 2*S, X + 3*S]. */ > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); > + > + pe = loop_preheader_edge (iv_loop); > + /* Find the first insertion point in the BB. */ > + basic_block bb = gimple_bb (phi); > + si = gsi_after_labels (bb); > + > + init_expr = vect_phi_initial_value (phi); > + > + gimple_seq stmts = NULL; > + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr, > + step_expr, nunits, vectype, > + induction_type); > + if (stmts) > + { > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); > + gcc_assert (!new_bb); > + } > + > + stmts = NULL; > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, > + vf, induction_type); > + if (stmts) > + { > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); > + gcc_assert (!new_bb); > + } > + > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, > + new_name, vectype, > + induction_type); > + /* Create the following def-use cycle: > + loop prolog: > + vec_init = ... > + vec_step = ... > + loop: > + vec_iv = PHI <vec_init, vec_loop> > + ... > + STMT > + ... > + vec_loop = vec_iv + vec_step; */ > + > + /* Create the induction-phi that defines the induction-operand. */ > + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); > + induction_phi = create_phi_node (vec_dest, iv_loop->header); > + induc_def = PHI_RESULT (induction_phi); > + > + /* Create the iv update inside the loop. */ > + stmts = NULL; > + vec_def = vect_update_nonlinear_iv (&stmts, vectype, > + induc_def, vec_step, > + induction_type); > + > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); > + new_stmt = SSA_NAME_DEF_STMT (vec_def); > + > + /* Set the arguments of the phi node: */ > + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); > + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), > + UNKNOWN_LOCATION); > + > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); > + *vec_stmt = induction_phi; > + > + /* In case that vectorization factor (VF) is bigger than the number > + of elements that we can fit in a vectype (nunits), we have to generate > + more than one vector stmt - i.e - we need to "unroll" the > + vector stmt by a factor VF/nunits. For more details see documentation > + in vectorizable_operation. */ > + > + if (ncopies > 1) > + { > + stmts = NULL; > + /* FORNOW. This restriction should be relaxed. */ > + gcc_assert (!nested_in_vect_loop); > + > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, > + nunits, induction_type); > + > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, > + new_name, vectype, > + induction_type); are these not the same as created above? > + vec_def = induc_def; > + for (i = 1; i < ncopies; i++) > + { > + /* vec_i = vec_prev + vec_step. */ > + stmts = NULL; > + vec_def = vect_update_nonlinear_iv (&stmts, vectype, > + vec_def, vec_step, > + induction_type); I think you need to update the PHI latch definition to the last vec_i? > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); > + new_stmt = SSA_NAME_DEF_STMT (vec_def); > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > + } > + } > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "transform induction: created def-use cycle: %G%G", > + induction_phi, SSA_NAME_DEF_STMT (vec_def)); > + > + return true; > +} > + > /* Function vectorizable_induction > > Check if STMT_INFO performs an induction computation that can be vectorized. > @@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo, > unsigned i; > tree expr; > gimple_stmt_iterator si; > + enum vect_induction_op_type induction_type > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); > > gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); > if (!phi) > @@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo, > if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) > return false; > > + /* Handle nonlinear induction in a separate place. */ > + if (induction_type != vect_step_op_add) > + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, > + vec_stmt, slp_node, cost_vec); > + > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index e5fdc9e0a14..0c7f1a32070 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -68,6 +68,15 @@ enum vect_def_type { > vect_unknown_def_type > }; > > +/* Define operation type of linear/non-linear induction variable. */ > +enum vect_induction_op_type { > + vect_step_op_add = 0, > + vect_step_op_neg, > + vect_step_op_mul, > + vect_step_op_shl, > + vect_step_op_shr > +}; > + > /* Define type of reduction. */ > enum vect_reduction_type { > TREE_CODE_REDUCTION, > @@ -1188,6 +1197,7 @@ public: > the version here. */ > tree loop_phi_evolution_base_unchanged; > tree loop_phi_evolution_part; > + enum vect_induction_op_type loop_phi_evolution_type; > > /* Used for various bookkeeping purposes, generally holding a pointer to > some other stmt S that is in some way "related" to this stmt. > @@ -1421,6 +1431,7 @@ struct gather_scatter_info { > ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S)) > #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged > #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part > +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type > #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist > #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type > #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code > -- > 2.18.1 >
Richard Biener <richard.guenther@gmail.com> writes: >> +/* Create vector init for vectorized iv. */ >> +static tree >> +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, >> + tree step_expr, poly_uint64 nunits, >> + tree vectype, >> + enum vect_induction_op_type induction_type) >> +{ >> + unsigned HOST_WIDE_INT const_nunits; >> + tree vec_shift, vec_init, new_name; >> + unsigned i; >> + >> + /* iv_loop is the loop to be vectorized. Create: >> + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ >> + new_name = init_expr; >> + switch (induction_type) >> + { >> + case vect_step_op_shr: >> + case vect_step_op_shl: >> + /* Build the Initial value from shift_expr. */ >> + vec_init = gimple_build_vector_from_val (stmts, >> + vectype, >> + new_name); >> + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype, >> + build_zero_cst (TREE_TYPE (step_expr)), >> + step_expr); > > There might be a more canonical way to build the series expr - Richard? build_vec_series is shorter if step_expr is known to be a constant. The above looks good for the general case. Thanks, Richard
On Thu, Aug 4, 2022 at 4:19 PM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Thu, Aug 4, 2022 at 6:29 AM liuhongt via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no > > vec_step is needed to update vectorized iv since vf is always multiple > > of 2(negative * negative is positive). > > > > For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..] > > as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is > > updated as vec_def = vec_init >>/<< vec_step. > > > > For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..] > > as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def = > > vec_init * vec_step. > > > > The patch handles nonlinear iv for > > 1. Integer type only, floating point is not handled. > > 2. No slp_node. > > 3. iv_loop should be same as vector loop, not nested loop. > > 4. No UD is created, for mul, no UD overlow for pow (c, vf), for > > shift, shift count should be less than type precision. > > > > Bootstrapped and regression tested on x86_64-pc-linux-gnu{-m32,}. > > There's some cases observed in SPEC2017, but no big performance impact. > > > > Any comments? > > Looks good overall - a few comments inline. Also can you please add > SLP support? > I've tried hard to fill in gaps where SLP support is missing since my > goal is still to get > rid of non-SLP. > > > gcc/ChangeLog: > > > > PR tree-optimization/103144 > > * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function. > > (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function. > > (vect_create_nonlinear_iv_init): New function. > > (vect_create_nonlinear_iv_step): Ditto > > (vect_create_nonlinear_iv_vec_step): Ditto > > (vect_update_nonlinear_iv): Ditto > > (vectorizable_nonlinear_induction): Ditto. > > (vectorizable_induction): Call > > vectorizable_nonlinear_induction when induction_type is not > > vect_step_op_add. > > * tree-vectorizer.h (enum vect_induction_op_type): New enum. > > (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr103144-mul-1.c: New test. > > * gcc.target/i386/pr103144-mul-2.c: New test. > > * gcc.target/i386/pr103144-neg-1.c: New test. > > * gcc.target/i386/pr103144-neg-2.c: New test. > > * gcc.target/i386/pr103144-shift-1.c: New test. > > * gcc.target/i386/pr103144-shift-2.c: New test. > > --- > > .../gcc.target/i386/pr103144-mul-1.c | 25 + > > .../gcc.target/i386/pr103144-mul-2.c | 43 ++ > > .../gcc.target/i386/pr103144-neg-1.c | 25 + > > .../gcc.target/i386/pr103144-neg-2.c | 36 ++ > > .../gcc.target/i386/pr103144-shift-1.c | 34 + > > .../gcc.target/i386/pr103144-shift-2.c | 61 ++ > > gcc/tree-vect-loop.cc | 604 +++++++++++++++++- > > gcc/tree-vectorizer.h | 11 + > > 8 files changed, 834 insertions(+), 5 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > > new file mode 100644 > > index 00000000000..2357541d95d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c > > @@ -0,0 +1,25 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > > + > > +#define N 10000 > > +void > > +foo_mul (int* a, int b) > > +{ > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b *= 3; > > + } > > +} > > + > > +void > > +foo_mul_const (int* a) > > +{ > > + int b = 1; > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b *= 3; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > > new file mode 100644 > > index 00000000000..4ea53e44658 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c > > @@ -0,0 +1,43 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > > +/* { dg-require-effective-target avx2 } */ > > + > > +#include "avx2-check.h" > > +#include <string.h> > > +#include "pr103144-mul-1.c" > > + > > +typedef int v8si __attribute__((vector_size(32))); > > + > > +void > > +avx2_test (void) > > +{ > > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > > + > > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > > + int b = 8; > > + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 }; > > + > > + for (int i = 0; i != N / 8; i++) > > + { > > + memcpy (epi32_exp + i * 8, &init, 32); > > + init *= 6561; > > + } > > + > > + foo_mul (epi32_dst, b); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 }; > > + for (int i = 0; i != N / 8; i++) > > + { > > + memcpy (epi32_exp + i * 8, &init, 32); > > + init *= 6561; > > + } > > + > > + foo_mul_const (epi32_dst); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + return; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > > new file mode 100644 > > index 00000000000..63a3331563d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c > > @@ -0,0 +1,25 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > > + > > +#define N 10000 > > +void > > +foo_neg (int* a, int b) > > +{ > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b = -b; > > + } > > +} > > + > > +void > > +foo_neg_const (int* a) > > +{ > > + int b = 1; > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b = -b; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > > new file mode 100644 > > index 00000000000..13bc61cdce7 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c > > @@ -0,0 +1,36 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > > +/* { dg-require-effective-target avx2 } */ > > + > > +#include "avx2-check.h" > > +#include <string.h> > > +#include "pr103144-neg-1.c" > > + > > +void > > +avx2_test (void) > > +{ > > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > > + long long* epi64_exp = (long long*) malloc (N * sizeof (int)); > > + > > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > > + int b = 100; > > + > > + for (int i = 0; i != N / 2; i++) > > + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32); > > + > > + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); > > + foo_neg (epi32_dst, b); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + for (int i = 0; i != N / 2; i++) > > + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32); > > + > > + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); > > + foo_neg_const (epi32_dst); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + return; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > > new file mode 100644 > > index 00000000000..51531495d60 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c > > @@ -0,0 +1,34 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */ > > + > > +#define N 10000 > > +void > > +foo_shl (int* a, int b) > > +{ > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b <<= 1; > > + } > > +} > > + > > +void > > +foo_ashr (int* a, int b) > > +{ > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b >>= 1; > > + } > > +} > > + > > +void > > +foo_lshr (unsigned int* a, unsigned int b) > > +{ > > + for (int i = 0; i != N; i++) > > + { > > + a[i] = b; > > + b >>= 1U; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > > new file mode 100644 > > index 00000000000..02deda6d1ab > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c > > @@ -0,0 +1,61 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ > > +/* { dg-require-effective-target avx2 } */ > > + > > +#include "avx2-check.h" > > +#include <string.h> > > +#include "pr103144-shift-1.c" > > + > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef unsigned int v8usi __attribute__((vector_size(32))); > > + > > +void > > +avx2_test (void) > > +{ > > + int* epi32_exp = (int*) malloc (N * sizeof (int)); > > + int* epi32_dst = (int*) malloc (N * sizeof (int)); > > + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int)); > > + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int)); > > + > > + > > + __builtin_memset (epi32_exp, 0, N * sizeof (int)); > > + int b = 8; > > + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 }; > > + > > + for (int i = 0; i != N / 8; i++) > > + { > > + memcpy (epi32_exp + i * 8, &init, 32); > > + init <<= 8; > > + } > > + > > + foo_shl (epi32_dst, b); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + b = 11111111; > > + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 }; > > + for (int i = 0; i != N / 8; i++) > > + { > > + memcpy (epi32_exp + i * 8, &init, 32); > > + init >>= 8; > > + } > > + > > + foo_ashr (epi32_dst, b); > > + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + __builtin_memset (epu32_exp, 0, N * sizeof (int)); > > + unsigned int c = 11111111; > > + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U }; > > + for (int i = 0; i != N / 8; i++) > > + { > > + memcpy (epu32_exp + i * 8, &initu, 32); > > + initu >>= 8U; > > + } > > + > > + foo_lshr (epu32_dst, c); > > + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0) > > + __builtin_abort (); > > + > > + return; > > +} > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index 2257b29a652..fc2a1584048 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, > > return true; > > } > > > > +static bool > > +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, > > + gphi* loop_phi_node, tree *init, tree *step) > > +{ > > + tree init_expr, ev_expr, result, op1, op2; > > + basic_block bb1, bb2; > > + gimple* def; > > + imm_use_iterator imm_iter; > > + use_operand_p use_p; > > + > > + > > + gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI); > > gcc_assert (is_a <gphi *> (loop_phi_node)); > > is shorter > > > + > > + if (gimple_phi_num_args (loop_phi_node) != 2) > > + return false; > > + > > + init_expr = PHI_ARG_DEF (loop_phi_node, 0); > > + ev_expr = PHI_ARG_DEF (loop_phi_node, 1); > > I think you should use > > init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node,loop_preheader_edge (loop)); > and loop_latch_edge (loop) for ev_expr, you can't rely on them being arg 0 / 1. > > > + > > + /* Only support nonlinear induction for integer. */ > > + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) > > + return false; > > + > > + bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src; > > + bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src; > > Likewise. Use preheader/latch edge. > > > + > > + /* One outside the loop, the other inside the loop. */ > > + if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2))) > > + return false; > > + > > + if (flow_bb_inside_loop_p (loop, bb1)) > > + std::swap (init_expr, ev_expr); > > and those two should then go away. > > > + > > + if (TREE_CODE (ev_expr) != SSA_NAME > > + || !has_single_use (ev_expr) > > A comment might be nice - this makes sure we only > see post-increment uses. But that also means > > for(;;) > { > b = -b; > a[i] = b; > } > > is not vectorized? I think it should be possible to relax > this. > > > + || !(def = SSA_NAME_DEF_STMT (ev_expr)) > > def is never NULL so a cheaper way to write this is > > || ((def = SSA_NAME_DEF_STMT (ev_expr)), true) > > > + || !is_gimple_assign (def)) > > + return false; > > + > > + *init = init_expr; > > + result = PHI_RESULT (loop_phi_node); > > + > > + /* final_value_replacement_loop should handle outside loop use. */ > > + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result) > > + { > > + gimple *use_stmt; > > + use_stmt = USE_STMT (use_p); > > + if (is_gimple_debug (use_stmt)) > > + continue; > > + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) > > + return false; > > + } > > not sure if we need to bother here - ideally vectorizable_live_operation would > give up instead (but I suppose the regular IV / induction analysis gives up > here as well?) > > > + > > + enum tree_code t_code = gimple_assign_rhs_code (def); > > + switch (t_code) > > + { > > + case NEGATE_EXPR: > > + if (gimple_assign_rhs1 (def) != result) > > + return false; > > + *step = build_int_cst (TREE_TYPE (init_expr), -1); > > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; > > + break; > > + > > + case RSHIFT_EXPR: > > + case LSHIFT_EXPR: > > + case MULT_EXPR: > > + op1 = gimple_assign_rhs1 (def); > > + op2 = gimple_assign_rhs2 (def); > > + if (TREE_CODE (op2) != INTEGER_CST > > + || op1 != result) > > + return false; > > + *step = op2; > > + break; > > + > > + default: > > + return false; > > + } > > + > > + if (t_code == LSHIFT_EXPR) > > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; > > + else if (t_code == RSHIFT_EXPR) > > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; > > + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ > > + else if (t_code == MULT_EXPR) > > + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; > > the above can at least go into the combined switch case > > > + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; > > Seeing this - did you check whether you handle prologue peeling correctly? The > same issue might show up with a vectorized epilogue. I think you can force a > peeled prologue with storing unaligned and -fno-vect-cost-model (that IIRC will > simply optimize for the larger number of aligned memory ops) Oh, i forget to update vect_update_ivs_after_vectorizer. > > > + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; > > + > > + return true; > > +} > > + > > /* Return true if PHI, described by STMT_INFO, is the inner PHI in > > what we are assuming is a double reduction. For example, given > > a structure like this: > > @@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, > > = evolution_part_in_loop_num (access_fn, loop->num); > > } > > > > - if (!access_fn > > - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) > > - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) > > - || (LOOP_VINFO_LOOP (loop_vinfo) != loop > > - && TREE_CODE (step) != INTEGER_CST)) > > + if ((!access_fn > > + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) > > + || !vect_is_simple_iv_evolution (loop->num, access_fn, > > + &init, &step) > > + || (LOOP_VINFO_LOOP (loop_vinfo) != loop > > + && TREE_CODE (step) != INTEGER_CST)) > > + /* Only handle nonlinear iv for same loop. */ > > + && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, > > + phi, &init, &step) > > + || LOOP_VINFO_LOOP (loop_vinfo) != loop)) > > since you only handle inner loop nonlinear IVs you should probably > swap the two checks? > > > { > > worklist.safe_push (stmt_vinfo); > > continue; > > @@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code) > > && vect_can_vectorize_without_simd_p (tree_code (code))); > > } > > > > +/* Create vector init for vectorized iv. */ > > +static tree > > +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, > > + tree step_expr, poly_uint64 nunits, > > + tree vectype, > > + enum vect_induction_op_type induction_type) > > +{ > > + unsigned HOST_WIDE_INT const_nunits; > > + tree vec_shift, vec_init, new_name; > > + unsigned i; > > + > > + /* iv_loop is the loop to be vectorized. Create: > > + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ > > + new_name = init_expr; > > + switch (induction_type) > > + { > > + case vect_step_op_shr: > > + case vect_step_op_shl: > > + /* Build the Initial value from shift_expr. */ > > + vec_init = gimple_build_vector_from_val (stmts, > > + vectype, > > + new_name); > > + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype, > > + build_zero_cst (TREE_TYPE (step_expr)), > > + step_expr); > > There might be a more canonical way to build the series expr - Richard? > > > + vec_init = gimple_build (stmts, > > + (induction_type == vect_step_op_shr > > + ? RSHIFT_EXPR : LSHIFT_EXPR), > > + vectype, vec_init, vec_shift); > > + break; > > + > > + case vect_step_op_neg: > > + { > > + vec_init = gimple_build_vector_from_val (stmts, > > + vectype, > > + new_name); > > + tree vec_neg = gimple_build (stmts, NEGATE_EXPR, > > + vectype, vec_init); > > + /* The encoding has 2 interleaved stepped patterns. */ > > + vec_perm_builder sel (nunits, 2, 3); > > + sel.quick_grow (6); > > + for (i = 0; i < 3; i++) > > + { > > + sel[2 * i] = i; > > + sel[2 * i + 1] = i + nunits; > > + } > > + vec_perm_indices indices (sel, 2, nunits); > > + tree perm_mask_even > > + = vect_gen_perm_mask_checked (vectype, indices); > > + vec_init = gimple_build (stmts, VEC_PERM_EXPR, > > + vectype, > > + vec_init, vec_neg, > > + perm_mask_even); > > + } > > + break; > > + > > + case vect_step_op_mul: > > + { > > + gcc_assert (nunits.is_constant (&const_nunits)); > > + vec_init = gimple_build_vector_from_val (stmts, > > + vectype, > > + new_name); > > + tree_vector_builder elts (vectype, const_nunits, 1); > > + tree elt_step = build_one_cst (TREE_TYPE (step_expr)); > > + > > + elts.quick_push (elt_step); > > + for (i = 1; i < const_nunits; i++) > > + { > > + /* Create: new_name_i = new_name + step_expr. */ > > + elt_step = gimple_build (stmts, MULT_EXPR, > > + TREE_TYPE (step_expr), > > + elt_step, step_expr); > > You probably should perform this multiplication in an unsigned type to avoid > overflow. > > > + elts.quick_push (elt_step); > > + } > > + /* Create a vector from [new_name_0, new_name_1, ..., > > + new_name_nunits-1]. */ > > + tree vec_mul = gimple_build_vector (stmts, &elts); > > + vec_init = gimple_build (stmts, MULT_EXPR, vectype, > > + vec_init, vec_mul); > > + } > > + break; > > + > > + default: > > + gcc_unreachable (); > > + } > > + > > + return vec_init; > > +} > > + > > +/* Create vector step for vectorized iv. */ > > +static tree > > +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, > > + poly_uint64 vf, > > + enum vect_induction_op_type induction_type) > > +{ > > + tree expr = build_int_cst (TREE_TYPE (step_expr), vf); > > + tree new_name = NULL; > > + /* Step should be pow (step, vf) for mult induction. */ > > + if (induction_type == vect_step_op_mul) > > + { > > + gcc_assert (vf.is_constant ()); > > + wide_int begin = wi::to_wide (step_expr); > > + > > + for (unsigned i = 0; i != vf.to_constant () - 1; i++) > > + begin = wi::mul (begin, wi::to_wide (step_expr)); > > + > > + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin); > > + } > > + else if (induction_type == vect_step_op_neg) > > + /* Do nothing. */ > > + ; > > + else > > + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr), > > + expr, step_expr); > > + return new_name; > > +} > > + > > +static tree > > +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, > > + stmt_vec_info stmt_info, > > + tree new_name, tree vectype, > > + enum vect_induction_op_type induction_type) > > +{ > > + /* No step is needed for neg induction. */ > > + if (induction_type == vect_step_op_neg) > > + return NULL; > > + > > + tree t = unshare_expr (new_name); > > + gcc_assert (CONSTANT_CLASS_P (new_name) > > + || TREE_CODE (new_name) == SSA_NAME); > > + tree new_vec = build_vector_from_val (vectype, t); > > + tree vec_step = vect_init_vector (loop_vinfo, stmt_info, > > + new_vec, vectype, NULL); > > + return vec_step; > > +} > > + > > +/* Update vectorized iv with vect_step, induc_def is init. */ > > +static tree > > +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, > > + tree induc_def, tree vec_step, > > + enum vect_induction_op_type induction_type) > > +{ > > + tree vec_def = induc_def; > > + switch (induction_type) > > + { > > + case vect_step_op_mul: > > + vec_def = gimple_build (stmts, MULT_EXPR, vectype, > > + vec_def, vec_step); > > + break; > > + > > + case vect_step_op_shr: > > + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype, > > + vec_def, vec_step); > > + break; > > + > > + case vect_step_op_shl: > > + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype, > > + vec_def, vec_step); > > + break; > > + case vect_step_op_neg: > > + vec_def = induc_def; > > + /* Do nothing. */ > > + break; > > + default: > > + gcc_unreachable (); > > + } > > + > > + return vec_def; > > + > > +} > > +/* Function vectorizable_induction > > + > > + Check if STMT_INFO performs an nonlinear induction computation that can be > > + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create > > + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same > > + basic block. > > + Return true if STMT_INFO is vectorizable in this way. */ > > + > > +static bool > > +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, > > + stmt_vec_info stmt_info, > > + gimple **vec_stmt, slp_tree slp_node, > > + stmt_vector_for_cost *cost_vec) > > +{ > > + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + unsigned ncopies; > > + bool nested_in_vect_loop = false; > > + class loop *iv_loop; > > + tree vec_def; > > + edge pe = loop_preheader_edge (loop); > > + basic_block new_bb; > > + tree vec_init, vec_step; > > + tree new_name; > > + gimple *new_stmt; > > + gphi *induction_phi; > > + tree induc_def, vec_dest; > > + tree init_expr, step_expr; > > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > > + unsigned i; > > + gimple_stmt_iterator si; > > + > > + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); > > + > > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > > + enum vect_induction_op_type induction_type > > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); > > + > > + gcc_assert (induction_type > vect_step_op_add); > > + > > + if (slp_node) > > + ncopies = 1; > > + else > > + ncopies = vect_get_num_copies (loop_vinfo, vectype); > > + gcc_assert (ncopies >= 1); > > + > > + /* FORNOW. Only handle nonlinear induction in the same loop. */ > > + if (nested_in_vect_loop_p (loop, stmt_info)) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "nonlinear induction in nested loop.\n"); > > + return false; > > + } > > + > > + iv_loop = loop; > > + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); > > + > > + if (slp_node) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "SLP induction not supported for nonlinear" > > + " induction.\n"); > > + return false; > > + } > > + > > + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "Peel for alignment not supported for nonlinear" > > + " induction.\n"); > > Note LOOP_VINFO_MASK_SKIP_NITERS doesn't capture all cases of > prologue peeling. I _think_ that you eventually could check > LOOP_VINFO_EPILOGUE_P (is this a vectorized epilouge) and > LOOP_VINFO_PEELING_FOR_ALIGNMENT here instead. But do you > see any difficulties in supporting this? > > > + return false; > > + } > > + > > + if (FLOAT_TYPE_P (vectype)) > > It's always better to state what is supported "positively", so please use > > if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))) > > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "floating point nonlinear induction vectorization" > > + " not supported.\n"); > > + return false; > > + } > > + > > + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); > > + init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info); > > + gcc_assert (step_expr != NULL_TREE && init_expr != NULL); > > + /* step_expr should be aligned with init_expr, > > + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ > > + step_expr = fold_convert (TREE_TYPE (vectype), step_expr); > > + gcc_assert (TREE_CODE (init_expr) == INTEGER_CST > > + || (TYPE_CANONICAL (TREE_TYPE (vectype)) > > + == TYPE_CANONICAL (TREE_TYPE (init_expr)))); > > use types_compatible_p (...) instead of comparing TYPE_CANONICAL. > A small enhancement would be to support different signedness > (use tree_nop_conversion_p then). > > > + gcc_assert (TREE_CODE (step_expr) == INTEGER_CST); > > + init_expr = fold_convert (TREE_TYPE (vectype), init_expr); > > above you asserted that the conversion is only necessary for constants > but then fold_convert will also generate a tree NOP_EXPR for > some types_compatible_p types. So maybe only do this for INTEGER_CST > init_expr or use init_expr = gimple_convert (...) and insert required stmts > on the preheader. > > > + > > + switch (induction_type) > > + { > > + case vect_step_op_neg: > > + if (TREE_CODE (init_expr) != INTEGER_CST > > + && TREE_CODE (init_expr) != REAL_CST) > > + { > > + /* Check for backend support of NEGATE_EXPR and vec_perm. */ > > + if (!directly_supported_p (NEGATE_EXPR, vectype)) > > + return false; > > + > > + /* The encoding has 2 interleaved stepped patterns. */ > > + vec_perm_builder sel (nunits, 2, 3); > > + machine_mode mode = TYPE_MODE (vectype); > > + sel.quick_grow (6); > > + for (i = 0; i < 3; i++) > > + { > > + sel[i * 2] = i; > > + sel[i * 2 + 1] = i + nunits; > > + } > > + vec_perm_indices indices (sel, 2, nunits); > > + if (!can_vec_perm_const_p (mode, mode, indices)) > > + return false; > > + } > > + break; > > + > > + case vect_step_op_mul: > > + { > > + /* Check for backend support of MULT_EXPR. */ > > + if (!directly_supported_p (MULT_EXPR, vectype)) > > + return false; > > + > > + /* ?? How to construct vector step for variable number vector. > > + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ > > + if (!nunits.is_constant ()) > > + return false; > > + > > + /* Make sure there's no overflow or overflow is defined. */ > > + if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr))) > > + break; > > Alternatively you could perform the vector IV updates in an unsigned type? > > > + wi::overflow_type overflow; > > + wide_int begin = wi::to_wide (step_expr); > > + > > + for (unsigned i = 0; i != nunits.to_constant () - 1; i++) > > + { > > + begin = wi::mul (begin, wi::to_wide (step_expr), > > + TYPE_SIGN (TREE_TYPE (init_expr)), > > + &overflow); > > + if (overflow) > > + return false; > > + } > > + } > > + break; > > + > > + case vect_step_op_shr: > > + /* Check for backend support of RSHIFT_EXPR. */ > > + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) > > + return false; > > + > > + /* Don't shift more than type precision. */ > > + if (!tree_fits_uhwi_p (step_expr) > > + || maybe_ge (nunits * tree_to_uhwi (step_expr), > > + TYPE_PRECISION (TREE_TYPE (init_expr)))) > > + return false; > > why's that necessary? can we do a MIN (vector_step, { prec-1, prec-1, > prec-1 ... }) > to fix things up? The shift IVs degenerate quickly with a large niter (or VF). > > > + break; > > + > > + case vect_step_op_shl: > > + /* Check for backend support of RSHIFT_EXPR. */ > > + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) > > + return false; > > + > > + /* Don't shift more than type precision. */ > > + if (!tree_fits_uhwi_p (step_expr) > > + || maybe_ge (nunits * tree_to_uhwi (step_expr), > > + TYPE_PRECISION (TREE_TYPE (init_expr)))) > > + return false; > > Likewise. > > > + > > + break; > > + > > + default: > > + gcc_unreachable (); > > + } > > + > > + if (!vec_stmt) /* transformation not required. */ > > + { > > + unsigned inside_cost = 0, prologue_cost = 0; > > + /* loop cost for vec_loop. Neg induction doesn't have any > > + inside_cost. */ > > + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, > > + stmt_info, 0, vect_body); > > + > > + /* loop cost for vec_loop. Neg induction doesn't have any > > + inside_cost. */ > > + if (induction_type == vect_step_op_neg) > > + inside_cost = 0; > > + > > + /* prologue cost for vec_init and vec_step. */ > > + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, > > + stmt_info, 0, vect_prologue); > > + > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "vect_model_induction_cost: inside_cost = %d, " > > + "prologue_cost = %d. \n", inside_cost, > > + prologue_cost); > > + > > + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; > > + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction"); > > + return true; > > + } > > + > > + /* Transform. */ > > + > > + /* Compute a vector variable, initialized with the first VF values of > > + the induction variable. E.g., for an iv with IV_PHI='X' and > > + evolution S, for a vector of 4 units, we want to compute: > > + [X, X + S, X + 2*S, X + 3*S]. */ > > + > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); > > + > > + pe = loop_preheader_edge (iv_loop); > > + /* Find the first insertion point in the BB. */ > > + basic_block bb = gimple_bb (phi); > > + si = gsi_after_labels (bb); > > + > > + init_expr = vect_phi_initial_value (phi); > > + > > + gimple_seq stmts = NULL; > > + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr, > > + step_expr, nunits, vectype, > > + induction_type); > > + if (stmts) > > + { > > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); > > + gcc_assert (!new_bb); > > + } > > + > > + stmts = NULL; > > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, > > + vf, induction_type); > > + if (stmts) > > + { > > + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); > > + gcc_assert (!new_bb); > > + } > > + > > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, > > + new_name, vectype, > > + induction_type); > > + /* Create the following def-use cycle: > > + loop prolog: > > + vec_init = ... > > + vec_step = ... > > + loop: > > + vec_iv = PHI <vec_init, vec_loop> > > + ... > > + STMT > > + ... > > + vec_loop = vec_iv + vec_step; */ > > + > > + /* Create the induction-phi that defines the induction-operand. */ > > + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); > > + induction_phi = create_phi_node (vec_dest, iv_loop->header); > > + induc_def = PHI_RESULT (induction_phi); > > + > > + /* Create the iv update inside the loop. */ > > + stmts = NULL; > > + vec_def = vect_update_nonlinear_iv (&stmts, vectype, > > + induc_def, vec_step, > > + induction_type); > > + > > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); > > + new_stmt = SSA_NAME_DEF_STMT (vec_def); > > + > > + /* Set the arguments of the phi node: */ > > + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); > > + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), > > + UNKNOWN_LOCATION); > > + > > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); > > + *vec_stmt = induction_phi; > > + > > + /* In case that vectorization factor (VF) is bigger than the number > > + of elements that we can fit in a vectype (nunits), we have to generate > > + more than one vector stmt - i.e - we need to "unroll" the > > + vector stmt by a factor VF/nunits. For more details see documentation > > + in vectorizable_operation. */ > > + > > + if (ncopies > 1) > > + { > > + stmts = NULL; > > + /* FORNOW. This restriction should be relaxed. */ > > + gcc_assert (!nested_in_vect_loop); > > + > > + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, > > + nunits, induction_type); > > + > > + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, > > + new_name, vectype, > > + induction_type); > > are these not the same as created above? > > > + vec_def = induc_def; > > + for (i = 1; i < ncopies; i++) > > + { > > + /* vec_i = vec_prev + vec_step. */ > > + stmts = NULL; > > + vec_def = vect_update_nonlinear_iv (&stmts, vectype, > > + vec_def, vec_step, > > + induction_type); > > I think you need to update the PHI latch definition to the last vec_i? > > > + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); > > + new_stmt = SSA_NAME_DEF_STMT (vec_def); > > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > > + } > > + } > > + > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "transform induction: created def-use cycle: %G%G", > > + induction_phi, SSA_NAME_DEF_STMT (vec_def)); > > + > > + return true; > > +} > > + > > /* Function vectorizable_induction > > > > Check if STMT_INFO performs an induction computation that can be vectorized. > > @@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo, > > unsigned i; > > tree expr; > > gimple_stmt_iterator si; > > + enum vect_induction_op_type induction_type > > + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); > > > > gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); > > if (!phi) > > @@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo, > > if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) > > return false; > > > > + /* Handle nonlinear induction in a separate place. */ > > + if (induction_type != vect_step_op_add) > > + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, > > + vec_stmt, slp_node, cost_vec); > > + > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > > index e5fdc9e0a14..0c7f1a32070 100644 > > --- a/gcc/tree-vectorizer.h > > +++ b/gcc/tree-vectorizer.h > > @@ -68,6 +68,15 @@ enum vect_def_type { > > vect_unknown_def_type > > }; > > > > +/* Define operation type of linear/non-linear induction variable. */ > > +enum vect_induction_op_type { > > + vect_step_op_add = 0, > > + vect_step_op_neg, > > + vect_step_op_mul, > > + vect_step_op_shl, > > + vect_step_op_shr > > +}; > > + > > /* Define type of reduction. */ > > enum vect_reduction_type { > > TREE_CODE_REDUCTION, > > @@ -1188,6 +1197,7 @@ public: > > the version here. */ > > tree loop_phi_evolution_base_unchanged; > > tree loop_phi_evolution_part; > > + enum vect_induction_op_type loop_phi_evolution_type; > > > > /* Used for various bookkeeping purposes, generally holding a pointer to > > some other stmt S that is in some way "related" to this stmt. > > @@ -1421,6 +1431,7 @@ struct gather_scatter_info { > > ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S)) > > #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged > > #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part > > +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type > > #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist > > #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type > > #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code > > -- > > 2.18.1 > > -- BR, Hongtao
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c new file mode 100644 index 00000000000..2357541d95d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ + +#define N 10000 +void +foo_mul (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b *= 3; + } +} + +void +foo_mul_const (int* a) +{ + int b = 1; + for (int i = 0; i != N; i++) + { + a[i] = b; + b *= 3; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c new file mode 100644 index 00000000000..4ea53e44658 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c @@ -0,0 +1,43 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-mul-1.c" + +typedef int v8si __attribute__((vector_size(32))); + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 8; + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 }; + + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init *= 6561; + } + + foo_mul (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init *= 6561; + } + + foo_mul_const (epi32_dst); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c new file mode 100644 index 00000000000..63a3331563d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ + +#define N 10000 +void +foo_neg (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b = -b; + } +} + +void +foo_neg_const (int* a) +{ + int b = 1; + for (int i = 0; i != N; i++) + { + a[i] = b; + b = -b; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c new file mode 100644 index 00000000000..13bc61cdce7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c @@ -0,0 +1,36 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-neg-1.c" + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + long long* epi64_exp = (long long*) malloc (N * sizeof (int)); + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 100; + + for (int i = 0; i != N / 2; i++) + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32); + + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); + foo_neg (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + for (int i = 0; i != N / 2; i++) + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32); + + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); + foo_neg_const (epi32_dst); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c new file mode 100644 index 00000000000..51531495d60 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" } } */ + +#define N 10000 +void +foo_shl (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b <<= 1; + } +} + +void +foo_ashr (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b >>= 1; + } +} + +void +foo_lshr (unsigned int* a, unsigned int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b >>= 1U; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c new file mode 100644 index 00000000000..02deda6d1ab --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c @@ -0,0 +1,61 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-shift-1.c" + +typedef int v8si __attribute__((vector_size(32))); +typedef unsigned int v8usi __attribute__((vector_size(32))); + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int)); + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int)); + + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 8; + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 }; + + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init <<= 8; + } + + foo_shl (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + b = 11111111; + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init >>= 8; + } + + foo_ashr (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + __builtin_memset (epu32_exp, 0, N * sizeof (int)); + unsigned int c = 11111111; + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epu32_exp + i * 8, &initu, 32); + initu >>= 8U; + } + + foo_lshr (epu32_dst, c); + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 2257b29a652..fc2a1584048 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -425,6 +425,98 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, return true; } +static bool +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, + gphi* loop_phi_node, tree *init, tree *step) +{ + tree init_expr, ev_expr, result, op1, op2; + basic_block bb1, bb2; + gimple* def; + imm_use_iterator imm_iter; + use_operand_p use_p; + + + gcc_assert (gimple_code (loop_phi_node) == GIMPLE_PHI); + + if (gimple_phi_num_args (loop_phi_node) != 2) + return false; + + init_expr = PHI_ARG_DEF (loop_phi_node, 0); + ev_expr = PHI_ARG_DEF (loop_phi_node, 1); + + /* Only support nonlinear induction for integer. */ + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) + return false; + + bb1 = gimple_phi_arg_edge (loop_phi_node, 0)->src; + bb2 = gimple_phi_arg_edge (loop_phi_node, 1)->src; + + /* One outside the loop, the other inside the loop. */ + if (!(flow_bb_inside_loop_p (loop, bb1) ^ flow_bb_inside_loop_p (loop, bb2))) + return false; + + if (flow_bb_inside_loop_p (loop, bb1)) + std::swap (init_expr, ev_expr); + + if (TREE_CODE (ev_expr) != SSA_NAME + || !has_single_use (ev_expr) + || !(def = SSA_NAME_DEF_STMT (ev_expr)) + || !is_gimple_assign (def)) + return false; + + *init = init_expr; + result = PHI_RESULT (loop_phi_node); + + /* final_value_replacement_loop should handle outside loop use. */ + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, result) + { + gimple *use_stmt; + use_stmt = USE_STMT (use_p); + if (is_gimple_debug (use_stmt)) + continue; + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) + return false; + } + + enum tree_code t_code = gimple_assign_rhs_code (def); + switch (t_code) + { + case NEGATE_EXPR: + if (gimple_assign_rhs1 (def) != result) + return false; + *step = build_int_cst (TREE_TYPE (init_expr), -1); + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; + break; + + case RSHIFT_EXPR: + case LSHIFT_EXPR: + case MULT_EXPR: + op1 = gimple_assign_rhs1 (def); + op2 = gimple_assign_rhs2 (def); + if (TREE_CODE (op2) != INTEGER_CST + || op1 != result) + return false; + *step = op2; + break; + + default: + return false; + } + + if (t_code == LSHIFT_EXPR) + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; + else if (t_code == RSHIFT_EXPR) + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ + else if (t_code == MULT_EXPR) + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; + + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; + + return true; +} + /* Return true if PHI, described by STMT_INFO, is the inner PHI in what we are assuming is a double reduction. For example, given a structure like this: @@ -512,11 +604,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, = evolution_part_in_loop_num (access_fn, loop->num); } - if (!access_fn - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) - || (LOOP_VINFO_LOOP (loop_vinfo) != loop - && TREE_CODE (step) != INTEGER_CST)) + if ((!access_fn + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) + || !vect_is_simple_iv_evolution (loop->num, access_fn, + &init, &step) + || (LOOP_VINFO_LOOP (loop_vinfo) != loop + && TREE_CODE (step) != INTEGER_CST)) + /* Only handle nonlinear iv for same loop. */ + && (!vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, + phi, &init, &step) + || LOOP_VINFO_LOOP (loop_vinfo) != loop)) { worklist.safe_push (stmt_vinfo); continue; @@ -8229,6 +8326,496 @@ vect_can_vectorize_without_simd_p (code_helper code) && vect_can_vectorize_without_simd_p (tree_code (code))); } +/* Create vector init for vectorized iv. */ +static tree +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, + tree step_expr, poly_uint64 nunits, + tree vectype, + enum vect_induction_op_type induction_type) +{ + unsigned HOST_WIDE_INT const_nunits; + tree vec_shift, vec_init, new_name; + unsigned i; + + /* iv_loop is the loop to be vectorized. Create: + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ + new_name = init_expr; + switch (induction_type) + { + case vect_step_op_shr: + case vect_step_op_shl: + /* Build the Initial value from shift_expr. */ + vec_init = gimple_build_vector_from_val (stmts, + vectype, + new_name); + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype, + build_zero_cst (TREE_TYPE (step_expr)), + step_expr); + vec_init = gimple_build (stmts, + (induction_type == vect_step_op_shr + ? RSHIFT_EXPR : LSHIFT_EXPR), + vectype, vec_init, vec_shift); + break; + + case vect_step_op_neg: + { + vec_init = gimple_build_vector_from_val (stmts, + vectype, + new_name); + tree vec_neg = gimple_build (stmts, NEGATE_EXPR, + vectype, vec_init); + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nunits, 2, 3); + sel.quick_grow (6); + for (i = 0; i < 3; i++) + { + sel[2 * i] = i; + sel[2 * i + 1] = i + nunits; + } + vec_perm_indices indices (sel, 2, nunits); + tree perm_mask_even + = vect_gen_perm_mask_checked (vectype, indices); + vec_init = gimple_build (stmts, VEC_PERM_EXPR, + vectype, + vec_init, vec_neg, + perm_mask_even); + } + break; + + case vect_step_op_mul: + { + gcc_assert (nunits.is_constant (&const_nunits)); + vec_init = gimple_build_vector_from_val (stmts, + vectype, + new_name); + tree_vector_builder elts (vectype, const_nunits, 1); + tree elt_step = build_one_cst (TREE_TYPE (step_expr)); + + elts.quick_push (elt_step); + for (i = 1; i < const_nunits; i++) + { + /* Create: new_name_i = new_name + step_expr. */ + elt_step = gimple_build (stmts, MULT_EXPR, + TREE_TYPE (step_expr), + elt_step, step_expr); + elts.quick_push (elt_step); + } + /* Create a vector from [new_name_0, new_name_1, ..., + new_name_nunits-1]. */ + tree vec_mul = gimple_build_vector (stmts, &elts); + vec_init = gimple_build (stmts, MULT_EXPR, vectype, + vec_init, vec_mul); + } + break; + + default: + gcc_unreachable (); + } + + return vec_init; +} + +/* Create vector step for vectorized iv. */ +static tree +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, + poly_uint64 vf, + enum vect_induction_op_type induction_type) +{ + tree expr = build_int_cst (TREE_TYPE (step_expr), vf); + tree new_name = NULL; + /* Step should be pow (step, vf) for mult induction. */ + if (induction_type == vect_step_op_mul) + { + gcc_assert (vf.is_constant ()); + wide_int begin = wi::to_wide (step_expr); + + for (unsigned i = 0; i != vf.to_constant () - 1; i++) + begin = wi::mul (begin, wi::to_wide (step_expr)); + + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin); + } + else if (induction_type == vect_step_op_neg) + /* Do nothing. */ + ; + else + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr), + expr, step_expr); + return new_name; +} + +static tree +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, + tree new_name, tree vectype, + enum vect_induction_op_type induction_type) +{ + /* No step is needed for neg induction. */ + if (induction_type == vect_step_op_neg) + return NULL; + + tree t = unshare_expr (new_name); + gcc_assert (CONSTANT_CLASS_P (new_name) + || TREE_CODE (new_name) == SSA_NAME); + tree new_vec = build_vector_from_val (vectype, t); + tree vec_step = vect_init_vector (loop_vinfo, stmt_info, + new_vec, vectype, NULL); + return vec_step; +} + +/* Update vectorized iv with vect_step, induc_def is init. */ +static tree +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, + tree induc_def, tree vec_step, + enum vect_induction_op_type induction_type) +{ + tree vec_def = induc_def; + switch (induction_type) + { + case vect_step_op_mul: + vec_def = gimple_build (stmts, MULT_EXPR, vectype, + vec_def, vec_step); + break; + + case vect_step_op_shr: + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype, + vec_def, vec_step); + break; + + case vect_step_op_shl: + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype, + vec_def, vec_step); + break; + case vect_step_op_neg: + vec_def = induc_def; + /* Do nothing. */ + break; + default: + gcc_unreachable (); + } + + return vec_def; + +} +/* Function vectorizable_induction + + Check if STMT_INFO performs an nonlinear induction computation that can be + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same + basic block. + Return true if STMT_INFO is vectorizable in this way. */ + +static bool +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, + gimple **vec_stmt, slp_tree slp_node, + stmt_vector_for_cost *cost_vec) +{ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + unsigned ncopies; + bool nested_in_vect_loop = false; + class loop *iv_loop; + tree vec_def; + edge pe = loop_preheader_edge (loop); + basic_block new_bb; + tree vec_init, vec_step; + tree new_name; + gimple *new_stmt; + gphi *induction_phi; + tree induc_def, vec_dest; + tree init_expr, step_expr; + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned i; + gimple_stmt_iterator si; + + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); + + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + enum vect_induction_op_type induction_type + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); + + gcc_assert (induction_type > vect_step_op_add); + + if (slp_node) + ncopies = 1; + else + ncopies = vect_get_num_copies (loop_vinfo, vectype); + gcc_assert (ncopies >= 1); + + /* FORNOW. Only handle nonlinear induction in the same loop. */ + if (nested_in_vect_loop_p (loop, stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "nonlinear induction in nested loop.\n"); + return false; + } + + iv_loop = loop; + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); + + if (slp_node) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP induction not supported for nonlinear" + " induction.\n"); + return false; + } + + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Peel for alignment not supported for nonlinear" + " induction.\n"); + return false; + } + + if (FLOAT_TYPE_P (vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "floating point nonlinear induction vectorization" + " not supported.\n"); + return false; + } + + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); + init_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info); + gcc_assert (step_expr != NULL_TREE && init_expr != NULL); + /* step_expr should be aligned with init_expr, + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ + step_expr = fold_convert (TREE_TYPE (vectype), step_expr); + gcc_assert (TREE_CODE (init_expr) == INTEGER_CST + || (TYPE_CANONICAL (TREE_TYPE (vectype)) + == TYPE_CANONICAL (TREE_TYPE (init_expr)))); + gcc_assert (TREE_CODE (step_expr) == INTEGER_CST); + init_expr = fold_convert (TREE_TYPE (vectype), init_expr); + + switch (induction_type) + { + case vect_step_op_neg: + if (TREE_CODE (init_expr) != INTEGER_CST + && TREE_CODE (init_expr) != REAL_CST) + { + /* Check for backend support of NEGATE_EXPR and vec_perm. */ + if (!directly_supported_p (NEGATE_EXPR, vectype)) + return false; + + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nunits, 2, 3); + machine_mode mode = TYPE_MODE (vectype); + sel.quick_grow (6); + for (i = 0; i < 3; i++) + { + sel[i * 2] = i; + sel[i * 2 + 1] = i + nunits; + } + vec_perm_indices indices (sel, 2, nunits); + if (!can_vec_perm_const_p (mode, mode, indices)) + return false; + } + break; + + case vect_step_op_mul: + { + /* Check for backend support of MULT_EXPR. */ + if (!directly_supported_p (MULT_EXPR, vectype)) + return false; + + /* ?? How to construct vector step for variable number vector. + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ + if (!nunits.is_constant ()) + return false; + + /* Make sure there's no overflow or overflow is defined. */ + if (!TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (init_expr))) + break; + + wi::overflow_type overflow; + wide_int begin = wi::to_wide (step_expr); + + for (unsigned i = 0; i != nunits.to_constant () - 1; i++) + { + begin = wi::mul (begin, wi::to_wide (step_expr), + TYPE_SIGN (TREE_TYPE (init_expr)), + &overflow); + if (overflow) + return false; + } + } + break; + + case vect_step_op_shr: + /* Check for backend support of RSHIFT_EXPR. */ + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) + return false; + + /* Don't shift more than type precision. */ + if (!tree_fits_uhwi_p (step_expr) + || maybe_ge (nunits * tree_to_uhwi (step_expr), + TYPE_PRECISION (TREE_TYPE (init_expr)))) + return false; + break; + + case vect_step_op_shl: + /* Check for backend support of RSHIFT_EXPR. */ + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) + return false; + + /* Don't shift more than type precision. */ + if (!tree_fits_uhwi_p (step_expr) + || maybe_ge (nunits * tree_to_uhwi (step_expr), + TYPE_PRECISION (TREE_TYPE (init_expr)))) + return false; + + break; + + default: + gcc_unreachable (); + } + + if (!vec_stmt) /* transformation not required. */ + { + unsigned inside_cost = 0, prologue_cost = 0; + /* loop cost for vec_loop. Neg induction doesn't have any + inside_cost. */ + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, + stmt_info, 0, vect_body); + + /* loop cost for vec_loop. Neg induction doesn't have any + inside_cost. */ + if (induction_type == vect_step_op_neg) + inside_cost = 0; + + /* prologue cost for vec_init and vec_step. */ + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, + stmt_info, 0, vect_prologue); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_induction_cost: inside_cost = %d, " + "prologue_cost = %d. \n", inside_cost, + prologue_cost); + + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction"); + return true; + } + + /* Transform. */ + + /* Compute a vector variable, initialized with the first VF values of + the induction variable. E.g., for an iv with IV_PHI='X' and + evolution S, for a vector of 4 units, we want to compute: + [X, X + S, X + 2*S, X + 3*S]. */ + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); + + pe = loop_preheader_edge (iv_loop); + /* Find the first insertion point in the BB. */ + basic_block bb = gimple_bb (phi); + si = gsi_after_labels (bb); + + init_expr = vect_phi_initial_value (phi); + + gimple_seq stmts = NULL; + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr, + step_expr, nunits, vectype, + induction_type); + if (stmts) + { + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + + stmts = NULL; + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, + vf, induction_type); + if (stmts) + { + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, + new_name, vectype, + induction_type); + /* Create the following def-use cycle: + loop prolog: + vec_init = ... + vec_step = ... + loop: + vec_iv = PHI <vec_init, vec_loop> + ... + STMT + ... + vec_loop = vec_iv + vec_step; */ + + /* Create the induction-phi that defines the induction-operand. */ + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); + induction_phi = create_phi_node (vec_dest, iv_loop->header); + induc_def = PHI_RESULT (induction_phi); + + /* Create the iv update inside the loop. */ + stmts = NULL; + vec_def = vect_update_nonlinear_iv (&stmts, vectype, + induc_def, vec_step, + induction_type); + + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + new_stmt = SSA_NAME_DEF_STMT (vec_def); + + /* Set the arguments of the phi node: */ + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), + UNKNOWN_LOCATION); + + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); + *vec_stmt = induction_phi; + + /* In case that vectorization factor (VF) is bigger than the number + of elements that we can fit in a vectype (nunits), we have to generate + more than one vector stmt - i.e - we need to "unroll" the + vector stmt by a factor VF/nunits. For more details see documentation + in vectorizable_operation. */ + + if (ncopies > 1) + { + stmts = NULL; + /* FORNOW. This restriction should be relaxed. */ + gcc_assert (!nested_in_vect_loop); + + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, + nunits, induction_type); + + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, + new_name, vectype, + induction_type); + vec_def = induc_def; + for (i = 1; i < ncopies; i++) + { + /* vec_i = vec_prev + vec_step. */ + stmts = NULL; + vec_def = vect_update_nonlinear_iv (&stmts, vectype, + vec_def, vec_step, + induction_type); + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + new_stmt = SSA_NAME_DEF_STMT (vec_def); + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + } + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "transform induction: created def-use cycle: %G%G", + induction_phi, SSA_NAME_DEF_STMT (vec_def)); + + return true; +} + /* Function vectorizable_induction Check if STMT_INFO performs an induction computation that can be vectorized. @@ -8259,6 +8846,8 @@ vectorizable_induction (loop_vec_info loop_vinfo, unsigned i; tree expr; gimple_stmt_iterator si; + enum vect_induction_op_type induction_type + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); if (!phi) @@ -8271,6 +8860,11 @@ vectorizable_induction (loop_vec_info loop_vinfo, if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) return false; + /* Handle nonlinear induction in a separate place. */ + if (induction_type != vect_step_op_add) + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, + vec_stmt, slp_node, cost_vec); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e5fdc9e0a14..0c7f1a32070 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -68,6 +68,15 @@ enum vect_def_type { vect_unknown_def_type }; +/* Define operation type of linear/non-linear induction variable. */ +enum vect_induction_op_type { + vect_step_op_add = 0, + vect_step_op_neg, + vect_step_op_mul, + vect_step_op_shl, + vect_step_op_shr +}; + /* Define type of reduction. */ enum vect_reduction_type { TREE_CODE_REDUCTION, @@ -1188,6 +1197,7 @@ public: the version here. */ tree loop_phi_evolution_base_unchanged; tree loop_phi_evolution_part; + enum vect_induction_op_type loop_phi_evolution_type; /* Used for various bookkeeping purposes, generally holding a pointer to some other stmt S that is in some way "related" to this stmt. @@ -1421,6 +1431,7 @@ struct gather_scatter_info { ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S)) #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code