@@ -529,6 +529,65 @@ vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
return false;
}
+/* Returns true if Phi is a first-order recurrence. A first-order
+ recurrence is a non-reduction recurrence relation in which the value of
+ the recurrence in the current loop iteration equals a value defined in
+ the previous iteration. */
+
+static bool
+vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
+ gphi *phi)
+{
+ /* Ensure the phi node is in the loop header and has two incoming values. */
+ if (gimple_bb (phi) != loop->header || gimple_phi_num_args (phi) != 2)
+ return false;
+
+ /* Ensure the loop has a preheader and a single latch block. The loop
+ vectorizer will need the latch to set up the next iteration of the loop. */
+ edge preheader = loop_preheader_edge (loop);
+ edge latch = loop_latch_edge (loop);
+ if (!preheader || !latch)
+ return false;
+
+ /* Ensure the phi node's incoming blocks are the loop preheader and latch. */
+ if (!PHI_ARG_DEF_FROM_EDGE (phi, preheader)
+ || !PHI_ARG_DEF_FROM_EDGE (phi, latch))
+ return false;
+
+ /* Get the previous value. The previous value comes from the latch edge while
+ the initial value comes form the preheader edge. */
+ gimple *previous = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, latch));
+ if (!previous)
+ return false;
+
+ /* Ensure every use_stmt of the phi node is dominated by the previous value.
+ The dominance requirement ensures the loop vectorizer will not need to
+ vectorize the initial value prior to the first iteration of the loop. */
+ gimple *use_stmt;
+ imm_use_iterator imm_iter;
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_phi_result (phi))
+ if (use_stmt)
+ if (!dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt),
+ gimple_bb (previous)))
+ return false;
+
+ /* First-order recurrence autovectorization needs shuffle vector. */
+ tree scalar_type = TREE_TYPE (PHI_RESULT (phi));
+ tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
+ /* First-order recurrence autovectorization needs to handle permutation
+ with indices = [nunits-1, nunits, nunits+1, ...]. */
+ machine_mode mode = TYPE_MODE (vectype);
+ poly_int64 nunits = GET_MODE_NUNITS (mode);
+ vec_perm_builder sel (nunits, 1, 3);
+ for (int i = 0; i < 3; ++i)
+ sel.quick_push (nunits - 1 + i);
+ vec_perm_indices indices (sel, 1, nunits * 2);
+ if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
+ return false;
+
+ return true;
+}
+
/* Function vect_analyze_scalar_cycles_1.
Examine the cross iteration def-use cycles of scalar variables
@@ -666,6 +725,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
}
}
}
+ else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
+ STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
else
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1808,9 +1869,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
gcc_assert (stmt_info);
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
+ ok = vectorizable_dep_phi (loop_vinfo, stmt_info, NULL, NULL);
+
if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
|| STMT_VINFO_LIVE_P (stmt_info))
- && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
+ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
+ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
/* A scalar-dependence cycle that we don't support. */
return opt_result::failure_at (phi,
"not vectorized:"
@@ -8267,6 +8332,118 @@ vectorizable_phi (vec_info *,
return true;
}
+/* Vectorizes Dep PHIs. */
+
+bool
+vectorizable_dep_phi (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+ gimple **vec_stmt, slp_tree slp_node)
+{
+ if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
+ return false;
+
+ gphi *phi = as_a<gphi *> (stmt_info->stmt);
+
+ /* So far we only support first-order recurrence auto-vectorization. */
+ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
+ return false;
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ /* So far we don't support first-order recurrence for SLP
+ * auto-vectorization. */
+ if (slp_node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "do not support first-order recurrence"
+ "autovectorization for SLP node\n");
+ return false;
+ }
+ STMT_VINFO_TYPE (stmt_info) = dep_phi_info_type;
+ return true;
+ }
+
+ /* This is the second phase of vectorizing first-order rececurrences. An
+ overview of the transformation is described below. Suppose we have the
+ following loop.
+
+ int32_t t = 0;
+ for (int i = 0; i < n; ++i)
+ {
+ b[i] = a[i] - t;
+ t = a[i];
+ }
+
+ There is a first-order recurrence on "a". For this loop, the shorthand
+ scalar IR looks like:
+
+ scalar.preheader:
+ init = a[-1]
+ br loop.body
+
+ scalar.body:
+ i = PHI <0(scalar.preheader), i+1(scalar.body)>
+ _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
+ _1 = a[i]
+ b[i] = _1 - _2
+ br cond, scalar.body, ...
+
+ In this example, _2 is a recurrence because it's value depends on the
+ previous iteration. In the first phase of vectorization, we created a
+ temporary value for _2. We now complete the vectorization and produce the
+ shorthand vector IR shown below (VF = 4).
+
+ vector.preheader:
+ vect_init = vect_cst(..., ..., ..., a[-1])
+ br vector.body
+
+ vector.body
+ i = PHI <0(vector.preheader), i+4(vector.body)>
+ vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
+ vect_2 = a[i, i+1, i+2, i+3];
+ vect_3 = vector(vect_1(3), vect_2(0, 1, 2))
+ b[i, i+1, i+2, i+3] = vect_2 - vect_3
+ br cond, vector.body, middle.block
+
+ middle.block:
+ x = vect_2(3)
+ br scalar.preheader
+
+ scalar.ph:
+ s_init = PHI <x(middle.block), a[-1], otherwise>
+ br scalar.body
+
+ After execution completes the vector loop, we extract the next value of
+ the recurrence (x) to use as the initial value in the scalar loop. */
+
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+ basic_block bb = gimple_bb (stmt_info->stmt);
+ tree vec_dest
+ = vect_get_new_vect_var (vectype, vect_simple_var, "vec_recur_");
+ auto_vec<tree> vec_oprnds;
+ tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
+
+ tree vec_init = build_vector_from_val (vectype, preheader);
+ vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
+ vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+ !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+ gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+ /* Create the vectorized first-order PHI node. */
+ gphi *new_phi = create_phi_node (vec_dest, bb);
+ add_phi_arg (new_phi, vec_oprnds[0], loop_latch_edge (loop),
+ UNKNOWN_LOCATION);
+ add_phi_arg (new_phi, vec_init, loop_preheader_edge (loop), UNKNOWN_LOCATION);
+ if (slp_node)
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+ else
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+ return true;
+}
+
/* Return true if VECTYPE represents a vector that requires lowering
by the vector lowering pass. */
@@ -10476,6 +10653,29 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
epilogue_vinfo->shared->save_datarefs ();
}
+/* Return true if the stmt is the first statement that uses the result
+ of a first-order recurrence phi node. */
+
+static gphi *
+vect_use_first_order_phi_result_p (auto_vec<gphi *> &first_order_phi_vec,
+ gimple *stmt)
+{
+ for (unsigned int i = 0; i < first_order_phi_vec.length (); ++i)
+ {
+ gphi *phi = first_order_phi_vec[i];
+ tree phi_result = PHI_RESULT (phi);
+ gimple *first_use;
+ use_operand_p use_p;
+ if (!single_imm_use (phi_result, &use_p, &first_use))
+ continue;
+
+ if (first_use == stmt)
+ return phi;
+ }
+
+ return NULL;
+}
+
/* Function vect_transform_loop.
The analysis phase has determined that the loop is vectorizable.
@@ -10624,6 +10824,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
basic_block bb = bbs[i];
stmt_vec_info stmt_info;
+ /* It's used to save the first-order recurrence phi node.
+ Consider the following case:
+ # t_19 = PHI <_4(6), 0(5)>
+ ...
+ _4 = *_3;
+ ...
+ _6 = _4 - t_19;
+
+ The vectorization sequence should be:
+ 1. Vectorize _4 = *_3;
+ 2. Vectorize # t_19 = PHI <_4(6), 0(5)>
+ 3, Vectorize _6 = _4 - t_19;
+
+ Flow:
+ 1. So we save the first-order recurrence PHI in first_order_phi_vec
+ and skip vectorizing it tentatively when iterating phi statement
+ (save t_19 = PHI <_4(6), 0(5)>).
+ 2. Vectorize all statements when iterating all statements
+ until we reach the statement who is using the result of
+ first-order recurrence PHI (vectorize _4 = *_3;).
+ 3. Stop iterating and return to vectorize the PHI saved
+ in first_order_phi_vec (# t_19 = PHI <_4(6), 0(5)>).
+ 4. Conintue iterating and vectorize the residual statements. */
+ auto_vec<gphi *> first_order_phi_vec;
+
for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
gsi_next (&si))
{
@@ -10677,9 +10902,16 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
- || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
&& ! PURE_SLP_STMT (stmt_info))
maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
+ {
+ first_order_phi_vec.safe_push (phi);
+ continue;
+ }
}
for (gimple_stmt_iterator si = gsi_start_bb (bb);
@@ -10698,6 +10930,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
/* Ignore vector stmts created in the outer loop. */
stmt_info = loop_vinfo->lookup_stmt (stmt);
+ gphi *first_order_phi
+ = vect_use_first_order_phi_result_p (first_order_phi_vec, stmt);
+ if (first_order_phi)
+ {
+ stmt_vec_info first_order_stmt_info
+ = loop_vinfo->lookup_stmt (first_order_phi);
+ gimple_stmt_iterator first_order_si
+ = gsi_for_stmt (first_order_phi);
+ vect_transform_loop_stmt (loop_vinfo, first_order_stmt_info,
+ &first_order_si, NULL);
+ }
+
/* vector stmts created in the outer-loop during vectorization of
stmts in an inner-loop may not have a stmt_info, and do not
need to be vectorized. */
@@ -1503,6 +1503,41 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
while (ncopies--)
vec_oprnds->quick_push (vop);
}
+ else if (dt == vect_first_order_recurrence)
+ {
+ def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
+ gcc_assert (STMT_VINFO_VEC_STMTS (def_stmt_info).length () == ncopies);
+ tree vector_type;
+
+ if (vectype)
+ vector_type = vectype;
+ else
+ vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
+
+ /* Insert shufflevector to for first-order recurrence autovectorization.
+ result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]. */
+ machine_mode mode = TYPE_MODE (vector_type);
+ poly_int64 nunits = GET_MODE_NUNITS (mode);
+ vec_perm_builder sel (nunits, 1, 3);
+ for (int i = 0; i < 3; ++i)
+ sel.quick_push (nunits - 1 + i);
+ vec_perm_indices indices (sel, 1, nunits * 2);
+ tree perm = vec_perm_indices_to_tree (vector_type, indices);
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+ for (unsigned i = 0; i < ncopies; ++i)
+ {
+ gphi *phi = as_a<gphi *> (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]);
+ tree latch = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+ tree recur = gimple_phi_result (phi);
+ gassign *assign
+ = gimple_build_assign (recur, VEC_PERM_EXPR, recur, latch, perm);
+ gimple_assign_set_lhs (assign, recur);
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt_vinfo->stmt);
+ vect_finish_stmt_generation (vinfo, stmt_vinfo, assign, &gsi);
+ vec_oprnds->quick_push (recur);
+ }
+ }
else
{
def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
@@ -11404,6 +11439,12 @@ vect_transform_stmt (vec_info *vinfo,
gcc_assert (done);
break;
+ case dep_phi_info_type:
+ done = vectorizable_dep_phi (as_a <loop_vec_info> (vinfo),
+ stmt_info, &vec_stmt, slp_node);
+ gcc_assert (done);
+ break;
+
case phi_info_type:
done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
gcc_assert (done);
@@ -11804,6 +11845,9 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
case vect_nested_cycle:
dump_printf (MSG_NOTE, "nested cycle\n");
break;
+ case vect_first_order_recurrence:
+ dump_printf (MSG_NOTE, "first order recurrence\n");
+ break;
case vect_unknown_def_type:
dump_printf (MSG_NOTE, "unknown\n");
break;
@@ -11852,7 +11896,8 @@ vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
|| *dt == vect_induction_def
|| *dt == vect_reduction_def
|| *dt == vect_double_reduction_def
- || *dt == vect_nested_cycle)
+ || *dt == vect_nested_cycle
+ || *dt == vect_first_order_recurrence)
{
*vectype = STMT_VINFO_VECTYPE (def_stmt_info);
gcc_assert (*vectype != NULL_TREE);
@@ -65,6 +65,7 @@ enum vect_def_type {
vect_reduction_def,
vect_double_reduction_def,
vect_nested_cycle,
+ vect_first_order_recurrence,
vect_unknown_def_type
};
@@ -1027,6 +1028,7 @@ enum stmt_vec_info_type {
cycle_phi_info_type,
lc_phi_info_type,
phi_info_type,
+ dep_phi_info_type,
loop_exit_ctrl_vec_info_type
};
@@ -2331,6 +2333,8 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info,
gimple **, slp_tree);
extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree,
stmt_vector_for_cost *);
+extern bool vectorizable_dep_phi (loop_vec_info, stmt_vec_info,
+ gimple **, slp_tree);
extern bool vect_emulated_vector_p (tree);
extern bool vect_can_vectorize_without_simd_p (tree_code);
extern bool vect_can_vectorize_without_simd_p (code_helper);
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Hi, After fixing previous ICE. I add full implementation (insert permutation to get correct result.) The gimple IR is correct now I think: # t_21 = PHI <_4(6), t_12(9)> # i_22 = PHI <i_17(6), 0(9)> # vectp_a.6_26 = PHI <vectp_a.6_25(6), a_14(D)(9)> # vect_vec_recur_.9_9 = PHI <vect__4.8_19(6), vect_cst__18(9)> # vectp_b.11_7 = PHI <vectp_b.11_30(6), b_15(D)(9)> # curr_cnt_36 = PHI <next_cnt_35(6), _32(9)> # loop_len_20 = PHI <next_len_34(6), _32(9)> _38 = .WHILE_LEN (loop_len_20, 32, POLY_INT_CST [4, 4]); while_len_37 = _38; _1 = (long unsigned int) i_22; _2 = _1 * 4; _3 = a_14(D) + _2; vect__4.8_19 = .LEN_LOAD (vectp_a.6_26, 32B, loop_len_20, 0); _4 = *_3; _5 = b_15(D) + _2; vect_vec_recur_.9_9 = VEC_PERM_EXPR <vect_vec_recur_.9_9, vect__4.8_19, { POLY_INT_CST [3, 4], POLY_INT_CST [4, 4], POLY_INT_CST [5, 4], ... }>; But I encounter another ICE: 0x169e0e7 process_bb ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:7498 0x16a09af do_rpo_vn(function*, edge_def*, bitmap_head*, bool, bool, vn_lookup_kind) ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8109 0x16a0fe7 do_rpo_vn(function*, edge_def*, bitmap_head*) ../../../riscv-gcc/gcc/tree-ssa-sccvn.cc:8205 0x179b7db execute ../../../riscv-gcc/gcc/tree-vectorizer.cc:1365 Could you help me with this? After fixing this ICE, I think the loop vectorizer can run correctly. Maybe you can test is in X86 or ARM after fixing this ICE. Thanks. --- gcc/tree-vect-loop.cc | 248 ++++++++++++++++++++++++++++++++++++++++- gcc/tree-vect-stmts.cc | 47 +++++++- gcc/tree-vectorizer.h | 4 + 3 files changed, 296 insertions(+), 3 deletions(-)