@@ -996,6 +996,16 @@ precompute_register_parameters (int num_actuals, struct arg_data *args,
pop_temp_slots ();
}
+ /* Put pseudos to temp stack for argument */
+ if (GET_CODE (args[i].value) == PARALLEL)
+ {
+ tree type = TREE_TYPE (args[i].tree_value);
+ int size = int_size_in_bytes (type);
+ rtx mem = assign_stack_temp (BLKmode, size);
+ emit_group_store (mem, args[i].value, type, size);
+ args[i].value = mem;
+ }
+
/* If we are to promote the function arg to a wider mode,
do it now. */
@@ -3789,7 +3789,7 @@ expand_value_return (rtx val)
tree decl = DECL_RESULT (current_function_decl);
rtx return_reg = DECL_RTL (decl);
- if (return_reg != val)
+ if (!rtx_equal_p (return_reg, val))
{
tree funtype = TREE_TYPE (current_function_decl);
tree type = TREE_TYPE (decl);
@@ -3862,6 +3862,10 @@ expand_return (tree retval)
been stored into it, so we don't have to do anything special. */
if (TREE_CODE (retval_rhs) == RESULT_DECL)
expand_value_return (result_rtl);
+ /* return is scalarized by fsra. */
+ else if (DECL_RTL_SET_P (retval_rhs)
+ && rtx_equal_p (result_rtl, DECL_RTL (retval_rhs)))
+ expand_null_return_1 ();
/* If the result is an aggregate that is being returned in one (or more)
registers, load the registers here. */
@@ -3483,6 +3483,440 @@ expand_ACCESS_WITH_SIZE (internal_fn, gcall *stmt)
expand_assignment (lhs, ref_to_obj, false);
}
+/* For the registers series REGS, compute which registers are touched
+ at the position {BITPOS, BITSIZE}. The results are stored into
+ START_INDEX, END_INDEX, LEFT_BITS and RIGHT_BITS.
+ Return the first register at the expected position. */
+
+static rtx
+query_position_in_series (rtx regs, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+ int &start_index, int &end_index,
+ HOST_WIDE_INT &left_bits, HOST_WIDE_INT &right_bits)
+{
+ if (GET_CODE (regs) == PARALLEL)
+ {
+ int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+ for (; cur_index < XVECLEN (regs, 0); cur_index++)
+ {
+ rtx slot = XVECEXP (regs, 0, cur_index);
+ HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+ machine_mode mode = GET_MODE (XEXP (slot, 0));
+ HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+ if (off <= bitpos && off + size > bitpos)
+ {
+ start_index = cur_index;
+ left_bits = bitpos - off;
+ }
+ if (off + size >= bitpos + bitsize)
+ {
+ end_index = cur_index;
+ right_bits = off + size - (bitpos + bitsize);
+ break;
+ }
+ }
+ /* Does not hit the incoming reg, e.g. accessing the padding. */
+ if (start_index < 0 || end_index < start_index)
+ return NULL_RTX;
+ return XEXP (XVECEXP (regs, 0, start_index), 0);
+ }
+
+ /* Complex concat. */
+ if (GET_CODE (regs) == CONCAT && COMPLEX_MODE_P (GET_MODE (regs)))
+ {
+ machine_mode mode = GET_MODE (XEXP (regs, 0));
+ HOST_WIDE_INT mode_size = GET_MODE_BITSIZE (mode).to_constant ();
+ if ((bitsize == mode_size && (bitpos == 0 || bitpos == mode_size))
+ || (bitsize == mode_size * 2 && bitpos == 0))
+ {
+ start_index = end_index = bitpos == 0 ? 0 : 1;
+ left_bits = right_bits = 0;
+ if (bitsize == mode_size)
+ return XEXP (regs, start_index);
+
+ /* bitsize == mode_size * 2 */
+ rtx reg = gen_reg_rtx (GET_MODE (regs));
+ emit_move_insn (reg, regs);
+ return reg;
+ }
+
+ /* Take middle bits of complex. */
+ return NULL_RTX;
+ }
+
+ /* Continues registers. */
+ if (REG_P (regs) && GET_MODE (regs) == BLKmode)
+ {
+ HOST_WIDE_INT end_bits = bitpos + bitsize - 1;
+ start_index = bitpos / BITS_PER_WORD;
+ left_bits = bitpos % BITS_PER_WORD;
+ end_index = end_bits / BITS_PER_WORD;
+ right_bits = BITS_PER_WORD - 1 - (end_bits % BITS_PER_WORD);
+ return gen_rtx_REG (word_mode, REGNO (regs) + start_index);
+ }
+
+ /* Only one incoming register. */
+ if (REG_P (regs))
+ {
+ start_index = end_index = 0;
+ left_bits = bitpos;
+ machine_mode mode = GET_MODE (regs);
+ right_bits = GET_MODE_BITSIZE (mode).to_constant () - bitpos - bitsize;
+ return regs;
+ }
+
+ return NULL_RTX;
+}
+
+/* For an access on ARG at {BITPOS, BITSIZE}, compute a RTX
+ expression for the access and assign it to LHS.
+ ARG is an aggregate parameter of a function, and it should
+ be passed through registers.
+ Return true if able to figure out an effective way to assign
+ the access to LHS, otherwise return false. */
+
+static bool
+assign_from_regs (tree arg, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+ bool reversep, tree lhs)
+{
+ rtx regs = DECL_INCOMING_RTL (arg);
+ if (MEM_P (regs))
+ return false;
+ int start_index = -1;
+ int end_index = -1;
+ HOST_WIDE_INT left_bits = 0;
+ HOST_WIDE_INT right_bits = 0;
+
+ /* Use pseudo on DECL_RTL instead of single hard register. */
+ if (REG_P (regs))
+ {
+ rtx arg_rtx = arg->decl_with_rtl.rtl;
+ if (GET_MODE (regs) != BLKmode)
+ regs = arg_rtx;
+
+ /* For the case: small aggregate size. */
+ else if (int_size_in_bytes (TREE_TYPE (arg)) < UNITS_PER_WORD)
+ {
+ regs = gen_reg_rtx (word_mode);
+ emit_move_insn (regs, adjust_address (arg_rtx, word_mode, 0));
+ }
+ }
+
+ rtx reg1st = query_position_in_series (regs, bitpos, bitsize, start_index,
+ end_index, left_bits, right_bits);
+ if (!reg1st)
+ return false;
+ machine_mode mode = word_mode;
+
+ tree type = TREE_TYPE (lhs);
+ machine_mode expr_mode = TYPE_MODE (type);
+
+ /* Just need one reg for the access. */
+ if (end_index == start_index)
+ {
+ rtx reg = reg1st;
+ /* Perfer pesudo */
+ if (HARD_REGISTER_P (reg))
+ {
+ rtx tmp = gen_reg_rtx (GET_MODE (reg));
+ emit_move_insn (tmp, reg);
+ reg = tmp;
+ }
+
+ /* Whole register. */
+ if (left_bits == 0 && right_bits == 0)
+ {
+ if (reversep)
+ reg = flip_storage_order (GET_MODE (reg), reg);
+ if (GET_MODE (reg) != expr_mode)
+ reg = gen_lowpart (expr_mode, reg);
+
+ rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ emit_move_insn (dest, reg);
+ return true;
+ }
+
+ /* For left_bits != 0 or right_bits != 0,
+ Need to extract bitfield part reg for the access. */
+ scalar_int_mode imode;
+ if (!int_mode_for_mode (expr_mode).exists (&imode))
+ return false;
+
+ bool sgn = TYPE_UNSIGNED (type);
+ rtx res = extract_bit_field (reg, bitsize, left_bits, sgn, NULL_RTX,
+ imode, imode, reversep, NULL);
+
+ if (GET_MODE (res) != imode)
+ res = gen_lowpart (imode, res);
+
+ if (expr_mode != imode)
+ {
+ /* expr_mode != imode, e.g. SF != SI. */
+ rtx tmp = gen_reg_rtx (imode);
+ emit_move_insn (tmp, res);
+ res = gen_lowpart (expr_mode, tmp);
+ }
+
+ rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ emit_move_insn (dest, res);
+ return true;
+ }
+
+ /* Sub-bits cross multi-registers. */
+ if (left_bits != 0 || right_bits != 0 || bitsize % BITS_PER_WORD)
+ return false;
+
+ /* Access multi-registers. */
+ int len = end_index - start_index + 1;
+ if (!VECTOR_MODE_P (expr_mode))
+ {
+ rtvec vec = rtvec_alloc (len);
+ int off0 = 0;
+ if (GET_CODE (regs) == PARALLEL)
+ off0 = INTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
+ for (int i = 0; i < len; i++)
+ {
+ int index = start_index + i;
+ rtx src;
+ rtx off;
+ if (REG_P (regs))
+ {
+ src = gen_rtx_REG (mode, REGNO (regs) + index);
+ off = GEN_INT (UNITS_PER_WORD * i);
+ }
+ else
+ {
+ rtx e = XVECEXP (regs, 0, index);
+ src = XEXP (e, 0);
+ off = GEN_INT (INTVAL (XEXP (e, 1)) - off0);
+ }
+ rtx tmp = gen_reg_rtx (GET_MODE (src));
+ emit_move_insn (tmp, src);
+ RTVEC_ELT (vec, i) = alloc_EXPR_LIST (REG_DEP_TRUE, tmp, off);
+ }
+ rtx res = gen_rtx_PARALLEL (expr_mode, vec);
+
+ /* A few registers: parallel:[DI,DI,DI..]. */
+ if (expr_mode == BLKmode)
+ lhs->decl_with_rtl.rtl = res; // SET_DECL_RTL
+
+ /* For case like: "reg:TI=[DI, DI]" */
+ else
+ {
+ rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ emit_group_store (dest, res, type, bitsize / BITS_PER_UNIT);
+ }
+ return true;
+ }
+
+ /* For vector, check vector mode and element mode. */
+ if (GET_CODE (regs) == PARALLEL)
+ mode = GET_MODE (XEXP (XVECEXP (regs, 0, start_index), 0));
+ scalar_mode smode = as_a<scalar_mode> (mode);
+ machine_mode vmode;
+ if (!related_vector_mode (expr_mode, smode, len).exists (&vmode))
+ return false;
+ insn_code icode = convert_optab_handler (vec_init_optab, vmode, smode);
+ if (icode == CODE_FOR_nothing)
+ return false;
+
+ rtvec vec = rtvec_alloc (len);
+ for (int i = 0; i < len; i++)
+ {
+ int index = start_index + i;
+ rtx src = REG_P (regs) ? gen_rtx_REG (mode, REGNO (regs) + index)
+ : XEXP (XVECEXP (regs, 0, index), 0);
+ RTVEC_ELT (vec, i) = src;
+ }
+
+ rtx res = gen_reg_rtx (vmode);
+ emit_insn (GEN_FCN (icode) (res, gen_rtx_PARALLEL (vmode, vec)));
+ if (expr_mode != vmode)
+ res = gen_lowpart (expr_mode, res);
+ rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ emit_move_insn (dest, res);
+ return true;
+}
+
+/* Extern function for building MEM_REF rtx. */
+tree reference_alias_ptr_type (tree);
+
+/* Expand the IFN_ARG_PARTS function:
+ LHS = .ARG_PARTS(INCOMING_ARG, BIT_OFFSET, BIT_SIZE, REVERSEP). */
+
+static void
+expand_ARG_PARTS (internal_fn, gcall *stmt)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ tree arg = gimple_call_arg (stmt, 0);
+ HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+ HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+ int reversep = tree_to_shwi (gimple_call_arg (stmt, 3));
+ if (assign_from_regs (arg, offset, size, reversep, lhs))
+ return;
+
+ /* Access bitfiled (mem or cross-registers). */
+ tree type = TREE_TYPE (lhs);
+ if ((INTEGRAL_TYPE_P (type) && !type_has_mode_precision_p (type))
+ || offset % BITS_PER_UNIT != 0 || size % BITS_PER_UNIT != 0)
+ {
+ machine_mode mode = TYPE_MODE (type);
+ rtx src
+ = expand_expr_real (arg, NULL, VOIDmode, EXPAND_NORMAL, NULL, true);
+ src = extract_bit_field (src, size, offset, TYPE_UNSIGNED (type), NULL,
+ mode, mode, reversep, NULL);
+ rtx dest = expand_expr (lhs, NULL, VOIDmode, EXPAND_WRITE);
+ if (GET_CODE (dest) == SUBREG && SUBREG_PROMOTED_VAR_P (dest))
+ convert_move (SUBREG_REG (dest), src, SUBREG_PROMOTED_SIGN (dest));
+ else
+ emit_move_insn (dest, src);
+ return;
+ }
+
+ /* Fallback to original expand. */
+ gcc_assert (offset % BITS_PER_UNIT == 0 && size % BITS_PER_UNIT == 0);
+ tree base = build_fold_addr_expr (arg);
+ tree atype = reference_alias_ptr_type (arg);
+ tree off = build_int_cst (atype, offset / BITS_PER_UNIT);
+ location_t loc = EXPR_LOCATION (arg);
+ tree rhs = fold_build2_loc (loc, MEM_REF, type, base, off);
+ REF_REVERSE_STORAGE_ORDER (rhs) = reversep;
+ expand_assignment (lhs, rhs, false);
+}
+
+/* REGS constains registers (e.g. function return), compute which register(s)
+ are touched at {BITPOS, BITSIZE}. REV indicates the aggrgates's storeage
+ order. And assign rhs to it(them) and return true. */
+
+static bool
+assign_to_regs (rtx regs, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize, bool rev,
+ tree rhs)
+{
+ /* Store src into a (or part of) registrer. */
+ auto store_into_reg
+ = [rev] (rtx dest, HOST_WIDE_INT size, HOST_WIDE_INT pos, rtx src) -> bool {
+ machine_mode mode = GET_MODE (dest);
+ if (MEM_P (src) && GET_MODE (src) != mode)
+ PUT_MODE (src, mode);
+ if (known_eq (size, GET_MODE_BITSIZE (mode)))
+ emit_move_insn (dest, gen_lowpart (mode, src));
+ else
+ store_bit_field (dest, size, pos, 0, 0, mode, src, rev, false);
+ return true;
+ };
+
+ rtx src = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ gcc_assert (src);
+
+ /* Just one register. */
+ if (REG_P (regs))
+ return store_into_reg (regs, bitsize, bitpos, src);
+
+ gcc_assert (GET_CODE (regs) == PARALLEL);
+
+ int start_index = -1;
+ int end_index = -1;
+ HOST_WIDE_INT left_bits = 0;
+ HOST_WIDE_INT right_bits = 0;
+ rtx reg1st = query_position_in_series (regs, bitpos, bitsize, start_index,
+ end_index, left_bits, right_bits);
+
+ gcc_assert (reg1st);
+
+ /* Only hit one register. */
+ if (end_index == start_index)
+ return store_into_reg (reg1st, bitsize, left_bits, src);
+
+ /* Hit multi-registers. */
+ machine_mode src_mode = GET_MODE (src);
+
+ /* Bits in mem:blk*/
+ if (src_mode != BLKmode && !REG_P (src))
+ {
+ /* To extract bitfield, convert src. */
+ poly_int64 size = GET_MODE_SIZE (src_mode);
+ rtx mem = assign_stack_temp (src_mode, size);
+ emit_move_insn (mem, src);
+ src = mem;
+ }
+
+ int off0 = INTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)) * BITS_PER_UNIT;
+ /* Need to set bits in the first register. */
+ if (left_bits != 0)
+ {
+ rtx reg = reg1st;
+ machine_mode mode = GET_MODE (reg);
+ int size = GET_MODE_BITSIZE (mode).to_constant ();
+ size -= left_bits;
+ rtx elt
+ = extract_bit_field (src, size, 0, 1, NULL, mode, mode, false, NULL);
+ store_into_reg (reg, size, left_bits, elt);
+
+ off0 += left_bits;
+ start_index++;
+ }
+
+ /* Need to set bits in the last register. */
+ if (right_bits != 0)
+ {
+ rtx e = XVECEXP (regs, 0, end_index);
+ rtx reg = XEXP (e, 0);
+ machine_mode mode = GET_MODE (reg);
+ int size = GET_MODE_BITSIZE (mode).to_constant ();
+ size -= right_bits;
+ int off = INTVAL (XEXP (e, 1)) * BITS_PER_UNIT - off0;
+ rtx elt
+ = extract_bit_field (src, size, off, 1, NULL, mode, mode, false, NULL);
+ store_into_reg (reg, size, 0, elt);
+
+ end_index--;
+ }
+
+ /* Assign the hitted whole registers. */
+ for (int i = start_index; i <= end_index; i++)
+ {
+ rtx e = XVECEXP (regs, 0, i);
+ rtx reg = XEXP (e, 0);
+ int off = INTVAL (XEXP (e, 1)) * BITS_PER_UNIT - off0;
+ machine_mode mode = GET_MODE (reg);
+ int size = GET_MODE_BITSIZE (mode).to_constant ();
+ rtx elt
+ = extract_bit_field (src, size, off, 1, NULL, mode, mode, false, NULL);
+ emit_move_insn (reg, elt);
+ }
+ return true;
+}
+
+/* Expand the IFN_SET_RET_PARTS function:
+ .SET_REG_PARTS(RET_BASE, BIT_OFFSET, BIT_SIZE, SRC).
+ e.g. .SET_RET_PARTS (D.2774, 0, 8, SRC); return D.2774; */
+
+static void
+expand_SET_RET_PARTS (internal_fn, gcall *stmt)
+{
+ HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+ HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+ tree rhs = gimple_call_arg (stmt, 3);
+
+ tree decl = DECL_RESULT (current_function_decl);
+ rtx dest_regs = decl->decl_with_rtl.rtl; // DECL_RTL (base);
+ if (GET_CODE (dest_regs) != PARALLEL
+ && (REG_P (dest_regs) ? REGNO (dest_regs) >= FIRST_PSEUDO_REGISTER
+ : DECL_REGISTER (decl))
+ && GET_CODE (crtl->return_rtx) == PARALLEL)
+ {
+ /* Outgoing registers are in crtl->return_rtx. */
+ dest_regs = gen_group_rtx (crtl->return_rtx);
+ decl->decl_with_rtl.rtl = dest_regs;
+ }
+
+ /* Use outgoing pseudos for return-var. */
+ tree base = gimple_call_arg (stmt, 0);
+ base->decl_with_rtl.rtl = dest_regs; // SET_DECL_RTL
+
+ bool rev = TYPE_REVERSE_STORAGE_ORDER (TREE_TYPE (decl));
+ assign_to_regs (dest_regs, offset, size, rev, rhs);
+}
+
/* The size of an OpenACC compute dimension. */
static void
@@ -520,6 +520,12 @@ DEF_INTERNAL_FN (DEFERRED_INIT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
2nd argument. */
DEF_INTERNAL_FN (ACCESS_WITH_SIZE, ECF_PURE | ECF_LEAF | ECF_NOTHROW, NULL)
+/* A function to extract element from function's incoming registers. */
+DEF_INTERNAL_FN (ARG_PARTS, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
+
+/* A function to set outgoing registers for 'return' aggregate. */
+DEF_INTERNAL_FN (SET_RET_PARTS, ECF_LEAF | ECF_NOTHROW, NULL)
+
/* DIM_SIZE and DIM_POS return the size of a particular compute
dimension and the executing thread's position within that
dimension. DIM_POS is pure (and not const) so that it isn't
@@ -448,6 +448,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_harden_conditional_branches);
NEXT_PASS (pass_harden_compares);
NEXT_PASS (pass_warn_access, /*early=*/false);
+ NEXT_PASS (pass_sra_final);
NEXT_PASS (pass_cleanup_cfg_post_optimizing);
NEXT_PASS (pass_warn_function_noreturn);
@@ -366,6 +366,7 @@ extern gimple_opt_pass *make_pass_early_tree_profile (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_cleanup_eh (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_sra (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_sra_early (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_sra_final (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tail_recursion (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tail_calls (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_musttail (gcc::context *ctxt);
@@ -21,14 +21,16 @@ along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
/* This file implements Scalar Reduction of Aggregates (SRA). SRA is run
- twice, once in the early stages of compilation (early SRA) and once in the
- late stages (late SRA). The aim of both is to turn references to scalar
- parts of aggregates into uses of independent scalar variables.
+ three times, once in the early stages of compilation (early SRA) and once
+ in the late stages (late SRA). The aim of them is to turn references to
+ scalar parts of aggregates into uses of independent scalar variables.
- The two passes are nearly identical, the only difference is that early SRA
+ The three passes are nearly identical, the difference are that early SRA
does not scalarize unions which are used as the result in a GIMPLE_RETURN
statement because together with inlining this can lead to weird type
- conversions.
+ conversions. The third pass is more care about parameters and returns,
+ it would be helpful for the parameters and returns which are passed through
+ registers.
Both passes operate in four stages:
@@ -104,6 +106,7 @@ along with GCC; see the file COPYING3. If not see
/* Enumeration of all aggregate reductions we can do. */
enum sra_mode { SRA_MODE_EARLY_IPA, /* early call regularization */
SRA_MODE_EARLY_INTRA, /* early intraprocedural SRA */
+ SRA_MODE_FINAL_INTRA, /* final gimple intraprocedural SRA */
SRA_MODE_INTRA }; /* late intraprocedural SRA */
/* Global variable describing which aggregate reduction we are performing at
@@ -1269,6 +1272,12 @@ build_access_from_expr_1 (tree expr, gimple *stmt, bool write)
return NULL;
}
+ if (TREE_CODE (expr) == TARGET_MEM_REF)
+ {
+ disqualify_base_of_expr (expr, "variable offset on base.");
+ return NULL;
+ }
+
struct access *ret = NULL;
bool partial_ref;
@@ -1549,7 +1558,8 @@ build_accesses_from_assign (gimple *stmt)
}
if (lacc && racc
- && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA)
+ && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA
+ || sra_mode == SRA_MODE_FINAL_INTRA)
&& !lacc->grp_unscalarizable_region
&& !racc->grp_unscalarizable_region
&& AGGREGATE_TYPE_P (TREE_TYPE (lhs))
@@ -2261,6 +2271,26 @@ find_var_candidates (void)
parm = DECL_CHAIN (parm))
ret |= maybe_add_sra_candidate (parm);
+ /* fsra only care about parameters and returns */
+ if (sra_mode == SRA_MODE_FINAL_INTRA)
+ {
+ tree res = DECL_RESULT (current_function_decl);
+ if (!res || !AGGREGATE_TYPE_P (TREE_TYPE (res))
+ || aggregate_value_p (res, current_function_decl))
+ return ret;
+
+ edge_iterator ei;
+ edge e;
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+ {
+ tree val = gimple_return_retval (r);
+ if (val && VAR_P (val))
+ ret |= maybe_add_sra_candidate (val);
+ }
+ return ret;
+ }
+
FOR_EACH_LOCAL_DECL (cfun, i, var)
{
if (!VAR_P (var))
@@ -2771,6 +2801,33 @@ expr_with_var_bounded_array_refs_p (tree expr)
return false;
}
+/* Analyze if the access ROOT should be replaced in fsra pass. */
+static bool
+fsra_analyze (struct access *root)
+{
+ gcc_assert (sra_mode == SRA_MODE_FINAL_INTRA);
+
+ bool scalar = is_gimple_reg_type (root->type);
+ if (scalar)
+ {
+ /* Parameter is scalarizable even no writing to it. */
+ if (TREE_CODE (root->base) == PARM_DECL)
+ return root->grp_scalar_read || root->grp_assignment_read;
+
+ /* Returns is scalarizable even no reading on it. */
+ else
+ return root->grp_scalar_write || root->grp_assignment_write;
+ }
+
+ /* fsra could handle assignment on aggregate, e.g for *p = arg; */
+ if (TREE_CODE (root->base) == PARM_DECL)
+ return !root->grp_scalar_read && root->grp_assignment_read;
+
+ /* For "D.3988 = *pa_2(D); return * D.3988;". */
+ return !root->grp_scalar_write && root->grp_assignment_write
+ && root->base == root->expr;
+}
+
/* Analyze the subtree of accesses rooted in ROOT, scheduling replacements when
both seeming beneficial and when ALLOW_REPLACEMENTS allows it. If TOTALLY
is set, we are totally scalarizing the aggregate. Also set all sorts of
@@ -2903,6 +2960,22 @@ analyze_access_subtree (struct access *root, struct access *parent,
sth_created = true;
hole = false;
}
+
+ /* Check if fsra is going to handle the access. */
+ else if (allow_replacements && sra_mode == SRA_MODE_FINAL_INTRA
+ && !root->first_child && fsra_analyze (root))
+ {
+ root->grp_to_be_replaced = 1;
+ if (scalar || TREE_CODE (root->base) == PARM_DECL)
+ root->replacement_decl = create_access_replacement (root);
+
+ /* For assigning to whole 'return', directly use DECL_RESULT. */
+ else
+ root->replacement_decl = DECL_RESULT (current_function_decl);
+ sth_created = true;
+ hole = false;
+ }
+
else
{
if (allow_replacements
@@ -3159,6 +3232,13 @@ propagate_subaccesses_from_rhs (struct access *lacc, struct access *racc)
return ret;
}
+ /* In fsra, as all access (include no replacement) about assigning to
+ 'return' is assign to outgoing registers, then no need to propagate
+ racc's child to the lhs('return'). */
+ if (sra_mode == SRA_MODE_FINAL_INTRA && lacc
+ && TREE_CODE (lacc->base) != PARM_DECL)
+ return ret;
+
for (rchild = racc->first_child; rchild; rchild = rchild->next_sibling)
{
struct access *new_acc = NULL;
@@ -3872,7 +3952,8 @@ generate_subtree_copies (struct access *access, tree agg,
HOST_WIDE_INT top_offset,
HOST_WIDE_INT start_offset, HOST_WIDE_INT chunk_size,
gimple_stmt_iterator *gsi, bool write,
- bool insert_after, location_t loc)
+ bool insert_after, location_t loc,
+ bool fsra_arg = false)
{
/* Never write anything into constant pool decls. See PR70602. */
if (!write && constant_decl_p (agg))
@@ -3887,7 +3968,7 @@ generate_subtree_copies (struct access *access, tree agg,
|| access->offset + access->size > start_offset))
{
tree expr, repl = get_access_replacement (access);
- gassign *stmt;
+ gimple *stmt;
expr = build_ref_for_model (loc, agg, access->offset - top_offset,
access, gsi, insert_after);
@@ -3899,7 +3980,21 @@ generate_subtree_copies (struct access *access, tree agg,
!insert_after,
insert_after ? GSI_NEW_STMT
: GSI_SAME_STMT);
- stmt = gimple_build_assign (repl, expr);
+ /* When assigning the incoming part of argument to 'repl', Gen
+ the ARG_PARTS. */
+ if (fsra_arg
+ && (!access->grp_write || is_gimple_reg_type (access->type))
+ && (access->grp_scalar_read || access->grp_assignment_read))
+ {
+ stmt = gimple_build_call_internal (
+ IFN_ARG_PARTS, 4, access->base,
+ wide_int_to_tree (sizetype, access->offset),
+ wide_int_to_tree (sizetype, access->size),
+ wide_int_to_tree (sizetype, access->reverse));
+ gimple_call_set_lhs (stmt, repl);
+ }
+ else
+ stmt = gimple_build_assign (repl, expr);
}
else
{
@@ -3909,7 +4004,15 @@ generate_subtree_copies (struct access *access, tree agg,
!insert_after,
insert_after ? GSI_NEW_STMT
: GSI_SAME_STMT);
- stmt = gimple_build_assign (expr, repl);
+ /* Gen .SET_RET_PARTS, when assigning to 'return var'. */
+ if (sra_mode == SRA_MODE_FINAL_INTRA && VAR_P (access->base)
+ && (access->grp_scalar_write || access->grp_assignment_write))
+ stmt = gimple_build_call_internal (
+ IFN_SET_RET_PARTS, 4, access->base,
+ wide_int_to_tree (sizetype, access->offset),
+ wide_int_to_tree (sizetype, access->size), repl);
+ else
+ stmt = gimple_build_assign (expr, repl);
}
gimple_set_location (stmt, loc);
@@ -3940,7 +4043,7 @@ generate_subtree_copies (struct access *access, tree agg,
if (access->first_child)
generate_subtree_copies (access->first_child, agg, top_offset,
start_offset, chunk_size, gsi,
- write, insert_after, loc);
+ write, insert_after, loc, fsra_arg);
access = access->next_sibling;
}
@@ -4718,6 +4821,30 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi)
gsi_insert_before (gsi, ds, GSI_SAME_STMT);
}
+ /* In fsra, even without replacement, IFN SET_RET_PARTS should be used
+ to replace the assignment on 'return var'. */
+ auto use_set_ret_parts
+ = [stmt, &orig_gsi, &gsi] (struct access *lacc, tree rhs) -> bool {
+ /* Change the assign to SET_RET_PARTS if it is not repalced yet. */
+ if (sra_mode == SRA_MODE_FINAL_INTRA && lacc
+ && TREE_CODE (lacc->base) != PARM_DECL && !lacc->replacement_decl)
+ {
+ tree base = lacc->base;
+ tree offset = wide_int_to_tree (sizetype, lacc->offset);
+ tree size = wide_int_to_tree (sizetype, lacc->size);
+ gimple *new_stmt = gimple_build_call_internal (IFN_SET_RET_PARTS, 4,
+ base, offset, size, rhs);
+ gsi_insert_before (&orig_gsi, new_stmt, GSI_SAME_STMT);
+ update_stmt (new_stmt);
+ unlink_stmt_vdef (stmt);
+ gsi_remove (gsi_stmt (*gsi) == stmt ? gsi : &orig_gsi, true);
+ release_defs (stmt);
+ sra_stats.deleted++;
+ return true;
+ }
+ return false;
+ };
+
/* From this point on, the function deals with assignments in between
aggregates when at least one has scalar reductions of some of its
components. There are three possible scenarios: Both the LHS and RHS have
@@ -4787,6 +4914,9 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi)
gcc_assert (stmt == gsi_stmt (orig_gsi));
}
+ if (use_set_ret_parts (lacc, rhs))
+ return SRA_AM_REMOVED;
+
return modify_this_stmt ? SRA_AM_MODIFIED : SRA_AM_NONE;
}
else
@@ -4823,6 +4953,9 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi)
sra_stats.deleted++;
return SRA_AM_REMOVED;
}
+
+ if (use_set_ret_parts (lacc, rhs))
+ return SRA_AM_REMOVED;
}
else
{
@@ -4866,6 +4999,9 @@ sra_modify_assign (gimple *stmt, gimple_stmt_iterator *gsi)
return SRA_AM_REMOVED;
}
}
+
+ if (use_set_ret_parts (lacc, rhs))
+ return SRA_AM_REMOVED;
}
return SRA_AM_NONE;
@@ -5070,7 +5206,8 @@ initialize_parameter_reductions (void)
access;
access = access->next_grp)
generate_subtree_copies (access, parm, 0, 0, 0, &gsi, true, true,
- EXPR_LOCATION (parm));
+ EXPR_LOCATION (parm),
+ sra_mode == SRA_MODE_FINAL_INTRA);
}
seq = gsi_seq (gsi);
@@ -5134,6 +5271,14 @@ late_intra_sra (void)
return perform_intra_sra ();
}
+/* Perform "final sra" SRA just before expander. */
+static unsigned int
+final_intra_sra (void)
+{
+ sra_mode = SRA_MODE_FINAL_INTRA;
+ return perform_intra_sra ();
+}
+
static bool
gate_intra_sra (void)
@@ -5217,6 +5362,44 @@ make_pass_sra (gcc::context *ctxt)
return new pass_sra (ctxt);
}
+namespace
+{
+const pass_data pass_data_sra_final = {
+ GIMPLE_PASS, /* type */
+ "fsra", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_TREE_SRA, /* tv_id */
+ (PROP_cfg | PROP_ssa), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_sra_final : public gimple_opt_pass
+{
+public:
+ pass_sra_final (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_sra_final, ctxt)
+ {
+ }
+
+ /* opt_pass methods: */
+ bool gate (function *) final override { return gate_intra_sra (); }
+ unsigned int execute (function *) final override
+ {
+ return final_intra_sra ();
+ }
+
+}; // class pass_sra_final
+
+} // namespace
+
+gimple_opt_pass *
+make_pass_sra_final (gcc::context *ctxt)
+{
+ return new pass_sra_final (ctxt);
+}
/* If type T cannot be totally scalarized, return false. Otherwise return true
and push to the vector within PC offsets and lengths of all padding in the
@@ -5,7 +5,8 @@
// Test that a zero-width bit field in an otherwise homogeneous aggregate
// generates a psabi warning and passes arguments in GPRs.
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mstd\M} 4 {target { ! has_arch_pwr8 } } } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 {target { has_arch_pwr8 } } } }
struct a_thing
{
new file mode 100644
@@ -0,0 +1,76 @@
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF
+{
+ double a[4];
+ short s1;
+ short s2;
+ short s3;
+ short s4;
+} DF;
+typedef struct SF
+{
+ float a[4];
+ int i1;
+ int i2;
+} SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { lp64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { lp64 && has_arch_pwr8 } } } } */
+
+#define NOIPA __attribute__ ((noipa))
+
+short NOIPA
+foo_hi (DF a, int flag)
+{
+ if (flag == 2)
+ return a.s2 + a.s3;
+ return 0;
+}
+int NOIPA
+foo_si (SF a, int flag)
+{
+ if (flag == 2)
+ return a.i2 + a.i1;
+ return 0;
+}
+double NOIPA
+foo_df (DF arg, int flag)
+{
+ if (flag == 2)
+ return arg.a[3];
+ else
+ return 0.0;
+}
+float NOIPA
+foo_sf (SF arg, int flag)
+{
+ if (flag == 2)
+ return arg.a[2];
+ return 0;
+}
+float NOIPA
+foo_sf1 (SF arg, int flag)
+{
+ if (flag == 2)
+ return arg.a[1];
+ return 0;
+}
+
+DF gdf = {{1.0, 2.0, 3.0, 4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f, 2.0f, 3.0f, 4.0f}, 1, 2};
+
+int
+main ()
+{
+ if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0
+ && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+ __builtin_abort ();
+ if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
+ && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+ __builtin_abort ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,74 @@
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 5 {target { lp64 && { has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 4 {target { lp64 && { has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && { has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 {target { lp64 && { has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mfadds\M} 2 {target { lp64 && has_arch_pwr8 } } } } */
+
+#define NOIPA __attribute__ ((noipa))
+typedef struct X
+{
+ float x;
+ float y;
+} X;
+
+float NOIPA
+fooX (X y)
+{
+ y.x += 1;
+ return y.x + y.y;
+}
+
+typedef struct Y
+{
+ double a[4];
+ long l;
+} Y;
+
+double NOIPA
+fooY (Y arg)
+{
+ return arg.a[3];
+}
+
+typedef struct Z
+{
+ float a[4];
+ short l;
+} Z;
+
+float NOIPA
+fooZ (Z arg)
+{
+ return arg.a[3];
+}
+
+float NOIPA
+fooZ2 (Z arg)
+{
+ return arg.a[2];
+}
+
+X x = {1.0f, 2.0f};
+Y y = {1.0, 2.0, 3.0, 4.0, 1};
+Z z = {1.0f, 2.0f, 3.0f, 4.0f, 1};
+int
+main ()
+{
+ if (fooX (x) != 4.0f)
+ __builtin_abort ();
+
+ if (fooY (y) != 4.0)
+ __builtin_abort ();
+
+ if (fooZ (z) != 4.0f)
+ __builtin_abort ();
+
+ if (fooZ2 (z) != 3.0f)
+ __builtin_abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2" } */
+
+/* generate lfd for fun1, fun4 and fun5 */
+/* { dg-final { scan-assembler-times {\mlfd\M} 12 {target { lp64 && powerpc_elfv2 } } } } */
+
+/* generate stfd for fun2 */
+/* { dg-final { scan-assembler-times {\mstfd\M} 4 {target { lp64 && powerpc_elfv2 } } } } */
+
+/* no other load/store */
+/* { dg-final { scan-assembler-not {\m[p?l|st]d\M} {target { lp64 && powerpc_elfv2 } } } } */
+/* { dg-final { scan-assembler-not {\m[l|st]xvd2x\M} {target { lp64 && powerpc_elfv2 } } } } */
+/* { dg-final { scan-assembler-not {\m[p?l|st]xv\M} {target { lp64 && powerpc_elfv2 } } } } */
+
+typedef struct { double a[4]; } A;
+typedef struct { A a; double b; } B;
+typedef struct { double c; A a;} C;
+
+A fun1 (const A *pa) { return *pa; }
+void fun2 (const A a, A *pa) { *pa = a; }
+A fun3 (const A a) { return a; }
+A fun4 (const B *b) { return b->a; }
+A fun5 (const C *b) { return b->a; }
+A fun6 (const B b) { return b.a; }
+B fun7 (const A a) { B b; b.a = a; b.b = 0; return b; }
+C fun8 (const A a) { C c; c.a = a; c.c = 0; return c; }
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {\mfmr\M} 3 {target { { lp64 && powerpc_elfv2 } && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mxscvspdpn\M} {target { { lp64 && powerpc_elfv2 } && has_arch_pwr8 } } } } */
+
+struct foo1
+{
+ float x;
+ float y;
+};
+
+struct foo1
+blah1 (struct foo1 y)
+{
+ struct foo1 x;
+
+ x.x = y.y;
+ x.y = y.x;
+
+ return x;
+}