@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+ extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+ extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
@@ -294,7 +294,31 @@
(define_insn_and_split "*movoo"
[(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
- (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+ (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+ "TARGET_MMA
+ && (gpc_reg_operand (operands[0], OOmode)
+ || gpc_reg_operand (operands[1], OOmode))"
+;; ""
+ "@
+ #
+ #
+ #"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ rs6000_split_multireg_move (operands[0], operands[1]);
+ DONE;
+}
+ [(set_attr "type" "vecload,vecstore,veclogical")
+ (set_attr "length" "*,*,8")])
+;; (set_attr "max_prefixed_insns" "2,2,*")])
+
+
+(define_insn_and_split "*movoo1"
+ [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
+ (unspec [
+ (match_operand:OO 1 "input_operand" "ZwO,wa,wa")
+ ] UNSPEC_LXVP))]
"TARGET_MMA
&& (gpc_reg_operand (operands[0], OOmode)
|| gpc_reg_operand (operands[1], OOmode))"
new file mode 100644
@@ -0,0 +1,708 @@
+/* Subroutines used to perform adjacent load/store into
+ paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-iter.h"
+#include "rtl-ssa.h"
+#include "rtl-ssa/internals.h"
+#include "rtl-ssa/internals.inl"
+#include "cfgcleanup.h"
+#include "tree-pass.h"
+#include "pair-fusion.h"
+
+using namespace rtl_ssa;
+
+struct rs6000_pair_fusion : public pair_fusion
+{
+ bool fpsimd_op_p (rtx , machine_mode , bool) override final
+ {
+ return false;
+ }
+
+ bool pair_mem_insn_p (rtx_insn *, bool &) override final
+ {
+ return false;
+ }
+
+ void change_existing_multword_mode (rtx_insn *insn) override final;
+
+ bool pair_mem_ok_with_policy (rtx, bool) override final
+ {
+ return true;
+ }
+
+ bool pair_operand_mode_ok_p (machine_mode mode) override final;
+
+ rtx gen_pair (rtx *pats, rtx, bool load_p) override final;
+
+ bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
+ {
+ return true;
+ }
+
+ int pair_mem_alias_check_limit () override final
+ {
+ return 0;
+ }
+
+ bool should_handle_writeback (enum writeback_type) override final
+ {
+ return false;
+ }
+
+ bool track_loads_p () override final
+ {
+ return true;
+ }
+
+ bool track_stores_p () override final
+ {
+ return true;
+ }
+
+ bool pair_mem_in_range_p (HOST_WIDE_INT) override final
+ {
+ return true;
+ }
+
+ rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
+ {
+ return NULL_RTX;
+ }
+
+ rtx destructure_pair (rtx_def **, rtx, bool) override final
+ {
+ return NULL_RTX;
+ }
+
+ bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
+
+ bool fuseable_load_p (insn_info *insn) override final;
+
+ void set_multiword_subreg (insn_info *i1, insn_info *i2,
+ bool load_p) override final;
+
+ void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt,
+ insn_change **pair_change,
+ auto_vec <insn_change *> &changes) override final;
+};
+
+bool
+rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+ return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
+}
+
+void
+rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn)
+{
+ rtx set = single_set (insn);
+ rtx src = SET_SRC (set);
+ rtx dest = SET_DEST (set);
+ rtx copy = NULL_RTX;
+
+ if ((MEM_P (src) && GET_MODE (src) == OOmode)
+ || (MEM_P (dest) && GET_MODE (dest) == OOmode))
+ {
+ rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest),
+ gen_rtvec (1, src),
+ UNSPEC_LXVP);
+ copy = gen_rtx_SET (dest, unspec);
+ rtx_insn *new_insn = emit_insn_after (copy, insn);
+ set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+ df_insn_rescan (new_insn);
+ df_insn_delete (insn);
+ remove_insn (insn);
+ insn->set_deleted ();
+ }
+}
+
+static void
+update_change (set_info *set)
+{
+ if (!set->has_any_uses ())
+ return;
+
+ auto *use = *set->all_uses ().begin ();
+ do
+ {
+ auto *next_use = use->next_use ();
+ if (use->is_in_phi ())
+ {
+ update_change (use->phi ());
+ }
+ else
+ {
+ crtl->ssa->remove_use (use);
+ }
+ use = next_use;
+ }
+ while (use);
+}
+
+void
+rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first,
+ obstack_watermark *attempt,
+ insn_change **pair_change,
+ auto_vec<insn_change *> &changes)
+{
+ for (insn_change *change : changes)
+ for (auto def : change->old_defs ())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ update_change (set);
+ }
+
+ auto &new_defs = (*pair_change)->new_defs;
+ vec_rtx_properties properties;
+ properties.add_insn (first->rtl (), true);
+ // Build up the new list of definitions.
+ for (rtx_obj_reference ref : properties.refs ())
+ if (ref.is_write ())
+ {
+ auto *set = crtl->ssa->allocate<set_info> (first,
+ full_register (ref.regno));
+ if (set)
+ {
+ auto def = find_access (new_defs, ref.regno);
+ if (!def)
+ {
+ new_defs = insert_access (*attempt, set,
+ new_defs);
+ auto &m_temp_defs = crtl->ssa->get_m_temp_defs ();
+ m_temp_defs.safe_push (set);
+ }
+ }
+ }
+}
+
+// df_insn_rescan dependent instruction where operands
+// are reversed given insn_info INFO.
+static void
+set_rescan_load (insn_info *i1)
+{
+ for (auto def : i1->defs ())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ for (auto use : set->all_uses ())
+ {
+ insn_info *info = use->insn ();
+ if (info && info->rtl ())
+ {
+ rtx_insn *rtl_insn = info->rtl ();
+ df_insn_rescan (rtl_insn);
+ }
+ }
+ }
+}
+
+// df_insn_rescan the def instruction where operands are reversed given INSN.
+static bool
+set_rescan_store (insn_info *insn)
+{
+ for (auto use : insn->uses())
+ {
+ auto def = use->def ();
+
+ if (!def)
+ return false;
+
+ if (def->insn ()->is_artificial ())
+ return false;
+
+ if (def->insn () && def->insn ()->rtl ()
+ && def->insn()->is_real ())
+ {
+ rtx_insn *rtl_insn = def->insn ()->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return false;
+ df_insn_rescan (rtl_insn);
+ }
+ }
+ return true;
+}
+
+// Check for feasibility of store to be fuseable or not. Return true if
+// feasible otherwise false.
+static bool
+feasible_store_p (insn_info *insn)
+{
+ for (auto use : insn->uses ())
+ {
+ auto def = use->def ();
+
+ if (def->insn ()->is_artificial ())
+ return false;
+
+ if (def->insn () && def->insn ()->rtl ()
+ && def->insn()->is_real ())
+ {
+ rtx_insn *rtl_insn = def->insn ()->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return false;
+
+ // Return false if dependent def is load.
+ // This is done as def instruction could be a fused load and
+ // to avoid already existing subreg (reg:OO R) offset.
+ if (rtl_insn && MEM_P (SET_SRC (set)))
+ return false;
+
+ // Return false if dependent def is store.
+ if (rtl_insn && MEM_P (SET_DEST (set)))
+ return false;
+ }
+ }
+ return true;
+}
+
+// Check if store can be fuseable or not. Return true if fuseable otherwise
+// false.
+bool
+rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx_insn *insn2 = i2->rtl ();
+ rtx body = PATTERN (insn1);
+ rtx src_exp = SET_SRC (body);
+ rtx insn2_body = PATTERN (insn2);
+ rtx insn2_src_exp = SET_SRC (insn2_body);
+
+ if (!(REG_P (src_exp)
+ && crtl->ssa->single_dominating_def (REGNO (src_exp))))
+ return false;
+
+ // This is done as def instruction could be a fused load and
+ // to avoid already existing subreg (reg:OO R) offset.
+ if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1)
+ return false;
+
+ // Return false if src of insn1 and src of insn2 are same.
+ if (src_exp == insn2_src_exp)
+ return false;
+
+ // Return false if src of insn1 is subreg.
+ if (GET_CODE (src_exp) == SUBREG)
+ return false;
+
+ // Return false if src of insn2 is subreg.
+ if (GET_CODE (insn2_src_exp) == SUBREG)
+ return false;
+
+ if (!feasible_store_p (i1))
+ return false;;
+
+ if (!feasible_store_p (i2))
+ return false;
+
+ return true;
+}
+
+// Set subreg for def of store INSN given rtx SRC instruction.
+static void
+set_store_subreg (insn_info *i1, rtx src, int regoff)
+{
+ for (auto use: i1->uses ())
+ {
+ auto def = use->def ();
+ if (!def)
+ return;
+
+ insn_info *info = def->insn ();
+
+ if (info->is_artificial ())
+ return;
+
+ if (info && info->is_real ())
+ {
+ rtx_insn *rtl_insn = info->rtl ();
+ rtx set = single_set (rtl_insn);
+ if (set == NULL_RTX)
+ return;
+ df_ref ref;
+ FOR_EACH_INSN_DEF (ref, rtl_insn)
+ {
+ rtx src_exp = SET_SRC (PATTERN (i1->rtl ()));
+ if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp))
+ {
+ rtx *loc = DF_REF_LOC (ref);
+ if (GET_CODE (*loc) == SUBREG)
+ {
+ rtx src1 = simplify_gen_subreg (GET_MODE (*loc),
+ SUBREG_REG (src),
+ OOmode,
+ regoff);
+ *loc = copy_rtx (src1);
+ }
+ else
+ *loc = copy_rtx (src);
+ }
+ }
+ }
+ }
+}
+
+// Check whether load can be fusable or not.
+// Return true if fuseable otherwise false.
+bool
+rs6000_pair_fusion::fuseable_load_p (insn_info *i1)
+{
+ rtx_insn *insn = i1->rtl ();
+ rtx body = PATTERN (insn);
+ rtx dest_exp = SET_DEST (body);
+
+ if (!(REG_P (dest_exp)
+ && crtl->ssa->single_dominating_def (REGNO (dest_exp))))
+ return false;
+ return true;
+}
+
+// Propagate insn I1 with new rtx NEW_DEST_EXP.
+static void
+propagate_insn (insn_info *i1, rtx new_dest_exp)
+{
+ df_ref ref;
+ FOR_EACH_INSN_DEF (ref, i1->rtl())
+ {
+ rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+ if (REG_P (dest_exp)
+ && DF_REF_REGNO (ref) == REGNO (dest_exp))
+ {
+ rtx *loc = DF_REF_LOC (ref);
+ *loc = new_dest_exp;
+ }
+ }
+}
+
+// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
+static rtx
+new_reg_rtx (rtx old_dest)
+{
+ rtx new_dest_exp = gen_reg_rtx (OOmode);
+ ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
+ REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
+ REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
+ REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
+ max_regno = max_reg_num ();
+ return new_dest_exp;
+}
+
+// Set subreg with use of INSN given SRC rtx instruction.
+static void
+set_load_subreg (insn_info *i1, rtx src)
+{
+ rtx set = single_set (i1->rtl());
+ rtx old_dest = SET_DEST (set);
+
+ for (auto def : i1->defs ())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ for (auto use : set->nondebug_insn_uses ())
+ {
+ insn_info *info = use->insn ();
+ if (!info || !info->rtl ())
+ continue;
+
+ rtx_insn *rtl_insn = info->rtl ();
+ df_ref ref;
+
+ FOR_EACH_INSN_USE (ref, rtl_insn)
+ {
+ rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+ if (REG_P (dest_exp)
+ && DF_REF_REGNO (ref) == REGNO (dest_exp))
+ {
+ rtx *loc = DF_REF_LOC (ref);
+ insn_propagation prop (rtl_insn, old_dest, src);
+ if (GET_CODE (*loc) == SUBREG)
+ {
+ if (!prop.apply_to_pattern (loc))
+ {
+ if (dump_file != NULL)
+ {
+ fprintf (dump_file,
+ "Cannot propagate insn \n");
+ print_rtl_single (dump_file, rtl_insn);
+ }
+ return;
+ }
+ }
+ else
+ *loc = copy_rtx (src);
+ }
+ }
+ }
+ }
+}
+
+// Set subreg for OO mode store pair to generate registers in pairs
+// given insn_info I1 and I2.
+static void
+set_multiword_subreg_store (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx_insn *insn2 = i2->rtl ();
+ rtx body = PATTERN (insn1);
+ rtx src_exp = SET_SRC (body);
+ rtx insn2_body = PATTERN (insn2);
+ rtx insn2_dest_exp = SET_DEST (insn2_body);
+ machine_mode mode = GET_MODE (src_exp);
+ int regoff;
+ rtx src;
+ rtx addr = XEXP (insn2_dest_exp, 0);
+
+ PUT_MODE_RAW (src_exp, OOmode);
+ if (GET_CODE (addr) == PLUS
+ && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
+ regoff = 16;
+ else
+ regoff = 0;
+
+ src = simplify_gen_subreg (mode,
+ src_exp, GET_MODE (src_exp),
+ regoff);
+
+ set_store_subreg (i1, src, regoff);
+
+ int regoff1 = 0;
+ rtx src1;
+
+ src1 = simplify_gen_subreg (mode,
+ src_exp, GET_MODE (src_exp),
+ regoff1);
+
+ set_store_subreg (i2, src1, regoff1);
+ set_rescan_store (i1);
+ set_rescan_store (i2);
+ df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair load to generate registers in pairs given
+// insn_info I2 and I2.
+static void
+set_multiword_subreg_load (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx body = PATTERN (insn1);
+ rtx dest_exp = SET_DEST (body);
+ machine_mode mode = GET_MODE (dest_exp);
+ PUT_MODE_RAW (dest_exp, OOmode);
+
+ int regoff = 0;
+ rtx src;
+
+ src = simplify_gen_subreg (mode,
+ dest_exp, GET_MODE (dest_exp),
+ regoff);
+
+ set_load_subreg (i2, src);
+
+ int regoff1;
+ rtx src1;
+
+ regoff1 = 16;
+ src1 = simplify_gen_subreg (mode,
+ dest_exp, GET_MODE (dest_exp),
+ regoff1);
+ set_load_subreg (i1, src1);
+
+ set_rescan_load (i1);
+ set_rescan_load (i2);
+ df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair load for existing subreg rtx to generate
+// registers in pairs given insn_info I2 and I2.
+static void
+set_multiword_existing_subreg (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx body = PATTERN (insn1);
+ rtx dest_exp = SET_DEST (body);
+ machine_mode mode = GET_MODE (dest_exp);
+ int regoff1;
+ regoff1 = 16;
+ rtx new_dest_exp = new_reg_rtx (dest_exp);
+
+ rtx src = simplify_gen_subreg (mode,
+ new_dest_exp,
+ OOmode,
+ regoff1);
+
+ set_load_subreg (i1, src);
+ propagate_insn (i1, new_dest_exp);
+
+ int regoff = 0;
+ rtx sset = single_set (i2->rtl ());
+ rtx insn2_dest_exp = SET_DEST (sset);
+ machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
+
+ src = simplify_gen_subreg (insn2_mode,
+ new_dest_exp,
+ OOmode,
+ regoff);
+
+ set_load_subreg (i2, src);
+ propagate_insn (i2, new_dest_exp);
+
+ auto attempt = crtl->ssa->new_change_attempt ();
+ resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) };
+ auto *set = crtl->ssa->allocate<set_info> (i1, resource);
+ if (set)
+ {
+ auto def = find_access (i1->defs (), REGNO (new_dest_exp));
+ if (!def)
+ i1->defs() = insert_access (attempt, set, i1->defs());
+ }
+
+ set_rescan_load (i1);
+ set_rescan_load (i2);
+ df_insn_rescan (insn1);
+}
+
+// Return true iff insn I1 has already existing subreg.
+static bool
+use_has_subreg_p (insn_info *i1)
+{
+ for (auto def : i1->defs ())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ for (auto use : set->nondebug_insn_uses ())
+ {
+ insn_info *info = use->insn ();
+ if (info && info->rtl ())
+ {
+ rtx_insn *rtl_insn = info->rtl ();
+ df_ref ref;
+ FOR_EACH_INSN_USE (ref, rtl_insn)
+ {
+ rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+ if (REG_P (dest_exp)
+ && DF_REF_REGNO (ref) == REGNO (dest_exp))
+ {
+ rtx *loc = DF_REF_LOC (ref);
+ if (GET_CODE (*loc) == SUBREG)
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+}
+
+// Set subreg for OO mode pair to generate sequential registers given
+// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
+// if store insn.
+void
+rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
+ bool load_p)
+{
+ if (load_p)
+ {
+ bool i1_subreg_p = use_has_subreg_p (i1);
+ bool i2_subreg_p = use_has_subreg_p (i2);
+
+ if (i1_subreg_p || i2_subreg_p)
+ set_multiword_existing_subreg (i1, i2);
+ else
+ set_multiword_subreg_load (i1, i2);
+ }
+ else
+ set_multiword_subreg_store (i1, i2);
+}
+
+rtx
+rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p)
+{
+ rtx i1 = pats[0];
+ rtx src_exp = SET_SRC (i1);
+ rtx dest_exp = SET_DEST (i1);
+ PUT_MODE_RAW (src_exp, OOmode);
+ PUT_MODE_RAW (dest_exp, OOmode);
+ rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp),
+ gen_rtvec (1, src_exp),
+ UNSPEC_LXVP);
+ rtx set = gen_rtx_SET (dest_exp, unspec);
+ if (dump_file)
+ {
+ if (load_p)
+ fprintf (dump_file, "lxv with lxvp ");
+ else
+ fprintf (dump_file, "stxv with stxvp ");
+ print_rtl_single (dump_file, set);
+ }
+ return set;
+}
+
+const pass_data pass_data_mem_fusion =
+{
+ RTL_PASS, /* type */
+ "mem_fusion", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_mem_fusion : public rtl_opt_pass
+{
+public:
+ pass_mem_fusion (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_mem_fusion, ctxt)
+ {}
+
+ opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
+
+ /* opt_pass methods: */
+ bool gate (function *)
+ {
+ return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+ }
+
+ unsigned int execute (function *) final override
+ {
+ rs6000_pair_fusion pass;
+ pass.run ();
+ return 0;
+ }
+}; // class pass_mem_fusion
+
+rtl_opt_pass *
+make_pass_mem_fusion (gcc::context *ctxt)
+{
+ return new pass_mem_fusion (ctxt);
+}
@@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see
The power8 does not have instructions that automaticaly do the byte swaps
for loads and stores. */
INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
-
+ /* Pass to replace adjacent memory addresses lxv/stxv instruction with
+ lxvp/stxvp instruction. */
+ INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
/* Pass to do the PCREL_OPT optimization that combines the load of an
external symbol's address along with a single load or store using that
address as a base register. */
@@ -343,6 +343,7 @@ namespace gcc { class context; }
class rtl_opt_pass;
extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
extern bool rs6000_quadword_masked_address_p (const_rtx exp);
@@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
reg_mode = word_mode;
reg_mode_size = GET_MODE_SIZE (reg_mode);
- gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
+ gcc_assert (mode == OOmode
+ || reg_mode_size * nregs == GET_MODE_SIZE (mode));
/* TDmode residing in FP registers is special, since the ISA requires that
the lower-numbered word of a register pair is always the most significant
@@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
if (MEM_P (dst))
{
+ rtx addr = XEXP (dst, 0);
+ rtx opnd1 = NULL_RTX;
+ if (addr && GET_CODE (addr) == PLUS)
+ opnd1 = XEXP (addr,1);
+
unsigned offset = 0;
unsigned size = GET_MODE_SIZE (reg_mode);
@@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
{
unsigned subreg
= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
- rtx dst2 = adjust_address (dst, reg_mode, offset);
+ rtx dst2 = dst;
+
+ if ((GET_CODE (addr) != PLUS
+ || (opnd1 && CONST_INT_P(opnd1))))
+ dst2 = adjust_address (dst, reg_mode, offset);
+ else
+ PUT_MODE_RAW (dst, reg_mode);
rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
offset += size;
emit_insn (gen_rtx_SET (dst2, src2));
@@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
if (MEM_P (src))
{
+ rtx addr = XEXP (src, 0);
+ rtx opnd1 = NULL_RTX;
+ if (addr && GET_CODE (addr) == PLUS)
+ opnd1 = XEXP (addr,1);
+
unsigned offset = 0;
unsigned size = GET_MODE_SIZE (reg_mode);
- for (int i = 0; i < nregs; i += reg_mode_nregs)
+ for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
{
unsigned subreg
= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
- rtx src2 = adjust_address (src, reg_mode, offset);
+ rtx src2 = src;
+
+ if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
+ src2 = adjust_address (src, reg_mode, offset);
+ else
+ PUT_MODE_RAW (src2, reg_mode);
offset += size;
emit_insn (gen_rtx_SET (dst2, src2));
}
@@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
/* If we are writing an accumulator register, we have to
prime it after we've written it. */
if (TARGET_MMA
- && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
+ && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
emit_insn (gen_mma_xxmtacc (dst, dst));
return;
@@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
{
for (i = nregs - 1; i >= 0; i--)
{
- rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
- rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
- emit_insn (gen_rtx_SET (dst_i, src_i));
+ if (REG_P (dst) && REG_P (src))
+ {
+ rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
+ rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
+ emit_insn (gen_rtx_SET (dst_i, src_i));
+ }
}
}
else
@@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
/* If we are writing an accumulator register, we have to
prime it after we've written it. */
if (TARGET_MMA
- && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
+ && REG_P (dst) && GET_MODE (dst) == XOmode
+ && FP_REGNO_P (REGNO (dst)))
emit_insn (gen_mma_xxmtacc (dst, dst));
}
else
@@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
/* If the base register we are using to address memory is
also a destination reg, then change that register last. */
- if (REG_P (breg)
+ if (REG_P (dst) && REG_P (breg)
&& REGNO (breg) >= REGNO (dst)
&& REGNO (breg) < REGNO (dst) + nregs)
j = REGNO (breg) - REGNO (dst);
@@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
/* XO/OO are opaque so cannot use subregs. */
if (mode == OOmode || mode == XOmode )
{
- rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
- rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
- emit_insn (gen_rtx_SET (dst_i, src_i));
+ if (REG_P (dst) && REG_P (src))
+ {
+ rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
+ rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
+ emit_insn (gen_rtx_SET (dst_i, src_i));
+ }
}
else
emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
@@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
if (restore_basereg != NULL_RTX)
emit_insn (restore_basereg);
}
+ return;
}
+
/* Return true if the peephole2 can combine a load involving a combination of
an addis instruction and a load with an offset that can be fused together on
@@ -159,6 +159,7 @@
UNSPEC_XXSPLTIW_CONST
UNSPEC_FMAX
UNSPEC_FMIN
+ UNSPEC_LXVP
])
;;
@@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
$(COMPILE) $<
$(POSTCOMPILE)
+rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+
rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
$(COMPILE) $<
$(POSTCOMPILE)
@@ -312,9 +312,9 @@ static int
encode_lfs (lfs_fields fields)
{
int size_log2 = exact_log2 (fields.size);
- gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
- return ((int)fields.load_p << 3)
- | ((int)fields.fpsimd_p << 2)
+ gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
+ return ((int)fields.load_p << 4)
+ | ((int)fields.fpsimd_p << 3)
| (size_log2 - 2);
}
@@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
static lfs_fields
decode_lfs (int lfs)
{
- bool load_p = (lfs & (1 << 3));
- bool fpsimd_p = (lfs & (1 << 2));
+ bool load_p = (lfs & (1 << 4));
+ bool fpsimd_p = (lfs & (1 << 3));
unsigned size = 1U << ((lfs & 3) + 2);
return { load_p, fpsimd_p, size };
}
@@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
if (MEM_VOLATILE_P (mem))
return;
+ if (load_p && !m_pass->fuseable_load_p (insn))
+ return;
+
// Ignore writeback accesses if the hook says to do so.
if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
&& GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
@@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
}
rtx reg_notes = combine_reg_notes (first, second, load_p);
-
+ m_pass->set_multiword_subreg (i1, i2, load_p);
rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
insn_change *pair_change = nullptr;
auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
@@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
pair_change->new_defs = merge_access_arrays (attempt,
input_defs[0],
input_defs[1]);
+ m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
gcc_assert (pair_change->new_defs.is_valid ());
pair_change->new_uses
@@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
reg_ops[i] = XEXP (pats[i], !load_p);
}
+ if (!load_p && !m_pass->fuseable_store_p (i1, i2))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return false;
+ }
+
if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
{
if (dump_file)
@@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb)
if (GET_CODE (pat) != SET)
continue;
+ change_existing_multword_mode (rti);
+
if (track_stores && MEM_P (XEXP (pat, 0)))
bb_state.track_access (insn, false, XEXP (pat, 0));
else if (track_loads && MEM_P (XEXP (pat, 1)))
@@ -26,8 +26,11 @@ namespace rtl_ssa {
class insn_info;
class insn_range_info;
class bb_info;
+ class insn_change;
}
+class obstack_watermark;
+
// Information about a potential base candidate, used in try_fuse_pair.
// There may be zero, one, or two viable RTL bases for a given pair.
struct base_cand
@@ -142,6 +145,19 @@ struct pair_fusion {
// true iff INSN is a load pair.
virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
+ // Given INSN change multiword mode load and store to respective
+ // unspec instruction.
+ virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
+
+ // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
+ // new rtx with INSN. Remove all uses of definition that are
+ // removed given CHANGES.
+ virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
+ obstack_watermark *attempt,
+ rtl_ssa::insn_change **pair_change,
+ auto_vec<rtl_ssa::insn_change *> &changes)
+ = 0;
+
// Return true if we should track loads.
virtual bool track_loads_p ()
{
@@ -171,6 +187,24 @@ struct pair_fusion {
virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
rtx regs[2], bool load_p) = 0;
+ // Given insn_info pair I1 and I2, sets subreg with multiword registers
+ // to assign register pairs by allocators.
+ // LOAD_P is true iff the pair is a load.
+ virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
+ rtl_ssa::insn_info *i2,
+ bool load_p) = 0;
+
+ // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+ // store mem pairs.
+ // Return true if feasible to perform store mem pairs otherwise false.
+ virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
+ rtl_ssa::insn_info *i2) = 0;
+
+ // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+ // load mem pairs.
+ // Return true if feasible to perform load mem pairs otherwise false.
+ virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
+
void process_block (rtl_ssa::bb_info *bb);
rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
const rtl_ssa::insn_range_info
@@ -222,6 +222,13 @@ public:
template<typename T, typename... Ts>
T *change_alloc (obstack_watermark &wm, Ts... args);
+ auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
+
+ template<typename T, typename... Ts>
+ T *allocate (Ts... args);
+
+ void remove_use (use_info *);
+
private:
class bb_phi_info;
class build_info;
@@ -231,9 +238,6 @@ private:
// allocate_temp during its lifetime.
obstack_watermark temp_watermark () { return &m_temp_obstack; }
- template<typename T, typename... Ts>
- T *allocate (Ts... args);
-
template<typename T, typename... Ts>
T *allocate_temp (Ts... args);
@@ -269,7 +273,6 @@ private:
static void insert_use_after (use_info *, use_info *);
void add_use (use_info *);
- void remove_use (use_info *);
insn_info::order_node *need_order_node (insn_info *);
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+
+void
+foo2 ()
+{
+ __vector_quad *dst1;
+ __vector_quad *dst2;
+ vector unsigned char src;
+ __vector_quad acc;
+ vector unsigned char *ptr;
+ __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+ *dst1 = acc;
+ __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+ *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+ __vector_quad acc;
+ __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+ *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
@@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
dst[13] = acc;
}
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
/* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
/* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
/* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */