@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+ extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+ extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
new file mode 100644
@@ -0,0 +1,651 @@
+/* Subroutines used to perform adjacent load/store into
+ paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "df.h"
+#include "rtl.h"
+#include "rtl-iter.h"
+#include "rtl-ssa.h"
+#include "tree-pass.h"
+#include "ordered-hash-map.h"
+#include "pair-fusion.h"
+
+using namespace rtl_ssa;
+
+struct rs6000_pair_fusion : public pair_fusion
+{
+ bool fpsimd_op_p (rtx , machine_mode , bool) override final
+ {
+ return false;
+ }
+
+ bool pair_mem_insn_p (rtx_insn *, bool &) override final
+ {
+ return false;
+ }
+
+ bool pair_mem_ok_with_policy (rtx, bool) override final
+ {
+ return true;
+ }
+
+ bool pair_operand_mode_ok_p (machine_mode mode) override final;
+
+ rtx gen_pair (rtx *pats, rtx writeback, bool load_p) override final;
+
+ bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
+ {
+ return true;
+ }
+
+ int pair_mem_alias_check_limit () override final
+ {
+ return 0;
+ }
+
+ bool should_handle_writeback (enum writeback) override final
+ {
+ return false;
+ }
+
+ bool track_loads_p () override final
+ {
+ return true;
+ }
+
+ bool track_stores_p () override final
+ {
+ return true;
+ }
+
+ bool pair_mem_in_range_p (HOST_WIDE_INT) override final
+ {
+ return true;
+ }
+
+ rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
+ {
+ return NULL_RTX;
+ }
+
+ rtx destructure_pair (rtx_def **, rtx, bool) override final
+ {
+ return NULL_RTX;
+ }
+
+ // (insn 31 62 32 2 (set (reg:V16QI 177 [ MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 64B] ])
+ // (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+ // (const_int 64 [0x40])) [1 MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 64B]+0 S16 A16]))
+ // {vsx_movv16qi_64bit}
+ // (nil))
+ // (insn 32 31 16 2 (set (reg:V16QI 178 [ MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 80B] ])
+ // (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+ // (const_int 80 [0x50])) [1 MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 80B]+0 S16 A16]))
+ // {vsx_movv16qi_64bit}
+ // (nil))
+ // (insn 16 32 21 2 (set (reg:V16QI 159 [ MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 16B] ])
+ // (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+ // (const_int 16 [0x10])) [1 MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 16B]+0 S16 A16]))
+ // {vsx_movv16qi_64bit}
+ // (nil))
+ // (insn 21 16 22 2 (set (reg:V16QI 165 [ MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 32B] ])
+ // (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+ // (const_int 32 [0x20])) [1 MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 32B]+0 S16 A16]))
+ // {vsx_movv16qi_64bit}
+ // (nil))
+ // (insn 22 21 37 2 (set (reg:V16QI 166 [ MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 48B] ])
+ // (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+ // (const_int 48 [0x30])) [1 MEM <vector(8) short unsigned int>
+ // [(short unsigned int *)vectp.62_36 + 48B]+0 S16 A16]))
+ // {vsx_movv16qi_64bit}
+ // (nil))
+ //
+ // insn 22 and insn 31 is merged in the failure case and breaks the code.
+ // This function handles the above case.
+ bool should_handle_unordered_insns (insn_info *i1,
+ insn_info *i2) override final
+ {
+ if (*i1 > *i2)
+ return false;
+
+ return true;
+ }
+
+ bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
+
+ bool fuseable_load_p (insn_info *insn) override final;
+
+ void set_multiword_subreg (insn_info *i1, insn_info *i2,
+ bool load_p) override final;
+};
+
+bool
+rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+ return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
+
+}
+
+// df_insn_rescan the unspec instruction where operands
+// are reversed given insn_info INFO.
+static void
+set_rescan_load (insn_info *info)
+{
+ for (auto def : info->defs())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ for (auto use : set->nondebug_insn_uses ())
+ {
+ insn_info *info = use->insn ();
+ if (info)
+ {
+ rtx_insn *rtl_insn = info->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return;
+
+ rtx op0 = SET_SRC (set);
+ if (GET_CODE (op0) != UNSPEC)
+ return;
+
+ use->set_is_live_out_use (true);
+ df_insn_rescan (rtl_insn);
+ }
+ }
+ }
+}
+
+// df_insn_rescan the def instruction where operands are reversed given INSN.
+static bool
+set_rescan_store (insn_info *insn)
+{
+ for (auto use : insn->uses())
+ {
+ auto def = use->def ();
+
+ if (def->insn ()->is_artificial())
+ return false;
+
+ if (def->insn () && def->insn ()->rtl ()
+ && def->insn()->is_real() )
+ {
+ rtx_insn *rtl_insn = def->insn ()->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return false;
+ df_insn_rescan (rtl_insn);
+ }
+ }
+
+ return true;
+}
+
+ordered_hash_map<rtx_insn *, bool> insn_map;
+
+// Return false if dependent def is load instruction given INSN otherwise
+// false.
+static bool
+feasible_store_p (rtx_insn *insn, bool immediate_dep)
+{
+ df_ref use;
+ df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref))
+ continue;
+
+ rtx_insn *select_insn2 = DF_REF_INSN (def_link->ref);
+
+ if (select_insn2 == NULL)
+ continue;
+
+ if (select_insn2 == insn)
+ return true;
+
+ while (def_link && def_link->ref)
+ {
+ rtx set = single_set (select_insn2);
+ rtx insn_set = single_set (insn);
+
+ if (set != NULL_RTX && insn_set != NULL_RTX)
+ {
+ if (GET_MODE (SET_SRC (set)) != GET_MODE (SET_SRC (insn_set)))
+ {
+ if (GET_MODE (SET_SRC (set)) == OOmode)
+ return false;
+
+ immediate_dep = false;
+ }
+ else
+ {
+ if (immediate_dep && MEM_P (SET_SRC (set)))
+ return false;
+ }
+
+ if (insn_map.get (select_insn2))
+ return true;
+ else
+ insn_map.put (select_insn2, true);
+
+ if (!feasible_store_p (select_insn2, immediate_dep))
+ return false;
+ }
+ def_link = def_link->next;
+ }
+ }
+ return true;
+}
+
+// Check for feasibility of store to be fuseable or not. Return true if
+// feasible otherwise false.
+static bool
+feasible_store_p (insn_info *insn)
+{
+ for (auto use : insn->uses ())
+ {
+ auto def = use->def ();
+
+ if (def->insn ()->is_artificial ())
+ return false;
+
+ if (def->insn () && def->insn ()->rtl ()
+ && def->insn()->is_real ())
+ {
+ rtx_insn *rtl_insn = def->insn ()->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return false;
+
+ // Return false if dependent def is load.
+ if (rtl_insn && MEM_P (SET_SRC (set)))
+ return false;
+
+ // Return false if dependent def is store.
+ if (rtl_insn && MEM_P (SET_DEST (set)))
+ return false;
+
+ // Return false if dependent def is parallel.
+ if (GET_CODE (PATTERN (rtl_insn)) == PARALLEL)
+ return false;
+
+ rtx src = SET_SRC (set);
+ rtx_code code = GET_CODE (src);
+
+ // Return false if dependent def is CONST_VECTOR or UNSPEC.
+ if (code == CONST_VECTOR || code == UNSPEC)
+ return false;
+
+ // Recursively check for dependent instruction is Load.
+ if (!feasible_store_p (rtl_insn, true))
+ return false;
+
+ if (GET_RTX_CLASS (code) == RTX_TERNARY)
+ return false;
+ }
+ }
+ return true;
+}
+
+// Check if store can be fuseable or not. Return true if fuseable otherwise
+// false.
+bool
+rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx_insn *insn2 = i2->rtl ();
+
+ rtx body = PATTERN (insn1);
+ rtx src_exp = SET_SRC (body);
+ rtx insn2_body = PATTERN (insn2);
+ rtx insn2_src_exp = SET_SRC (insn2_body);
+
+ // Return false if def and use count are not same.
+ if (REG_P (src_exp) &&
+ (DF_REG_DEF_COUNT (REGNO (src_exp)) != DF_REG_USE_COUNT (REGNO (src_exp))
+ || DF_REG_USE_COUNT (REGNO (src_exp)) > 1))
+ return false;
+
+ // Return false if src of insn1 and src of ins2 are same.
+ if (src_exp == insn2_src_exp)
+ return false;
+
+ // Return false if src of insn1 is subreg.
+ if (GET_CODE (src_exp) == SUBREG)
+ return false;
+
+ // Return false if src of insn1 is TImode or TFmode.
+ if (GET_MODE (src_exp) == TImode || GET_MODE (src_exp) == TFmode)
+ return false;
+
+ if (!feasible_store_p (i1))
+ return false;;
+
+ if (!feasible_store_p (i2))
+ return false;
+
+ return true;
+}
+
+// Set subreg for def of store INSN given rtx SRC instruction.
+static void
+set_store_subreg (rtx_insn *insn, rtx src)
+{
+ rtx set = single_set (insn);
+ rtx src_exp = SET_SRC (set);
+ df_ref use;
+
+ df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref))
+ continue;
+
+ while (def_link && def_link->ref)
+ {
+ rtx *loc = DF_REF_LOC (def_link->ref);
+
+ if (GET_MODE (*loc) == GET_MODE (src_exp))
+ *loc = copy_rtx (src);
+
+ def_link = def_link->next;
+ }
+ }
+}
+
+// Generate store pair stxvp given rtx I1.
+static rtx
+rs6000_gen_store_pair (rtx i1)
+{
+ rtx src_exp = SET_SRC (i1);
+ rtx dest_exp = SET_DEST (i1);
+ rtx stxv;
+ PUT_MODE_RAW (src_exp, OOmode);
+ PUT_MODE_RAW (dest_exp, OOmode);
+ stxv = gen_rtx_SET (dest_exp, src_exp);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Replacing stxv with stxvp \n");
+ print_rtl_single (dump_file, stxv);
+ }
+ return stxv;
+}
+
+// Check whether load can be fusable or not.
+// Return true if dependent use is UNSPEC otherwise false.
+// Adjacent load pair fusion with 256 bit OOmode is seen
+// and valid with use of load in UNSPEC. Thats why this check is added.
+bool
+rs6000_pair_fusion::fuseable_load_p (insn_info *info)
+{
+ for (auto def : info->defs ())
+ {
+ auto set = dyn_cast<set_info *> (def);
+ for (auto use : set->nondebug_insn_uses ())
+ {
+ insn_info *info = use->insn ();
+
+ if (info->is_artificial ())
+ return false;
+
+ if (info && info->is_real ())
+ {
+ rtx_insn *rtl_insn = info->rtl ();
+ rtx set = single_set (rtl_insn);
+
+ if (set == NULL_RTX)
+ return false;
+
+ rtx op0 = SET_SRC (set);
+ if (GET_CODE (op0) != UNSPEC)
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+// Set subreg with use of INSN given SRC rtx instruction.
+static void
+set_load_subreg (rtx_insn *insn, rtx src)
+{
+ df_ref use;
+ df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ FOR_EACH_INSN_INFO_DEF (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref))
+ continue;
+
+ while (def_link && def_link->ref)
+ {
+ rtx *loc = DF_REF_LOC (def_link->ref);
+ *loc = copy_rtx (src);
+ def_link = def_link->next;
+ }
+ }
+}
+
+// Set subreg for OO mode store pair to generate registers in pairs
+// given insn_info I1 and I2.
+static void
+set_multiword_subreg_store (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl ();
+ rtx_insn *insn2 = i2->rtl ();
+ rtx body = PATTERN (insn1);
+ rtx src_exp = SET_SRC (body);
+ rtx insn2_body = PATTERN (insn2);
+ rtx insn2_dest_exp = SET_DEST (insn2_body);
+ machine_mode mode = GET_MODE (src_exp);
+ int regoff;
+ rtx src;
+ rtx addr = XEXP (insn2_dest_exp, 0);
+
+ PUT_MODE_RAW (src_exp, OOmode);
+ if (GET_CODE (addr) == PLUS
+ && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
+ regoff = 16;
+ else
+ regoff = 0;
+
+ src = simplify_gen_subreg (mode,
+ src_exp, GET_MODE (src_exp),
+ regoff);
+
+ set_store_subreg (insn1, src);
+
+ int regoff1 = 0;
+ rtx src1;
+
+ src1 = simplify_gen_subreg (mode,
+ src_exp, GET_MODE (src_exp),
+ regoff1);
+
+ set_store_subreg (insn2, src1);
+ set_rescan_store (i1);
+ set_rescan_store (i2);
+ df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair load to generate registers in pairs given
+// insn_info I2 and I2.
+static void
+set_multiword_subreg_load (insn_info *i1, insn_info *i2)
+{
+ rtx_insn *insn1 = i1->rtl();
+ rtx_insn *insn2 = i2->rtl();
+ rtx body = PATTERN (insn1);
+ rtx dest_exp = SET_DEST (body);
+ rtx insn2_body = PATTERN (insn2);
+ machine_mode mode = GET_MODE (dest_exp);
+ PUT_MODE_RAW (dest_exp, OOmode);
+
+ rtx insn2_src_exp = SET_SRC (insn2_body);
+ int regoff = 0;
+ rtx src;
+
+ src = simplify_gen_subreg (mode,
+ dest_exp, GET_MODE (dest_exp),
+ regoff);
+
+ set_load_subreg (insn2, src);
+
+ int regoff1;
+ rtx src1;
+ rtx addr = XEXP (insn2_src_exp, 0);
+
+ if (GET_CODE (addr) == PLUS
+ && XEXP (addr, 1)
+ && CONST_INT_P (XEXP(addr, 1)))
+ regoff1 = 16;
+ else
+ regoff1 = 0;
+
+ src1 = simplify_gen_subreg (mode,
+ dest_exp, GET_MODE (dest_exp),
+ regoff1);
+
+ set_load_subreg (insn1, src1);
+ set_rescan_load (i1);
+ set_rescan_load (i2);
+ df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair to generate sequential registers given
+// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
+// if store insn.
+void
+rs6000_pair_fusion::set_multiword_subreg(insn_info *i1, insn_info *i2,
+ bool load_p)
+{
+ if (load_p)
+ set_multiword_subreg_load (i1, i2);
+ else
+ set_multiword_subreg_store (i1, i2);
+}
+
+// Return load pair given rtx I1.
+static rtx
+rs6000_gen_load_pair (rtx i1)
+{
+ rtx src_exp = SET_SRC (i1);
+ rtx dest_exp = SET_DEST (i1);
+ rtx lxv;
+ PUT_MODE_RAW (src_exp, OOmode);
+ PUT_MODE_RAW (dest_exp, OOmode);
+ lxv = gen_rtx_SET (dest_exp, src_exp);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "lxv with lxvp ");
+ print_rtl_single (dump_file, lxv);
+ }
+
+ return lxv;
+}
+
+rtx
+rs6000_pair_fusion::gen_pair (rtx *pats, rtx writeback, bool load_p)
+{
+ if (load_p || writeback)
+ return rs6000_gen_load_pair (pats[0]);
+ else
+ return rs6000_gen_store_pair (pats[0]);
+}
+
+const pass_data pass_data_mem_fusion =
+{
+ RTL_PASS, /* type */
+ "mem_fusion", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_mem_fusion : public rtl_opt_pass
+{
+public:
+ pass_mem_fusion (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_mem_fusion, ctxt)
+ {}
+
+ opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
+
+ /* opt_pass methods: */
+ bool gate (function *)
+ {
+ return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+ }
+
+ unsigned int execute (function *) final override
+ {
+ /* We use DF data flow because we change location rtx
+ which is easier to find and modify.
+ We use mix of rtl-ssa def-use and DF data flow
+ where it is easier. */
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_analyze ();
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ rs6000_pair_fusion pass;
+ pass.run ();
+ return 0;
+ }
+}; // class pass_mem_fusion
+
+rtl_opt_pass *
+make_pass_mem_fusion (gcc::context *ctxt)
+{
+ return new pass_mem_fusion (ctxt);
+}
@@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see
The power8 does not have instructions that automaticaly do the byte swaps
for loads and stores. */
INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
-
+ /* Pass to replace adjacent memory addresses lxv/stxv instruction with
+ lxvp/stxvp instruction. */
+ INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
/* Pass to do the PCREL_OPT optimization that combines the load of an
external symbol's address along with a single load or store using that
address as a base register. */
@@ -343,6 +343,7 @@ namespace gcc { class context; }
class rtl_opt_pass;
extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
extern bool rs6000_quadword_masked_address_p (const_rtx exp);
@@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
$(COMPILE) $<
$(POSTCOMPILE)
+rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+
rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
$(COMPILE) $<
$(POSTCOMPILE)
@@ -312,9 +312,9 @@ static int
encode_lfs (lfs_fields fields)
{
int size_log2 = exact_log2 (fields.size);
- gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
- return ((int)fields.load_p << 3)
- | ((int)fields.fpsimd_p << 2)
+ gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
+ return ((int)fields.load_p << 4)
+ | ((int)fields.fpsimd_p << 3)
| (size_log2 - 2);
}
@@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
static lfs_fields
decode_lfs (int lfs)
{
- bool load_p = (lfs & (1 << 3));
- bool fpsimd_p = (lfs & (1 << 2));
+ bool load_p = (lfs & (1 << 4));
+ bool fpsimd_p = (lfs & (1 << 3));
unsigned size = 1U << ((lfs & 3) + 2);
return { load_p, fpsimd_p, size };
}
@@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
if (MEM_VOLATILE_P (mem))
return;
+ if (load_p && !m_pass->fuseable_load_p (insn))
+ return;
+
// Ignore writeback accesses if the hook says to do so.
if (!m_pass->should_handle_writeback (writeback::EXISTING)
&& GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
@@ -1821,6 +1824,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
rtx reg_notes = combine_reg_notes (first, second, load_p);
+ m_pass->set_multiword_subreg (first, second, load_p);
rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
insn_change *pair_change = nullptr;
auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
@@ -2383,6 +2387,9 @@ bool
pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
insn_info *i1, insn_info *i2)
{
+ if (!m_pass->should_handle_unordered_insns (i1, i2))
+ return false;
+
if (dump_file)
fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
load_p, i1->uid (), i2->uid ());
@@ -2411,6 +2418,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
reg_ops[i] = XEXP (pats[i], !load_p);
}
+ if (!load_p && !m_pass->fuseable_store_p (i1, i2))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return false;
+ }
+
if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
{
if (dump_file)
@@ -171,6 +171,26 @@ struct pair_fusion {
virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
rtx regs[2], bool load_p) = 0;
+ // Given insn_info pair I1 and I2, sets subreg with multiword registers
+ // to assign register pairs by allocators.
+ // LOAD_P is true iff the pair is a load.
+ virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, rtl_ssa::insn_info *i2,
+ bool load_p) = 0;
+
+ // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+ // store mem pairs.
+ // Return true if feasible to perform store mem pairs otherwise false.
+ virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, rtl_ssa::insn_info *i2) = 0;
+
+ // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+ // load mem pairs.
+ // Return true if feasible to perform load mem pairs otherwise false.
+ virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
+
+ // Given insn_info pair I1 and I2, return true if offsets are in order.
+ virtual bool should_handle_unordered_insns (rtl_ssa::insn_info *i1,
+ rtl_ssa::insn_info *i2) = 0;
+
void process_block (rtl_ssa::bb_info *bb);
rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
const rtl_ssa::insn_range_info
@@ -379,6 +379,7 @@ public:
//
// This routine is only meaningful when def () is nonnull.
bool is_last_use () const;
+ void set_is_live_out_use (bool value) { m_is_live_out_use = value; }
// Print a description of def () to PP.
void print_def (pretty_printer *pp) const;
@@ -430,7 +431,6 @@ private:
void record_reference (rtx_obj_reference, bool);
void set_insn (insn_info *);
void set_def (set_info *set) { m_def = set; }
- void set_is_live_out_use (bool value) { m_is_live_out_use = value; }
void copy_prev_from (use_info *);
void copy_next_from (use_info *);
void set_last_use (use_info *);
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+
+void
+foo2 ()
+{
+ __vector_quad *dst1;
+ __vector_quad *dst2;
+ vector unsigned char src;
+ __vector_quad acc;
+ vector unsigned char *ptr;
+ __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+ *dst1 = acc;
+ __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+ *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+ __vector_quad acc;
+ __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+ __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+ *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
@@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
dst[13] = acc;
}
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
/* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
/* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
/* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */