Message ID | 47ed562a-379c-4cf3-979e-7ce8962808b4@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [rs6000,middle-end] v6: Add implementation for different targets for pair mem fusion | expand |
Ajit Agarwal <aagarwa1@linux.ibm.com> writes: > Hello All: > > This version of patch relaxes store fusion for more use cases. > > Common infrastructure using generic code for pair mem fusion of different > targets. > > rs6000 target specific code implement virtual functions defined by generic code. > > Target specific code are added in rs6000-mem-fusion.cc. > > Bootstrapped and regtested on powerpc64-linux-gnu. > > Thanks & Regards > Ajit > > > rs6000, middle-end: Add implementation for different targets for pair mem fusion > > Common infrastructure using generic code for pair mem fusion of different > targets. > > rs6000 target specific code implement virtual functions defined by generic code. > > Target specific code are added in rs6000-mem-fusion.cc. > > 2024-07-02 Ajit Kumar Agarwal <aagarwa1@linux.ibm.com> > > gcc/ChangeLog: > > * config/rs6000/rs6000-passes.def: New mem fusion pass > before pass_early_remat. > * pair-fusion.h: Add additional pure virtual function > required for rs6000 target implementation. > * pair-fusion.cc: Use of virtual functions for additional > virtual function addded for rs6000 target. > * config/rs6000/rs6000-mem-fusion.cc: Add new pass. > Add target specific implementation for generic pure virtual > functions. > * config/rs6000/mma.md: Modify movoo machine description. > Add new machine description movoo1. > * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move > to expand movoo machine description for all constraints. > * config.gcc: Add new object file. > * config/rs6000/rs6000-protos.h: Add new prototype for mem > fusion pass. > * config/rs6000/t-rs6000: Add new rule. > * rtl-ssa/functions.h: Move out allocate function from private > to public and add get_m_temp_defs function. > > gcc/testsuite/ChangeLog: > > * g++.target/powerpc/mem-fusion.C: New test. > * g++.target/powerpc/mem-fusion-1.C: New test. > * gcc.target/powerpc/mma-builtin-1.c: Modify test. > --- > gcc/config.gcc | 2 + > gcc/config/rs6000/mma.md | 26 +- > gcc/config/rs6000/rs6000-mem-fusion.cc | 708 ++++++++++++++++++ > gcc/config/rs6000/rs6000-passes.def | 4 +- > gcc/config/rs6000/rs6000-protos.h | 1 + > gcc/config/rs6000/rs6000.cc | 57 +- > gcc/config/rs6000/rs6000.md | 1 + > gcc/config/rs6000/t-rs6000 | 5 + > gcc/pair-fusion.cc | 27 +- > gcc/pair-fusion.h | 34 + > gcc/rtl-ssa/functions.h | 11 +- > .../g++.target/powerpc/mem-fusion-1.C | 22 + > gcc/testsuite/g++.target/powerpc/mem-fusion.C | 15 + > .../gcc.target/powerpc/mma-builtin-1.c | 4 +- > 14 files changed, 890 insertions(+), 27 deletions(-) > create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc > create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C > create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C > > diff --git a/gcc/config.gcc b/gcc/config.gcc > index bc45615741b..12f79a78177 100644 > --- a/gcc/config.gcc > +++ b/gcc/config.gcc > @@ -524,6 +524,7 @@ powerpc*-*-*) > extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" > extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" > extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" > + extra_objs="${extra_objs} rs6000-mem-fusion.o" > extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" > extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" > extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" > @@ -560,6 +561,7 @@ rs6000*-*-*) > extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" > extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" > extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" > + extra_objs="${extra_objs} rs6000-mem-fusion.o" > target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc" > target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc" > ;; > diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md > index 04e2d0066df..88413926a02 100644 > --- a/gcc/config/rs6000/mma.md > +++ b/gcc/config/rs6000/mma.md > @@ -294,7 +294,31 @@ > > (define_insn_and_split "*movoo" > [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") > - (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] > + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] > + "TARGET_MMA > + && (gpc_reg_operand (operands[0], OOmode) > + || gpc_reg_operand (operands[1], OOmode))" > +;; "" > + "@ > + # > + # > + #" > + "&& reload_completed" > + [(const_int 0)] > +{ > + rs6000_split_multireg_move (operands[0], operands[1]); > + DONE; > +} > + [(set_attr "type" "vecload,vecstore,veclogical") > + (set_attr "length" "*,*,8")]) > +;; (set_attr "max_prefixed_insns" "2,2,*")]) > + > + > +(define_insn_and_split "*movoo1" > + [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") > + (unspec [ > + (match_operand:OO 1 "input_operand" "ZwO,wa,wa") > + ] UNSPEC_LXVP))] > "TARGET_MMA > && (gpc_reg_operand (operands[0], OOmode) > || gpc_reg_operand (operands[1], OOmode))" > diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc > new file mode 100644 > index 00000000000..b63b6f31001 > --- /dev/null > +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc > @@ -0,0 +1,708 @@ > +/* Subroutines used to perform adjacent load/store into > + paired memory accesses for TARGET_POWER10 and TARGET_VSX. > + > + Copyright (C) 2024 Free Software Foundation, Inc. > + > + This file is part of GCC. > + > + GCC is free software; you can redistribute it and/or modify it > + under the terms of the GNU General Public License as published > + by the Free Software Foundation; either version 3, or (at your > + option) any later version. > + > + GCC is distributed in the hope that it will be useful, but WITHOUT > + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY > + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > + License for more details. > + > + You should have received a copy of the GNU General Public License > + along with GCC; see the file COPYING3. If not see > + <http://www.gnu.org/licenses/>. */ > + > +#define INCLUDE_ALGORITHM > +#define INCLUDE_FUNCTIONAL > +#define INCLUDE_LIST > +#define INCLUDE_TYPE_TRAITS > +#include "config.h" > +#include "system.h" > +#include "coretypes.h" > +#include "backend.h" > +#include "rtl.h" > +#include "df.h" > +#include "rtl-iter.h" > +#include "rtl-ssa.h" > +#include "rtl-ssa/internals.h" > +#include "rtl-ssa/internals.inl" > +#include "cfgcleanup.h" > +#include "tree-pass.h" > +#include "pair-fusion.h" > + > +using namespace rtl_ssa; > + > +struct rs6000_pair_fusion : public pair_fusion > +{ > + bool fpsimd_op_p (rtx , machine_mode , bool) override final > + { > + return false; > + } > + > + bool pair_mem_insn_p (rtx_insn *, bool &) override final > + { > + return false; > + } > + > + void change_existing_multword_mode (rtx_insn *insn) override final; > + > + bool pair_mem_ok_with_policy (rtx, bool) override final > + { > + return true; > + } > + > + bool pair_operand_mode_ok_p (machine_mode mode) override final; > + > + rtx gen_pair (rtx *pats, rtx, bool load_p) override final; > + > + bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final > + { > + return true; > + } > + > + int pair_mem_alias_check_limit () override final > + { > + return 0; > + } > + > + bool should_handle_writeback (enum writeback_type) override final > + { > + return false; > + } > + > + bool track_loads_p () override final > + { > + return true; > + } > + > + bool track_stores_p () override final > + { > + return true; > + } > + > + bool pair_mem_in_range_p (HOST_WIDE_INT) override final > + { > + return true; > + } > + > + rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final > + { > + return NULL_RTX; > + } > + > + rtx destructure_pair (rtx_def **, rtx, bool) override final > + { > + return NULL_RTX; > + } > + > + bool fuseable_store_p (insn_info *i1, insn_info *i2) override final; > + > + bool fuseable_load_p (insn_info *insn) override final; > + > + void set_multiword_subreg (insn_info *i1, insn_info *i2, > + bool load_p) override final; > + > + void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt, > + insn_change **pair_change, > + auto_vec <insn_change *> &changes) override final; > +}; > + > +bool > +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) > +{ > + return (ALTIVEC_OR_VSX_VECTOR_MODE (mode)); > +} > + > +void > +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn) > +{ > + rtx set = single_set (insn); > + rtx src = SET_SRC (set); > + rtx dest = SET_DEST (set); > + rtx copy = NULL_RTX; > + > + if ((MEM_P (src) && GET_MODE (src) == OOmode) > + || (MEM_P (dest) && GET_MODE (dest) == OOmode)) > + { > + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), > + gen_rtvec (1, src), > + UNSPEC_LXVP); > + copy = gen_rtx_SET (dest, unspec); > + rtx_insn *new_insn = emit_insn_after (copy, insn); > + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); > + df_insn_rescan (new_insn); > + df_insn_delete (insn); > + remove_insn (insn); > + insn->set_deleted (); > + } > +} > + > +static void > +update_change (set_info *set) > +{ > + if (!set->has_any_uses ()) > + return; > + > + auto *use = *set->all_uses ().begin (); > + do > + { > + auto *next_use = use->next_use (); > + if (use->is_in_phi ()) > + { > + update_change (use->phi ()); > + } > + else > + { > + crtl->ssa->remove_use (use); > + } > + use = next_use; > + } > + while (use); > +} > + > +void > +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first, > + obstack_watermark *attempt, > + insn_change **pair_change, > + auto_vec<insn_change *> &changes) > +{ > + for (insn_change *change : changes) > + for (auto def : change->old_defs ()) > + { > + auto set = dyn_cast<set_info *> (def); > + update_change (set); > + } > + > + auto &new_defs = (*pair_change)->new_defs; > + vec_rtx_properties properties; > + properties.add_insn (first->rtl (), true); > + // Build up the new list of definitions. > + for (rtx_obj_reference ref : properties.refs ()) > + if (ref.is_write ()) > + { > + auto *set = crtl->ssa->allocate<set_info> (first, > + full_register (ref.regno)); > + if (set) > + { > + auto def = find_access (new_defs, ref.regno); > + if (!def) > + { > + new_defs = insert_access (*attempt, set, > + new_defs); > + auto &m_temp_defs = crtl->ssa->get_m_temp_defs (); > + m_temp_defs.safe_push (set); > + } > + } > + } > +} > + > +// df_insn_rescan dependent instruction where operands > +// are reversed given insn_info INFO. > +static void > +set_rescan_load (insn_info *i1) > +{ > + for (auto def : i1->defs ()) > + { > + auto set = dyn_cast<set_info *> (def); > + for (auto use : set->all_uses ()) > + { > + insn_info *info = use->insn (); > + if (info && info->rtl ()) > + { > + rtx_insn *rtl_insn = info->rtl (); > + df_insn_rescan (rtl_insn); > + } > + } > + } > +} > + > +// df_insn_rescan the def instruction where operands are reversed given INSN. > +static bool > +set_rescan_store (insn_info *insn) > +{ > + for (auto use : insn->uses()) > + { > + auto def = use->def (); > + > + if (!def) > + return false; > + > + if (def->insn ()->is_artificial ()) > + return false; > + > + if (def->insn () && def->insn ()->rtl () > + && def->insn()->is_real ()) > + { > + rtx_insn *rtl_insn = def->insn ()->rtl (); > + rtx set = single_set (rtl_insn); > + > + if (set == NULL_RTX) > + return false; > + df_insn_rescan (rtl_insn); > + } > + } > + return true; > +} > + > +// Check for feasibility of store to be fuseable or not. Return true if > +// feasible otherwise false. > +static bool > +feasible_store_p (insn_info *insn) > +{ > + for (auto use : insn->uses ()) > + { > + auto def = use->def (); > + > + if (def->insn ()->is_artificial ()) > + return false; > + > + if (def->insn () && def->insn ()->rtl () > + && def->insn()->is_real ()) > + { > + rtx_insn *rtl_insn = def->insn ()->rtl (); > + rtx set = single_set (rtl_insn); > + > + if (set == NULL_RTX) > + return false; > + > + // Return false if dependent def is load. > + // This is done as def instruction could be a fused load and > + // to avoid already existing subreg (reg:OO R) offset. > + if (rtl_insn && MEM_P (SET_SRC (set))) > + return false; > + > + // Return false if dependent def is store. > + if (rtl_insn && MEM_P (SET_DEST (set))) > + return false; I don't understand these tests. It might help to turn it around and say: what sort of cases do you want to handle? > + } > + } > + return true; > +} > + > +// Check if store can be fuseable or not. Return true if fuseable otherwise > +// false. > +bool > +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2) > +{ > + rtx_insn *insn1 = i1->rtl (); > + rtx_insn *insn2 = i2->rtl (); > + rtx body = PATTERN (insn1); > + rtx src_exp = SET_SRC (body); > + rtx insn2_body = PATTERN (insn2); > + rtx insn2_src_exp = SET_SRC (insn2_body); > + > + if (!(REG_P (src_exp) > + && crtl->ssa->single_dominating_def (REGNO (src_exp)))) > + return false; > + > + // This is done as def instruction could be a fused load and > + // to avoid already existing subreg (reg:OO R) offset. > + if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1) > + return false; > + > + // Return false if src of insn1 and src of insn2 are same. > + if (src_exp == insn2_src_exp) > + return false; > + > + // Return false if src of insn1 is subreg. > + if (GET_CODE (src_exp) == SUBREG) > + return false; This can't be true after the REG_P check above. > + > + // Return false if src of insn2 is subreg. > + if (GET_CODE (insn2_src_exp) == SUBREG) > + return false; Shouldn't the tests for i1 and i2 be symmetrical, with i2 also requiring a single dominating definition? > + > + if (!feasible_store_p (i1)) > + return false;; > + > + if (!feasible_store_p (i2)) > + return false; > + > + return true; > +} > + > +// Set subreg for def of store INSN given rtx SRC instruction. > +static void > +set_store_subreg (insn_info *i1, rtx src, int regoff) > +{ > + for (auto use: i1->uses ()) > + { > + auto def = use->def (); > + if (!def) > + return; > + > + insn_info *info = def->insn (); > + > + if (info->is_artificial ()) > + return; > + > + if (info && info->is_real ()) > + { > + rtx_insn *rtl_insn = info->rtl (); > + rtx set = single_set (rtl_insn); > + if (set == NULL_RTX) > + return; > + df_ref ref; > + FOR_EACH_INSN_DEF (ref, rtl_insn) > + { > + rtx src_exp = SET_SRC (PATTERN (i1->rtl ())); > + if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp)) > + { > + rtx *loc = DF_REF_LOC (ref); > + if (GET_CODE (*loc) == SUBREG) > + { > + rtx src1 = simplify_gen_subreg (GET_MODE (*loc), > + SUBREG_REG (src), > + OOmode, > + regoff); > + *loc = copy_rtx (src1); > + } > + else > + *loc = copy_rtx (src); > + } > + } > + } > + } > +} > + > +// Check whether load can be fusable or not. > +// Return true if fuseable otherwise false. > +bool > +rs6000_pair_fusion::fuseable_load_p (insn_info *i1) > +{ > + rtx_insn *insn = i1->rtl (); > + rtx body = PATTERN (insn); > + rtx dest_exp = SET_DEST (body); > + > + if (!(REG_P (dest_exp) > + && crtl->ssa->single_dominating_def (REGNO (dest_exp)))) > + return false; > + return true; > +} > + > +// Propagate insn I1 with new rtx NEW_DEST_EXP. > +static void > +propagate_insn (insn_info *i1, rtx new_dest_exp) > +{ > + df_ref ref; > + FOR_EACH_INSN_DEF (ref, i1->rtl()) > + { > + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); > + if (REG_P (dest_exp) > + && DF_REF_REGNO (ref) == REGNO (dest_exp)) > + { > + rtx *loc = DF_REF_LOC (ref); > + *loc = new_dest_exp; > + } > + } > +} > + > +// Generate new reg rtx with copy of OLD_DEST for OOmode pair. > +static rtx > +new_reg_rtx (rtx old_dest) > +{ > + rtx new_dest_exp = gen_reg_rtx (OOmode); > + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest); > + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest); > + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest); > + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest); > + max_regno = max_reg_num (); > + return new_dest_exp; > +} > + > +// Set subreg with use of INSN given SRC rtx instruction. > +static void > +set_load_subreg (insn_info *i1, rtx src) > +{ > + rtx set = single_set (i1->rtl()); > + rtx old_dest = SET_DEST (set); > + > + for (auto def : i1->defs ()) > + { > + auto set = dyn_cast<set_info *> (def); > + for (auto use : set->nondebug_insn_uses ()) > + { > + insn_info *info = use->insn (); > + if (!info || !info->rtl ()) > + continue; > + > + rtx_insn *rtl_insn = info->rtl (); > + df_ref ref; > + > + FOR_EACH_INSN_USE (ref, rtl_insn) > + { > + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); > + if (REG_P (dest_exp) > + && DF_REF_REGNO (ref) == REGNO (dest_exp)) > + { > + rtx *loc = DF_REF_LOC (ref); > + insn_propagation prop (rtl_insn, old_dest, src); > + if (GET_CODE (*loc) == SUBREG) > + { > + if (!prop.apply_to_pattern (loc)) > + { > + if (dump_file != NULL) > + { > + fprintf (dump_file, > + "Cannot propagate insn \n"); > + print_rtl_single (dump_file, rtl_insn); > + } > + return; > + } > + } > + else > + *loc = copy_rtx (src); > + } > + } > + } > + } > +} > + > +// Set subreg for OO mode store pair to generate registers in pairs > +// given insn_info I1 and I2. > +static void > +set_multiword_subreg_store (insn_info *i1, insn_info *i2) > +{ > + rtx_insn *insn1 = i1->rtl (); > + rtx_insn *insn2 = i2->rtl (); > + rtx body = PATTERN (insn1); > + rtx src_exp = SET_SRC (body); > + rtx insn2_body = PATTERN (insn2); > + rtx insn2_dest_exp = SET_DEST (insn2_body); > + machine_mode mode = GET_MODE (src_exp); > + int regoff; > + rtx src; > + rtx addr = XEXP (insn2_dest_exp, 0); > + > + PUT_MODE_RAW (src_exp, OOmode); > + if (GET_CODE (addr) == PLUS > + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1))) > + regoff = 16; > + else > + regoff = 0; > + > + src = simplify_gen_subreg (mode, > + src_exp, GET_MODE (src_exp), > + regoff); > + > + set_store_subreg (i1, src, regoff); > + > + int regoff1 = 0; > + rtx src1; > + > + src1 = simplify_gen_subreg (mode, > + src_exp, GET_MODE (src_exp), > + regoff1); > + > + set_store_subreg (i2, src1, regoff1); > + set_rescan_store (i1); > + set_rescan_store (i2); > + df_insn_rescan (insn1); > +} > + > +// Set subreg for OO mode pair load to generate registers in pairs given > +// insn_info I2 and I2. > +static void > +set_multiword_subreg_load (insn_info *i1, insn_info *i2) > +{ > + rtx_insn *insn1 = i1->rtl (); > + rtx body = PATTERN (insn1); > + rtx dest_exp = SET_DEST (body); > + machine_mode mode = GET_MODE (dest_exp); > + PUT_MODE_RAW (dest_exp, OOmode); > + > + int regoff = 0; > + rtx src; > + > + src = simplify_gen_subreg (mode, > + dest_exp, GET_MODE (dest_exp), > + regoff); > + > + set_load_subreg (i2, src); > + > + int regoff1; > + rtx src1; > + > + regoff1 = 16; > + src1 = simplify_gen_subreg (mode, > + dest_exp, GET_MODE (dest_exp), > + regoff1); > + set_load_subreg (i1, src1); > + > + set_rescan_load (i1); > + set_rescan_load (i2); > + df_insn_rescan (insn1); > +} > + > +// Set subreg for OO mode pair load for existing subreg rtx to generate > +// registers in pairs given insn_info I2 and I2. > +static void > +set_multiword_existing_subreg (insn_info *i1, insn_info *i2) > +{ > + rtx_insn *insn1 = i1->rtl (); > + rtx body = PATTERN (insn1); > + rtx dest_exp = SET_DEST (body); > + machine_mode mode = GET_MODE (dest_exp); > + int regoff1; > + regoff1 = 16; > + rtx new_dest_exp = new_reg_rtx (dest_exp); > + > + rtx src = simplify_gen_subreg (mode, > + new_dest_exp, > + OOmode, > + regoff1); > + > + set_load_subreg (i1, src); > + propagate_insn (i1, new_dest_exp); > + > + int regoff = 0; > + rtx sset = single_set (i2->rtl ()); > + rtx insn2_dest_exp = SET_DEST (sset); > + machine_mode insn2_mode = GET_MODE (insn2_dest_exp); > + > + src = simplify_gen_subreg (insn2_mode, > + new_dest_exp, > + OOmode, > + regoff); > + > + set_load_subreg (i2, src); > + propagate_insn (i2, new_dest_exp); > + > + auto attempt = crtl->ssa->new_change_attempt (); > + resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) }; > + auto *set = crtl->ssa->allocate<set_info> (i1, resource); > + if (set) > + { > + auto def = find_access (i1->defs (), REGNO (new_dest_exp)); > + if (!def) > + i1->defs() = insert_access (attempt, set, i1->defs()); > + } > + > + set_rescan_load (i1); > + set_rescan_load (i2); > + df_insn_rescan (insn1); > +} > + > +// Return true iff insn I1 has already existing subreg. > +static bool > +use_has_subreg_p (insn_info *i1) > +{ > + for (auto def : i1->defs ()) > + { > + auto set = dyn_cast<set_info *> (def); > + for (auto use : set->nondebug_insn_uses ()) > + { > + insn_info *info = use->insn (); > + if (info && info->rtl ()) > + { > + rtx_insn *rtl_insn = info->rtl (); > + df_ref ref; > + FOR_EACH_INSN_USE (ref, rtl_insn) > + { > + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); > + if (REG_P (dest_exp) > + && DF_REF_REGNO (ref) == REGNO (dest_exp)) > + { > + rtx *loc = DF_REF_LOC (ref); > + if (GET_CODE (*loc) == SUBREG) > + return true; > + } > + } > + } > + } > + } > + return false; > +} > + > +// Set subreg for OO mode pair to generate sequential registers given > +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false > +// if store insn. > +void > +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, > + bool load_p) > +{ > + if (load_p) > + { > + bool i1_subreg_p = use_has_subreg_p (i1); > + bool i2_subreg_p = use_has_subreg_p (i2); > + > + if (i1_subreg_p || i2_subreg_p) > + set_multiword_existing_subreg (i1, i2); > + else > + set_multiword_subreg_load (i1, i2); I don't understand this. Why do we have both set_multiword_existing_subreg and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically independent of one another (since i1 and i2 were separate instructions until now). So "i1_subreg_p || i2_subreg_p" implies that set_multiword_existing_subreg can handle i1s that have no existing subreg (used when i2_subreg_p) and that it can handle i2s that have no existing subreg (used when i1_subreg_p). So doesn't this mean that set_multiword_existing_subreg can handle everything? IMO, the way the update should work is that: (a) all references to the old registers should be updated via insn_propagation (regardless of whether the old references involved subregs). (b) those updates should be part of the same insn_change group as the change to the load itself. For stores, definitions of the stored register can probably be handled directly using df_refs, but there too, the updates should IMO be part of the same insn_change group as the change to the store itself. In both cases, it's the: crtl->ssa->change_insns (changes); in pair_fusion_bb_info::fuse_pair that should be responsible for updating the rtl-ssa IR. The changes that the pass wants to make should be described as insn_changes and passed to change_insns. The reason for funneling all changes through change_insns is that it allows rtl-ssa to maintain more complex datastructures. Clients aren't supposed to manually update the datastructures piecemeal. Thanks, Richard > + } > + else > + set_multiword_subreg_store (i1, i2); > +} > + > +rtx > +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p) > +{ > + rtx i1 = pats[0]; > + rtx src_exp = SET_SRC (i1); > + rtx dest_exp = SET_DEST (i1); > + PUT_MODE_RAW (src_exp, OOmode); > + PUT_MODE_RAW (dest_exp, OOmode); > + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp), > + gen_rtvec (1, src_exp), > + UNSPEC_LXVP); > + rtx set = gen_rtx_SET (dest_exp, unspec); > + if (dump_file) > + { > + if (load_p) > + fprintf (dump_file, "lxv with lxvp "); > + else > + fprintf (dump_file, "stxv with stxvp "); > + print_rtl_single (dump_file, set); > + } > + return set; > +} > + > +const pass_data pass_data_mem_fusion = > +{ > + RTL_PASS, /* type */ > + "mem_fusion", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_NONE, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + TODO_df_finish, /* todo_flags_finish */ > +}; > + > +class pass_mem_fusion : public rtl_opt_pass > +{ > +public: > + pass_mem_fusion (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_mem_fusion, ctxt) > + {} > + > + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);} > + > + /* opt_pass methods: */ > + bool gate (function *) > + { > + return (optimize > 0 && TARGET_VSX && TARGET_POWER10); > + } > + > + unsigned int execute (function *) final override > + { > + rs6000_pair_fusion pass; > + pass.run (); > + return 0; > + } > +}; // class pass_mem_fusion > + > +rtl_opt_pass * > +make_pass_mem_fusion (gcc::context *ctxt) > +{ > + return new pass_mem_fusion (ctxt); > +} > diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def > index 46a0d0b8c56..0b48f57014d 100644 > --- a/gcc/config/rs6000/rs6000-passes.def > +++ b/gcc/config/rs6000/rs6000-passes.def > @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see > The power8 does not have instructions that automaticaly do the byte swaps > for loads and stores. */ > INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); > - > + /* Pass to replace adjacent memory addresses lxv/stxv instruction with > + lxvp/stxvp instruction. */ > + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion); > /* Pass to do the PCREL_OPT optimization that combines the load of an > external symbol's address along with a single load or store using that > address as a base register. */ > diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h > index 09a57a806fa..1412b31c2eb 100644 > --- a/gcc/config/rs6000/rs6000-protos.h > +++ b/gcc/config/rs6000/rs6000-protos.h > @@ -343,6 +343,7 @@ namespace gcc { class context; } > class rtl_opt_pass; > > extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); > +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *); > extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); > extern bool rs6000_sum_of_two_registers_p (const_rtx expr); > extern bool rs6000_quadword_masked_address_p (const_rtx exp); > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index 58553ff66f4..6da4e70973d 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) > reg_mode = word_mode; > reg_mode_size = GET_MODE_SIZE (reg_mode); > > - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); > + gcc_assert (mode == OOmode > + || reg_mode_size * nregs == GET_MODE_SIZE (mode)); > > /* TDmode residing in FP registers is special, since the ISA requires that > the lower-numbered word of a register pair is always the most significant > @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) > int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); > if (MEM_P (dst)) > { > + rtx addr = XEXP (dst, 0); > + rtx opnd1 = NULL_RTX; > + if (addr && GET_CODE (addr) == PLUS) > + opnd1 = XEXP (addr,1); > + > unsigned offset = 0; > unsigned size = GET_MODE_SIZE (reg_mode); > > @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src) > { > unsigned subreg > = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); > - rtx dst2 = adjust_address (dst, reg_mode, offset); > + rtx dst2 = dst; > + > + if ((GET_CODE (addr) != PLUS > + || (opnd1 && CONST_INT_P(opnd1)))) > + dst2 = adjust_address (dst, reg_mode, offset); > + else > + PUT_MODE_RAW (dst, reg_mode); > rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); > offset += size; > emit_insn (gen_rtx_SET (dst2, src2)); > @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src) > > if (MEM_P (src)) > { > + rtx addr = XEXP (src, 0); > + rtx opnd1 = NULL_RTX; > + if (addr && GET_CODE (addr) == PLUS) > + opnd1 = XEXP (addr,1); > + > unsigned offset = 0; > unsigned size = GET_MODE_SIZE (reg_mode); > > - for (int i = 0; i < nregs; i += reg_mode_nregs) > + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs) > { > unsigned subreg > = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); > rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); > - rtx src2 = adjust_address (src, reg_mode, offset); > + rtx src2 = src; > + > + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1)))) > + src2 = adjust_address (src, reg_mode, offset); > + else > + PUT_MODE_RAW (src2, reg_mode); > offset += size; > emit_insn (gen_rtx_SET (dst2, src2)); > } > @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) > /* If we are writing an accumulator register, we have to > prime it after we've written it. */ > if (TARGET_MMA > - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) > + && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) > emit_insn (gen_mma_xxmtacc (dst, dst)); > > return; > @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) > { > for (i = nregs - 1; i >= 0; i--) > { > - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); > - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); > - emit_insn (gen_rtx_SET (dst_i, src_i)); > + if (REG_P (dst) && REG_P (src)) > + { > + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); > + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); > + emit_insn (gen_rtx_SET (dst_i, src_i)); > + } > } > } > else > @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) > /* If we are writing an accumulator register, we have to > prime it after we've written it. */ > if (TARGET_MMA > - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) > + && REG_P (dst) && GET_MODE (dst) == XOmode > + && FP_REGNO_P (REGNO (dst))) > emit_insn (gen_mma_xxmtacc (dst, dst)); > } > else > @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) > > /* If the base register we are using to address memory is > also a destination reg, then change that register last. */ > - if (REG_P (breg) > + if (REG_P (dst) && REG_P (breg) > && REGNO (breg) >= REGNO (dst) > && REGNO (breg) < REGNO (dst) + nregs) > j = REGNO (breg) - REGNO (dst); > @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) > /* XO/OO are opaque so cannot use subregs. */ > if (mode == OOmode || mode == XOmode ) > { > - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); > - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); > - emit_insn (gen_rtx_SET (dst_i, src_i)); > + if (REG_P (dst) && REG_P (src)) > + { > + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); > + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); > + emit_insn (gen_rtx_SET (dst_i, src_i)); > + } > } > else > emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, > @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src) > if (restore_basereg != NULL_RTX) > emit_insn (restore_basereg); > } > + return; > } > + > > /* Return true if the peephole2 can combine a load involving a combination of > an addis instruction and a load with an offset that can be fused together on > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md > index a5d20594789..2106e1a1fed 100644 > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -159,6 +159,7 @@ > UNSPEC_XXSPLTIW_CONST > UNSPEC_FMAX > UNSPEC_FMIN > + UNSPEC_LXVP > ]) > > ;; > diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 > index b3ce09d523b..df9b3a35b66 100644 > --- a/gcc/config/rs6000/t-rs6000 > +++ b/gcc/config/rs6000/t-rs6000 > @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc > $(COMPILE) $< > $(POSTCOMPILE) > > +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc > + $(COMPILE) $< > + $(POSTCOMPILE) > + > + > rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc > $(COMPILE) $< > $(POSTCOMPILE) > diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc > index 31d2c21c88f..ff77a0bc8c6 100644 > --- a/gcc/pair-fusion.cc > +++ b/gcc/pair-fusion.cc > @@ -312,9 +312,9 @@ static int > encode_lfs (lfs_fields fields) > { > int size_log2 = exact_log2 (fields.size); > - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); > - return ((int)fields.load_p << 3) > - | ((int)fields.fpsimd_p << 2) > + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9); > + return ((int)fields.load_p << 4) > + | ((int)fields.fpsimd_p << 3) > | (size_log2 - 2); > } > > @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields) > static lfs_fields > decode_lfs (int lfs) > { > - bool load_p = (lfs & (1 << 3)); > - bool fpsimd_p = (lfs & (1 << 2)); > + bool load_p = (lfs & (1 << 4)); > + bool fpsimd_p = (lfs & (1 << 3)); > unsigned size = 1U << ((lfs & 3) + 2); > return { load_p, fpsimd_p, size }; > } > @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) > if (MEM_VOLATILE_P (mem)) > return; > > + if (load_p && !m_pass->fuseable_load_p (insn)) > + return; > + > // Ignore writeback accesses if the hook says to do so. > if (!m_pass->should_handle_writeback (writeback_type::EXISTING) > && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) > @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, > } > > rtx reg_notes = combine_reg_notes (first, second, load_p); > - > + m_pass->set_multiword_subreg (i1, i2, load_p); > rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p); > insn_change *pair_change = nullptr; > auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { > @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, > pair_change->new_defs = merge_access_arrays (attempt, > input_defs[0], > input_defs[1]); > + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes); > gcc_assert (pair_change->new_defs.is_valid ()); > > pair_change->new_uses > @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size, > reg_ops[i] = XEXP (pats[i], !load_p); > } > > + if (!load_p && !m_pass->fuseable_store_p (i1, i2)) > + { > + if (dump_file) > + fprintf (dump_file, > + "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n", > + insns[0]->uid (), insns[1]->uid ()); > + return false; > + } > + > if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) > { > if (dump_file) > @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb) > if (GET_CODE (pat) != SET) > continue; > > + change_existing_multword_mode (rti); > + > if (track_stores && MEM_P (XEXP (pat, 0))) > bb_state.track_access (insn, false, XEXP (pat, 0)); > else if (track_loads && MEM_P (XEXP (pat, 1))) > diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h > index 45e4edceecb..756357db794 100644 > --- a/gcc/pair-fusion.h > +++ b/gcc/pair-fusion.h > @@ -26,8 +26,11 @@ namespace rtl_ssa { > class insn_info; > class insn_range_info; > class bb_info; > + class insn_change; > } > > +class obstack_watermark; > + > // Information about a potential base candidate, used in try_fuse_pair. > // There may be zero, one, or two viable RTL bases for a given pair. > struct base_cand > @@ -142,6 +145,19 @@ struct pair_fusion { > // true iff INSN is a load pair. > virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; > > + // Given INSN change multiword mode load and store to respective > + // unspec instruction. > + virtual void change_existing_multword_mode (rtx_insn *insn) = 0; > + > + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the > + // new rtx with INSN. Remove all uses of definition that are > + // removed given CHANGES. > + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first, > + obstack_watermark *attempt, > + rtl_ssa::insn_change **pair_change, > + auto_vec<rtl_ssa::insn_change *> &changes) > + = 0; > + > // Return true if we should track loads. > virtual bool track_loads_p () > { > @@ -171,6 +187,24 @@ struct pair_fusion { > virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, > rtx regs[2], bool load_p) = 0; > > + // Given insn_info pair I1 and I2, sets subreg with multiword registers > + // to assign register pairs by allocators. > + // LOAD_P is true iff the pair is a load. > + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, > + rtl_ssa::insn_info *i2, > + bool load_p) = 0; > + > + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform > + // store mem pairs. > + // Return true if feasible to perform store mem pairs otherwise false. > + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, > + rtl_ssa::insn_info *i2) = 0; > + > + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform > + // load mem pairs. > + // Return true if feasible to perform load mem pairs otherwise false. > + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0; > + > void process_block (rtl_ssa::bb_info *bb); > rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2], > const rtl_ssa::insn_range_info > diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h > index e2134621723..d5c5b80f8aa 100644 > --- a/gcc/rtl-ssa/functions.h > +++ b/gcc/rtl-ssa/functions.h > @@ -222,6 +222,13 @@ public: > template<typename T, typename... Ts> > T *change_alloc (obstack_watermark &wm, Ts... args); > > + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; } > + > + template<typename T, typename... Ts> > + T *allocate (Ts... args); > + > + void remove_use (use_info *); > + > private: > class bb_phi_info; > class build_info; > @@ -231,9 +238,6 @@ private: > // allocate_temp during its lifetime. > obstack_watermark temp_watermark () { return &m_temp_obstack; } > > - template<typename T, typename... Ts> > - T *allocate (Ts... args); > - > template<typename T, typename... Ts> > T *allocate_temp (Ts... args); > > @@ -269,7 +273,6 @@ private: > static void insert_use_after (use_info *, use_info *); > > void add_use (use_info *); > - void remove_use (use_info *); > > insn_info::order_node *need_order_node (insn_info *); > > diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C > new file mode 100644 > index 00000000000..d10ff0cdf36 > --- /dev/null > +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +#include <altivec.h> > + > +void > +foo2 () > +{ > + __vector_quad *dst1; > + __vector_quad *dst2; > + vector unsigned char src; > + __vector_quad acc; > + vector unsigned char *ptr; > + __builtin_mma_xvf32ger(&acc, src, ptr[0]); > + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); > + *dst1 = acc; > + __builtin_mma_xvf32ger(&acc, src, ptr[2]); > + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]); > + *dst2 = acc; > +} > +/* { dg-final { scan-assembler {\mlxvp\M} } } */ > diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C > new file mode 100644 > index 00000000000..c523572cf3c > --- /dev/null > +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +#include <altivec.h> > + > +void > +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src) > +{ > + __vector_quad acc; > + __builtin_mma_xvf32ger(&acc, src, ptr[0]); > + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); > + *dst = acc; > +} > +/* { dg-final { scan-assembler {\mlxvp\M} } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c > index 69ee826e1be..ae29127f954 100644 > --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c > +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c > @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec) > dst[13] = acc; > } > > -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */ > -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ > +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */ > +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */ > /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */ > /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */ > /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
Hello Richard: On 18/07/24 1:17 am, Richard Sandiford wrote: > Ajit Agarwal <aagarwa1@linux.ibm.com> writes: >> Hello All: >> >> This version of patch relaxes store fusion for more use cases. >> >> Common infrastructure using generic code for pair mem fusion of different >> targets. >> >> rs6000 target specific code implement virtual functions defined by generic code. >> >> Target specific code are added in rs6000-mem-fusion.cc. >> >> Bootstrapped and regtested on powerpc64-linux-gnu. >> >> Thanks & Regards >> Ajit >> >> >> rs6000, middle-end: Add implementation for different targets for pair mem fusion >> >> Common infrastructure using generic code for pair mem fusion of different >> targets. >> >> rs6000 target specific code implement virtual functions defined by generic code. >> >> Target specific code are added in rs6000-mem-fusion.cc. >> >> 2024-07-02 Ajit Kumar Agarwal <aagarwa1@linux.ibm.com> >> >> gcc/ChangeLog: >> >> * config/rs6000/rs6000-passes.def: New mem fusion pass >> before pass_early_remat. >> * pair-fusion.h: Add additional pure virtual function >> required for rs6000 target implementation. >> * pair-fusion.cc: Use of virtual functions for additional >> virtual function addded for rs6000 target. >> * config/rs6000/rs6000-mem-fusion.cc: Add new pass. >> Add target specific implementation for generic pure virtual >> functions. >> * config/rs6000/mma.md: Modify movoo machine description. >> Add new machine description movoo1. >> * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move >> to expand movoo machine description for all constraints. >> * config.gcc: Add new object file. >> * config/rs6000/rs6000-protos.h: Add new prototype for mem >> fusion pass. >> * config/rs6000/t-rs6000: Add new rule. >> * rtl-ssa/functions.h: Move out allocate function from private >> to public and add get_m_temp_defs function. >> >> gcc/testsuite/ChangeLog: >> >> * g++.target/powerpc/mem-fusion.C: New test. >> * g++.target/powerpc/mem-fusion-1.C: New test. >> * gcc.target/powerpc/mma-builtin-1.c: Modify test. >> --- >> gcc/config.gcc | 2 + >> gcc/config/rs6000/mma.md | 26 +- >> gcc/config/rs6000/rs6000-mem-fusion.cc | 708 ++++++++++++++++++ >> gcc/config/rs6000/rs6000-passes.def | 4 +- >> gcc/config/rs6000/rs6000-protos.h | 1 + >> gcc/config/rs6000/rs6000.cc | 57 +- >> gcc/config/rs6000/rs6000.md | 1 + >> gcc/config/rs6000/t-rs6000 | 5 + >> gcc/pair-fusion.cc | 27 +- >> gcc/pair-fusion.h | 34 + >> gcc/rtl-ssa/functions.h | 11 +- >> .../g++.target/powerpc/mem-fusion-1.C | 22 + >> gcc/testsuite/g++.target/powerpc/mem-fusion.C | 15 + >> .../gcc.target/powerpc/mma-builtin-1.c | 4 +- >> 14 files changed, 890 insertions(+), 27 deletions(-) >> create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc >> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C >> >> diff --git a/gcc/config.gcc b/gcc/config.gcc >> index bc45615741b..12f79a78177 100644 >> --- a/gcc/config.gcc >> +++ b/gcc/config.gcc >> @@ -524,6 +524,7 @@ powerpc*-*-*) >> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" >> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" >> extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" >> + extra_objs="${extra_objs} rs6000-mem-fusion.o" >> extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" >> extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" >> extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" >> @@ -560,6 +561,7 @@ rs6000*-*-*) >> extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" >> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" >> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" >> + extra_objs="${extra_objs} rs6000-mem-fusion.o" >> target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc" >> target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc" >> ;; >> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md >> index 04e2d0066df..88413926a02 100644 >> --- a/gcc/config/rs6000/mma.md >> +++ b/gcc/config/rs6000/mma.md >> @@ -294,7 +294,31 @@ >> >> (define_insn_and_split "*movoo" >> [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") >> - (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] >> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] >> + "TARGET_MMA >> + && (gpc_reg_operand (operands[0], OOmode) >> + || gpc_reg_operand (operands[1], OOmode))" >> +;; "" >> + "@ >> + # >> + # >> + #" >> + "&& reload_completed" >> + [(const_int 0)] >> +{ >> + rs6000_split_multireg_move (operands[0], operands[1]); >> + DONE; >> +} >> + [(set_attr "type" "vecload,vecstore,veclogical") >> + (set_attr "length" "*,*,8")]) >> +;; (set_attr "max_prefixed_insns" "2,2,*")]) >> + >> + >> +(define_insn_and_split "*movoo1" >> + [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") >> + (unspec [ >> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa") >> + ] UNSPEC_LXVP))] >> "TARGET_MMA >> && (gpc_reg_operand (operands[0], OOmode) >> || gpc_reg_operand (operands[1], OOmode))" >> diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc >> new file mode 100644 >> index 00000000000..b63b6f31001 >> --- /dev/null >> +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc >> @@ -0,0 +1,708 @@ >> +/* Subroutines used to perform adjacent load/store into >> + paired memory accesses for TARGET_POWER10 and TARGET_VSX. >> + >> + Copyright (C) 2024 Free Software Foundation, Inc. >> + >> + This file is part of GCC. >> + >> + GCC is free software; you can redistribute it and/or modify it >> + under the terms of the GNU General Public License as published >> + by the Free Software Foundation; either version 3, or (at your >> + option) any later version. >> + >> + GCC is distributed in the hope that it will be useful, but WITHOUT >> + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY >> + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public >> + License for more details. >> + >> + You should have received a copy of the GNU General Public License >> + along with GCC; see the file COPYING3. If not see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#define INCLUDE_ALGORITHM >> +#define INCLUDE_FUNCTIONAL >> +#define INCLUDE_LIST >> +#define INCLUDE_TYPE_TRAITS >> +#include "config.h" >> +#include "system.h" >> +#include "coretypes.h" >> +#include "backend.h" >> +#include "rtl.h" >> +#include "df.h" >> +#include "rtl-iter.h" >> +#include "rtl-ssa.h" >> +#include "rtl-ssa/internals.h" >> +#include "rtl-ssa/internals.inl" >> +#include "cfgcleanup.h" >> +#include "tree-pass.h" >> +#include "pair-fusion.h" >> + >> +using namespace rtl_ssa; >> + >> +struct rs6000_pair_fusion : public pair_fusion >> +{ >> + bool fpsimd_op_p (rtx , machine_mode , bool) override final >> + { >> + return false; >> + } >> + >> + bool pair_mem_insn_p (rtx_insn *, bool &) override final >> + { >> + return false; >> + } >> + >> + void change_existing_multword_mode (rtx_insn *insn) override final; >> + >> + bool pair_mem_ok_with_policy (rtx, bool) override final >> + { >> + return true; >> + } >> + >> + bool pair_operand_mode_ok_p (machine_mode mode) override final; >> + >> + rtx gen_pair (rtx *pats, rtx, bool load_p) override final; >> + >> + bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final >> + { >> + return true; >> + } >> + >> + int pair_mem_alias_check_limit () override final >> + { >> + return 0; >> + } >> + >> + bool should_handle_writeback (enum writeback_type) override final >> + { >> + return false; >> + } >> + >> + bool track_loads_p () override final >> + { >> + return true; >> + } >> + >> + bool track_stores_p () override final >> + { >> + return true; >> + } >> + >> + bool pair_mem_in_range_p (HOST_WIDE_INT) override final >> + { >> + return true; >> + } >> + >> + rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final >> + { >> + return NULL_RTX; >> + } >> + >> + rtx destructure_pair (rtx_def **, rtx, bool) override final >> + { >> + return NULL_RTX; >> + } >> + >> + bool fuseable_store_p (insn_info *i1, insn_info *i2) override final; >> + >> + bool fuseable_load_p (insn_info *insn) override final; >> + >> + void set_multiword_subreg (insn_info *i1, insn_info *i2, >> + bool load_p) override final; >> + >> + void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt, >> + insn_change **pair_change, >> + auto_vec <insn_change *> &changes) override final; >> +}; >> + >> +bool >> +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) >> +{ >> + return (ALTIVEC_OR_VSX_VECTOR_MODE (mode)); >> +} >> + >> +void >> +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn) >> +{ >> + rtx set = single_set (insn); >> + rtx src = SET_SRC (set); >> + rtx dest = SET_DEST (set); >> + rtx copy = NULL_RTX; >> + >> + if ((MEM_P (src) && GET_MODE (src) == OOmode) >> + || (MEM_P (dest) && GET_MODE (dest) == OOmode)) >> + { >> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), >> + gen_rtvec (1, src), >> + UNSPEC_LXVP); >> + copy = gen_rtx_SET (dest, unspec); >> + rtx_insn *new_insn = emit_insn_after (copy, insn); >> + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); >> + df_insn_rescan (new_insn); >> + df_insn_delete (insn); >> + remove_insn (insn); >> + insn->set_deleted (); >> + } >> +} >> + >> +static void >> +update_change (set_info *set) >> +{ >> + if (!set->has_any_uses ()) >> + return; >> + >> + auto *use = *set->all_uses ().begin (); >> + do >> + { >> + auto *next_use = use->next_use (); >> + if (use->is_in_phi ()) >> + { >> + update_change (use->phi ()); >> + } >> + else >> + { >> + crtl->ssa->remove_use (use); >> + } >> + use = next_use; >> + } >> + while (use); >> +} >> + >> +void >> +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first, >> + obstack_watermark *attempt, >> + insn_change **pair_change, >> + auto_vec<insn_change *> &changes) >> +{ >> + for (insn_change *change : changes) >> + for (auto def : change->old_defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + update_change (set); >> + } >> + >> + auto &new_defs = (*pair_change)->new_defs; >> + vec_rtx_properties properties; >> + properties.add_insn (first->rtl (), true); >> + // Build up the new list of definitions. >> + for (rtx_obj_reference ref : properties.refs ()) >> + if (ref.is_write ()) >> + { >> + auto *set = crtl->ssa->allocate<set_info> (first, >> + full_register (ref.regno)); >> + if (set) >> + { >> + auto def = find_access (new_defs, ref.regno); >> + if (!def) >> + { >> + new_defs = insert_access (*attempt, set, >> + new_defs); >> + auto &m_temp_defs = crtl->ssa->get_m_temp_defs (); >> + m_temp_defs.safe_push (set); >> + } >> + } >> + } >> +} >> + >> +// df_insn_rescan dependent instruction where operands >> +// are reversed given insn_info INFO. >> +static void >> +set_rescan_load (insn_info *i1) >> +{ >> + for (auto def : i1->defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + for (auto use : set->all_uses ()) >> + { >> + insn_info *info = use->insn (); >> + if (info && info->rtl ()) >> + { >> + rtx_insn *rtl_insn = info->rtl (); >> + df_insn_rescan (rtl_insn); >> + } >> + } >> + } >> +} >> + >> +// df_insn_rescan the def instruction where operands are reversed given INSN. >> +static bool >> +set_rescan_store (insn_info *insn) >> +{ >> + for (auto use : insn->uses()) >> + { >> + auto def = use->def (); >> + >> + if (!def) >> + return false; >> + >> + if (def->insn ()->is_artificial ()) >> + return false; >> + >> + if (def->insn () && def->insn ()->rtl () >> + && def->insn()->is_real ()) >> + { >> + rtx_insn *rtl_insn = def->insn ()->rtl (); >> + rtx set = single_set (rtl_insn); >> + >> + if (set == NULL_RTX) >> + return false; >> + df_insn_rescan (rtl_insn); >> + } >> + } >> + return true; >> +} >> + >> +// Check for feasibility of store to be fuseable or not. Return true if >> +// feasible otherwise false. >> +static bool >> +feasible_store_p (insn_info *insn) >> +{ >> + for (auto use : insn->uses ()) >> + { >> + auto def = use->def (); >> + >> + if (def->insn ()->is_artificial ()) >> + return false; >> + >> + if (def->insn () && def->insn ()->rtl () >> + && def->insn()->is_real ()) >> + { >> + rtx_insn *rtl_insn = def->insn ()->rtl (); >> + rtx set = single_set (rtl_insn); >> + >> + if (set == NULL_RTX) >> + return false; >> + >> + // Return false if dependent def is load. >> + // This is done as def instruction could be a fused load and >> + // to avoid already existing subreg (reg:OO R) offset. >> + if (rtl_insn && MEM_P (SET_SRC (set))) >> + return false; >> + >> + // Return false if dependent def is store. >> + if (rtl_insn && MEM_P (SET_DEST (set))) >> + return false; > > I don't understand these tests. It might help to turn it around and > say: what sort of cases do you want to handle? > >> + } >> + } >> + return true; >> +} >> + >> +// Check if store can be fuseable or not. Return true if fuseable otherwise >> +// false. >> +bool >> +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2) >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx_insn *insn2 = i2->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx src_exp = SET_SRC (body); >> + rtx insn2_body = PATTERN (insn2); >> + rtx insn2_src_exp = SET_SRC (insn2_body); >> + >> + if (!(REG_P (src_exp) >> + && crtl->ssa->single_dominating_def (REGNO (src_exp)))) >> + return false; >> + >> + // This is done as def instruction could be a fused load and >> + // to avoid already existing subreg (reg:OO R) offset. >> + if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1) >> + return false; >> + >> + // Return false if src of insn1 and src of insn2 are same. >> + if (src_exp == insn2_src_exp) >> + return false; >> + >> + // Return false if src of insn1 is subreg. >> + if (GET_CODE (src_exp) == SUBREG) >> + return false; > > This can't be true after the REG_P check above. > I will make this change. >> + >> + // Return false if src of insn2 is subreg. >> + if (GET_CODE (insn2_src_exp) == SUBREG) >> + return false; > > Shouldn't the tests for i1 and i2 be symmetrical, with i2 also > requiring a single dominating definition? > I will make this change. >> + >> + if (!feasible_store_p (i1)) >> + return false;; >> + >> + if (!feasible_store_p (i2)) >> + return false; >> + >> + return true; >> +} >> + >> +// Set subreg for def of store INSN given rtx SRC instruction. >> +static void >> +set_store_subreg (insn_info *i1, rtx src, int regoff) >> +{ >> + for (auto use: i1->uses ()) >> + { >> + auto def = use->def (); >> + if (!def) >> + return; >> + >> + insn_info *info = def->insn (); >> + >> + if (info->is_artificial ()) >> + return; >> + >> + if (info && info->is_real ()) >> + { >> + rtx_insn *rtl_insn = info->rtl (); >> + rtx set = single_set (rtl_insn); >> + if (set == NULL_RTX) >> + return; >> + df_ref ref; >> + FOR_EACH_INSN_DEF (ref, rtl_insn) >> + { >> + rtx src_exp = SET_SRC (PATTERN (i1->rtl ())); >> + if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + if (GET_CODE (*loc) == SUBREG) >> + { >> + rtx src1 = simplify_gen_subreg (GET_MODE (*loc), >> + SUBREG_REG (src), >> + OOmode, >> + regoff); >> + *loc = copy_rtx (src1); >> + } >> + else >> + *loc = copy_rtx (src); >> + } >> + } >> + } >> + } >> +} >> + >> +// Check whether load can be fusable or not. >> +// Return true if fuseable otherwise false. >> +bool >> +rs6000_pair_fusion::fuseable_load_p (insn_info *i1) >> +{ >> + rtx_insn *insn = i1->rtl (); >> + rtx body = PATTERN (insn); >> + rtx dest_exp = SET_DEST (body); >> + >> + if (!(REG_P (dest_exp) >> + && crtl->ssa->single_dominating_def (REGNO (dest_exp)))) >> + return false; >> + return true; >> +} >> + >> +// Propagate insn I1 with new rtx NEW_DEST_EXP. >> +static void >> +propagate_insn (insn_info *i1, rtx new_dest_exp) >> +{ >> + df_ref ref; >> + FOR_EACH_INSN_DEF (ref, i1->rtl()) >> + { >> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >> + if (REG_P (dest_exp) >> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + *loc = new_dest_exp; >> + } >> + } >> +} >> + >> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair. >> +static rtx >> +new_reg_rtx (rtx old_dest) >> +{ >> + rtx new_dest_exp = gen_reg_rtx (OOmode); >> + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest); >> + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest); >> + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest); >> + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest); >> + max_regno = max_reg_num (); >> + return new_dest_exp; >> +} >> + >> +// Set subreg with use of INSN given SRC rtx instruction. >> +static void >> +set_load_subreg (insn_info *i1, rtx src) >> +{ >> + rtx set = single_set (i1->rtl()); >> + rtx old_dest = SET_DEST (set); >> + >> + for (auto def : i1->defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + for (auto use : set->nondebug_insn_uses ()) >> + { >> + insn_info *info = use->insn (); >> + if (!info || !info->rtl ()) >> + continue; >> + >> + rtx_insn *rtl_insn = info->rtl (); >> + df_ref ref; >> + >> + FOR_EACH_INSN_USE (ref, rtl_insn) >> + { >> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >> + if (REG_P (dest_exp) >> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + insn_propagation prop (rtl_insn, old_dest, src); >> + if (GET_CODE (*loc) == SUBREG) >> + { >> + if (!prop.apply_to_pattern (loc)) >> + { >> + if (dump_file != NULL) >> + { >> + fprintf (dump_file, >> + "Cannot propagate insn \n"); >> + print_rtl_single (dump_file, rtl_insn); >> + } >> + return; >> + } >> + } >> + else >> + *loc = copy_rtx (src); >> + } >> + } >> + } >> + } >> +} >> + >> +// Set subreg for OO mode store pair to generate registers in pairs >> +// given insn_info I1 and I2. >> +static void >> +set_multiword_subreg_store (insn_info *i1, insn_info *i2) >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx_insn *insn2 = i2->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx src_exp = SET_SRC (body); >> + rtx insn2_body = PATTERN (insn2); >> + rtx insn2_dest_exp = SET_DEST (insn2_body); >> + machine_mode mode = GET_MODE (src_exp); >> + int regoff; >> + rtx src; >> + rtx addr = XEXP (insn2_dest_exp, 0); >> + >> + PUT_MODE_RAW (src_exp, OOmode); >> + if (GET_CODE (addr) == PLUS >> + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1))) >> + regoff = 16; >> + else >> + regoff = 0; >> + >> + src = simplify_gen_subreg (mode, >> + src_exp, GET_MODE (src_exp), >> + regoff); >> + >> + set_store_subreg (i1, src, regoff); >> + >> + int regoff1 = 0; >> + rtx src1; >> + >> + src1 = simplify_gen_subreg (mode, >> + src_exp, GET_MODE (src_exp), >> + regoff1); >> + >> + set_store_subreg (i2, src1, regoff1); >> + set_rescan_store (i1); >> + set_rescan_store (i2); >> + df_insn_rescan (insn1); >> +} >> + >> +// Set subreg for OO mode pair load to generate registers in pairs given >> +// insn_info I2 and I2. >> +static void >> +set_multiword_subreg_load (insn_info *i1, insn_info *i2) >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx dest_exp = SET_DEST (body); >> + machine_mode mode = GET_MODE (dest_exp); >> + PUT_MODE_RAW (dest_exp, OOmode); >> + >> + int regoff = 0; >> + rtx src; >> + >> + src = simplify_gen_subreg (mode, >> + dest_exp, GET_MODE (dest_exp), >> + regoff); >> + >> + set_load_subreg (i2, src); >> + >> + int regoff1; >> + rtx src1; >> + >> + regoff1 = 16; >> + src1 = simplify_gen_subreg (mode, >> + dest_exp, GET_MODE (dest_exp), >> + regoff1); >> + set_load_subreg (i1, src1); >> + >> + set_rescan_load (i1); >> + set_rescan_load (i2); >> + df_insn_rescan (insn1); >> +} >> + >> +// Set subreg for OO mode pair load for existing subreg rtx to generate >> +// registers in pairs given insn_info I2 and I2. >> +static void >> +set_multiword_existing_subreg (insn_info *i1, insn_info *i2) >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx dest_exp = SET_DEST (body); >> + machine_mode mode = GET_MODE (dest_exp); >> + int regoff1; >> + regoff1 = 16; >> + rtx new_dest_exp = new_reg_rtx (dest_exp); >> + >> + rtx src = simplify_gen_subreg (mode, >> + new_dest_exp, >> + OOmode, >> + regoff1); >> + >> + set_load_subreg (i1, src); >> + propagate_insn (i1, new_dest_exp); >> + >> + int regoff = 0; >> + rtx sset = single_set (i2->rtl ()); >> + rtx insn2_dest_exp = SET_DEST (sset); >> + machine_mode insn2_mode = GET_MODE (insn2_dest_exp); >> + >> + src = simplify_gen_subreg (insn2_mode, >> + new_dest_exp, >> + OOmode, >> + regoff); >> + >> + set_load_subreg (i2, src); >> + propagate_insn (i2, new_dest_exp); >> + >> + auto attempt = crtl->ssa->new_change_attempt (); >> + resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) }; >> + auto *set = crtl->ssa->allocate<set_info> (i1, resource); >> + if (set) >> + { >> + auto def = find_access (i1->defs (), REGNO (new_dest_exp)); >> + if (!def) >> + i1->defs() = insert_access (attempt, set, i1->defs()); >> + } >> + >> + set_rescan_load (i1); >> + set_rescan_load (i2); >> + df_insn_rescan (insn1); >> +} >> + >> +// Return true iff insn I1 has already existing subreg. >> +static bool >> +use_has_subreg_p (insn_info *i1) >> +{ >> + for (auto def : i1->defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + for (auto use : set->nondebug_insn_uses ()) >> + { >> + insn_info *info = use->insn (); >> + if (info && info->rtl ()) >> + { >> + rtx_insn *rtl_insn = info->rtl (); >> + df_ref ref; >> + FOR_EACH_INSN_USE (ref, rtl_insn) >> + { >> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >> + if (REG_P (dest_exp) >> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + if (GET_CODE (*loc) == SUBREG) >> + return true; >> + } >> + } >> + } >> + } >> + } >> + return false; >> +} >> + >> +// Set subreg for OO mode pair to generate sequential registers given >> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false >> +// if store insn. >> +void >> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, >> + bool load_p) >> +{ >> + if (load_p) >> + { >> + bool i1_subreg_p = use_has_subreg_p (i1); >> + bool i2_subreg_p = use_has_subreg_p (i2); >> + >> + if (i1_subreg_p || i2_subreg_p) >> + set_multiword_existing_subreg (i1, i2); >> + else >> + set_multiword_subreg_load (i1, i2); > > I don't understand this. Why do we have both set_multiword_existing_subreg > and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically > independent of one another (since i1 and i2 were separate instructions > until now). So "i1_subreg_p || i2_subreg_p" implies that > set_multiword_existing_subreg can handle i1s that have no existing > subreg (used when i2_subreg_p) and that it can handle i2s that have no > existing subreg (used when i1_subreg_p). So doesn't this mean that > set_multiword_existing_subreg can handle everything? > I will make the following change. if (load_p) { bool i1_subreg_p = use_has_subreg_p (i1); bool i2_subreg_p = use_has_subreg_p (i2); if (!i1_subreg_p && !i2_subreg_p) set_multiword_subreg_load (i1, i2); else set_multiword_existing_subreg (i1, i2); } Is this okay. > IMO, the way the update should work is that: > > (a) all references to the old registers should be updated via > insn_propagation (regardless of whether the old references > involved subregs). > > (b) those updates should be part of the same insn_change group as > the change to the load itself. > > For stores, definitions of the stored register can probably be handled > directly using df_refs, but there too, the updates should IMO be part > of the same insn_change group as the change to the store itself. > > In both cases, it's the: > > crtl->ssa->change_insns (changes); > > in pair_fusion_bb_info::fuse_pair that should be responsible for > updating the rtl-ssa IR. The changes that the pass wants to make > should be described as insn_changes and passed to change_insns. > > The reason for funneling all changes through change_insns is that > it allows rtl-ssa to maintain more complex datastructures. Clients > aren't supposed to manually update the datastructures piecemeal. > I am afraid I am not getting this. Would you mind elaborating this. Sorry for that. > Thanks, > Richard > Thanks & Regards Ajit >> + } >> + else >> + set_multiword_subreg_store (i1, i2); >> +} >> + >> +rtx >> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p) >> +{ >> + rtx i1 = pats[0]; >> + rtx src_exp = SET_SRC (i1); >> + rtx dest_exp = SET_DEST (i1); >> + PUT_MODE_RAW (src_exp, OOmode); >> + PUT_MODE_RAW (dest_exp, OOmode); >> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp), >> + gen_rtvec (1, src_exp), >> + UNSPEC_LXVP); >> + rtx set = gen_rtx_SET (dest_exp, unspec); >> + if (dump_file) >> + { >> + if (load_p) >> + fprintf (dump_file, "lxv with lxvp "); >> + else >> + fprintf (dump_file, "stxv with stxvp "); >> + print_rtl_single (dump_file, set); >> + } >> + return set; >> +} >> + >> +const pass_data pass_data_mem_fusion = >> +{ >> + RTL_PASS, /* type */ >> + "mem_fusion", /* name */ >> + OPTGROUP_NONE, /* optinfo_flags */ >> + TV_NONE, /* tv_id */ >> + 0, /* properties_required */ >> + 0, /* properties_provided */ >> + 0, /* properties_destroyed */ >> + 0, /* todo_flags_start */ >> + TODO_df_finish, /* todo_flags_finish */ >> +}; >> + >> +class pass_mem_fusion : public rtl_opt_pass >> +{ >> +public: >> + pass_mem_fusion (gcc::context *ctxt) >> + : rtl_opt_pass (pass_data_mem_fusion, ctxt) >> + {} >> + >> + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);} >> + >> + /* opt_pass methods: */ >> + bool gate (function *) >> + { >> + return (optimize > 0 && TARGET_VSX && TARGET_POWER10); >> + } >> + >> + unsigned int execute (function *) final override >> + { >> + rs6000_pair_fusion pass; >> + pass.run (); >> + return 0; >> + } >> +}; // class pass_mem_fusion >> + >> +rtl_opt_pass * >> +make_pass_mem_fusion (gcc::context *ctxt) >> +{ >> + return new pass_mem_fusion (ctxt); >> +} >> diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def >> index 46a0d0b8c56..0b48f57014d 100644 >> --- a/gcc/config/rs6000/rs6000-passes.def >> +++ b/gcc/config/rs6000/rs6000-passes.def >> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see >> The power8 does not have instructions that automaticaly do the byte swaps >> for loads and stores. */ >> INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); >> - >> + /* Pass to replace adjacent memory addresses lxv/stxv instruction with >> + lxvp/stxvp instruction. */ >> + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion); >> /* Pass to do the PCREL_OPT optimization that combines the load of an >> external symbol's address along with a single load or store using that >> address as a base register. */ >> diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h >> index 09a57a806fa..1412b31c2eb 100644 >> --- a/gcc/config/rs6000/rs6000-protos.h >> +++ b/gcc/config/rs6000/rs6000-protos.h >> @@ -343,6 +343,7 @@ namespace gcc { class context; } >> class rtl_opt_pass; >> >> extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); >> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *); >> extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); >> extern bool rs6000_sum_of_two_registers_p (const_rtx expr); >> extern bool rs6000_quadword_masked_address_p (const_rtx exp); >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index 58553ff66f4..6da4e70973d 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> reg_mode = word_mode; >> reg_mode_size = GET_MODE_SIZE (reg_mode); >> >> - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); >> + gcc_assert (mode == OOmode >> + || reg_mode_size * nregs == GET_MODE_SIZE (mode)); >> >> /* TDmode residing in FP registers is special, since the ISA requires that >> the lower-numbered word of a register pair is always the most significant >> @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); >> if (MEM_P (dst)) >> { >> + rtx addr = XEXP (dst, 0); >> + rtx opnd1 = NULL_RTX; >> + if (addr && GET_CODE (addr) == PLUS) >> + opnd1 = XEXP (addr,1); >> + >> unsigned offset = 0; >> unsigned size = GET_MODE_SIZE (reg_mode); >> >> @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> { >> unsigned subreg >> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >> - rtx dst2 = adjust_address (dst, reg_mode, offset); >> + rtx dst2 = dst; >> + >> + if ((GET_CODE (addr) != PLUS >> + || (opnd1 && CONST_INT_P(opnd1)))) >> + dst2 = adjust_address (dst, reg_mode, offset); >> + else >> + PUT_MODE_RAW (dst, reg_mode); >> rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); >> offset += size; >> emit_insn (gen_rtx_SET (dst2, src2)); >> @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> >> if (MEM_P (src)) >> { >> + rtx addr = XEXP (src, 0); >> + rtx opnd1 = NULL_RTX; >> + if (addr && GET_CODE (addr) == PLUS) >> + opnd1 = XEXP (addr,1); >> + >> unsigned offset = 0; >> unsigned size = GET_MODE_SIZE (reg_mode); >> >> - for (int i = 0; i < nregs; i += reg_mode_nregs) >> + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs) >> { >> unsigned subreg >> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >> rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); >> - rtx src2 = adjust_address (src, reg_mode, offset); >> + rtx src2 = src; >> + >> + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1)))) >> + src2 = adjust_address (src, reg_mode, offset); >> + else >> + PUT_MODE_RAW (src2, reg_mode); >> offset += size; >> emit_insn (gen_rtx_SET (dst2, src2)); >> } >> @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* If we are writing an accumulator register, we have to >> prime it after we've written it. */ >> if (TARGET_MMA >> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >> + && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >> emit_insn (gen_mma_xxmtacc (dst, dst)); >> >> return; >> @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> { >> for (i = nregs - 1; i >= 0; i--) >> { >> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >> - emit_insn (gen_rtx_SET (dst_i, src_i)); >> + if (REG_P (dst) && REG_P (src)) >> + { >> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >> + emit_insn (gen_rtx_SET (dst_i, src_i)); >> + } >> } >> } >> else >> @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* If we are writing an accumulator register, we have to >> prime it after we've written it. */ >> if (TARGET_MMA >> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >> + && REG_P (dst) && GET_MODE (dst) == XOmode >> + && FP_REGNO_P (REGNO (dst))) >> emit_insn (gen_mma_xxmtacc (dst, dst)); >> } >> else >> @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> >> /* If the base register we are using to address memory is >> also a destination reg, then change that register last. */ >> - if (REG_P (breg) >> + if (REG_P (dst) && REG_P (breg) >> && REGNO (breg) >= REGNO (dst) >> && REGNO (breg) < REGNO (dst) + nregs) >> j = REGNO (breg) - REGNO (dst); >> @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* XO/OO are opaque so cannot use subregs. */ >> if (mode == OOmode || mode == XOmode ) >> { >> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >> - emit_insn (gen_rtx_SET (dst_i, src_i)); >> + if (REG_P (dst) && REG_P (src)) >> + { >> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >> + emit_insn (gen_rtx_SET (dst_i, src_i)); >> + } >> } >> else >> emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, >> @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> if (restore_basereg != NULL_RTX) >> emit_insn (restore_basereg); >> } >> + return; >> } >> + >> >> /* Return true if the peephole2 can combine a load involving a combination of >> an addis instruction and a load with an offset that can be fused together on >> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md >> index a5d20594789..2106e1a1fed 100644 >> --- a/gcc/config/rs6000/rs6000.md >> +++ b/gcc/config/rs6000/rs6000.md >> @@ -159,6 +159,7 @@ >> UNSPEC_XXSPLTIW_CONST >> UNSPEC_FMAX >> UNSPEC_FMIN >> + UNSPEC_LXVP >> ]) >> >> ;; >> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 >> index b3ce09d523b..df9b3a35b66 100644 >> --- a/gcc/config/rs6000/t-rs6000 >> +++ b/gcc/config/rs6000/t-rs6000 >> @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc >> $(COMPILE) $< >> $(POSTCOMPILE) >> >> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc >> + $(COMPILE) $< >> + $(POSTCOMPILE) >> + >> + >> rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc >> $(COMPILE) $< >> $(POSTCOMPILE) >> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc >> index 31d2c21c88f..ff77a0bc8c6 100644 >> --- a/gcc/pair-fusion.cc >> +++ b/gcc/pair-fusion.cc >> @@ -312,9 +312,9 @@ static int >> encode_lfs (lfs_fields fields) >> { >> int size_log2 = exact_log2 (fields.size); >> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); >> - return ((int)fields.load_p << 3) >> - | ((int)fields.fpsimd_p << 2) >> + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9); >> + return ((int)fields.load_p << 4) >> + | ((int)fields.fpsimd_p << 3) >> | (size_log2 - 2); >> } >> >> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields) >> static lfs_fields >> decode_lfs (int lfs) >> { >> - bool load_p = (lfs & (1 << 3)); >> - bool fpsimd_p = (lfs & (1 << 2)); >> + bool load_p = (lfs & (1 << 4)); >> + bool fpsimd_p = (lfs & (1 << 3)); >> unsigned size = 1U << ((lfs & 3) + 2); >> return { load_p, fpsimd_p, size }; >> } >> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) >> if (MEM_VOLATILE_P (mem)) >> return; >> >> + if (load_p && !m_pass->fuseable_load_p (insn)) >> + return; >> + >> // Ignore writeback accesses if the hook says to do so. >> if (!m_pass->should_handle_writeback (writeback_type::EXISTING) >> && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) >> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> } >> >> rtx reg_notes = combine_reg_notes (first, second, load_p); >> - >> + m_pass->set_multiword_subreg (i1, i2, load_p); >> rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p); >> insn_change *pair_change = nullptr; >> auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { >> @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> pair_change->new_defs = merge_access_arrays (attempt, >> input_defs[0], >> input_defs[1]); >> + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes); >> gcc_assert (pair_change->new_defs.is_valid ()); >> >> pair_change->new_uses >> @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size, >> reg_ops[i] = XEXP (pats[i], !load_p); >> } >> >> + if (!load_p && !m_pass->fuseable_store_p (i1, i2)) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) >> { >> if (dump_file) >> @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb) >> if (GET_CODE (pat) != SET) >> continue; >> >> + change_existing_multword_mode (rti); >> + >> if (track_stores && MEM_P (XEXP (pat, 0))) >> bb_state.track_access (insn, false, XEXP (pat, 0)); >> else if (track_loads && MEM_P (XEXP (pat, 1))) >> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h >> index 45e4edceecb..756357db794 100644 >> --- a/gcc/pair-fusion.h >> +++ b/gcc/pair-fusion.h >> @@ -26,8 +26,11 @@ namespace rtl_ssa { >> class insn_info; >> class insn_range_info; >> class bb_info; >> + class insn_change; >> } >> >> +class obstack_watermark; >> + >> // Information about a potential base candidate, used in try_fuse_pair. >> // There may be zero, one, or two viable RTL bases for a given pair. >> struct base_cand >> @@ -142,6 +145,19 @@ struct pair_fusion { >> // true iff INSN is a load pair. >> virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; >> >> + // Given INSN change multiword mode load and store to respective >> + // unspec instruction. >> + virtual void change_existing_multword_mode (rtx_insn *insn) = 0; >> + >> + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the >> + // new rtx with INSN. Remove all uses of definition that are >> + // removed given CHANGES. >> + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first, >> + obstack_watermark *attempt, >> + rtl_ssa::insn_change **pair_change, >> + auto_vec<rtl_ssa::insn_change *> &changes) >> + = 0; >> + >> // Return true if we should track loads. >> virtual bool track_loads_p () >> { >> @@ -171,6 +187,24 @@ struct pair_fusion { >> virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, >> rtx regs[2], bool load_p) = 0; >> >> + // Given insn_info pair I1 and I2, sets subreg with multiword registers >> + // to assign register pairs by allocators. >> + // LOAD_P is true iff the pair is a load. >> + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2, >> + bool load_p) = 0; >> + >> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >> + // store mem pairs. >> + // Return true if feasible to perform store mem pairs otherwise false. >> + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2) = 0; >> + >> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >> + // load mem pairs. >> + // Return true if feasible to perform load mem pairs otherwise false. >> + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0; >> + >> void process_block (rtl_ssa::bb_info *bb); >> rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2], >> const rtl_ssa::insn_range_info >> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h >> index e2134621723..d5c5b80f8aa 100644 >> --- a/gcc/rtl-ssa/functions.h >> +++ b/gcc/rtl-ssa/functions.h >> @@ -222,6 +222,13 @@ public: >> template<typename T, typename... Ts> >> T *change_alloc (obstack_watermark &wm, Ts... args); >> >> + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; } >> + >> + template<typename T, typename... Ts> >> + T *allocate (Ts... args); >> + >> + void remove_use (use_info *); >> + >> private: >> class bb_phi_info; >> class build_info; >> @@ -231,9 +238,6 @@ private: >> // allocate_temp during its lifetime. >> obstack_watermark temp_watermark () { return &m_temp_obstack; } >> >> - template<typename T, typename... Ts> >> - T *allocate (Ts... args); >> - >> template<typename T, typename... Ts> >> T *allocate_temp (Ts... args); >> >> @@ -269,7 +273,6 @@ private: >> static void insert_use_after (use_info *, use_info *); >> >> void add_use (use_info *); >> - void remove_use (use_info *); >> >> insn_info::order_node *need_order_node (insn_info *); >> >> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> new file mode 100644 >> index 00000000000..d10ff0cdf36 >> --- /dev/null >> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> @@ -0,0 +1,22 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target power10_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >> + >> +#include <altivec.h> >> + >> +void >> +foo2 () >> +{ >> + __vector_quad *dst1; >> + __vector_quad *dst2; >> + vector unsigned char src; >> + __vector_quad acc; >> + vector unsigned char *ptr; >> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >> + *dst1 = acc; >> + __builtin_mma_xvf32ger(&acc, src, ptr[2]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]); >> + *dst2 = acc; >> +} >> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >> new file mode 100644 >> index 00000000000..c523572cf3c >> --- /dev/null >> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >> @@ -0,0 +1,15 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target power10_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >> + >> +#include <altivec.h> >> + >> +void >> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src) >> +{ >> + __vector_quad acc; >> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >> + *dst = acc; >> +} >> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> index 69ee826e1be..ae29127f954 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec) >> dst[13] = acc; >> } >> >> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */ >> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ >> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */ >> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */ >> /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */ >> /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */ >> /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
Ajit Agarwal <aagarwa1@linux.ibm.com> writes: > [...] >>> +// Set subreg for OO mode pair to generate sequential registers given >>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false >>> +// if store insn. >>> +void >>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, >>> + bool load_p) >>> +{ >>> + if (load_p) >>> + { >>> + bool i1_subreg_p = use_has_subreg_p (i1); >>> + bool i2_subreg_p = use_has_subreg_p (i2); >>> + >>> + if (i1_subreg_p || i2_subreg_p) >>> + set_multiword_existing_subreg (i1, i2); >>> + else >>> + set_multiword_subreg_load (i1, i2); >> >> I don't understand this. Why do we have both set_multiword_existing_subreg >> and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically >> independent of one another (since i1 and i2 were separate instructions >> until now). So "i1_subreg_p || i2_subreg_p" implies that >> set_multiword_existing_subreg can handle i1s that have no existing >> subreg (used when i2_subreg_p) and that it can handle i2s that have no >> existing subreg (used when i1_subreg_p). So doesn't this mean that >> set_multiword_existing_subreg can handle everything? >> > > I will make the following change. > if (load_p) > { > bool i1_subreg_p = use_has_subreg_p (i1); > bool i2_subreg_p = use_has_subreg_p (i2); > > if (!i1_subreg_p && !i2_subreg_p) > set_multiword_subreg_load (i1, i2); > else > set_multiword_existing_subreg (i1, i2); > } > > Is this okay. That's the same thing though: it's just replacing a ? A : B with !a ? B : A. >> IMO, the way the update should work is that: >> >> (a) all references to the old registers should be updated via >> insn_propagation (regardless of whether the old references >> involved subregs). >> >> (b) those updates should be part of the same insn_change group as >> the change to the load itself. >> >> For stores, definitions of the stored register can probably be handled >> directly using df_refs, but there too, the updates should IMO be part >> of the same insn_change group as the change to the store itself. >> >> In both cases, it's the: >> >> crtl->ssa->change_insns (changes); >> >> in pair_fusion_bb_info::fuse_pair that should be responsible for >> updating the rtl-ssa IR. The changes that the pass wants to make >> should be described as insn_changes and passed to change_insns. >> >> The reason for funneling all changes through change_insns is that >> it allows rtl-ssa to maintain more complex datastructures. Clients >> aren't supposed to manually update the datastructures piecemeal. >> > > I am afraid I am not getting this. Would you mind elaborating this. > Sorry for that. See how fwprop.cc makes changes. It: - creates an insn_change for each change that it wants to make - uses insn_propagation to replace the old value with the new value - sets the new_uses of the insn_change to reflect the effect of the propagation (in this case, replacing the old 128-bit value with a 256-bit value) - uses change_insn to commit the change The process would be similar here. Thanks, Richard
Hello Richard: On 18/07/24 2:04 pm, Ajit Agarwal wrote: > Hello Richard: > > On 18/07/24 1:17 am, Richard Sandiford wrote: >> Ajit Agarwal <aagarwa1@linux.ibm.com> writes: >>> Hello All: >>> >>> This version of patch relaxes store fusion for more use cases. >>> >>> Common infrastructure using generic code for pair mem fusion of different >>> targets. >>> >>> rs6000 target specific code implement virtual functions defined by generic code. >>> >>> Target specific code are added in rs6000-mem-fusion.cc. >>> >>> Bootstrapped and regtested on powerpc64-linux-gnu. >>> >>> Thanks & Regards >>> Ajit >>> >>> >>> rs6000, middle-end: Add implementation for different targets for pair mem fusion >>> >>> Common infrastructure using generic code for pair mem fusion of different >>> targets. >>> >>> rs6000 target specific code implement virtual functions defined by generic code. >>> >>> Target specific code are added in rs6000-mem-fusion.cc. >>> >>> 2024-07-02 Ajit Kumar Agarwal <aagarwa1@linux.ibm.com> >>> >>> gcc/ChangeLog: >>> >>> * config/rs6000/rs6000-passes.def: New mem fusion pass >>> before pass_early_remat. >>> * pair-fusion.h: Add additional pure virtual function >>> required for rs6000 target implementation. >>> * pair-fusion.cc: Use of virtual functions for additional >>> virtual function addded for rs6000 target. >>> * config/rs6000/rs6000-mem-fusion.cc: Add new pass. >>> Add target specific implementation for generic pure virtual >>> functions. >>> * config/rs6000/mma.md: Modify movoo machine description. >>> Add new machine description movoo1. >>> * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move >>> to expand movoo machine description for all constraints. >>> * config.gcc: Add new object file. >>> * config/rs6000/rs6000-protos.h: Add new prototype for mem >>> fusion pass. >>> * config/rs6000/t-rs6000: Add new rule. >>> * rtl-ssa/functions.h: Move out allocate function from private >>> to public and add get_m_temp_defs function. >>> >>> gcc/testsuite/ChangeLog: >>> >>> * g++.target/powerpc/mem-fusion.C: New test. >>> * g++.target/powerpc/mem-fusion-1.C: New test. >>> * gcc.target/powerpc/mma-builtin-1.c: Modify test. >>> --- >>> gcc/config.gcc | 2 + >>> gcc/config/rs6000/mma.md | 26 +- >>> gcc/config/rs6000/rs6000-mem-fusion.cc | 708 ++++++++++++++++++ >>> gcc/config/rs6000/rs6000-passes.def | 4 +- >>> gcc/config/rs6000/rs6000-protos.h | 1 + >>> gcc/config/rs6000/rs6000.cc | 57 +- >>> gcc/config/rs6000/rs6000.md | 1 + >>> gcc/config/rs6000/t-rs6000 | 5 + >>> gcc/pair-fusion.cc | 27 +- >>> gcc/pair-fusion.h | 34 + >>> gcc/rtl-ssa/functions.h | 11 +- >>> .../g++.target/powerpc/mem-fusion-1.C | 22 + >>> gcc/testsuite/g++.target/powerpc/mem-fusion.C | 15 + >>> .../gcc.target/powerpc/mma-builtin-1.c | 4 +- >>> 14 files changed, 890 insertions(+), 27 deletions(-) >>> create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc >>> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >>> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C >>> >>> diff --git a/gcc/config.gcc b/gcc/config.gcc >>> index bc45615741b..12f79a78177 100644 >>> --- a/gcc/config.gcc >>> +++ b/gcc/config.gcc >>> @@ -524,6 +524,7 @@ powerpc*-*-*) >>> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" >>> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" >>> extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" >>> + extra_objs="${extra_objs} rs6000-mem-fusion.o" >>> extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" >>> extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" >>> extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" >>> @@ -560,6 +561,7 @@ rs6000*-*-*) >>> extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" >>> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" >>> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" >>> + extra_objs="${extra_objs} rs6000-mem-fusion.o" >>> target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc" >>> target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc" >>> ;; >>> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md >>> index 04e2d0066df..88413926a02 100644 >>> --- a/gcc/config/rs6000/mma.md >>> +++ b/gcc/config/rs6000/mma.md >>> @@ -294,7 +294,31 @@ >>> >>> (define_insn_and_split "*movoo" >>> [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") >>> - (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] >>> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] >>> + "TARGET_MMA >>> + && (gpc_reg_operand (operands[0], OOmode) >>> + || gpc_reg_operand (operands[1], OOmode))" >>> +;; "" >>> + "@ >>> + # >>> + # >>> + #" >>> + "&& reload_completed" >>> + [(const_int 0)] >>> +{ >>> + rs6000_split_multireg_move (operands[0], operands[1]); >>> + DONE; >>> +} >>> + [(set_attr "type" "vecload,vecstore,veclogical") >>> + (set_attr "length" "*,*,8")]) >>> +;; (set_attr "max_prefixed_insns" "2,2,*")]) >>> + >>> + >>> +(define_insn_and_split "*movoo1" >>> + [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") >>> + (unspec [ >>> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa") >>> + ] UNSPEC_LXVP))] >>> "TARGET_MMA >>> && (gpc_reg_operand (operands[0], OOmode) >>> || gpc_reg_operand (operands[1], OOmode))" >>> diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc >>> new file mode 100644 >>> index 00000000000..b63b6f31001 >>> --- /dev/null >>> +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc >>> @@ -0,0 +1,708 @@ >>> +/* Subroutines used to perform adjacent load/store into >>> + paired memory accesses for TARGET_POWER10 and TARGET_VSX. >>> + >>> + Copyright (C) 2024 Free Software Foundation, Inc. >>> + >>> + This file is part of GCC. >>> + >>> + GCC is free software; you can redistribute it and/or modify it >>> + under the terms of the GNU General Public License as published >>> + by the Free Software Foundation; either version 3, or (at your >>> + option) any later version. >>> + >>> + GCC is distributed in the hope that it will be useful, but WITHOUT >>> + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY >>> + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public >>> + License for more details. >>> + >>> + You should have received a copy of the GNU General Public License >>> + along with GCC; see the file COPYING3. If not see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +#define INCLUDE_ALGORITHM >>> +#define INCLUDE_FUNCTIONAL >>> +#define INCLUDE_LIST >>> +#define INCLUDE_TYPE_TRAITS >>> +#include "config.h" >>> +#include "system.h" >>> +#include "coretypes.h" >>> +#include "backend.h" >>> +#include "rtl.h" >>> +#include "df.h" >>> +#include "rtl-iter.h" >>> +#include "rtl-ssa.h" >>> +#include "rtl-ssa/internals.h" >>> +#include "rtl-ssa/internals.inl" >>> +#include "cfgcleanup.h" >>> +#include "tree-pass.h" >>> +#include "pair-fusion.h" >>> + >>> +using namespace rtl_ssa; >>> + >>> +struct rs6000_pair_fusion : public pair_fusion >>> +{ >>> + bool fpsimd_op_p (rtx , machine_mode , bool) override final >>> + { >>> + return false; >>> + } >>> + >>> + bool pair_mem_insn_p (rtx_insn *, bool &) override final >>> + { >>> + return false; >>> + } >>> + >>> + void change_existing_multword_mode (rtx_insn *insn) override final; >>> + >>> + bool pair_mem_ok_with_policy (rtx, bool) override final >>> + { >>> + return true; >>> + } >>> + >>> + bool pair_operand_mode_ok_p (machine_mode mode) override final; >>> + >>> + rtx gen_pair (rtx *pats, rtx, bool load_p) override final; >>> + >>> + bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final >>> + { >>> + return true; >>> + } >>> + >>> + int pair_mem_alias_check_limit () override final >>> + { >>> + return 0; >>> + } >>> + >>> + bool should_handle_writeback (enum writeback_type) override final >>> + { >>> + return false; >>> + } >>> + >>> + bool track_loads_p () override final >>> + { >>> + return true; >>> + } >>> + >>> + bool track_stores_p () override final >>> + { >>> + return true; >>> + } >>> + >>> + bool pair_mem_in_range_p (HOST_WIDE_INT) override final >>> + { >>> + return true; >>> + } >>> + >>> + rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final >>> + { >>> + return NULL_RTX; >>> + } >>> + >>> + rtx destructure_pair (rtx_def **, rtx, bool) override final >>> + { >>> + return NULL_RTX; >>> + } >>> + >>> + bool fuseable_store_p (insn_info *i1, insn_info *i2) override final; >>> + >>> + bool fuseable_load_p (insn_info *insn) override final; >>> + >>> + void set_multiword_subreg (insn_info *i1, insn_info *i2, >>> + bool load_p) override final; >>> + >>> + void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt, >>> + insn_change **pair_change, >>> + auto_vec <insn_change *> &changes) override final; >>> +}; >>> + >>> +bool >>> +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) >>> +{ >>> + return (ALTIVEC_OR_VSX_VECTOR_MODE (mode)); >>> +} >>> + >>> +void >>> +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn) >>> +{ >>> + rtx set = single_set (insn); >>> + rtx src = SET_SRC (set); >>> + rtx dest = SET_DEST (set); >>> + rtx copy = NULL_RTX; >>> + >>> + if ((MEM_P (src) && GET_MODE (src) == OOmode) >>> + || (MEM_P (dest) && GET_MODE (dest) == OOmode)) >>> + { >>> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), >>> + gen_rtvec (1, src), >>> + UNSPEC_LXVP); >>> + copy = gen_rtx_SET (dest, unspec); >>> + rtx_insn *new_insn = emit_insn_after (copy, insn); >>> + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); >>> + df_insn_rescan (new_insn); >>> + df_insn_delete (insn); >>> + remove_insn (insn); >>> + insn->set_deleted (); >>> + } >>> +} >>> + >>> +static void >>> +update_change (set_info *set) >>> +{ >>> + if (!set->has_any_uses ()) >>> + return; >>> + >>> + auto *use = *set->all_uses ().begin (); >>> + do >>> + { >>> + auto *next_use = use->next_use (); >>> + if (use->is_in_phi ()) >>> + { >>> + update_change (use->phi ()); >>> + } >>> + else >>> + { >>> + crtl->ssa->remove_use (use); >>> + } >>> + use = next_use; >>> + } >>> + while (use); >>> +} >>> + >>> +void >>> +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first, >>> + obstack_watermark *attempt, >>> + insn_change **pair_change, >>> + auto_vec<insn_change *> &changes) >>> +{ >>> + for (insn_change *change : changes) >>> + for (auto def : change->old_defs ()) >>> + { >>> + auto set = dyn_cast<set_info *> (def); >>> + update_change (set); >>> + } >>> + >>> + auto &new_defs = (*pair_change)->new_defs; >>> + vec_rtx_properties properties; >>> + properties.add_insn (first->rtl (), true); >>> + // Build up the new list of definitions. >>> + for (rtx_obj_reference ref : properties.refs ()) >>> + if (ref.is_write ()) >>> + { >>> + auto *set = crtl->ssa->allocate<set_info> (first, >>> + full_register (ref.regno)); >>> + if (set) >>> + { >>> + auto def = find_access (new_defs, ref.regno); >>> + if (!def) >>> + { >>> + new_defs = insert_access (*attempt, set, >>> + new_defs); >>> + auto &m_temp_defs = crtl->ssa->get_m_temp_defs (); >>> + m_temp_defs.safe_push (set); >>> + } >>> + } >>> + } >>> +} >>> + >>> +// df_insn_rescan dependent instruction where operands >>> +// are reversed given insn_info INFO. >>> +static void >>> +set_rescan_load (insn_info *i1) >>> +{ >>> + for (auto def : i1->defs ()) >>> + { >>> + auto set = dyn_cast<set_info *> (def); >>> + for (auto use : set->all_uses ()) >>> + { >>> + insn_info *info = use->insn (); >>> + if (info && info->rtl ()) >>> + { >>> + rtx_insn *rtl_insn = info->rtl (); >>> + df_insn_rescan (rtl_insn); >>> + } >>> + } >>> + } >>> +} >>> + >>> +// df_insn_rescan the def instruction where operands are reversed given INSN. >>> +static bool >>> +set_rescan_store (insn_info *insn) >>> +{ >>> + for (auto use : insn->uses()) >>> + { >>> + auto def = use->def (); >>> + >>> + if (!def) >>> + return false; >>> + >>> + if (def->insn ()->is_artificial ()) >>> + return false; >>> + >>> + if (def->insn () && def->insn ()->rtl () >>> + && def->insn()->is_real ()) >>> + { >>> + rtx_insn *rtl_insn = def->insn ()->rtl (); >>> + rtx set = single_set (rtl_insn); >>> + >>> + if (set == NULL_RTX) >>> + return false; >>> + df_insn_rescan (rtl_insn); >>> + } >>> + } >>> + return true; >>> +} >>> + >>> +// Check for feasibility of store to be fuseable or not. Return true if >>> +// feasible otherwise false. >>> +static bool >>> +feasible_store_p (insn_info *insn) >>> +{ >>> + for (auto use : insn->uses ()) >>> + { >>> + auto def = use->def (); >>> + >>> + if (def->insn ()->is_artificial ()) >>> + return false; >>> + >>> + if (def->insn () && def->insn ()->rtl () >>> + && def->insn()->is_real ()) >>> + { >>> + rtx_insn *rtl_insn = def->insn ()->rtl (); >>> + rtx set = single_set (rtl_insn); >>> + >>> + if (set == NULL_RTX) >>> + return false; >>> + >>> + // Return false if dependent def is load. >>> + // This is done as def instruction could be a fused load and >>> + // to avoid already existing subreg (reg:OO R) offset. >>> + if (rtl_insn && MEM_P (SET_SRC (set))) >>> + return false; >>> + >>> + // Return false if dependent def is store. >>> + if (rtl_insn && MEM_P (SET_DEST (set))) >>> + return false; >> >> I don't understand these tests. It might help to turn it around and >> say: what sort of cases do you want to handle? If the def instruction is Load and store which already are fused then I bail out with store fusion. >> >>> + } >>> + } >>> + return true; >>> +} >>> + >>> +// Check if store can be fuseable or not. Return true if fuseable otherwise >>> +// false. >>> +bool >>> +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2) >>> +{ >>> + rtx_insn *insn1 = i1->rtl (); >>> + rtx_insn *insn2 = i2->rtl (); >>> + rtx body = PATTERN (insn1); >>> + rtx src_exp = SET_SRC (body); >>> + rtx insn2_body = PATTERN (insn2); >>> + rtx insn2_src_exp = SET_SRC (insn2_body); >>> + >>> + if (!(REG_P (src_exp) >>> + && crtl->ssa->single_dominating_def (REGNO (src_exp)))) >>> + return false; >>> + >>> + // This is done as def instruction could be a fused load and >>> + // to avoid already existing subreg (reg:OO R) offset. >>> + if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1) >>> + return false; >>> + >>> + // Return false if src of insn1 and src of insn2 are same. >>> + if (src_exp == insn2_src_exp) >>> + return false; >>> + >>> + // Return false if src of insn1 is subreg. >>> + if (GET_CODE (src_exp) == SUBREG) >>> + return false; >> >> This can't be true after the REG_P check above. >> > > I will make this change. > I have made these changes and send a separate patch. >>> + >>> + // Return false if src of insn2 is subreg. >>> + if (GET_CODE (insn2_src_exp) == SUBREG) >>> + return false; >> >> Shouldn't the tests for i1 and i2 be symmetrical, with i2 also >> requiring a single dominating definition? >> > > I will make this change. > I have made changes in separate patch. >>> + >>> + if (!feasible_store_p (i1)) >>> + return false;; >>> + >>> + if (!feasible_store_p (i2)) >>> + return false; >>> + >>> + return true; >>> +} >>> + >>> +// Set subreg for def of store INSN given rtx SRC instruction. >>> +static void >>> +set_store_subreg (insn_info *i1, rtx src, int regoff) >>> +{ >>> + for (auto use: i1->uses ()) >>> + { >>> + auto def = use->def (); >>> + if (!def) >>> + return; >>> + >>> + insn_info *info = def->insn (); >>> + >>> + if (info->is_artificial ()) >>> + return; >>> + >>> + if (info && info->is_real ()) >>> + { >>> + rtx_insn *rtl_insn = info->rtl (); >>> + rtx set = single_set (rtl_insn); >>> + if (set == NULL_RTX) >>> + return; >>> + df_ref ref; >>> + FOR_EACH_INSN_DEF (ref, rtl_insn) >>> + { >>> + rtx src_exp = SET_SRC (PATTERN (i1->rtl ())); >>> + if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp)) >>> + { >>> + rtx *loc = DF_REF_LOC (ref); >>> + if (GET_CODE (*loc) == SUBREG) >>> + { >>> + rtx src1 = simplify_gen_subreg (GET_MODE (*loc), >>> + SUBREG_REG (src), >>> + OOmode, >>> + regoff); >>> + *loc = copy_rtx (src1); >>> + } >>> + else >>> + *loc = copy_rtx (src); >>> + } >>> + } >>> + } >>> + } >>> +} >>> + >>> +// Check whether load can be fusable or not. >>> +// Return true if fuseable otherwise false. >>> +bool >>> +rs6000_pair_fusion::fuseable_load_p (insn_info *i1) >>> +{ >>> + rtx_insn *insn = i1->rtl (); >>> + rtx body = PATTERN (insn); >>> + rtx dest_exp = SET_DEST (body); >>> + >>> + if (!(REG_P (dest_exp) >>> + && crtl->ssa->single_dominating_def (REGNO (dest_exp)))) >>> + return false; >>> + return true; >>> +} >>> + >>> +// Propagate insn I1 with new rtx NEW_DEST_EXP. >>> +static void >>> +propagate_insn (insn_info *i1, rtx new_dest_exp) >>> +{ >>> + df_ref ref; >>> + FOR_EACH_INSN_DEF (ref, i1->rtl()) >>> + { >>> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >>> + if (REG_P (dest_exp) >>> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >>> + { >>> + rtx *loc = DF_REF_LOC (ref); >>> + *loc = new_dest_exp; >>> + } >>> + } >>> +} >>> + >>> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair. >>> +static rtx >>> +new_reg_rtx (rtx old_dest) >>> +{ >>> + rtx new_dest_exp = gen_reg_rtx (OOmode); >>> + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest); >>> + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest); >>> + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest); >>> + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest); >>> + max_regno = max_reg_num (); >>> + return new_dest_exp; >>> +} >>> + >>> +// Set subreg with use of INSN given SRC rtx instruction. >>> +static void >>> +set_load_subreg (insn_info *i1, rtx src) >>> +{ >>> + rtx set = single_set (i1->rtl()); >>> + rtx old_dest = SET_DEST (set); >>> + >>> + for (auto def : i1->defs ()) >>> + { >>> + auto set = dyn_cast<set_info *> (def); >>> + for (auto use : set->nondebug_insn_uses ()) >>> + { >>> + insn_info *info = use->insn (); >>> + if (!info || !info->rtl ()) >>> + continue; >>> + >>> + rtx_insn *rtl_insn = info->rtl (); >>> + df_ref ref; >>> + >>> + FOR_EACH_INSN_USE (ref, rtl_insn) >>> + { >>> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >>> + if (REG_P (dest_exp) >>> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >>> + { >>> + rtx *loc = DF_REF_LOC (ref); >>> + insn_propagation prop (rtl_insn, old_dest, src); >>> + if (GET_CODE (*loc) == SUBREG) >>> + { >>> + if (!prop.apply_to_pattern (loc)) >>> + { >>> + if (dump_file != NULL) >>> + { >>> + fprintf (dump_file, >>> + "Cannot propagate insn \n"); >>> + print_rtl_single (dump_file, rtl_insn); >>> + } >>> + return; >>> + } >>> + } >>> + else >>> + *loc = copy_rtx (src); >>> + } >>> + } >>> + } >>> + } >>> +} >>> + >>> +// Set subreg for OO mode store pair to generate registers in pairs >>> +// given insn_info I1 and I2. >>> +static void >>> +set_multiword_subreg_store (insn_info *i1, insn_info *i2) >>> +{ >>> + rtx_insn *insn1 = i1->rtl (); >>> + rtx_insn *insn2 = i2->rtl (); >>> + rtx body = PATTERN (insn1); >>> + rtx src_exp = SET_SRC (body); >>> + rtx insn2_body = PATTERN (insn2); >>> + rtx insn2_dest_exp = SET_DEST (insn2_body); >>> + machine_mode mode = GET_MODE (src_exp); >>> + int regoff; >>> + rtx src; >>> + rtx addr = XEXP (insn2_dest_exp, 0); >>> + >>> + PUT_MODE_RAW (src_exp, OOmode); >>> + if (GET_CODE (addr) == PLUS >>> + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1))) >>> + regoff = 16; >>> + else >>> + regoff = 0; >>> + >>> + src = simplify_gen_subreg (mode, >>> + src_exp, GET_MODE (src_exp), >>> + regoff); >>> + >>> + set_store_subreg (i1, src, regoff); >>> + >>> + int regoff1 = 0; >>> + rtx src1; >>> + >>> + src1 = simplify_gen_subreg (mode, >>> + src_exp, GET_MODE (src_exp), >>> + regoff1); >>> + >>> + set_store_subreg (i2, src1, regoff1); >>> + set_rescan_store (i1); >>> + set_rescan_store (i2); >>> + df_insn_rescan (insn1); >>> +} >>> + >>> +// Set subreg for OO mode pair load to generate registers in pairs given >>> +// insn_info I2 and I2. >>> +static void >>> +set_multiword_subreg_load (insn_info *i1, insn_info *i2) >>> +{ >>> + rtx_insn *insn1 = i1->rtl (); >>> + rtx body = PATTERN (insn1); >>> + rtx dest_exp = SET_DEST (body); >>> + machine_mode mode = GET_MODE (dest_exp); >>> + PUT_MODE_RAW (dest_exp, OOmode); >>> + >>> + int regoff = 0; >>> + rtx src; >>> + >>> + src = simplify_gen_subreg (mode, >>> + dest_exp, GET_MODE (dest_exp), >>> + regoff); >>> + >>> + set_load_subreg (i2, src); >>> + >>> + int regoff1; >>> + rtx src1; >>> + >>> + regoff1 = 16; >>> + src1 = simplify_gen_subreg (mode, >>> + dest_exp, GET_MODE (dest_exp), >>> + regoff1); >>> + set_load_subreg (i1, src1); >>> + >>> + set_rescan_load (i1); >>> + set_rescan_load (i2); >>> + df_insn_rescan (insn1); >>> +} >>> + >>> +// Set subreg for OO mode pair load for existing subreg rtx to generate >>> +// registers in pairs given insn_info I2 and I2. >>> +static void >>> +set_multiword_existing_subreg (insn_info *i1, insn_info *i2) >>> +{ >>> + rtx_insn *insn1 = i1->rtl (); >>> + rtx body = PATTERN (insn1); >>> + rtx dest_exp = SET_DEST (body); >>> + machine_mode mode = GET_MODE (dest_exp); >>> + int regoff1; >>> + regoff1 = 16; >>> + rtx new_dest_exp = new_reg_rtx (dest_exp); >>> + >>> + rtx src = simplify_gen_subreg (mode, >>> + new_dest_exp, >>> + OOmode, >>> + regoff1); >>> + >>> + set_load_subreg (i1, src); >>> + propagate_insn (i1, new_dest_exp); >>> + >>> + int regoff = 0; >>> + rtx sset = single_set (i2->rtl ()); >>> + rtx insn2_dest_exp = SET_DEST (sset); >>> + machine_mode insn2_mode = GET_MODE (insn2_dest_exp); >>> + >>> + src = simplify_gen_subreg (insn2_mode, >>> + new_dest_exp, >>> + OOmode, >>> + regoff); >>> + >>> + set_load_subreg (i2, src); >>> + propagate_insn (i2, new_dest_exp); >>> + >>> + auto attempt = crtl->ssa->new_change_attempt (); >>> + resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) }; >>> + auto *set = crtl->ssa->allocate<set_info> (i1, resource); >>> + if (set) >>> + { >>> + auto def = find_access (i1->defs (), REGNO (new_dest_exp)); >>> + if (!def) >>> + i1->defs() = insert_access (attempt, set, i1->defs()); >>> + } >>> + >>> + set_rescan_load (i1); >>> + set_rescan_load (i2); >>> + df_insn_rescan (insn1); >>> +} >>> + >>> +// Return true iff insn I1 has already existing subreg. >>> +static bool >>> +use_has_subreg_p (insn_info *i1) >>> +{ >>> + for (auto def : i1->defs ()) >>> + { >>> + auto set = dyn_cast<set_info *> (def); >>> + for (auto use : set->nondebug_insn_uses ()) >>> + { >>> + insn_info *info = use->insn (); >>> + if (info && info->rtl ()) >>> + { >>> + rtx_insn *rtl_insn = info->rtl (); >>> + df_ref ref; >>> + FOR_EACH_INSN_USE (ref, rtl_insn) >>> + { >>> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >>> + if (REG_P (dest_exp) >>> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >>> + { >>> + rtx *loc = DF_REF_LOC (ref); >>> + if (GET_CODE (*loc) == SUBREG) >>> + return true; >>> + } >>> + } >>> + } >>> + } >>> + } >>> + return false; >>> +} >>> + >>> +// Set subreg for OO mode pair to generate sequential registers given >>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false >>> +// if store insn. >>> +void >>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, >>> + bool load_p) >>> +{ >>> + if (load_p) >>> + { >>> + bool i1_subreg_p = use_has_subreg_p (i1); >>> + bool i2_subreg_p = use_has_subreg_p (i2); >>> + >>> + if (i1_subreg_p || i2_subreg_p) >>> + set_multiword_existing_subreg (i1, i2); >>> + else >>> + set_multiword_subreg_load (i1, i2); >> >> I don't understand this. Why do we have both set_multiword_existing_subreg >> and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically >> independent of one another (since i1 and i2 were separate instructions >> until now). So "i1_subreg_p || i2_subreg_p" implies that >> set_multiword_existing_subreg can handle i1s that have no existing >> subreg (used when i2_subreg_p) and that it can handle i2s that have no >> existing subreg (used when i1_subreg_p). So doesn't this mean that >> set_multiword_existing_subreg can handle everything? >> > > I will make the following change. > if (load_p) > { > bool i1_subreg_p = use_has_subreg_p (i1); > bool i2_subreg_p = use_has_subreg_p (i2); > > if (!i1_subreg_p && !i2_subreg_p) > set_multiword_subreg_load (i1, i2); > else > set_multiword_existing_subreg (i1, i2); > } > > Is this okay. > I have made these changes. > >> IMO, the way the update should work is that: >> >> (a) all references to the old registers should be updated via >> insn_propagation (regardless of whether the old references >> involved subregs). >> >> (b) those updates should be part of the same insn_change group as >> the change to the load itself. >> >> For stores, definitions of the stored register can probably be handled >> directly using df_refs, but there too, the updates should IMO be part >> of the same insn_change group as the change to the store itself. >> >> In both cases, it's the: >> >> crtl->ssa->change_insns (changes); >> >> in pair_fusion_bb_info::fuse_pair that should be responsible for >> updating the rtl-ssa IR. The changes that the pass wants to make >> should be described as insn_changes and passed to change_insns. >> >> The reason for funneling all changes through change_insns is that >> it allows rtl-ssa to maintain more complex datastructures. Clients >> aren't supposed to manually update the datastructures piecemeal. >> > > I am afraid I am not getting this. Would you mind elaborating this. > Sorry for that. > I have made reference to all uses for loads and defs for store in the same change. I will send a separate patch with this changes. >> Thanks, >> Richard >> > Thanks & Regards Ajit > Thanks & Regards > Ajit > >>> + } >>> + else >>> + set_multiword_subreg_store (i1, i2); >>> +} >>> + >>> +rtx >>> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p) >>> +{ >>> + rtx i1 = pats[0]; >>> + rtx src_exp = SET_SRC (i1); >>> + rtx dest_exp = SET_DEST (i1); >>> + PUT_MODE_RAW (src_exp, OOmode); >>> + PUT_MODE_RAW (dest_exp, OOmode); >>> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp), >>> + gen_rtvec (1, src_exp), >>> + UNSPEC_LXVP); >>> + rtx set = gen_rtx_SET (dest_exp, unspec); >>> + if (dump_file) >>> + { >>> + if (load_p) >>> + fprintf (dump_file, "lxv with lxvp "); >>> + else >>> + fprintf (dump_file, "stxv with stxvp "); >>> + print_rtl_single (dump_file, set); >>> + } >>> + return set; >>> +} >>> + >>> +const pass_data pass_data_mem_fusion = >>> +{ >>> + RTL_PASS, /* type */ >>> + "mem_fusion", /* name */ >>> + OPTGROUP_NONE, /* optinfo_flags */ >>> + TV_NONE, /* tv_id */ >>> + 0, /* properties_required */ >>> + 0, /* properties_provided */ >>> + 0, /* properties_destroyed */ >>> + 0, /* todo_flags_start */ >>> + TODO_df_finish, /* todo_flags_finish */ >>> +}; >>> + >>> +class pass_mem_fusion : public rtl_opt_pass >>> +{ >>> +public: >>> + pass_mem_fusion (gcc::context *ctxt) >>> + : rtl_opt_pass (pass_data_mem_fusion, ctxt) >>> + {} >>> + >>> + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);} >>> + >>> + /* opt_pass methods: */ >>> + bool gate (function *) >>> + { >>> + return (optimize > 0 && TARGET_VSX && TARGET_POWER10); >>> + } >>> + >>> + unsigned int execute (function *) final override >>> + { >>> + rs6000_pair_fusion pass; >>> + pass.run (); >>> + return 0; >>> + } >>> +}; // class pass_mem_fusion >>> + >>> +rtl_opt_pass * >>> +make_pass_mem_fusion (gcc::context *ctxt) >>> +{ >>> + return new pass_mem_fusion (ctxt); >>> +} >>> diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def >>> index 46a0d0b8c56..0b48f57014d 100644 >>> --- a/gcc/config/rs6000/rs6000-passes.def >>> +++ b/gcc/config/rs6000/rs6000-passes.def >>> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see >>> The power8 does not have instructions that automaticaly do the byte swaps >>> for loads and stores. */ >>> INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); >>> - >>> + /* Pass to replace adjacent memory addresses lxv/stxv instruction with >>> + lxvp/stxvp instruction. */ >>> + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion); >>> /* Pass to do the PCREL_OPT optimization that combines the load of an >>> external symbol's address along with a single load or store using that >>> address as a base register. */ >>> diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h >>> index 09a57a806fa..1412b31c2eb 100644 >>> --- a/gcc/config/rs6000/rs6000-protos.h >>> +++ b/gcc/config/rs6000/rs6000-protos.h >>> @@ -343,6 +343,7 @@ namespace gcc { class context; } >>> class rtl_opt_pass; >>> >>> extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); >>> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *); >>> extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); >>> extern bool rs6000_sum_of_two_registers_p (const_rtx expr); >>> extern bool rs6000_quadword_masked_address_p (const_rtx exp); >>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>> index 58553ff66f4..6da4e70973d 100644 >>> --- a/gcc/config/rs6000/rs6000.cc >>> +++ b/gcc/config/rs6000/rs6000.cc >>> @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> reg_mode = word_mode; >>> reg_mode_size = GET_MODE_SIZE (reg_mode); >>> >>> - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); >>> + gcc_assert (mode == OOmode >>> + || reg_mode_size * nregs == GET_MODE_SIZE (mode)); >>> >>> /* TDmode residing in FP registers is special, since the ISA requires that >>> the lower-numbered word of a register pair is always the most significant >>> @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); >>> if (MEM_P (dst)) >>> { >>> + rtx addr = XEXP (dst, 0); >>> + rtx opnd1 = NULL_RTX; >>> + if (addr && GET_CODE (addr) == PLUS) >>> + opnd1 = XEXP (addr,1); >>> + >>> unsigned offset = 0; >>> unsigned size = GET_MODE_SIZE (reg_mode); >>> >>> @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> { >>> unsigned subreg >>> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >>> - rtx dst2 = adjust_address (dst, reg_mode, offset); >>> + rtx dst2 = dst; >>> + >>> + if ((GET_CODE (addr) != PLUS >>> + || (opnd1 && CONST_INT_P(opnd1)))) >>> + dst2 = adjust_address (dst, reg_mode, offset); >>> + else >>> + PUT_MODE_RAW (dst, reg_mode); >>> rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); >>> offset += size; >>> emit_insn (gen_rtx_SET (dst2, src2)); >>> @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> >>> if (MEM_P (src)) >>> { >>> + rtx addr = XEXP (src, 0); >>> + rtx opnd1 = NULL_RTX; >>> + if (addr && GET_CODE (addr) == PLUS) >>> + opnd1 = XEXP (addr,1); >>> + >>> unsigned offset = 0; >>> unsigned size = GET_MODE_SIZE (reg_mode); >>> >>> - for (int i = 0; i < nregs; i += reg_mode_nregs) >>> + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs) >>> { >>> unsigned subreg >>> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >>> rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); >>> - rtx src2 = adjust_address (src, reg_mode, offset); >>> + rtx src2 = src; >>> + >>> + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1)))) >>> + src2 = adjust_address (src, reg_mode, offset); >>> + else >>> + PUT_MODE_RAW (src2, reg_mode); >>> offset += size; >>> emit_insn (gen_rtx_SET (dst2, src2)); >>> } >>> @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> /* If we are writing an accumulator register, we have to >>> prime it after we've written it. */ >>> if (TARGET_MMA >>> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >>> + && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >>> emit_insn (gen_mma_xxmtacc (dst, dst)); >>> >>> return; >>> @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> { >>> for (i = nregs - 1; i >= 0; i--) >>> { >>> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >>> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >>> - emit_insn (gen_rtx_SET (dst_i, src_i)); >>> + if (REG_P (dst) && REG_P (src)) >>> + { >>> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >>> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >>> + emit_insn (gen_rtx_SET (dst_i, src_i)); >>> + } >>> } >>> } >>> else >>> @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> /* If we are writing an accumulator register, we have to >>> prime it after we've written it. */ >>> if (TARGET_MMA >>> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >>> + && REG_P (dst) && GET_MODE (dst) == XOmode >>> + && FP_REGNO_P (REGNO (dst))) >>> emit_insn (gen_mma_xxmtacc (dst, dst)); >>> } >>> else >>> @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> >>> /* If the base register we are using to address memory is >>> also a destination reg, then change that register last. */ >>> - if (REG_P (breg) >>> + if (REG_P (dst) && REG_P (breg) >>> && REGNO (breg) >= REGNO (dst) >>> && REGNO (breg) < REGNO (dst) + nregs) >>> j = REGNO (breg) - REGNO (dst); >>> @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> /* XO/OO are opaque so cannot use subregs. */ >>> if (mode == OOmode || mode == XOmode ) >>> { >>> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >>> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >>> - emit_insn (gen_rtx_SET (dst_i, src_i)); >>> + if (REG_P (dst) && REG_P (src)) >>> + { >>> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >>> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >>> + emit_insn (gen_rtx_SET (dst_i, src_i)); >>> + } >>> } >>> else >>> emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, >>> @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src) >>> if (restore_basereg != NULL_RTX) >>> emit_insn (restore_basereg); >>> } >>> + return; >>> } >>> + >>> >>> /* Return true if the peephole2 can combine a load involving a combination of >>> an addis instruction and a load with an offset that can be fused together on >>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md >>> index a5d20594789..2106e1a1fed 100644 >>> --- a/gcc/config/rs6000/rs6000.md >>> +++ b/gcc/config/rs6000/rs6000.md >>> @@ -159,6 +159,7 @@ >>> UNSPEC_XXSPLTIW_CONST >>> UNSPEC_FMAX >>> UNSPEC_FMIN >>> + UNSPEC_LXVP >>> ]) >>> >>> ;; >>> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 >>> index b3ce09d523b..df9b3a35b66 100644 >>> --- a/gcc/config/rs6000/t-rs6000 >>> +++ b/gcc/config/rs6000/t-rs6000 >>> @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc >>> $(COMPILE) $< >>> $(POSTCOMPILE) >>> >>> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc >>> + $(COMPILE) $< >>> + $(POSTCOMPILE) >>> + >>> + >>> rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc >>> $(COMPILE) $< >>> $(POSTCOMPILE) >>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc >>> index 31d2c21c88f..ff77a0bc8c6 100644 >>> --- a/gcc/pair-fusion.cc >>> +++ b/gcc/pair-fusion.cc >>> @@ -312,9 +312,9 @@ static int >>> encode_lfs (lfs_fields fields) >>> { >>> int size_log2 = exact_log2 (fields.size); >>> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); >>> - return ((int)fields.load_p << 3) >>> - | ((int)fields.fpsimd_p << 2) >>> + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9); >>> + return ((int)fields.load_p << 4) >>> + | ((int)fields.fpsimd_p << 3) >>> | (size_log2 - 2); >>> } >>> >>> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields) >>> static lfs_fields >>> decode_lfs (int lfs) >>> { >>> - bool load_p = (lfs & (1 << 3)); >>> - bool fpsimd_p = (lfs & (1 << 2)); >>> + bool load_p = (lfs & (1 << 4)); >>> + bool fpsimd_p = (lfs & (1 << 3)); >>> unsigned size = 1U << ((lfs & 3) + 2); >>> return { load_p, fpsimd_p, size }; >>> } >>> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) >>> if (MEM_VOLATILE_P (mem)) >>> return; >>> >>> + if (load_p && !m_pass->fuseable_load_p (insn)) >>> + return; >>> + >>> // Ignore writeback accesses if the hook says to do so. >>> if (!m_pass->should_handle_writeback (writeback_type::EXISTING) >>> && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) >>> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >>> } >>> >>> rtx reg_notes = combine_reg_notes (first, second, load_p); >>> - >>> + m_pass->set_multiword_subreg (i1, i2, load_p); >>> rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p); >>> insn_change *pair_change = nullptr; >>> auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { >>> @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >>> pair_change->new_defs = merge_access_arrays (attempt, >>> input_defs[0], >>> input_defs[1]); >>> + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes); >>> gcc_assert (pair_change->new_defs.is_valid ()); >>> >>> pair_change->new_uses >>> @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size, >>> reg_ops[i] = XEXP (pats[i], !load_p); >>> } >>> >>> + if (!load_p && !m_pass->fuseable_store_p (i1, i2)) >>> + { >>> + if (dump_file) >>> + fprintf (dump_file, >>> + "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n", >>> + insns[0]->uid (), insns[1]->uid ()); >>> + return false; >>> + } >>> + >>> if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) >>> { >>> if (dump_file) >>> @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb) >>> if (GET_CODE (pat) != SET) >>> continue; >>> >>> + change_existing_multword_mode (rti); >>> + >>> if (track_stores && MEM_P (XEXP (pat, 0))) >>> bb_state.track_access (insn, false, XEXP (pat, 0)); >>> else if (track_loads && MEM_P (XEXP (pat, 1))) >>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h >>> index 45e4edceecb..756357db794 100644 >>> --- a/gcc/pair-fusion.h >>> +++ b/gcc/pair-fusion.h >>> @@ -26,8 +26,11 @@ namespace rtl_ssa { >>> class insn_info; >>> class insn_range_info; >>> class bb_info; >>> + class insn_change; >>> } >>> >>> +class obstack_watermark; >>> + >>> // Information about a potential base candidate, used in try_fuse_pair. >>> // There may be zero, one, or two viable RTL bases for a given pair. >>> struct base_cand >>> @@ -142,6 +145,19 @@ struct pair_fusion { >>> // true iff INSN is a load pair. >>> virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; >>> >>> + // Given INSN change multiword mode load and store to respective >>> + // unspec instruction. >>> + virtual void change_existing_multword_mode (rtx_insn *insn) = 0; >>> + >>> + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the >>> + // new rtx with INSN. Remove all uses of definition that are >>> + // removed given CHANGES. >>> + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first, >>> + obstack_watermark *attempt, >>> + rtl_ssa::insn_change **pair_change, >>> + auto_vec<rtl_ssa::insn_change *> &changes) >>> + = 0; >>> + >>> // Return true if we should track loads. >>> virtual bool track_loads_p () >>> { >>> @@ -171,6 +187,24 @@ struct pair_fusion { >>> virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, >>> rtx regs[2], bool load_p) = 0; >>> >>> + // Given insn_info pair I1 and I2, sets subreg with multiword registers >>> + // to assign register pairs by allocators. >>> + // LOAD_P is true iff the pair is a load. >>> + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, >>> + rtl_ssa::insn_info *i2, >>> + bool load_p) = 0; >>> + >>> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >>> + // store mem pairs. >>> + // Return true if feasible to perform store mem pairs otherwise false. >>> + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, >>> + rtl_ssa::insn_info *i2) = 0; >>> + >>> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >>> + // load mem pairs. >>> + // Return true if feasible to perform load mem pairs otherwise false. >>> + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0; >>> + >>> void process_block (rtl_ssa::bb_info *bb); >>> rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2], >>> const rtl_ssa::insn_range_info >>> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h >>> index e2134621723..d5c5b80f8aa 100644 >>> --- a/gcc/rtl-ssa/functions.h >>> +++ b/gcc/rtl-ssa/functions.h >>> @@ -222,6 +222,13 @@ public: >>> template<typename T, typename... Ts> >>> T *change_alloc (obstack_watermark &wm, Ts... args); >>> >>> + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; } >>> + >>> + template<typename T, typename... Ts> >>> + T *allocate (Ts... args); >>> + >>> + void remove_use (use_info *); >>> + >>> private: >>> class bb_phi_info; >>> class build_info; >>> @@ -231,9 +238,6 @@ private: >>> // allocate_temp during its lifetime. >>> obstack_watermark temp_watermark () { return &m_temp_obstack; } >>> >>> - template<typename T, typename... Ts> >>> - T *allocate (Ts... args); >>> - >>> template<typename T, typename... Ts> >>> T *allocate_temp (Ts... args); >>> >>> @@ -269,7 +273,6 @@ private: >>> static void insert_use_after (use_info *, use_info *); >>> >>> void add_use (use_info *); >>> - void remove_use (use_info *); >>> >>> insn_info::order_node *need_order_node (insn_info *); >>> >>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >>> new file mode 100644 >>> index 00000000000..d10ff0cdf36 >>> --- /dev/null >>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >>> @@ -0,0 +1,22 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target power10_ok } */ >>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >>> + >>> +#include <altivec.h> >>> + >>> +void >>> +foo2 () >>> +{ >>> + __vector_quad *dst1; >>> + __vector_quad *dst2; >>> + vector unsigned char src; >>> + __vector_quad acc; >>> + vector unsigned char *ptr; >>> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >>> + *dst1 = acc; >>> + __builtin_mma_xvf32ger(&acc, src, ptr[2]); >>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]); >>> + *dst2 = acc; >>> +} >>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >>> new file mode 100644 >>> index 00000000000..c523572cf3c >>> --- /dev/null >>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >>> @@ -0,0 +1,15 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target power10_ok } */ >>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >>> + >>> +#include <altivec.h> >>> + >>> +void >>> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src) >>> +{ >>> + __vector_quad acc; >>> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >>> + *dst = acc; >>> +} >>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >>> index 69ee826e1be..ae29127f954 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >>> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec) >>> dst[13] = acc; >>> } >>> >>> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */ >>> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ >>> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */ >>> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */ >>> /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */ >>> /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */ >>> /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
Hello Richard: On 18/07/24 4:44 pm, Richard Sandiford wrote: > Ajit Agarwal <aagarwa1@linux.ibm.com> writes: >> [...] >>>> +// Set subreg for OO mode pair to generate sequential registers given >>>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false >>>> +// if store insn. >>>> +void >>>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, >>>> + bool load_p) >>>> +{ >>>> + if (load_p) >>>> + { >>>> + bool i1_subreg_p = use_has_subreg_p (i1); >>>> + bool i2_subreg_p = use_has_subreg_p (i2); >>>> + >>>> + if (i1_subreg_p || i2_subreg_p) >>>> + set_multiword_existing_subreg (i1, i2); >>>> + else >>>> + set_multiword_subreg_load (i1, i2); >>> >>> I don't understand this. Why do we have both set_multiword_existing_subreg >>> and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically >>> independent of one another (since i1 and i2 were separate instructions >>> until now). So "i1_subreg_p || i2_subreg_p" implies that >>> set_multiword_existing_subreg can handle i1s that have no existing >>> subreg (used when i2_subreg_p) and that it can handle i2s that have no >>> existing subreg (used when i1_subreg_p). So doesn't this mean that >>> set_multiword_existing_subreg can handle everything? >>> >> >> I will make the following change. >> if (load_p) >> { >> bool i1_subreg_p = use_has_subreg_p (i1); >> bool i2_subreg_p = use_has_subreg_p (i2); >> >> if (!i1_subreg_p && !i2_subreg_p) >> set_multiword_subreg_load (i1, i2); >> else >> set_multiword_existing_subreg (i1, i2); >> } >> >> Is this okay. > > That's the same thing though: it's just replacing a ? A : B with !a ? B : A. > Addressed in v7 of the patch. >>> IMO, the way the update should work is that: >>> >>> (a) all references to the old registers should be updated via >>> insn_propagation (regardless of whether the old references >>> involved subregs). >>> >>> (b) those updates should be part of the same insn_change group as >>> the change to the load itself. >>> >>> For stores, definitions of the stored register can probably be handled >>> directly using df_refs, but there too, the updates should IMO be part >>> of the same insn_change group as the change to the store itself. >>> >>> In both cases, it's the: >>> >>> crtl->ssa->change_insns (changes); >>> >>> in pair_fusion_bb_info::fuse_pair that should be responsible for >>> updating the rtl-ssa IR. The changes that the pass wants to make >>> should be described as insn_changes and passed to change_insns. >>> >>> The reason for funneling all changes through change_insns is that >>> it allows rtl-ssa to maintain more complex datastructures. Clients >>> aren't supposed to manually update the datastructures piecemeal. >>> >> >> I am afraid I am not getting this. Would you mind elaborating this. >> Sorry for that. > > See how fwprop.cc makes changes. It: > > - creates an insn_change for each change that it wants to make > > - uses insn_propagation to replace the old value with the new value > > - sets the new_uses of the insn_change to reflect the effect > of the propagation (in this case, replacing the old 128-bit > value with a 256-bit value) > > - uses change_insn to commit the change > > The process would be similar here. > Addressed in v7 of the patch. > Thanks, > Richard Thanks & Regards Ajit
diff --git a/gcc/config.gcc b/gcc/config.gcc index bc45615741b..12f79a78177 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -524,6 +524,7 @@ powerpc*-*-*) extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" + extra_objs="${extra_objs} rs6000-mem-fusion.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" @@ -560,6 +561,7 @@ rs6000*-*-*) extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" + extra_objs="${extra_objs} rs6000-mem-fusion.o" target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc" target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc" ;; diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index 04e2d0066df..88413926a02 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -294,7 +294,31 @@ (define_insn_and_split "*movoo" [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") - (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))] + "TARGET_MMA + && (gpc_reg_operand (operands[0], OOmode) + || gpc_reg_operand (operands[1], OOmode))" +;; "" + "@ + # + # + #" + "&& reload_completed" + [(const_int 0)] +{ + rs6000_split_multireg_move (operands[0], operands[1]); + DONE; +} + [(set_attr "type" "vecload,vecstore,veclogical") + (set_attr "length" "*,*,8")]) +;; (set_attr "max_prefixed_insns" "2,2,*")]) + + +(define_insn_and_split "*movoo1" + [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa") + (unspec [ + (match_operand:OO 1 "input_operand" "ZwO,wa,wa") + ] UNSPEC_LXVP))] "TARGET_MMA && (gpc_reg_operand (operands[0], OOmode) || gpc_reg_operand (operands[1], OOmode))" diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc new file mode 100644 index 00000000000..b63b6f31001 --- /dev/null +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc @@ -0,0 +1,708 @@ +/* Subroutines used to perform adjacent load/store into + paired memory accesses for TARGET_POWER10 and TARGET_VSX. + + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#define INCLUDE_LIST +#define INCLUDE_TYPE_TRAITS +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-iter.h" +#include "rtl-ssa.h" +#include "rtl-ssa/internals.h" +#include "rtl-ssa/internals.inl" +#include "cfgcleanup.h" +#include "tree-pass.h" +#include "pair-fusion.h" + +using namespace rtl_ssa; + +struct rs6000_pair_fusion : public pair_fusion +{ + bool fpsimd_op_p (rtx , machine_mode , bool) override final + { + return false; + } + + bool pair_mem_insn_p (rtx_insn *, bool &) override final + { + return false; + } + + void change_existing_multword_mode (rtx_insn *insn) override final; + + bool pair_mem_ok_with_policy (rtx, bool) override final + { + return true; + } + + bool pair_operand_mode_ok_p (machine_mode mode) override final; + + rtx gen_pair (rtx *pats, rtx, bool load_p) override final; + + bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final + { + return true; + } + + int pair_mem_alias_check_limit () override final + { + return 0; + } + + bool should_handle_writeback (enum writeback_type) override final + { + return false; + } + + bool track_loads_p () override final + { + return true; + } + + bool track_stores_p () override final + { + return true; + } + + bool pair_mem_in_range_p (HOST_WIDE_INT) override final + { + return true; + } + + rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final + { + return NULL_RTX; + } + + rtx destructure_pair (rtx_def **, rtx, bool) override final + { + return NULL_RTX; + } + + bool fuseable_store_p (insn_info *i1, insn_info *i2) override final; + + bool fuseable_load_p (insn_info *insn) override final; + + void set_multiword_subreg (insn_info *i1, insn_info *i2, + bool load_p) override final; + + void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt, + insn_change **pair_change, + auto_vec <insn_change *> &changes) override final; +}; + +bool +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) +{ + return (ALTIVEC_OR_VSX_VECTOR_MODE (mode)); +} + +void +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn) +{ + rtx set = single_set (insn); + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + rtx copy = NULL_RTX; + + if ((MEM_P (src) && GET_MODE (src) == OOmode) + || (MEM_P (dest) && GET_MODE (dest) == OOmode)) + { + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), + gen_rtvec (1, src), + UNSPEC_LXVP); + copy = gen_rtx_SET (dest, unspec); + rtx_insn *new_insn = emit_insn_after (copy, insn); + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); + df_insn_rescan (new_insn); + df_insn_delete (insn); + remove_insn (insn); + insn->set_deleted (); + } +} + +static void +update_change (set_info *set) +{ + if (!set->has_any_uses ()) + return; + + auto *use = *set->all_uses ().begin (); + do + { + auto *next_use = use->next_use (); + if (use->is_in_phi ()) + { + update_change (use->phi ()); + } + else + { + crtl->ssa->remove_use (use); + } + use = next_use; + } + while (use); +} + +void +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first, + obstack_watermark *attempt, + insn_change **pair_change, + auto_vec<insn_change *> &changes) +{ + for (insn_change *change : changes) + for (auto def : change->old_defs ()) + { + auto set = dyn_cast<set_info *> (def); + update_change (set); + } + + auto &new_defs = (*pair_change)->new_defs; + vec_rtx_properties properties; + properties.add_insn (first->rtl (), true); + // Build up the new list of definitions. + for (rtx_obj_reference ref : properties.refs ()) + if (ref.is_write ()) + { + auto *set = crtl->ssa->allocate<set_info> (first, + full_register (ref.regno)); + if (set) + { + auto def = find_access (new_defs, ref.regno); + if (!def) + { + new_defs = insert_access (*attempt, set, + new_defs); + auto &m_temp_defs = crtl->ssa->get_m_temp_defs (); + m_temp_defs.safe_push (set); + } + } + } +} + +// df_insn_rescan dependent instruction where operands +// are reversed given insn_info INFO. +static void +set_rescan_load (insn_info *i1) +{ + for (auto def : i1->defs ()) + { + auto set = dyn_cast<set_info *> (def); + for (auto use : set->all_uses ()) + { + insn_info *info = use->insn (); + if (info && info->rtl ()) + { + rtx_insn *rtl_insn = info->rtl (); + df_insn_rescan (rtl_insn); + } + } + } +} + +// df_insn_rescan the def instruction where operands are reversed given INSN. +static bool +set_rescan_store (insn_info *insn) +{ + for (auto use : insn->uses()) + { + auto def = use->def (); + + if (!def) + return false; + + if (def->insn ()->is_artificial ()) + return false; + + if (def->insn () && def->insn ()->rtl () + && def->insn()->is_real ()) + { + rtx_insn *rtl_insn = def->insn ()->rtl (); + rtx set = single_set (rtl_insn); + + if (set == NULL_RTX) + return false; + df_insn_rescan (rtl_insn); + } + } + return true; +} + +// Check for feasibility of store to be fuseable or not. Return true if +// feasible otherwise false. +static bool +feasible_store_p (insn_info *insn) +{ + for (auto use : insn->uses ()) + { + auto def = use->def (); + + if (def->insn ()->is_artificial ()) + return false; + + if (def->insn () && def->insn ()->rtl () + && def->insn()->is_real ()) + { + rtx_insn *rtl_insn = def->insn ()->rtl (); + rtx set = single_set (rtl_insn); + + if (set == NULL_RTX) + return false; + + // Return false if dependent def is load. + // This is done as def instruction could be a fused load and + // to avoid already existing subreg (reg:OO R) offset. + if (rtl_insn && MEM_P (SET_SRC (set))) + return false; + + // Return false if dependent def is store. + if (rtl_insn && MEM_P (SET_DEST (set))) + return false; + } + } + return true; +} + +// Check if store can be fuseable or not. Return true if fuseable otherwise +// false. +bool +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2) +{ + rtx_insn *insn1 = i1->rtl (); + rtx_insn *insn2 = i2->rtl (); + rtx body = PATTERN (insn1); + rtx src_exp = SET_SRC (body); + rtx insn2_body = PATTERN (insn2); + rtx insn2_src_exp = SET_SRC (insn2_body); + + if (!(REG_P (src_exp) + && crtl->ssa->single_dominating_def (REGNO (src_exp)))) + return false; + + // This is done as def instruction could be a fused load and + // to avoid already existing subreg (reg:OO R) offset. + if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1) + return false; + + // Return false if src of insn1 and src of insn2 are same. + if (src_exp == insn2_src_exp) + return false; + + // Return false if src of insn1 is subreg. + if (GET_CODE (src_exp) == SUBREG) + return false; + + // Return false if src of insn2 is subreg. + if (GET_CODE (insn2_src_exp) == SUBREG) + return false; + + if (!feasible_store_p (i1)) + return false;; + + if (!feasible_store_p (i2)) + return false; + + return true; +} + +// Set subreg for def of store INSN given rtx SRC instruction. +static void +set_store_subreg (insn_info *i1, rtx src, int regoff) +{ + for (auto use: i1->uses ()) + { + auto def = use->def (); + if (!def) + return; + + insn_info *info = def->insn (); + + if (info->is_artificial ()) + return; + + if (info && info->is_real ()) + { + rtx_insn *rtl_insn = info->rtl (); + rtx set = single_set (rtl_insn); + if (set == NULL_RTX) + return; + df_ref ref; + FOR_EACH_INSN_DEF (ref, rtl_insn) + { + rtx src_exp = SET_SRC (PATTERN (i1->rtl ())); + if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp)) + { + rtx *loc = DF_REF_LOC (ref); + if (GET_CODE (*loc) == SUBREG) + { + rtx src1 = simplify_gen_subreg (GET_MODE (*loc), + SUBREG_REG (src), + OOmode, + regoff); + *loc = copy_rtx (src1); + } + else + *loc = copy_rtx (src); + } + } + } + } +} + +// Check whether load can be fusable or not. +// Return true if fuseable otherwise false. +bool +rs6000_pair_fusion::fuseable_load_p (insn_info *i1) +{ + rtx_insn *insn = i1->rtl (); + rtx body = PATTERN (insn); + rtx dest_exp = SET_DEST (body); + + if (!(REG_P (dest_exp) + && crtl->ssa->single_dominating_def (REGNO (dest_exp)))) + return false; + return true; +} + +// Propagate insn I1 with new rtx NEW_DEST_EXP. +static void +propagate_insn (insn_info *i1, rtx new_dest_exp) +{ + df_ref ref; + FOR_EACH_INSN_DEF (ref, i1->rtl()) + { + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); + if (REG_P (dest_exp) + && DF_REF_REGNO (ref) == REGNO (dest_exp)) + { + rtx *loc = DF_REF_LOC (ref); + *loc = new_dest_exp; + } + } +} + +// Generate new reg rtx with copy of OLD_DEST for OOmode pair. +static rtx +new_reg_rtx (rtx old_dest) +{ + rtx new_dest_exp = gen_reg_rtx (OOmode); + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest); + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest); + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest); + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest); + max_regno = max_reg_num (); + return new_dest_exp; +} + +// Set subreg with use of INSN given SRC rtx instruction. +static void +set_load_subreg (insn_info *i1, rtx src) +{ + rtx set = single_set (i1->rtl()); + rtx old_dest = SET_DEST (set); + + for (auto def : i1->defs ()) + { + auto set = dyn_cast<set_info *> (def); + for (auto use : set->nondebug_insn_uses ()) + { + insn_info *info = use->insn (); + if (!info || !info->rtl ()) + continue; + + rtx_insn *rtl_insn = info->rtl (); + df_ref ref; + + FOR_EACH_INSN_USE (ref, rtl_insn) + { + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); + if (REG_P (dest_exp) + && DF_REF_REGNO (ref) == REGNO (dest_exp)) + { + rtx *loc = DF_REF_LOC (ref); + insn_propagation prop (rtl_insn, old_dest, src); + if (GET_CODE (*loc) == SUBREG) + { + if (!prop.apply_to_pattern (loc)) + { + if (dump_file != NULL) + { + fprintf (dump_file, + "Cannot propagate insn \n"); + print_rtl_single (dump_file, rtl_insn); + } + return; + } + } + else + *loc = copy_rtx (src); + } + } + } + } +} + +// Set subreg for OO mode store pair to generate registers in pairs +// given insn_info I1 and I2. +static void +set_multiword_subreg_store (insn_info *i1, insn_info *i2) +{ + rtx_insn *insn1 = i1->rtl (); + rtx_insn *insn2 = i2->rtl (); + rtx body = PATTERN (insn1); + rtx src_exp = SET_SRC (body); + rtx insn2_body = PATTERN (insn2); + rtx insn2_dest_exp = SET_DEST (insn2_body); + machine_mode mode = GET_MODE (src_exp); + int regoff; + rtx src; + rtx addr = XEXP (insn2_dest_exp, 0); + + PUT_MODE_RAW (src_exp, OOmode); + if (GET_CODE (addr) == PLUS + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1))) + regoff = 16; + else + regoff = 0; + + src = simplify_gen_subreg (mode, + src_exp, GET_MODE (src_exp), + regoff); + + set_store_subreg (i1, src, regoff); + + int regoff1 = 0; + rtx src1; + + src1 = simplify_gen_subreg (mode, + src_exp, GET_MODE (src_exp), + regoff1); + + set_store_subreg (i2, src1, regoff1); + set_rescan_store (i1); + set_rescan_store (i2); + df_insn_rescan (insn1); +} + +// Set subreg for OO mode pair load to generate registers in pairs given +// insn_info I2 and I2. +static void +set_multiword_subreg_load (insn_info *i1, insn_info *i2) +{ + rtx_insn *insn1 = i1->rtl (); + rtx body = PATTERN (insn1); + rtx dest_exp = SET_DEST (body); + machine_mode mode = GET_MODE (dest_exp); + PUT_MODE_RAW (dest_exp, OOmode); + + int regoff = 0; + rtx src; + + src = simplify_gen_subreg (mode, + dest_exp, GET_MODE (dest_exp), + regoff); + + set_load_subreg (i2, src); + + int regoff1; + rtx src1; + + regoff1 = 16; + src1 = simplify_gen_subreg (mode, + dest_exp, GET_MODE (dest_exp), + regoff1); + set_load_subreg (i1, src1); + + set_rescan_load (i1); + set_rescan_load (i2); + df_insn_rescan (insn1); +} + +// Set subreg for OO mode pair load for existing subreg rtx to generate +// registers in pairs given insn_info I2 and I2. +static void +set_multiword_existing_subreg (insn_info *i1, insn_info *i2) +{ + rtx_insn *insn1 = i1->rtl (); + rtx body = PATTERN (insn1); + rtx dest_exp = SET_DEST (body); + machine_mode mode = GET_MODE (dest_exp); + int regoff1; + regoff1 = 16; + rtx new_dest_exp = new_reg_rtx (dest_exp); + + rtx src = simplify_gen_subreg (mode, + new_dest_exp, + OOmode, + regoff1); + + set_load_subreg (i1, src); + propagate_insn (i1, new_dest_exp); + + int regoff = 0; + rtx sset = single_set (i2->rtl ()); + rtx insn2_dest_exp = SET_DEST (sset); + machine_mode insn2_mode = GET_MODE (insn2_dest_exp); + + src = simplify_gen_subreg (insn2_mode, + new_dest_exp, + OOmode, + regoff); + + set_load_subreg (i2, src); + propagate_insn (i2, new_dest_exp); + + auto attempt = crtl->ssa->new_change_attempt (); + resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) }; + auto *set = crtl->ssa->allocate<set_info> (i1, resource); + if (set) + { + auto def = find_access (i1->defs (), REGNO (new_dest_exp)); + if (!def) + i1->defs() = insert_access (attempt, set, i1->defs()); + } + + set_rescan_load (i1); + set_rescan_load (i2); + df_insn_rescan (insn1); +} + +// Return true iff insn I1 has already existing subreg. +static bool +use_has_subreg_p (insn_info *i1) +{ + for (auto def : i1->defs ()) + { + auto set = dyn_cast<set_info *> (def); + for (auto use : set->nondebug_insn_uses ()) + { + insn_info *info = use->insn (); + if (info && info->rtl ()) + { + rtx_insn *rtl_insn = info->rtl (); + df_ref ref; + FOR_EACH_INSN_USE (ref, rtl_insn) + { + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); + if (REG_P (dest_exp) + && DF_REF_REGNO (ref) == REGNO (dest_exp)) + { + rtx *loc = DF_REF_LOC (ref); + if (GET_CODE (*loc) == SUBREG) + return true; + } + } + } + } + } + return false; +} + +// Set subreg for OO mode pair to generate sequential registers given +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false +// if store insn. +void +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, + bool load_p) +{ + if (load_p) + { + bool i1_subreg_p = use_has_subreg_p (i1); + bool i2_subreg_p = use_has_subreg_p (i2); + + if (i1_subreg_p || i2_subreg_p) + set_multiword_existing_subreg (i1, i2); + else + set_multiword_subreg_load (i1, i2); + } + else + set_multiword_subreg_store (i1, i2); +} + +rtx +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p) +{ + rtx i1 = pats[0]; + rtx src_exp = SET_SRC (i1); + rtx dest_exp = SET_DEST (i1); + PUT_MODE_RAW (src_exp, OOmode); + PUT_MODE_RAW (dest_exp, OOmode); + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp), + gen_rtvec (1, src_exp), + UNSPEC_LXVP); + rtx set = gen_rtx_SET (dest_exp, unspec); + if (dump_file) + { + if (load_p) + fprintf (dump_file, "lxv with lxvp "); + else + fprintf (dump_file, "stxv with stxvp "); + print_rtl_single (dump_file, set); + } + return set; +} + +const pass_data pass_data_mem_fusion = +{ + RTL_PASS, /* type */ + "mem_fusion", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_mem_fusion : public rtl_opt_pass +{ +public: + pass_mem_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_mem_fusion, ctxt) + {} + + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);} + + /* opt_pass methods: */ + bool gate (function *) + { + return (optimize > 0 && TARGET_VSX && TARGET_POWER10); + } + + unsigned int execute (function *) final override + { + rs6000_pair_fusion pass; + pass.run (); + return 0; + } +}; // class pass_mem_fusion + +rtl_opt_pass * +make_pass_mem_fusion (gcc::context *ctxt) +{ + return new pass_mem_fusion (ctxt); +} diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def index 46a0d0b8c56..0b48f57014d 100644 --- a/gcc/config/rs6000/rs6000-passes.def +++ b/gcc/config/rs6000/rs6000-passes.def @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see The power8 does not have instructions that automaticaly do the byte swaps for loads and stores. */ INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); - + /* Pass to replace adjacent memory addresses lxv/stxv instruction with + lxvp/stxvp instruction. */ + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion); /* Pass to do the PCREL_OPT optimization that combines the load of an external symbol's address along with a single load or store using that address as a base register. */ diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 09a57a806fa..1412b31c2eb 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -343,6 +343,7 @@ namespace gcc { class context; } class rtl_opt_pass; extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *); extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); extern bool rs6000_sum_of_two_registers_p (const_rtx expr); extern bool rs6000_quadword_masked_address_p (const_rtx exp); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 58553ff66f4..6da4e70973d 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) reg_mode = word_mode; reg_mode_size = GET_MODE_SIZE (reg_mode); - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); + gcc_assert (mode == OOmode + || reg_mode_size * nregs == GET_MODE_SIZE (mode)); /* TDmode residing in FP registers is special, since the ISA requires that the lower-numbered word of a register pair is always the most significant @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); if (MEM_P (dst)) { + rtx addr = XEXP (dst, 0); + rtx opnd1 = NULL_RTX; + if (addr && GET_CODE (addr) == PLUS) + opnd1 = XEXP (addr,1); + unsigned offset = 0; unsigned size = GET_MODE_SIZE (reg_mode); @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src) { unsigned subreg = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); - rtx dst2 = adjust_address (dst, reg_mode, offset); + rtx dst2 = dst; + + if ((GET_CODE (addr) != PLUS + || (opnd1 && CONST_INT_P(opnd1)))) + dst2 = adjust_address (dst, reg_mode, offset); + else + PUT_MODE_RAW (dst, reg_mode); rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); offset += size; emit_insn (gen_rtx_SET (dst2, src2)); @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src) if (MEM_P (src)) { + rtx addr = XEXP (src, 0); + rtx opnd1 = NULL_RTX; + if (addr && GET_CODE (addr) == PLUS) + opnd1 = XEXP (addr,1); + unsigned offset = 0; unsigned size = GET_MODE_SIZE (reg_mode); - for (int i = 0; i < nregs; i += reg_mode_nregs) + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs) { unsigned subreg = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); - rtx src2 = adjust_address (src, reg_mode, offset); + rtx src2 = src; + + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1)))) + src2 = adjust_address (src, reg_mode, offset); + else + PUT_MODE_RAW (src2, reg_mode); offset += size; emit_insn (gen_rtx_SET (dst2, src2)); } @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) /* If we are writing an accumulator register, we have to prime it after we've written it. */ if (TARGET_MMA - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) + && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) emit_insn (gen_mma_xxmtacc (dst, dst)); return; @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) { for (i = nregs - 1; i >= 0; i--) { - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); - emit_insn (gen_rtx_SET (dst_i, src_i)); + if (REG_P (dst) && REG_P (src)) + { + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); + emit_insn (gen_rtx_SET (dst_i, src_i)); + } } } else @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) /* If we are writing an accumulator register, we have to prime it after we've written it. */ if (TARGET_MMA - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) + && REG_P (dst) && GET_MODE (dst) == XOmode + && FP_REGNO_P (REGNO (dst))) emit_insn (gen_mma_xxmtacc (dst, dst)); } else @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) /* If the base register we are using to address memory is also a destination reg, then change that register last. */ - if (REG_P (breg) + if (REG_P (dst) && REG_P (breg) && REGNO (breg) >= REGNO (dst) && REGNO (breg) < REGNO (dst) + nregs) j = REGNO (breg) - REGNO (dst); @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) /* XO/OO are opaque so cannot use subregs. */ if (mode == OOmode || mode == XOmode ) { - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); - emit_insn (gen_rtx_SET (dst_i, src_i)); + if (REG_P (dst) && REG_P (src)) + { + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); + emit_insn (gen_rtx_SET (dst_i, src_i)); + } } else emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src) if (restore_basereg != NULL_RTX) emit_insn (restore_basereg); } + return; } + /* Return true if the peephole2 can combine a load involving a combination of an addis instruction and a load with an offset that can be fused together on diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index a5d20594789..2106e1a1fed 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -159,6 +159,7 @@ UNSPEC_XXSPLTIW_CONST UNSPEC_FMAX UNSPEC_FMIN + UNSPEC_LXVP ]) ;; diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index b3ce09d523b..df9b3a35b66 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc $(COMPILE) $< $(POSTCOMPILE) +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc + $(COMPILE) $< + $(POSTCOMPILE) + + rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc $(COMPILE) $< $(POSTCOMPILE) diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc index 31d2c21c88f..ff77a0bc8c6 100644 --- a/gcc/pair-fusion.cc +++ b/gcc/pair-fusion.cc @@ -312,9 +312,9 @@ static int encode_lfs (lfs_fields fields) { int size_log2 = exact_log2 (fields.size); - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); - return ((int)fields.load_p << 3) - | ((int)fields.fpsimd_p << 2) + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9); + return ((int)fields.load_p << 4) + | ((int)fields.fpsimd_p << 3) | (size_log2 - 2); } @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields) static lfs_fields decode_lfs (int lfs) { - bool load_p = (lfs & (1 << 3)); - bool fpsimd_p = (lfs & (1 << 2)); + bool load_p = (lfs & (1 << 4)); + bool fpsimd_p = (lfs & (1 << 3)); unsigned size = 1U << ((lfs & 3) + 2); return { load_p, fpsimd_p, size }; } @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) if (MEM_VOLATILE_P (mem)) return; + if (load_p && !m_pass->fuseable_load_p (insn)) + return; + // Ignore writeback accesses if the hook says to do so. if (!m_pass->should_handle_writeback (writeback_type::EXISTING) && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, } rtx reg_notes = combine_reg_notes (first, second, load_p); - + m_pass->set_multiword_subreg (i1, i2, load_p); rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p); insn_change *pair_change = nullptr; auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, pair_change->new_defs = merge_access_arrays (attempt, input_defs[0], input_defs[1]); + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes); gcc_assert (pair_change->new_defs.is_valid ()); pair_change->new_uses @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size, reg_ops[i] = XEXP (pats[i], !load_p); } + if (!load_p && !m_pass->fuseable_store_p (i1, i2)) + { + if (dump_file) + fprintf (dump_file, + "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n", + insns[0]->uid (), insns[1]->uid ()); + return false; + } + if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) { if (dump_file) @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb) if (GET_CODE (pat) != SET) continue; + change_existing_multword_mode (rti); + if (track_stores && MEM_P (XEXP (pat, 0))) bb_state.track_access (insn, false, XEXP (pat, 0)); else if (track_loads && MEM_P (XEXP (pat, 1))) diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h index 45e4edceecb..756357db794 100644 --- a/gcc/pair-fusion.h +++ b/gcc/pair-fusion.h @@ -26,8 +26,11 @@ namespace rtl_ssa { class insn_info; class insn_range_info; class bb_info; + class insn_change; } +class obstack_watermark; + // Information about a potential base candidate, used in try_fuse_pair. // There may be zero, one, or two viable RTL bases for a given pair. struct base_cand @@ -142,6 +145,19 @@ struct pair_fusion { // true iff INSN is a load pair. virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; + // Given INSN change multiword mode load and store to respective + // unspec instruction. + virtual void change_existing_multword_mode (rtx_insn *insn) = 0; + + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the + // new rtx with INSN. Remove all uses of definition that are + // removed given CHANGES. + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first, + obstack_watermark *attempt, + rtl_ssa::insn_change **pair_change, + auto_vec<rtl_ssa::insn_change *> &changes) + = 0; + // Return true if we should track loads. virtual bool track_loads_p () { @@ -171,6 +187,24 @@ struct pair_fusion { virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, rtx regs[2], bool load_p) = 0; + // Given insn_info pair I1 and I2, sets subreg with multiword registers + // to assign register pairs by allocators. + // LOAD_P is true iff the pair is a load. + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, + rtl_ssa::insn_info *i2, + bool load_p) = 0; + + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform + // store mem pairs. + // Return true if feasible to perform store mem pairs otherwise false. + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, + rtl_ssa::insn_info *i2) = 0; + + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform + // load mem pairs. + // Return true if feasible to perform load mem pairs otherwise false. + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0; + void process_block (rtl_ssa::bb_info *bb); rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2], const rtl_ssa::insn_range_info diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h index e2134621723..d5c5b80f8aa 100644 --- a/gcc/rtl-ssa/functions.h +++ b/gcc/rtl-ssa/functions.h @@ -222,6 +222,13 @@ public: template<typename T, typename... Ts> T *change_alloc (obstack_watermark &wm, Ts... args); + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; } + + template<typename T, typename... Ts> + T *allocate (Ts... args); + + void remove_use (use_info *); + private: class bb_phi_info; class build_info; @@ -231,9 +238,6 @@ private: // allocate_temp during its lifetime. obstack_watermark temp_watermark () { return &m_temp_obstack; } - template<typename T, typename... Ts> - T *allocate (Ts... args); - template<typename T, typename... Ts> T *allocate_temp (Ts... args); @@ -269,7 +273,6 @@ private: static void insert_use_after (use_info *, use_info *); void add_use (use_info *); - void remove_use (use_info *); insn_info::order_node *need_order_node (insn_info *); diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C new file mode 100644 index 00000000000..d10ff0cdf36 --- /dev/null +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +#include <altivec.h> + +void +foo2 () +{ + __vector_quad *dst1; + __vector_quad *dst2; + vector unsigned char src; + __vector_quad acc; + vector unsigned char *ptr; + __builtin_mma_xvf32ger(&acc, src, ptr[0]); + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); + *dst1 = acc; + __builtin_mma_xvf32ger(&acc, src, ptr[2]); + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]); + *dst2 = acc; +} +/* { dg-final { scan-assembler {\mlxvp\M} } } */ diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C new file mode 100644 index 00000000000..c523572cf3c --- /dev/null +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +#include <altivec.h> + +void +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src) +{ + __vector_quad acc; + __builtin_mma_xvf32ger(&acc, src, ptr[0]); + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); + *dst = acc; +} +/* { dg-final { scan-assembler {\mlxvp\M} } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c index 69ee826e1be..ae29127f954 100644 --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec) dst[13] = acc; } -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */ -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */ +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */ /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */ /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */ /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */