diff mbox series

[rs6000,middle-end] v7: Add implementation for different targets for pair mem fusion

Message ID d50b7ef6-bfa9-42ae-ac78-be3bc33f114d@linux.ibm.com
State New
Headers show
Series [rs6000,middle-end] v7: Add implementation for different targets for pair mem fusion | expand

Commit Message

Ajit Agarwal July 19, 2024, 9:16 a.m. UTC
Hello Richard:

All comments are addressed.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-07-19  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/rs6000/rs6000-passes.def: New mem fusion pass
	before pass_early_remat.
	* pair-fusion.h: Add additional pure virtual function
	required for rs6000 target implementation.
	* pair-fusion.cc: Use of virtual functions for additional
	virtual function addded for rs6000 target.
	* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
	Add target specific implementation for generic pure virtual
	functions.
	* config/rs6000/mma.md: Modify movoo machine description.
	Add new machine description movoo1.
	* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
	to expand movoo machine description for all constraints.
	* config.gcc: Add new object file.
	* config/rs6000/rs6000-protos.h: Add new prototype for mem
	fusion pass.
	* config/rs6000/t-rs6000: Add new rule.
	* rtl-ssa/functions.h: Move out allocate function from private
	to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

	* g++.target/powerpc/mem-fusion.C: New test.
	* g++.target/powerpc/mem-fusion-1.C: New test.
	* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc                                |   2 +
 gcc/config/rs6000/mma.md                      |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc        | 746 ++++++++++++++++++
 gcc/config/rs6000/rs6000-passes.def           |   4 +-
 gcc/config/rs6000/rs6000-protos.h             |   1 +
 gcc/config/rs6000/rs6000.cc                   |  58 +-
 gcc/config/rs6000/rs6000.md                   |   1 +
 gcc/config/rs6000/t-rs6000                    |   5 +
 gcc/pair-fusion.cc                            |  32 +-
 gcc/pair-fusion.h                             |  48 ++
 gcc/rtl-ssa/functions.h                       |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C         |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c        |   4 +-
 14 files changed, 946 insertions(+), 29 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

Comments

Richard Sandiford Aug. 12, 2024, noon UTC | #1
Ajit Agarwal <aagarwa1@linux.ibm.com> writes:
> [...]
> +static void
> +update_change (set_info *set)
> +{
> +  if (!set->has_any_uses ())
> +    return;
> +
> +  auto *use = *set->all_uses ().begin ();
> +  do
> +    {
> +      auto *next_use = use->next_use ();
> +      if (use->is_in_phi ())
> +	{
> +	  update_change (use->phi ());
> +	}
> +      else
> +	{
> +	  crtl->ssa->remove_use (use);

This isn't right.  AFAICT it's simply removing uses from a debug
instruction even though the debug instruction pattern still refers
to the register.

Instead, the debug instruction needs to be updated to refer to the
correct half of the new double-width load result.  If the update fails
for any reason, the debug insn should be "reset" (i.e. changed to

      INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();

see insn_combination::substitute_debug_use in late-combine.cc for
an example).

> +	}
> +      use = next_use;
> +    }
> +  while (use);
> +}
> [...]
> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
> +static rtx
> +new_reg_rtx (rtx old_dest)
> +{
> +  rtx new_dest_exp = gen_reg_rtx (OOmode);
> +  ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
> +  REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
> +  REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
> +  REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
> +  max_regno = max_reg_num ();
> +  return new_dest_exp;
> +}

The new register is a different size and mode from OLD_DEST, so it isn't
appropriate to copy across this much information.  The caller should just
use gen_reg_rtx directly.

> +
> +// Set subreg with use of INSN given SRC rtx instruction.
> +static void
> +set_load_subreg (insn_info *i1, rtx src)
> +{
> +  rtx set = single_set (i1->rtl());
> +  rtx old_dest = SET_DEST (set);
> +
> +  for (auto def : i1->defs ())
> +    {
> +      auto set = dyn_cast<set_info *> (def);
> +      for (auto use : set->nondebug_insn_uses ())
> +	{
> +	  insn_info *info = use->insn ();
> +	  if (!info || !info->rtl ())
> +	    continue;
> +
> +	  rtx_insn *rtl_insn = info->rtl ();
> +	  df_ref ref;
> +
> +	  FOR_EACH_INSN_USE (ref, rtl_insn)
> +	    {
> +	      rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
> +	      if (REG_P (dest_exp)
> +		  && DF_REF_REGNO (ref) == REGNO (dest_exp))
> +		{
> +		  rtx *loc = DF_REF_LOC (ref);
> +		  insn_propagation prop (rtl_insn, old_dest, src);
> +		  if (GET_CODE (*loc) == SUBREG)
> +		    {
> +		      if (!prop.apply_to_pattern (loc))
> +			{
> +			  if (dump_file != NULL)
> +			    {
> +			      fprintf (dump_file,
> +				       "Cannot propagate insn \n");
> +			      print_rtl_single (dump_file, rtl_insn);

We can't simply ignore the failure, since it would leave the instruction
referring to the wrong register.  We either need to assert that the failure
can't happen (dangerous) or bail out of the load fusion if propagation fails.

> +			    }
> +			  return;
> +			}
> +		    }
> +		  else
> +		    *loc = copy_rtx (src);
> +		}
> +	    }
> +	}

We shouldn't have two ways of doing this.  The above FOR_EACH_INSN_USE
loop should be replaced by something like:

      insn_propagation prop (rtl_insn, old_dest, src);
      if (!prop.apply_to_pattern (loc))
        ...

so that the substitution is done in one step, and only done using
insn_propagation.

Like I mentioned before, this change needs to be described as an
insn_change as well, so that rtl-ssa framework will see it.  This is
important because later insn movement decisions require the def-use
information to be up-to-date.

> +    }
> +}
> +
> +// Set subreg for OO mode store pair to generate registers in pairs
> +// given insn_info I1 and I2.
> +static void
> +set_multiword_subreg_store (insn_info *i1, insn_info *i2)
> +{
> +  rtx_insn *insn1 = i1->rtl ();
> +  rtx_insn *insn2 = i2->rtl ();
> +  rtx body = PATTERN (insn1);
> +  rtx src_exp = SET_SRC (body);
> +  rtx insn2_body = PATTERN (insn2);
> +  rtx insn2_dest_exp = SET_DEST (insn2_body);
> +  machine_mode mode = GET_MODE (src_exp);
> +  int regoff;
> +  rtx src;
> +  rtx addr = XEXP (insn2_dest_exp, 0);
> +
> +  PUT_MODE_RAW (src_exp, OOmode);
> +  if (GET_CODE (addr) == PLUS
> +      && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
> +    regoff = 16;
> +  else
> +    regoff = 0;
> +
> +  src = simplify_gen_subreg (mode,
> +			     src_exp, GET_MODE (src_exp),
> +			     regoff);
> +
> +  set_store_subreg (i1, src, regoff);
> +
> +  int regoff1 = 0;
> +  rtx src1;
> +
> +  src1 = simplify_gen_subreg (mode,
> +			      src_exp, GET_MODE (src_exp),
> +			      regoff1);
> +
> +  set_store_subreg (i2, src1, regoff1);
> +  set_rescan_store (i1);
> +  set_rescan_store (i2);
> +  df_insn_rescan (insn1);
> +}
> +
> +// Return true iff insn I1 has already existing subreg.
> +static bool
> +use_has_subreg_p (insn_info *i1)
> +{
> +  for (auto def : i1->defs ())
> +    {
> +      auto set = dyn_cast<set_info *> (def);
> +      for (auto use : set->nondebug_insn_uses ())
> +	{
> +	  insn_info *info = use->insn ();
> +	  if (info && info->rtl ())
> +	    {
> +	      rtx_insn *rtl_insn = info->rtl ();
> +	      df_ref ref;
> +	      FOR_EACH_INSN_USE (ref, rtl_insn)
> +		{
> +		  rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
> +		  if (REG_P (dest_exp)
> +		      && DF_REF_REGNO (ref) == REGNO (dest_exp))
> +		    {
> +		      rtx *loc = DF_REF_LOC (ref);
> +		      if (GET_CODE (*loc) == SUBREG)
> +			return true;
> +		    }
> +		}
> +	     }
> +	}
> +    }
> +   return false;
> +}
> +
> +// Set subreg for OO mode pair load for existing subreg rtx to generate
> +// registers in pairs given insn_info I2 and I2.
> +static void
> +set_multiword_subreg_load (insn_info *i1, insn_info *i2)
> +
> +{
> +  rtx_insn *insn1 = i1->rtl ();
> +  rtx body = PATTERN (insn1);
> +  rtx dest_exp = SET_DEST (body);
> +  machine_mode mode = GET_MODE (dest_exp);
> +  int regoff1;
> +  regoff1 = 16;
> +
> +  bool subreg_p = false;
> +
> +  if (use_has_subreg_p (i1))
> +    subreg_p = true;

Like I said in the last review, I don't think we should have a separate
path for uses that involved subregs and uses that didn't.  What is currently
the "subreg_p" path should be the only path.  We should use it even if
there were no references involving subregs.

We shouldn't in general change the width of an existing pseudo register,
since it would invalidate REG_ATTRS and any debug information associated
with the register.  The rs6000 form of load fusion should always create
a fresh pseudo register for the double-width fused result.

> +
> +  rtx new_dest_exp = dest_exp;
> +
> +  if (subreg_p)
> +    new_dest_exp = new_reg_rtx (dest_exp);
> +  else
> +    PUT_MODE_RAW (new_dest_exp, OOmode);
> +
> +  rtx src = simplify_gen_subreg (mode,
> +				 new_dest_exp,
> +				 OOmode,
> +				 regoff1);
> +
> +  set_load_subreg (i1, src);
> +  if (subreg_p)
> +    propagate_insn (i1, new_dest_exp);
> +
> +  int regoff = 0;
> +  rtx sset = single_set (i2->rtl ());
> +  rtx insn2_dest_exp = SET_DEST (sset);
> +  machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
> +
> +  src = simplify_gen_subreg (insn2_mode,
> +			     new_dest_exp,
> +			     OOmode,
> +			     regoff);
> +
> +  set_load_subreg (i2, src);
> +
> +  if (subreg_p)
> +    propagate_insn (i2, new_dest_exp);
> +
> +  if (subreg_p)
> +    {
> +      auto attempt = crtl->ssa->new_change_attempt ();
> +      resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) };
> +      auto *set = crtl->ssa->allocate<set_info> (i1, resource);
> +      if (set)
> +	{
> +	  auto def = find_access (i1->defs (), REGNO (new_dest_exp));
> +	  if (!def)
> +	    i1->defs() = insert_access (attempt, set, i1->defs());
> +	}

Changes to rtl-ssa insn_infos should always happen through insn_changes,
rather than by direct modification of insns.  This allows the framework
to do all the required book-keeping.  E.g. the above does not extend
the function_info::m_defs array.  (And it shouldn't extend that array
directly -- rtl-ssa should do that internally.)

We already have:

  function_info::create_set

which creates a temporary definition of something, for use in an
insn_change.  I'm not sure that it handles completely new registers,
because no-one's needed that yet, but it would be a reasonable thing
to add (in function_info::change_insns).

In summary:

- please always use a new pseudo register for the fused load result
- please always use insn_propagation to update uses of the load results
- please create an insn_change for each use, and add the changes to the
  existing list of insn_changes made by the main pair-fusion.cc code
- please don't try to change the rtl-ssa ir directly from rs6000 code;
  instead, describe the new instruction using insn_changes
- once we know that the fusion is going to go ahead, we should update
  all debug uses to refer to the new register half, in a similar way
  to late-combine's insn_combination::substitute_optional_uses
  (perhaps I should try to provide an abstraction around that).

Thanks,
Richard

> +    }
> +
> +  set_rescan_load (i1);
> +  set_rescan_load (i2);
> +  df_insn_rescan (insn1);
> +}
> +
> +// Set subreg for OO mode pair to generate sequential registers given
> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
> +// if store insn.
> +void
> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
> +					  bool load_p)
> +{
> +  if (load_p)
> +    set_multiword_subreg_load (i1, i2);
> +  else
> +    set_multiword_subreg_store (i1, i2);
> +}
> +
> +rtx
> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx,  bool load_p)
> +{
> +  rtx i1 = pats[0];
> +  rtx src_exp = SET_SRC (i1);
> +  rtx dest_exp = SET_DEST (i1);
> +  PUT_MODE_RAW (src_exp, OOmode);
> +  PUT_MODE_RAW (dest_exp, OOmode);
> +  rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest_exp),
> +				gen_rtvec (1, src_exp),
> +				UNSPEC_LXVP);
> +  rtx set =  gen_rtx_SET (dest_exp, unspec);
> +  if (dump_file)
> +    {
> +      if (load_p)
> +	fprintf (dump_file, "lxv with lxvp ");
> +      else
> +	fprintf (dump_file, "stxv with stxvp ");
> +      print_rtl_single (dump_file, set);
> +    }
> +  return set;
> +}
> +
> +const pass_data pass_data_mem_fusion =
> +{
> +  RTL_PASS, /* type */
> +  "mem_fusion", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_NONE, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_mem_fusion : public rtl_opt_pass
> +{
> +public:
> +  pass_mem_fusion (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_mem_fusion, ctxt)
> +  {}
> +
> +  opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
> +
> +  /* opt_pass methods: */
> +  bool gate (function *)
> +    {
> +      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
> +    }
> +
> +  unsigned int execute (function *) final override
> +    {
> +      rs6000_pair_fusion pass;
> +      pass.run ();
> +      return 0;
> +    }
> +}; // class pass_mem_fusion
> +
> +rtl_opt_pass *
> +make_pass_mem_fusion (gcc::context *ctxt)
> +{
> +  return new pass_mem_fusion (ctxt);
> +}
> diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
> index 46a0d0b8c56..0b48f57014d 100644
> --- a/gcc/config/rs6000/rs6000-passes.def
> +++ b/gcc/config/rs6000/rs6000-passes.def
> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3.  If not see
>       The power8 does not have instructions that automaticaly do the byte swaps
>       for loads and stores.  */
>    INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
> -
> +  /* Pass to replace adjacent memory addresses lxv/stxv instruction with
> +     lxvp/stxvp instruction.  */
> +  INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
>    /* Pass to do the PCREL_OPT optimization that combines the load of an
>       external symbol's address along with a single load or store using that
>       address as a base register.  */
> diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
> index b40557a8557..5295ff66950 100644
> --- a/gcc/config/rs6000/rs6000-protos.h
> +++ b/gcc/config/rs6000/rs6000-protos.h
> @@ -342,6 +342,7 @@ namespace gcc { class context; }
>  class rtl_opt_pass;
>  
>  extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
>  extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
>  extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
>  extern bool rs6000_quadword_masked_address_p (const_rtx exp);
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 5ed64b1e686..449d9e446d5 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -27442,7 +27442,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>      reg_mode = word_mode;
>    reg_mode_size = GET_MODE_SIZE (reg_mode);
>  
> -  gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
> +  gcc_assert (mode == OOmode
> +	      || reg_mode_size * nregs == GET_MODE_SIZE (mode));
>  
>    /* TDmode residing in FP registers is special, since the ISA requires that
>       the lower-numbered word of a register pair is always the most significant
> @@ -27489,6 +27490,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>        int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
>        if (MEM_P (dst))
>  	{
> +	  rtx addr = XEXP (dst, 0);
> +	  rtx opnd1 = NULL_RTX;
> +	  if (addr && GET_CODE (addr) == PLUS)
> +	    opnd1 = XEXP (addr,1);
> +
>  	  unsigned offset = 0;
>  	  unsigned size = GET_MODE_SIZE (reg_mode);
>  
> @@ -27502,7 +27508,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  	    {
>  	      unsigned subreg
>  		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
> -	      rtx dst2 = adjust_address (dst, reg_mode, offset);
> +	      rtx dst2 = dst;
> +
> +	      if ((GET_CODE (addr) != PLUS
> +		  || (opnd1 && CONST_INT_P(opnd1))))
> +		dst2 = adjust_address (dst, reg_mode, offset);
> +	      else
> +		PUT_MODE_RAW (dst, reg_mode);
>  	      rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
>  	      offset += size;
>  	      emit_insn (gen_rtx_SET (dst2, src2));
> @@ -27513,15 +27525,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  
>        if (MEM_P (src))
>  	{
> +	  rtx  addr = XEXP (src, 0);
> +	  rtx opnd1 = NULL_RTX;
> +	  if (addr && GET_CODE (addr) == PLUS)
> +	    opnd1 = XEXP (addr,1);
> +
>  	  unsigned offset = 0;
>  	  unsigned size = GET_MODE_SIZE (reg_mode);
>  
> -	  for (int i = 0; i < nregs; i += reg_mode_nregs)
> +	  for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
>  	    {
>  	      unsigned subreg
>  		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>  	      rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
> -	      rtx src2 = adjust_address (src, reg_mode, offset);
> +	      rtx src2 = src;
> +
> +	      if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
> +		src2 = adjust_address (src, reg_mode, offset);
> +	      else
> +		PUT_MODE_RAW (src2, reg_mode);
>  	      offset += size;
>  	      emit_insn (gen_rtx_SET (dst2, src2));
>  	    }
> @@ -27529,7 +27551,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  	  /* If we are writing an accumulator register, we have to
>  	     prime it after we've written it.  */
>  	  if (TARGET_MMA
> -	      && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
> +	      && REG_P (dst) && GET_MODE (dst) == XOmode
> +	      && FP_REGNO_P (REGNO (dst)))
>  	    emit_insn (gen_mma_xxmtacc (dst, dst));
>  
>  	  return;
> @@ -27622,9 +27645,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  	{
>  	  for (i = nregs - 1; i >= 0; i--)
>  	    {
> -	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
> -	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
> -	      emit_insn (gen_rtx_SET (dst_i, src_i));
> +	      if (REG_P (dst) && REG_P (src))
> +		{
> +		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
> +		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
> +		  emit_insn (gen_rtx_SET (dst_i, src_i));
> +		}
>  	    }
>  	}
>        else
> @@ -27639,7 +27665,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>        /* If we are writing an accumulator register, we have to
>  	 prime it after we've written it.  */
>        if (TARGET_MMA
> -	  && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
> +	  && REG_P (dst) && GET_MODE (dst) == XOmode
> +	  && FP_REGNO_P (REGNO (dst)))
>  	emit_insn (gen_mma_xxmtacc (dst, dst));
>      }
>    else
> @@ -27696,7 +27723,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  
>  	  /* If the base register we are using to address memory is
>  	     also a destination reg, then change that register last.  */
> -	  if (REG_P (breg)
> +	  if (REG_P (dst) && REG_P (breg)
>  	      && REGNO (breg) >= REGNO (dst)
>  	      && REGNO (breg) < REGNO (dst) + nregs)
>  	    j = REGNO (breg) - REGNO (dst);
> @@ -27794,9 +27821,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>  	  /* XO/OO are opaque so cannot use subregs. */
>  	  if (mode == OOmode || mode == XOmode )
>  	    {
> -	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
> -	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
> -	      emit_insn (gen_rtx_SET (dst_i, src_i));
> +	      if (REG_P (dst) &&  REG_P (src))
> +		{
> +		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
> +		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
> +		  emit_insn (gen_rtx_SET (dst_i, src_i));
> +		}
>  	    }
>  	  else
>  	    emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
> @@ -27814,7 +27844,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>        if (restore_basereg != NULL_RTX)
>  	emit_insn (restore_basereg);
>      }
> +  return;
>  }
> +
>  
>  /* Return true if the peephole2 can combine a load involving a combination of
>     an addis instruction and a load with an offset that can be fused together on
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index f59be536570..99f070ee24d 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -159,6 +159,7 @@
>     UNSPEC_XXSPLTIW_CONST
>     UNSPEC_FMAX
>     UNSPEC_FMIN
> +   UNSPEC_LXVP
>    ])
>  
>  ;;
> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
> index 155788de40a..a052163eec4 100644
> --- a/gcc/config/rs6000/t-rs6000
> +++ b/gcc/config/rs6000/t-rs6000
> @@ -34,6 +34,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
>  	$(COMPILE) $<
>  	$(POSTCOMPILE)
>  
> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
> +	$(COMPILE) $<
> +	$(POSTCOMPILE)
> +
> +
>  rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
>  	$(COMPILE) $<
>  	$(POSTCOMPILE)
> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
> index 31d2c21c88f..b1521253474 100644
> --- a/gcc/pair-fusion.cc
> +++ b/gcc/pair-fusion.cc
> @@ -312,9 +312,9 @@ static int
>  encode_lfs (lfs_fields fields)
>  {
>    int size_log2 = exact_log2 (fields.size);
> -  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
> -  return ((int)fields.load_p << 3)
> -    | ((int)fields.fpsimd_p << 2)
> +  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
> +  return ((int)fields.load_p << 4)
> +    | ((int)fields.fpsimd_p << 3)
>      | (size_log2 - 2);
>  }
>  
> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
>  static lfs_fields
>  decode_lfs (int lfs)
>  {
> -  bool load_p = (lfs & (1 << 3));
> -  bool fpsimd_p = (lfs & (1 << 2));
> +  bool load_p = (lfs & (1 << 4));
> +  bool fpsimd_p = (lfs & (1 << 3));
>    unsigned size = 1U << ((lfs & 3) + 2);
>    return { load_p, fpsimd_p, size };
>  }
> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    if (MEM_VOLATILE_P (mem))
>      return;
>  
> +  if (load_p && !m_pass->fuseable_load_p (insn))
> +    return;
> +
>    // Ignore writeback accesses if the hook says to do so.
>    if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>      }
>  
>    rtx reg_notes = combine_reg_notes (first, second, load_p);
> -
> +  m_pass->set_multiword_subreg (i1, i2, load_p);
>    rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
>    insn_change *pair_change = nullptr;
>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
> @@ -1828,18 +1831,18 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>        changes.safe_push (make_delete (first));
>        pair_change = make_change (second);
>        changes.safe_push (pair_change);
> -
>        pair_change->move_range = move_range;
>        pair_change->new_defs = merge_access_arrays (attempt,
>  						   input_defs[0],
>  						   input_defs[1]);
> +      m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
>        gcc_assert (pair_change->new_defs.is_valid ());
> -
>        pair_change->new_uses
>  	= merge_access_arrays (attempt,
>  			       drop_memory_access (input_uses[0]),
>  			       drop_memory_access (input_uses[1]));
>        gcc_assert (pair_change->new_uses.is_valid ());
> +      m_pass->set_uses (i1, i2, &pair_change, &attempt);
>        set_pair_pat (pair_change);
>      }
>    else
> @@ -1872,6 +1875,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>  						store_def,
>  						change->new_defs);
>  	      gcc_assert (change->new_defs.is_valid ());
> +	      m_pass->set_defs (i1, i2, &change, &attempt);
>  	      change->move_range = move_range;
>  	      pair_change = change;
>  	      break;
> @@ -1908,6 +1912,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>  	      change->new_defs = insert_access (attempt, new_set,
>  						change->new_defs);
>  	      gcc_assert (change->new_defs.is_valid ());
> +	      m_pass->set_defs (i1, i2, &change, &attempt);
>  	      pair_change = change;
>  	      break;
>  	    }
> @@ -2405,6 +2410,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>        reg_ops[i] = XEXP (pats[i], !load_p);
>      }
>  
> +  if (!load_p && !m_pass->fuseable_store_p (i1, i2))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>      {
>        if (dump_file)
> @@ -2997,6 +3011,8 @@ void pair_fusion::process_block (bb_info *bb)
>        if (GET_CODE (pat) != SET)
>  	continue;
>  
> +      change_existing_multword_mode (rti);
> +
>        if (track_stores && MEM_P (XEXP (pat, 0)))
>  	bb_state.track_access (insn, false, XEXP (pat, 0));
>        else if (track_loads && MEM_P (XEXP (pat, 1)))
> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
> index 45e4edceecb..b3af49de892 100644
> --- a/gcc/pair-fusion.h
> +++ b/gcc/pair-fusion.h
> @@ -26,8 +26,12 @@ namespace rtl_ssa {
>    class insn_info;
>    class insn_range_info;
>    class bb_info;
> +  class insn_change;
> +  class insn_range_info;
>  }
>  
> +class obstack_watermark;
> +
>  // Information about a potential base candidate, used in try_fuse_pair.
>  // There may be zero, one, or two viable RTL bases for a given pair.
>  struct base_cand
> @@ -142,6 +146,19 @@ struct pair_fusion {
>    // true iff INSN is a load pair.
>    virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
>  
> +  // Given INSN change multiword mode load and store to respective
> +  // unspec instruction.
> +  virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
> +
> +  // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
> +  // new rtx with INSN.  Remove all uses of definition that are
> +  // removed given CHANGES.
> +  virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
> +				    obstack_watermark *attempt,
> +				    rtl_ssa::insn_change **pair_change,
> +				    auto_vec<rtl_ssa::insn_change *> &changes)
> +				    = 0;
> +
>    // Return true if we should track loads.
>    virtual bool track_loads_p ()
>    {
> @@ -171,6 +188,37 @@ struct pair_fusion {
>    virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
>  					  rtx regs[2], bool load_p) = 0;
>  
> +  // Given insn_info pair I1 and I2, sets subreg with multiword registers
> +  // to assign register pairs by allocators.
> +  // LOAD_P is true iff the pair is a load.
> +  virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
> +				     rtl_ssa::insn_info *i2,
> +				     bool load_p) = 0;
> +
> +  // Given insn_info pair I1 and I2, sets uses in PAIR_CHANGE.
> +  virtual void set_uses (rtl_ssa::insn_info *i1,
> +			 rtl_ssa::insn_info *i2,
> +			 rtl_ssa::insn_change **pair_change,
> +			 obstack_watermark *attempt) = 0;
> +
> +  // Given insn_info pair I1 and I2, sets defss in PAIR_CHANGE.
> +  virtual void set_defs (rtl_ssa::insn_info *i1,
> +			 rtl_ssa::insn_info *i2,
> +			 rtl_ssa::insn_change **pair_change,
> +			 obstack_watermark *attempt) = 0;
> +
> +
> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
> +  // store mem pairs.
> +  // Return true if feasible to perform store mem pairs otherwise false.
> +  virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
> +				 rtl_ssa::insn_info *i2) = 0;
> +
> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
> +  // load mem pairs.
> +  // Return true if feasible to perform load mem pairs otherwise false.
> +  virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
> +
>    void process_block (rtl_ssa::bb_info *bb);
>    rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
>  					 const rtl_ssa::insn_range_info
> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
> index 8be04f1aa96..27a732b7353 100644
> --- a/gcc/rtl-ssa/functions.h
> +++ b/gcc/rtl-ssa/functions.h
> @@ -222,6 +222,13 @@ public:
>    template<typename T, typename... Ts>
>    T *change_alloc (obstack_watermark &wm, Ts... args);
>  
> +  auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
> +
> +  template<typename T, typename... Ts>
> +  T *allocate (Ts... args);
> +
> +  void remove_use (use_info *);
> +
>  private:
>    class bb_phi_info;
>    class build_info;
> @@ -231,9 +238,6 @@ private:
>    // allocate_temp during its lifetime.
>    obstack_watermark temp_watermark () { return &m_temp_obstack; }
>  
> -  template<typename T, typename... Ts>
> -  T *allocate (Ts... args);
> -
>    template<typename T, typename... Ts>
>    T *allocate_temp (Ts... args);
>  
> @@ -269,7 +273,6 @@ private:
>    static void insert_use_after (use_info *, use_info *);
>  
>    void add_use (use_info *);
> -  void remove_use (use_info *);
>  
>    insn_info::order_node *need_order_node (insn_info *);
>  
> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> new file mode 100644
> index 00000000000..d10ff0cdf36
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +#include <altivec.h>
> +	
> +void
> +foo2 ()
> +{
> +  __vector_quad *dst1;
> +  __vector_quad *dst2;
> +  vector unsigned char src;
> +  __vector_quad acc;
> +  vector unsigned char *ptr;
> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
> +  *dst1 = acc;
> +  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
> +  *dst2 = acc;
> +}
> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
> new file mode 100644
> index 00000000000..c523572cf3c
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */ 
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
> +
> +#include <altivec.h>
> +
> +void
> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
> +{
> +  __vector_quad acc;
> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
> +  *dst = acc;
> +}
> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> index 69ee826e1be..ae29127f954 100644
> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
>    dst[13] = acc;
>  }
>  
> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
>  /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
>  /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
>  /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
Ajit Agarwal Aug. 14, 2024, 2:42 p.m. UTC | #2
Hello Richard:

On 12/08/24 5:30 pm, Richard Sandiford wrote:
> Ajit Agarwal <aagarwa1@linux.ibm.com> writes:
>> [...]
>> +static void
>> +update_change (set_info *set)
>> +{
>> +  if (!set->has_any_uses ())
>> +    return;
>> +
>> +  auto *use = *set->all_uses ().begin ();
>> +  do
>> +    {
>> +      auto *next_use = use->next_use ();
>> +      if (use->is_in_phi ())
>> +	{
>> +	  update_change (use->phi ());
>> +	}
>> +      else
>> +	{
>> +	  crtl->ssa->remove_use (use);
> 
> This isn't right.  AFAICT it's simply removing uses from a debug
> instruction even though the debug instruction pattern still refers
> to the register.
> 
> Instead, the debug instruction needs to be updated to refer to the
> correct half of the new double-width load result.  If the update fails
> for any reason, the debug insn should be "reset" (i.e. changed to
> 
>       INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();
> 
> see insn_combination::substitute_debug_use in late-combine.cc for
> an example).
> 

Addressed in v8 of the patch.

>> +	}
>> +      use = next_use;
>> +    }
>> +  while (use);
>> +}
>> [...]
>> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
>> +static rtx
>> +new_reg_rtx (rtx old_dest)
>> +{
>> +  rtx new_dest_exp = gen_reg_rtx (OOmode);
>> +  ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
>> +  REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
>> +  REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
>> +  REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
>> +  max_regno = max_reg_num ();
>> +  return new_dest_exp;
>> +}
> 
> The new register is a different size and mode from OLD_DEST, so it isn't
> appropriate to copy across this much information.  The caller should just
> use gen_reg_rtx directly.
>

Addressed in v8 of the patch.
 
>> +
>> +// Set subreg with use of INSN given SRC rtx instruction.
>> +static void
>> +set_load_subreg (insn_info *i1, rtx src)
>> +{
>> +  rtx set = single_set (i1->rtl());
>> +  rtx old_dest = SET_DEST (set);
>> +
>> +  for (auto def : i1->defs ())
>> +    {
>> +      auto set = dyn_cast<set_info *> (def);
>> +      for (auto use : set->nondebug_insn_uses ())
>> +	{
>> +	  insn_info *info = use->insn ();
>> +	  if (!info || !info->rtl ())
>> +	    continue;
>> +
>> +	  rtx_insn *rtl_insn = info->rtl ();
>> +	  df_ref ref;
>> +
>> +	  FOR_EACH_INSN_USE (ref, rtl_insn)
>> +	    {
>> +	      rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>> +	      if (REG_P (dest_exp)
>> +		  && DF_REF_REGNO (ref) == REGNO (dest_exp))
>> +		{
>> +		  rtx *loc = DF_REF_LOC (ref);
>> +		  insn_propagation prop (rtl_insn, old_dest, src);
>> +		  if (GET_CODE (*loc) == SUBREG)
>> +		    {
>> +		      if (!prop.apply_to_pattern (loc))
>> +			{
>> +			  if (dump_file != NULL)
>> +			    {
>> +			      fprintf (dump_file,
>> +				       "Cannot propagate insn \n");
>> +			      print_rtl_single (dump_file, rtl_insn);
> 
> We can't simply ignore the failure, since it would leave the instruction
> referring to the wrong register.  We either need to assert that the failure
> can't happen (dangerous) or bail out of the load fusion if propagation fails.
> 

Addressed in v8 of the patch.

>> +			    }
>> +			  return;
>> +			}
>> +		    }
>> +		  else
>> +		    *loc = copy_rtx (src);
>> +		}
>> +	    }
>> +	}
> 
> We shouldn't have two ways of doing this.  The above FOR_EACH_INSN_USE
> loop should be replaced by something like:
> 
>       insn_propagation prop (rtl_insn, old_dest, src);
>       if (!prop.apply_to_pattern (loc))
>         ...
> 
> so that the substitution is done in one step, and only done using
> insn_propagation.
> 
> Like I mentioned before, this change needs to be described as an
> insn_change as well, so that rtl-ssa framework will see it.  This is
> important because later insn movement decisions require the def-use
> information to be up-to-date.
> 

Addressed in v8 of the patch.

>> +    }
>> +}
>> +
>> +// Set subreg for OO mode store pair to generate registers in pairs
>> +// given insn_info I1 and I2.
>> +static void
>> +set_multiword_subreg_store (insn_info *i1, insn_info *i2)
>> +{
>> +  rtx_insn *insn1 = i1->rtl ();
>> +  rtx_insn *insn2 = i2->rtl ();
>> +  rtx body = PATTERN (insn1);
>> +  rtx src_exp = SET_SRC (body);
>> +  rtx insn2_body = PATTERN (insn2);
>> +  rtx insn2_dest_exp = SET_DEST (insn2_body);
>> +  machine_mode mode = GET_MODE (src_exp);
>> +  int regoff;
>> +  rtx src;
>> +  rtx addr = XEXP (insn2_dest_exp, 0);
>> +
>> +  PUT_MODE_RAW (src_exp, OOmode);
>> +  if (GET_CODE (addr) == PLUS
>> +      && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>> +    regoff = 16;
>> +  else
>> +    regoff = 0;
>> +
>> +  src = simplify_gen_subreg (mode,
>> +			     src_exp, GET_MODE (src_exp),
>> +			     regoff);
>> +
>> +  set_store_subreg (i1, src, regoff);
>> +
>> +  int regoff1 = 0;
>> +  rtx src1;
>> +
>> +  src1 = simplify_gen_subreg (mode,
>> +			      src_exp, GET_MODE (src_exp),
>> +			      regoff1);
>> +
>> +  set_store_subreg (i2, src1, regoff1);
>> +  set_rescan_store (i1);
>> +  set_rescan_store (i2);
>> +  df_insn_rescan (insn1);
>> +}
>> +
>> +// Return true iff insn I1 has already existing subreg.
>> +static bool
>> +use_has_subreg_p (insn_info *i1)
>> +{
>> +  for (auto def : i1->defs ())
>> +    {
>> +      auto set = dyn_cast<set_info *> (def);
>> +      for (auto use : set->nondebug_insn_uses ())
>> +	{
>> +	  insn_info *info = use->insn ();
>> +	  if (info && info->rtl ())
>> +	    {
>> +	      rtx_insn *rtl_insn = info->rtl ();
>> +	      df_ref ref;
>> +	      FOR_EACH_INSN_USE (ref, rtl_insn)
>> +		{
>> +		  rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>> +		  if (REG_P (dest_exp)
>> +		      && DF_REF_REGNO (ref) == REGNO (dest_exp))
>> +		    {
>> +		      rtx *loc = DF_REF_LOC (ref);
>> +		      if (GET_CODE (*loc) == SUBREG)
>> +			return true;
>> +		    }
>> +		}
>> +	     }
>> +	}
>> +    }
>> +   return false;
>> +}
>> +
>> +// Set subreg for OO mode pair load for existing subreg rtx to generate
>> +// registers in pairs given insn_info I2 and I2.
>> +static void
>> +set_multiword_subreg_load (insn_info *i1, insn_info *i2)
>> +
>> +{
>> +  rtx_insn *insn1 = i1->rtl ();
>> +  rtx body = PATTERN (insn1);
>> +  rtx dest_exp = SET_DEST (body);
>> +  machine_mode mode = GET_MODE (dest_exp);
>> +  int regoff1;
>> +  regoff1 = 16;
>> +
>> +  bool subreg_p = false;
>> +
>> +  if (use_has_subreg_p (i1))
>> +    subreg_p = true;
> 
> Like I said in the last review, I don't think we should have a separate
> path for uses that involved subregs and uses that didn't.  What is currently
> the "subreg_p" path should be the only path.  We should use it even if
> there were no references involving subregs.
> 
> We shouldn't in general change the width of an existing pseudo register,
> since it would invalidate REG_ATTRS and any debug information associated
> with the register.  The rs6000 form of load fusion should always create
> a fresh pseudo register for the double-width fused result.
>

Addressed in v8 of the patch.
 
>> +
>> +  rtx new_dest_exp = dest_exp;
>> +
>> +  if (subreg_p)
>> +    new_dest_exp = new_reg_rtx (dest_exp);
>> +  else
>> +    PUT_MODE_RAW (new_dest_exp, OOmode);
>> +
>> +  rtx src = simplify_gen_subreg (mode,
>> +				 new_dest_exp,
>> +				 OOmode,
>> +				 regoff1);
>> +
>> +  set_load_subreg (i1, src);
>> +  if (subreg_p)
>> +    propagate_insn (i1, new_dest_exp);
>> +
>> +  int regoff = 0;
>> +  rtx sset = single_set (i2->rtl ());
>> +  rtx insn2_dest_exp = SET_DEST (sset);
>> +  machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
>> +
>> +  src = simplify_gen_subreg (insn2_mode,
>> +			     new_dest_exp,
>> +			     OOmode,
>> +			     regoff);
>> +
>> +  set_load_subreg (i2, src);
>> +
>> +  if (subreg_p)
>> +    propagate_insn (i2, new_dest_exp);
>> +
>> +  if (subreg_p)
>> +    {
>> +      auto attempt = crtl->ssa->new_change_attempt ();
>> +      resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) };
>> +      auto *set = crtl->ssa->allocate<set_info> (i1, resource);
>> +      if (set)
>> +	{
>> +	  auto def = find_access (i1->defs (), REGNO (new_dest_exp));
>> +	  if (!def)
>> +	    i1->defs() = insert_access (attempt, set, i1->defs());
>> +	}
> 
> Changes to rtl-ssa insn_infos should always happen through insn_changes,
> rather than by direct modification of insns.  This allows the framework
> to do all the required book-keeping.  E.g. the above does not extend
> the function_info::m_defs array.  (And it shouldn't extend that array
> directly -- rtl-ssa should do that internally.)
> 
> We already have:
> 
>   function_info::create_set
> 
> which creates a temporary definition of something, for use in an
> insn_change.  I'm not sure that it handles completely new registers,
> because no-one's needed that yet, but it would be a reasonable thing
> to add (in function_info::change_insns).
> 
> In summary:
> 
> - please always use a new pseudo register for the fused load result
> - please always use insn_propagation to update uses of the load results
> - please create an insn_change for each use, and add the changes to the
>   existing list of insn_changes made by the main pair-fusion.cc code
> - please don't try to change the rtl-ssa ir directly from rs6000 code;
>   instead, describe the new instruction using insn_changes
> - once we know that the fusion is going to go ahead, we should update
>   all debug uses to refer to the new register half, in a similar way
>   to late-combine's insn_combination::substitute_optional_uses
>   (perhaps I should try to provide an abstraction around that).
> 

Addressed in v8 of the patch.

> Thanks,
> Richard
>

Thanks & Regards
Ajit
 
>> +    }
>> +
>> +  set_rescan_load (i1);
>> +  set_rescan_load (i2);
>> +  df_insn_rescan (insn1);
>> +}
>> +
>> +// Set subreg for OO mode pair to generate sequential registers given
>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
>> +// if store insn.
>> +void
>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
>> +					  bool load_p)
>> +{
>> +  if (load_p)
>> +    set_multiword_subreg_load (i1, i2);
>> +  else
>> +    set_multiword_subreg_store (i1, i2);
>> +}
>> +
>> +rtx
>> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx,  bool load_p)
>> +{
>> +  rtx i1 = pats[0];
>> +  rtx src_exp = SET_SRC (i1);
>> +  rtx dest_exp = SET_DEST (i1);
>> +  PUT_MODE_RAW (src_exp, OOmode);
>> +  PUT_MODE_RAW (dest_exp, OOmode);
>> +  rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest_exp),
>> +				gen_rtvec (1, src_exp),
>> +				UNSPEC_LXVP);
>> +  rtx set =  gen_rtx_SET (dest_exp, unspec);
>> +  if (dump_file)
>> +    {
>> +      if (load_p)
>> +	fprintf (dump_file, "lxv with lxvp ");
>> +      else
>> +	fprintf (dump_file, "stxv with stxvp ");
>> +      print_rtl_single (dump_file, set);
>> +    }
>> +  return set;
>> +}
>> +
>> +const pass_data pass_data_mem_fusion =
>> +{
>> +  RTL_PASS, /* type */
>> +  "mem_fusion", /* name */
>> +  OPTGROUP_NONE, /* optinfo_flags */
>> +  TV_NONE, /* tv_id */
>> +  0, /* properties_required */
>> +  0, /* properties_provided */
>> +  0, /* properties_destroyed */
>> +  0, /* todo_flags_start */
>> +  TODO_df_finish, /* todo_flags_finish */
>> +};
>> +
>> +class pass_mem_fusion : public rtl_opt_pass
>> +{
>> +public:
>> +  pass_mem_fusion (gcc::context *ctxt)
>> +    : rtl_opt_pass (pass_data_mem_fusion, ctxt)
>> +  {}
>> +
>> +  opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
>> +
>> +  /* opt_pass methods: */
>> +  bool gate (function *)
>> +    {
>> +      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
>> +    }
>> +
>> +  unsigned int execute (function *) final override
>> +    {
>> +      rs6000_pair_fusion pass;
>> +      pass.run ();
>> +      return 0;
>> +    }
>> +}; // class pass_mem_fusion
>> +
>> +rtl_opt_pass *
>> +make_pass_mem_fusion (gcc::context *ctxt)
>> +{
>> +  return new pass_mem_fusion (ctxt);
>> +}
>> diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
>> index 46a0d0b8c56..0b48f57014d 100644
>> --- a/gcc/config/rs6000/rs6000-passes.def
>> +++ b/gcc/config/rs6000/rs6000-passes.def
>> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3.  If not see
>>       The power8 does not have instructions that automaticaly do the byte swaps
>>       for loads and stores.  */
>>    INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
>> -
>> +  /* Pass to replace adjacent memory addresses lxv/stxv instruction with
>> +     lxvp/stxvp instruction.  */
>> +  INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
>>    /* Pass to do the PCREL_OPT optimization that combines the load of an
>>       external symbol's address along with a single load or store using that
>>       address as a base register.  */
>> diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
>> index b40557a8557..5295ff66950 100644
>> --- a/gcc/config/rs6000/rs6000-protos.h
>> +++ b/gcc/config/rs6000/rs6000-protos.h
>> @@ -342,6 +342,7 @@ namespace gcc { class context; }
>>  class rtl_opt_pass;
>>  
>>  extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
>> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
>>  extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
>>  extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
>>  extern bool rs6000_quadword_masked_address_p (const_rtx exp);
>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>> index 5ed64b1e686..449d9e446d5 100644
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -27442,7 +27442,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>      reg_mode = word_mode;
>>    reg_mode_size = GET_MODE_SIZE (reg_mode);
>>  
>> -  gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
>> +  gcc_assert (mode == OOmode
>> +	      || reg_mode_size * nregs == GET_MODE_SIZE (mode));
>>  
>>    /* TDmode residing in FP registers is special, since the ISA requires that
>>       the lower-numbered word of a register pair is always the most significant
>> @@ -27489,6 +27490,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>        int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
>>        if (MEM_P (dst))
>>  	{
>> +	  rtx addr = XEXP (dst, 0);
>> +	  rtx opnd1 = NULL_RTX;
>> +	  if (addr && GET_CODE (addr) == PLUS)
>> +	    opnd1 = XEXP (addr,1);
>> +
>>  	  unsigned offset = 0;
>>  	  unsigned size = GET_MODE_SIZE (reg_mode);
>>  
>> @@ -27502,7 +27508,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  	    {
>>  	      unsigned subreg
>>  		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>> -	      rtx dst2 = adjust_address (dst, reg_mode, offset);
>> +	      rtx dst2 = dst;
>> +
>> +	      if ((GET_CODE (addr) != PLUS
>> +		  || (opnd1 && CONST_INT_P(opnd1))))
>> +		dst2 = adjust_address (dst, reg_mode, offset);
>> +	      else
>> +		PUT_MODE_RAW (dst, reg_mode);
>>  	      rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
>>  	      offset += size;
>>  	      emit_insn (gen_rtx_SET (dst2, src2));
>> @@ -27513,15 +27525,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  
>>        if (MEM_P (src))
>>  	{
>> +	  rtx  addr = XEXP (src, 0);
>> +	  rtx opnd1 = NULL_RTX;
>> +	  if (addr && GET_CODE (addr) == PLUS)
>> +	    opnd1 = XEXP (addr,1);
>> +
>>  	  unsigned offset = 0;
>>  	  unsigned size = GET_MODE_SIZE (reg_mode);
>>  
>> -	  for (int i = 0; i < nregs; i += reg_mode_nregs)
>> +	  for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
>>  	    {
>>  	      unsigned subreg
>>  		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>>  	      rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
>> -	      rtx src2 = adjust_address (src, reg_mode, offset);
>> +	      rtx src2 = src;
>> +
>> +	      if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
>> +		src2 = adjust_address (src, reg_mode, offset);
>> +	      else
>> +		PUT_MODE_RAW (src2, reg_mode);
>>  	      offset += size;
>>  	      emit_insn (gen_rtx_SET (dst2, src2));
>>  	    }
>> @@ -27529,7 +27551,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  	  /* If we are writing an accumulator register, we have to
>>  	     prime it after we've written it.  */
>>  	  if (TARGET_MMA
>> -	      && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>> +	      && REG_P (dst) && GET_MODE (dst) == XOmode
>> +	      && FP_REGNO_P (REGNO (dst)))
>>  	    emit_insn (gen_mma_xxmtacc (dst, dst));
>>  
>>  	  return;
>> @@ -27622,9 +27645,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  	{
>>  	  for (i = nregs - 1; i >= 0; i--)
>>  	    {
>> -	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>> -	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>> -	      emit_insn (gen_rtx_SET (dst_i, src_i));
>> +	      if (REG_P (dst) && REG_P (src))
>> +		{
>> +		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>> +		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>> +		  emit_insn (gen_rtx_SET (dst_i, src_i));
>> +		}
>>  	    }
>>  	}
>>        else
>> @@ -27639,7 +27665,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>        /* If we are writing an accumulator register, we have to
>>  	 prime it after we've written it.  */
>>        if (TARGET_MMA
>> -	  && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>> +	  && REG_P (dst) && GET_MODE (dst) == XOmode
>> +	  && FP_REGNO_P (REGNO (dst)))
>>  	emit_insn (gen_mma_xxmtacc (dst, dst));
>>      }
>>    else
>> @@ -27696,7 +27723,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  
>>  	  /* If the base register we are using to address memory is
>>  	     also a destination reg, then change that register last.  */
>> -	  if (REG_P (breg)
>> +	  if (REG_P (dst) && REG_P (breg)
>>  	      && REGNO (breg) >= REGNO (dst)
>>  	      && REGNO (breg) < REGNO (dst) + nregs)
>>  	    j = REGNO (breg) - REGNO (dst);
>> @@ -27794,9 +27821,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>  	  /* XO/OO are opaque so cannot use subregs. */
>>  	  if (mode == OOmode || mode == XOmode )
>>  	    {
>> -	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>> -	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>> -	      emit_insn (gen_rtx_SET (dst_i, src_i));
>> +	      if (REG_P (dst) &&  REG_P (src))
>> +		{
>> +		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>> +		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>> +		  emit_insn (gen_rtx_SET (dst_i, src_i));
>> +		}
>>  	    }
>>  	  else
>>  	    emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
>> @@ -27814,7 +27844,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>        if (restore_basereg != NULL_RTX)
>>  	emit_insn (restore_basereg);
>>      }
>> +  return;
>>  }
>> +
>>  
>>  /* Return true if the peephole2 can combine a load involving a combination of
>>     an addis instruction and a load with an offset that can be fused together on
>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>> index f59be536570..99f070ee24d 100644
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -159,6 +159,7 @@
>>     UNSPEC_XXSPLTIW_CONST
>>     UNSPEC_FMAX
>>     UNSPEC_FMIN
>> +   UNSPEC_LXVP
>>    ])
>>  
>>  ;;
>> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
>> index 155788de40a..a052163eec4 100644
>> --- a/gcc/config/rs6000/t-rs6000
>> +++ b/gcc/config/rs6000/t-rs6000
>> @@ -34,6 +34,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
>>  	$(COMPILE) $<
>>  	$(POSTCOMPILE)
>>  
>> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
>> +	$(COMPILE) $<
>> +	$(POSTCOMPILE)
>> +
>> +
>>  rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
>>  	$(COMPILE) $<
>>  	$(POSTCOMPILE)
>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
>> index 31d2c21c88f..b1521253474 100644
>> --- a/gcc/pair-fusion.cc
>> +++ b/gcc/pair-fusion.cc
>> @@ -312,9 +312,9 @@ static int
>>  encode_lfs (lfs_fields fields)
>>  {
>>    int size_log2 = exact_log2 (fields.size);
>> -  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>> -  return ((int)fields.load_p << 3)
>> -    | ((int)fields.fpsimd_p << 2)
>> +  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
>> +  return ((int)fields.load_p << 4)
>> +    | ((int)fields.fpsimd_p << 3)
>>      | (size_log2 - 2);
>>  }
>>  
>> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
>>  static lfs_fields
>>  decode_lfs (int lfs)
>>  {
>> -  bool load_p = (lfs & (1 << 3));
>> -  bool fpsimd_p = (lfs & (1 << 2));
>> +  bool load_p = (lfs & (1 << 4));
>> +  bool fpsimd_p = (lfs & (1 << 3));
>>    unsigned size = 1U << ((lfs & 3) + 2);
>>    return { load_p, fpsimd_p, size };
>>  }
>> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>    if (MEM_VOLATILE_P (mem))
>>      return;
>>  
>> +  if (load_p && !m_pass->fuseable_load_p (insn))
>> +    return;
>> +
>>    // Ignore writeback accesses if the hook says to do so.
>>    if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
>>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>      }
>>  
>>    rtx reg_notes = combine_reg_notes (first, second, load_p);
>> -
>> +  m_pass->set_multiword_subreg (i1, i2, load_p);
>>    rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
>>    insn_change *pair_change = nullptr;
>>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>> @@ -1828,18 +1831,18 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>        changes.safe_push (make_delete (first));
>>        pair_change = make_change (second);
>>        changes.safe_push (pair_change);
>> -
>>        pair_change->move_range = move_range;
>>        pair_change->new_defs = merge_access_arrays (attempt,
>>  						   input_defs[0],
>>  						   input_defs[1]);
>> +      m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
>>        gcc_assert (pair_change->new_defs.is_valid ());
>> -
>>        pair_change->new_uses
>>  	= merge_access_arrays (attempt,
>>  			       drop_memory_access (input_uses[0]),
>>  			       drop_memory_access (input_uses[1]));
>>        gcc_assert (pair_change->new_uses.is_valid ());
>> +      m_pass->set_uses (i1, i2, &pair_change, &attempt);
>>        set_pair_pat (pair_change);
>>      }
>>    else
>> @@ -1872,6 +1875,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>  						store_def,
>>  						change->new_defs);
>>  	      gcc_assert (change->new_defs.is_valid ());
>> +	      m_pass->set_defs (i1, i2, &change, &attempt);
>>  	      change->move_range = move_range;
>>  	      pair_change = change;
>>  	      break;
>> @@ -1908,6 +1912,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>  	      change->new_defs = insert_access (attempt, new_set,
>>  						change->new_defs);
>>  	      gcc_assert (change->new_defs.is_valid ());
>> +	      m_pass->set_defs (i1, i2, &change, &attempt);
>>  	      pair_change = change;
>>  	      break;
>>  	    }
>> @@ -2405,6 +2410,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>>        reg_ops[i] = XEXP (pats[i], !load_p);
>>      }
>>  
>> +  if (!load_p && !m_pass->fuseable_store_p (i1, i2))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
>>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>>      {
>>        if (dump_file)
>> @@ -2997,6 +3011,8 @@ void pair_fusion::process_block (bb_info *bb)
>>        if (GET_CODE (pat) != SET)
>>  	continue;
>>  
>> +      change_existing_multword_mode (rti);
>> +
>>        if (track_stores && MEM_P (XEXP (pat, 0)))
>>  	bb_state.track_access (insn, false, XEXP (pat, 0));
>>        else if (track_loads && MEM_P (XEXP (pat, 1)))
>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>> index 45e4edceecb..b3af49de892 100644
>> --- a/gcc/pair-fusion.h
>> +++ b/gcc/pair-fusion.h
>> @@ -26,8 +26,12 @@ namespace rtl_ssa {
>>    class insn_info;
>>    class insn_range_info;
>>    class bb_info;
>> +  class insn_change;
>> +  class insn_range_info;
>>  }
>>  
>> +class obstack_watermark;
>> +
>>  // Information about a potential base candidate, used in try_fuse_pair.
>>  // There may be zero, one, or two viable RTL bases for a given pair.
>>  struct base_cand
>> @@ -142,6 +146,19 @@ struct pair_fusion {
>>    // true iff INSN is a load pair.
>>    virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
>>  
>> +  // Given INSN change multiword mode load and store to respective
>> +  // unspec instruction.
>> +  virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
>> +
>> +  // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
>> +  // new rtx with INSN.  Remove all uses of definition that are
>> +  // removed given CHANGES.
>> +  virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
>> +				    obstack_watermark *attempt,
>> +				    rtl_ssa::insn_change **pair_change,
>> +				    auto_vec<rtl_ssa::insn_change *> &changes)
>> +				    = 0;
>> +
>>    // Return true if we should track loads.
>>    virtual bool track_loads_p ()
>>    {
>> @@ -171,6 +188,37 @@ struct pair_fusion {
>>    virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
>>  					  rtx regs[2], bool load_p) = 0;
>>  
>> +  // Given insn_info pair I1 and I2, sets subreg with multiword registers
>> +  // to assign register pairs by allocators.
>> +  // LOAD_P is true iff the pair is a load.
>> +  virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
>> +				     rtl_ssa::insn_info *i2,
>> +				     bool load_p) = 0;
>> +
>> +  // Given insn_info pair I1 and I2, sets uses in PAIR_CHANGE.
>> +  virtual void set_uses (rtl_ssa::insn_info *i1,
>> +			 rtl_ssa::insn_info *i2,
>> +			 rtl_ssa::insn_change **pair_change,
>> +			 obstack_watermark *attempt) = 0;
>> +
>> +  // Given insn_info pair I1 and I2, sets defss in PAIR_CHANGE.
>> +  virtual void set_defs (rtl_ssa::insn_info *i1,
>> +			 rtl_ssa::insn_info *i2,
>> +			 rtl_ssa::insn_change **pair_change,
>> +			 obstack_watermark *attempt) = 0;
>> +
>> +
>> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
>> +  // store mem pairs.
>> +  // Return true if feasible to perform store mem pairs otherwise false.
>> +  virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
>> +				 rtl_ssa::insn_info *i2) = 0;
>> +
>> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
>> +  // load mem pairs.
>> +  // Return true if feasible to perform load mem pairs otherwise false.
>> +  virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
>> +
>>    void process_block (rtl_ssa::bb_info *bb);
>>    rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
>>  					 const rtl_ssa::insn_range_info
>> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
>> index 8be04f1aa96..27a732b7353 100644
>> --- a/gcc/rtl-ssa/functions.h
>> +++ b/gcc/rtl-ssa/functions.h
>> @@ -222,6 +222,13 @@ public:
>>    template<typename T, typename... Ts>
>>    T *change_alloc (obstack_watermark &wm, Ts... args);
>>  
>> +  auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
>> +
>> +  template<typename T, typename... Ts>
>> +  T *allocate (Ts... args);
>> +
>> +  void remove_use (use_info *);
>> +
>>  private:
>>    class bb_phi_info;
>>    class build_info;
>> @@ -231,9 +238,6 @@ private:
>>    // allocate_temp during its lifetime.
>>    obstack_watermark temp_watermark () { return &m_temp_obstack; }
>>  
>> -  template<typename T, typename... Ts>
>> -  T *allocate (Ts... args);
>> -
>>    template<typename T, typename... Ts>
>>    T *allocate_temp (Ts... args);
>>  
>> @@ -269,7 +273,6 @@ private:
>>    static void insert_use_after (use_info *, use_info *);
>>  
>>    void add_use (use_info *);
>> -  void remove_use (use_info *);
>>  
>>    insn_info::order_node *need_order_node (insn_info *);
>>  
>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>> new file mode 100644
>> index 00000000000..d10ff0cdf36
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>> @@ -0,0 +1,22 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
>> +
>> +#include <altivec.h>
>> +	
>> +void
>> +foo2 ()
>> +{
>> +  __vector_quad *dst1;
>> +  __vector_quad *dst2;
>> +  vector unsigned char src;
>> +  __vector_quad acc;
>> +  vector unsigned char *ptr;
>> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>> +  *dst1 = acc;
>> +  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
>> +  *dst2 = acc;
>> +}
>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>> new file mode 100644
>> index 00000000000..c523572cf3c
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>> @@ -0,0 +1,15 @@
>> +/* { dg-do compile } */ 
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
>> +
>> +#include <altivec.h>
>> +
>> +void
>> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
>> +{
>> +  __vector_quad acc;
>> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>> +  *dst = acc;
>> +}
>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> index 69ee826e1be..ae29127f954 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
>>    dst[13] = acc;
>>  }
>>  
>> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
>> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
>> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
>> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
>>  /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
>>  /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
>>  /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
diff mbox series

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index bc45615741b..12f79a78177 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@  powerpc*-*-*)
 	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
 	extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+	extra_objs="${extra_objs} rs6000-mem-fusion.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
 	extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@  rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
 	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+	extra_objs="${extra_objs} rs6000-mem-fusion.o"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
 	;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@ 
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-	(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+        (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+       || gpc_reg_operand (operands[1], OOmode))"
+;;    ""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   (set_attr "length" "*,*,8")])
+;;   (set_attr "max_prefixed_insns" "2,2,*")])
+
+
+(define_insn_and_split "*movoo1"
+  [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
+        (unspec [
+          (match_operand:OO 1 "input_operand" "ZwO,wa,wa")
+        ] UNSPEC_LXVP))]
   "TARGET_MMA
    && (gpc_reg_operand (operands[0], OOmode)
        || gpc_reg_operand (operands[1], OOmode))"
diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc
new file mode 100644
index 00000000000..47fa4e05d20
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
@@ -0,0 +1,746 @@ 
+/* Subroutines used to perform adjacent load/store into
+   paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-iter.h"
+#include "rtl-ssa.h"
+#include "rtl-ssa/internals.h"
+#include "rtl-ssa/internals.inl"
+#include "cfgcleanup.h"
+#include "tree-pass.h"
+#include "pair-fusion.h"
+
+using namespace rtl_ssa;
+
+struct rs6000_pair_fusion : public pair_fusion
+{
+  bool fpsimd_op_p (rtx , machine_mode , bool)  override final
+  {
+    return false;
+  }
+
+  bool pair_mem_insn_p (rtx_insn *, bool &) override final
+  {
+    return false;
+  }
+
+  void change_existing_multword_mode (rtx_insn *insn) override final;
+
+  bool pair_mem_ok_with_policy (rtx, bool) override final
+  {
+    return true;
+  }
+
+  bool pair_operand_mode_ok_p (machine_mode mode) override final;
+
+  rtx gen_pair (rtx *pats, rtx, bool load_p) override final;
+
+  bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
+  {
+    return true;
+  }
+
+  int pair_mem_alias_check_limit () override final
+  {
+    return 0;
+  }
+
+  bool should_handle_writeback (enum writeback_type) override final
+  {
+    return false;
+  }
+
+  bool track_loads_p () override final
+  {
+    return true;
+  }
+
+  bool track_stores_p () override final
+  {
+    return true;
+  }
+
+  bool pair_mem_in_range_p (HOST_WIDE_INT) override final
+  {
+    return true;
+  }
+
+  rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
+  {
+    return NULL_RTX;
+  }
+
+  rtx destructure_pair (rtx_def **, rtx, bool) override final
+  {
+    return NULL_RTX;
+  }
+
+  bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
+
+  bool fuseable_load_p (insn_info *insn) override final;
+
+  void set_multiword_subreg (insn_info *i1, insn_info *i2,
+			     bool load_p) override final;
+
+  void  set_uses (insn_info *i1, insn_info *i2,
+		  insn_change **pair_change,
+		  obstack_watermark *attempt) override final;
+
+  void  set_defs (insn_info *i1, insn_info *i2,
+		  insn_change **pair_change,
+		  obstack_watermark *attempt) override final;
+
+
+  void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt,
+			    insn_change **pair_change,
+			    auto_vec <insn_change *> &changes) override final;
+};
+
+bool
+rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+  return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
+}
+
+void
+rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn)
+{
+  rtx set = single_set (insn);
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+  rtx copy = NULL_RTX;
+
+  if ((MEM_P (src) && GET_MODE (src) == OOmode)
+       || (MEM_P (dest) && GET_MODE (dest) == OOmode))
+    {
+      rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest),
+				    gen_rtvec (1, src),
+				    UNSPEC_LXVP);
+      copy =  gen_rtx_SET (dest, unspec);
+      rtx_insn *new_insn = emit_insn_after (copy, insn);
+      set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+      df_insn_rescan (new_insn);
+      df_insn_delete (insn);
+      remove_insn (insn);
+      insn->set_deleted ();
+    }
+}
+
+static void
+update_change (set_info *set)
+{
+  if (!set->has_any_uses ())
+    return;
+
+  auto *use = *set->all_uses ().begin ();
+  do
+    {
+      auto *next_use = use->next_use ();
+      if (use->is_in_phi ())
+	{
+	  update_change (use->phi ());
+	}
+      else
+	{
+	  crtl->ssa->remove_use (use);
+	}
+      use = next_use;
+    }
+  while (use);
+}
+
+void
+rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first,
+					 obstack_watermark *attempt,
+					 insn_change **pair_change,
+					 auto_vec<insn_change *> &changes)
+{
+  for (insn_change *change : changes)
+    for (auto def : change->old_defs ())
+      {
+	auto set = dyn_cast<set_info *> (def);
+	update_change (set);
+      }
+
+  auto &new_defs = (*pair_change)->new_defs;
+  vec_rtx_properties properties;
+  properties.add_insn (first->rtl (), true);
+  // Build up the new list of definitions.
+  for (rtx_obj_reference ref : properties.refs ())
+    if (ref.is_write ())
+      {
+	auto *set = crtl->ssa->allocate<set_info> (first,
+						   full_register (ref.regno));
+	if (set)
+	  {
+	    auto def = find_access (new_defs, ref.regno);
+	    if (!def)
+	      {
+		new_defs = insert_access (*attempt, set,
+					  new_defs);
+		auto &m_temp_defs = crtl->ssa->get_m_temp_defs ();
+		m_temp_defs.safe_push (set);
+	      }
+	  }
+      }
+}
+
+//Given insn I1, PAIR_CHANGE and obstack_watermark ATTEMPT sets new
+//uses in insn_change.
+static void
+set_uses_load (insn_info *i1,insn_change **pair_change,
+	       obstack_watermark *attempt)
+{
+  for (auto def : i1->defs ())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->all_uses ())
+	{
+	  insn_change *change = *pair_change;
+	  change->new_uses = insert_access (*attempt, use, change->new_uses);
+	}
+    }
+}
+
+void rs6000_pair_fusion::set_uses (insn_info *i1,
+				   insn_info *i2,
+				   insn_change **pair_change,
+				   obstack_watermark *attempt)
+{
+
+  set_uses_load (i1, pair_change, attempt);
+  set_uses_load (i2, pair_change, attempt);
+}
+
+// df_insn_rescan dependent instruction where operands
+// are reversed given insn_info INFO.
+static void
+set_rescan_load (insn_info *i1)
+{
+  for (auto def : i1->defs ())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->all_uses ())
+	{
+	  insn_info *info = use->insn ();
+	  if (info && info->rtl ())
+	    {
+	      rtx_insn *rtl_insn = info->rtl ();
+	      df_insn_rescan (rtl_insn);
+	    }
+	}
+    }
+}
+
+//Given insn INSN, PAIR_CHANGE and obstack_watermark ATTEMPT sets new
+//defs in insn_change.
+static bool
+set_defs_store (insn_info *insn,
+		insn_change **pair_change,
+		obstack_watermark *attempt)
+{
+  for (auto use : insn->uses())
+    {
+      auto def = use->def ();
+
+      if (!def)
+	return false;
+
+      if (def->insn ()->is_artificial ())
+	{
+	  insn_change *change = *pair_change;
+	  change->new_defs = insert_access (*attempt, def, change->new_defs);
+	}
+    }
+  return true;
+}
+
+void rs6000_pair_fusion::set_defs(insn_info *i1,
+				  insn_info *i2,
+				  insn_change **pair_change,
+				  obstack_watermark *attempt)
+{
+  set_defs_store (i1, pair_change, attempt);
+  set_defs_store (i2, pair_change, attempt);
+}
+
+// df_insn_rescan the def instruction where operands are reversed given INSN.
+static bool
+set_rescan_store (insn_info *insn)
+{
+  for (auto use : insn->uses())
+    {
+      auto def = use->def ();
+
+      if (!def)
+	return false;
+
+      if (def->insn ()->is_artificial ())
+	return false;
+
+      if (def->insn () && def->insn ()->rtl ()
+	  && def->insn()->is_real ())
+	{
+	  rtx_insn *rtl_insn = def->insn ()->rtl ();
+	  rtx set = single_set (rtl_insn);
+
+	  if (set == NULL_RTX)
+	    return false;
+	  df_insn_rescan (rtl_insn);
+	}
+    }
+  return true;
+}
+
+// Check for feasibility of store to be fuseable or not. Return true if
+// feasible otherwise false.
+static bool
+feasible_store_p (insn_info *insn)
+{
+  for (auto use : insn->uses ())
+    {
+      auto def = use->def ();
+
+      if (def->insn ()->is_artificial ())
+	return false;
+
+      if (def->insn () && def->insn ()->rtl ()
+	  && def->insn()->is_real ())
+	{
+	  rtx_insn *rtl_insn = def->insn ()->rtl ();
+	  rtx set = single_set (rtl_insn);
+
+	  if (set == NULL_RTX)
+	    return false;
+
+	  // Return false if dependent def is load.
+	  // This is done as def instruction could be a fused load and
+	  // to avoid already existing subreg (reg:OO R) offset.
+	  if (rtl_insn && MEM_P (SET_SRC (set)))
+	    return false;
+
+	  // Return false if dependent def is store.
+	  if (rtl_insn && MEM_P (SET_DEST (set)))
+	    return false;
+	}
+    }
+  return true;
+}
+
+// Check if store can be fuseable or not.  Return true if fuseable otherwise
+// false.
+bool
+rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
+{
+  rtx_insn *insn1 = i1->rtl ();
+  rtx_insn *insn2 = i2->rtl ();
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_src_exp = SET_SRC (insn2_body);
+
+  if (!(REG_P (src_exp)
+      && crtl->ssa->single_dominating_def (REGNO (src_exp))))
+    return false;
+
+  if (!(REG_P (insn2_src_exp)
+      && crtl->ssa->single_dominating_def (REGNO (insn2_src_exp))))
+    return false;
+
+  // This is done as def instruction could be a fused load and
+  // to avoid  already existing subreg (reg:OO R) offset.
+  if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1)
+    return false;
+
+  // Return false if src of insn1 and src of insn2 are same.
+  if (src_exp == insn2_src_exp)
+    return false;
+
+  if (!feasible_store_p (i1))
+    return false;;
+
+  if (!feasible_store_p (i2))
+    return false;
+
+  return true;
+}
+
+// Set subreg for def of store INSN given rtx SRC instruction.
+static void
+set_store_subreg (insn_info *i1, rtx src, int regoff)
+{
+  for (auto use: i1->uses ())
+    {
+      auto def = use->def ();
+      if (!def)
+	return;
+
+      insn_info *info = def->insn ();
+
+      if (info->is_artificial ())
+	return;
+
+      if (info && info->is_real ())
+	{
+	  rtx_insn *rtl_insn = info->rtl ();
+	  rtx set = single_set (rtl_insn);
+	  if (set == NULL_RTX)
+	    return;
+	  df_ref ref;
+	  FOR_EACH_INSN_DEF (ref, rtl_insn)
+	    {
+	      rtx src_exp = SET_SRC (PATTERN (i1->rtl ()));
+	      if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp))
+		{
+		  rtx *loc = DF_REF_LOC (ref);
+		  if (GET_CODE (*loc) == SUBREG)
+		    {
+		      src = simplify_gen_subreg (GET_MODE (*loc),
+						      SUBREG_REG (src),
+						      OOmode,
+						      regoff);
+		    }
+		  *loc =  copy_rtx (src);
+		}
+	    }
+	}
+    }
+}
+
+// Check whether load can be fusable or not.
+// Return true if fuseable otherwise false.
+bool
+rs6000_pair_fusion::fuseable_load_p (insn_info *i1)
+{
+  rtx_insn *insn = i1->rtl ();
+  rtx body = PATTERN (insn);
+  rtx dest_exp = SET_DEST (body);
+
+  if (!(REG_P (dest_exp)
+      && crtl->ssa->single_dominating_def (REGNO (dest_exp))))
+    return false;
+  return true;
+}
+
+// Propagate insn I1 with new rtx NEW_DEST_EXP.
+static void
+propagate_insn (insn_info *i1, rtx new_dest_exp)
+{
+  df_ref ref;
+  FOR_EACH_INSN_DEF (ref, i1->rtl())
+    {
+      rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+      if (REG_P (dest_exp)
+	  && DF_REF_REGNO (ref) == REGNO (dest_exp))
+	{
+	  rtx *loc = DF_REF_LOC (ref);
+	  *loc = new_dest_exp;
+	}
+     }
+}
+
+// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
+static rtx
+new_reg_rtx (rtx old_dest)
+{
+  rtx new_dest_exp = gen_reg_rtx (OOmode);
+  ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
+  REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
+  REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
+  REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
+  max_regno = max_reg_num ();
+  return new_dest_exp;
+}
+
+// Set subreg with use of INSN given SRC rtx instruction.
+static void
+set_load_subreg (insn_info *i1, rtx src)
+{
+  rtx set = single_set (i1->rtl());
+  rtx old_dest = SET_DEST (set);
+
+  for (auto def : i1->defs ())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->nondebug_insn_uses ())
+	{
+	  insn_info *info = use->insn ();
+	  if (!info || !info->rtl ())
+	    continue;
+
+	  rtx_insn *rtl_insn = info->rtl ();
+	  df_ref ref;
+
+	  FOR_EACH_INSN_USE (ref, rtl_insn)
+	    {
+	      rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+	      if (REG_P (dest_exp)
+		  && DF_REF_REGNO (ref) == REGNO (dest_exp))
+		{
+		  rtx *loc = DF_REF_LOC (ref);
+		  insn_propagation prop (rtl_insn, old_dest, src);
+		  if (GET_CODE (*loc) == SUBREG)
+		    {
+		      if (!prop.apply_to_pattern (loc))
+			{
+			  if (dump_file != NULL)
+			    {
+			      fprintf (dump_file,
+				       "Cannot propagate insn \n");
+			      print_rtl_single (dump_file, rtl_insn);
+			    }
+			  return;
+			}
+		    }
+		  else
+		    *loc = copy_rtx (src);
+		}
+	    }
+	}
+    }
+}
+
+// Set subreg for OO mode store pair to generate registers in pairs
+// given insn_info I1 and I2.
+static void
+set_multiword_subreg_store (insn_info *i1, insn_info *i2)
+{
+  rtx_insn *insn1 = i1->rtl ();
+  rtx_insn *insn2 = i2->rtl ();
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_dest_exp = SET_DEST (insn2_body);
+  machine_mode mode = GET_MODE (src_exp);
+  int regoff;
+  rtx src;
+  rtx addr = XEXP (insn2_dest_exp, 0);
+
+  PUT_MODE_RAW (src_exp, OOmode);
+  if (GET_CODE (addr) == PLUS
+      && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
+    regoff = 16;
+  else
+    regoff = 0;
+
+  src = simplify_gen_subreg (mode,
+			     src_exp, GET_MODE (src_exp),
+			     regoff);
+
+  set_store_subreg (i1, src, regoff);
+
+  int regoff1 = 0;
+  rtx src1;
+
+  src1 = simplify_gen_subreg (mode,
+			      src_exp, GET_MODE (src_exp),
+			      regoff1);
+
+  set_store_subreg (i2, src1, regoff1);
+  set_rescan_store (i1);
+  set_rescan_store (i2);
+  df_insn_rescan (insn1);
+}
+
+// Return true iff insn I1 has already existing subreg.
+static bool
+use_has_subreg_p (insn_info *i1)
+{
+  for (auto def : i1->defs ())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->nondebug_insn_uses ())
+	{
+	  insn_info *info = use->insn ();
+	  if (info && info->rtl ())
+	    {
+	      rtx_insn *rtl_insn = info->rtl ();
+	      df_ref ref;
+	      FOR_EACH_INSN_USE (ref, rtl_insn)
+		{
+		  rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
+		  if (REG_P (dest_exp)
+		      && DF_REF_REGNO (ref) == REGNO (dest_exp))
+		    {
+		      rtx *loc = DF_REF_LOC (ref);
+		      if (GET_CODE (*loc) == SUBREG)
+			return true;
+		    }
+		}
+	     }
+	}
+    }
+   return false;
+}
+
+// Set subreg for OO mode pair load for existing subreg rtx to generate
+// registers in pairs given insn_info I2 and I2.
+static void
+set_multiword_subreg_load (insn_info *i1, insn_info *i2)
+
+{
+  rtx_insn *insn1 = i1->rtl ();
+  rtx body = PATTERN (insn1);
+  rtx dest_exp = SET_DEST (body);
+  machine_mode mode = GET_MODE (dest_exp);
+  int regoff1;
+  regoff1 = 16;
+
+  bool subreg_p = false;
+
+  if (use_has_subreg_p (i1))
+    subreg_p = true;
+
+  rtx new_dest_exp = dest_exp;
+
+  if (subreg_p)
+    new_dest_exp = new_reg_rtx (dest_exp);
+  else
+    PUT_MODE_RAW (new_dest_exp, OOmode);
+
+  rtx src = simplify_gen_subreg (mode,
+				 new_dest_exp,
+				 OOmode,
+				 regoff1);
+
+  set_load_subreg (i1, src);
+  if (subreg_p)
+    propagate_insn (i1, new_dest_exp);
+
+  int regoff = 0;
+  rtx sset = single_set (i2->rtl ());
+  rtx insn2_dest_exp = SET_DEST (sset);
+  machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
+
+  src = simplify_gen_subreg (insn2_mode,
+			     new_dest_exp,
+			     OOmode,
+			     regoff);
+
+  set_load_subreg (i2, src);
+
+  if (subreg_p)
+    propagate_insn (i2, new_dest_exp);
+
+  if (subreg_p)
+    {
+      auto attempt = crtl->ssa->new_change_attempt ();
+      resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) };
+      auto *set = crtl->ssa->allocate<set_info> (i1, resource);
+      if (set)
+	{
+	  auto def = find_access (i1->defs (), REGNO (new_dest_exp));
+	  if (!def)
+	    i1->defs() = insert_access (attempt, set, i1->defs());
+	}
+    }
+
+  set_rescan_load (i1);
+  set_rescan_load (i2);
+  df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair to generate sequential registers given
+// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
+// if store insn.
+void
+rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
+					  bool load_p)
+{
+  if (load_p)
+    set_multiword_subreg_load (i1, i2);
+  else
+    set_multiword_subreg_store (i1, i2);
+}
+
+rtx
+rs6000_pair_fusion::gen_pair (rtx *pats, rtx,  bool load_p)
+{
+  rtx i1 = pats[0];
+  rtx src_exp = SET_SRC (i1);
+  rtx dest_exp = SET_DEST (i1);
+  PUT_MODE_RAW (src_exp, OOmode);
+  PUT_MODE_RAW (dest_exp, OOmode);
+  rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest_exp),
+				gen_rtvec (1, src_exp),
+				UNSPEC_LXVP);
+  rtx set =  gen_rtx_SET (dest_exp, unspec);
+  if (dump_file)
+    {
+      if (load_p)
+	fprintf (dump_file, "lxv with lxvp ");
+      else
+	fprintf (dump_file, "stxv with stxvp ");
+      print_rtl_single (dump_file, set);
+    }
+  return set;
+}
+
+const pass_data pass_data_mem_fusion =
+{
+  RTL_PASS, /* type */
+  "mem_fusion", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_mem_fusion : public rtl_opt_pass
+{
+public:
+  pass_mem_fusion (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_mem_fusion, ctxt)
+  {}
+
+  opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
+
+  /* opt_pass methods: */
+  bool gate (function *)
+    {
+      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+    }
+
+  unsigned int execute (function *) final override
+    {
+      rs6000_pair_fusion pass;
+      pass.run ();
+      return 0;
+    }
+}; // class pass_mem_fusion
+
+rtl_opt_pass *
+make_pass_mem_fusion (gcc::context *ctxt)
+{
+  return new pass_mem_fusion (ctxt);
+}
diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
index 46a0d0b8c56..0b48f57014d 100644
--- a/gcc/config/rs6000/rs6000-passes.def
+++ b/gcc/config/rs6000/rs6000-passes.def
@@ -28,7 +28,9 @@  along with GCC; see the file COPYING3.  If not see
      The power8 does not have instructions that automaticaly do the byte swaps
      for loads and stores.  */
   INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
-
+  /* Pass to replace adjacent memory addresses lxv/stxv instruction with
+     lxvp/stxvp instruction.  */
+  INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
   /* Pass to do the PCREL_OPT optimization that combines the load of an
      external symbol's address along with a single load or store using that
      address as a base register.  */
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index b40557a8557..5295ff66950 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -342,6 +342,7 @@  namespace gcc { class context; }
 class rtl_opt_pass;
 
 extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
 extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
 extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
 extern bool rs6000_quadword_masked_address_p (const_rtx exp);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 5ed64b1e686..449d9e446d5 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -27442,7 +27442,8 @@  rs6000_split_multireg_move (rtx dst, rtx src)
     reg_mode = word_mode;
   reg_mode_size = GET_MODE_SIZE (reg_mode);
 
-  gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
+  gcc_assert (mode == OOmode
+	      || reg_mode_size * nregs == GET_MODE_SIZE (mode));
 
   /* TDmode residing in FP registers is special, since the ISA requires that
      the lower-numbered word of a register pair is always the most significant
@@ -27489,6 +27490,11 @@  rs6000_split_multireg_move (rtx dst, rtx src)
       int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
       if (MEM_P (dst))
 	{
+	  rtx addr = XEXP (dst, 0);
+	  rtx opnd1 = NULL_RTX;
+	  if (addr && GET_CODE (addr) == PLUS)
+	    opnd1 = XEXP (addr,1);
+
 	  unsigned offset = 0;
 	  unsigned size = GET_MODE_SIZE (reg_mode);
 
@@ -27502,7 +27508,13 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 	    {
 	      unsigned subreg
 		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
-	      rtx dst2 = adjust_address (dst, reg_mode, offset);
+	      rtx dst2 = dst;
+
+	      if ((GET_CODE (addr) != PLUS
+		  || (opnd1 && CONST_INT_P(opnd1))))
+		dst2 = adjust_address (dst, reg_mode, offset);
+	      else
+		PUT_MODE_RAW (dst, reg_mode);
 	      rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
 	      offset += size;
 	      emit_insn (gen_rtx_SET (dst2, src2));
@@ -27513,15 +27525,25 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 
       if (MEM_P (src))
 	{
+	  rtx  addr = XEXP (src, 0);
+	  rtx opnd1 = NULL_RTX;
+	  if (addr && GET_CODE (addr) == PLUS)
+	    opnd1 = XEXP (addr,1);
+
 	  unsigned offset = 0;
 	  unsigned size = GET_MODE_SIZE (reg_mode);
 
-	  for (int i = 0; i < nregs; i += reg_mode_nregs)
+	  for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
 	    {
 	      unsigned subreg
 		= WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
 	      rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
-	      rtx src2 = adjust_address (src, reg_mode, offset);
+	      rtx src2 = src;
+
+	      if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
+		src2 = adjust_address (src, reg_mode, offset);
+	      else
+		PUT_MODE_RAW (src2, reg_mode);
 	      offset += size;
 	      emit_insn (gen_rtx_SET (dst2, src2));
 	    }
@@ -27529,7 +27551,8 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 	  /* If we are writing an accumulator register, we have to
 	     prime it after we've written it.  */
 	  if (TARGET_MMA
-	      && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
+	      && REG_P (dst) && GET_MODE (dst) == XOmode
+	      && FP_REGNO_P (REGNO (dst)))
 	    emit_insn (gen_mma_xxmtacc (dst, dst));
 
 	  return;
@@ -27622,9 +27645,12 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 	{
 	  for (i = nregs - 1; i >= 0; i--)
 	    {
-	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
-	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
-	      emit_insn (gen_rtx_SET (dst_i, src_i));
+	      if (REG_P (dst) && REG_P (src))
+		{
+		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
+		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
+		  emit_insn (gen_rtx_SET (dst_i, src_i));
+		}
 	    }
 	}
       else
@@ -27639,7 +27665,8 @@  rs6000_split_multireg_move (rtx dst, rtx src)
       /* If we are writing an accumulator register, we have to
 	 prime it after we've written it.  */
       if (TARGET_MMA
-	  && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
+	  && REG_P (dst) && GET_MODE (dst) == XOmode
+	  && FP_REGNO_P (REGNO (dst)))
 	emit_insn (gen_mma_xxmtacc (dst, dst));
     }
   else
@@ -27696,7 +27723,7 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 
 	  /* If the base register we are using to address memory is
 	     also a destination reg, then change that register last.  */
-	  if (REG_P (breg)
+	  if (REG_P (dst) && REG_P (breg)
 	      && REGNO (breg) >= REGNO (dst)
 	      && REGNO (breg) < REGNO (dst) + nregs)
 	    j = REGNO (breg) - REGNO (dst);
@@ -27794,9 +27821,12 @@  rs6000_split_multireg_move (rtx dst, rtx src)
 	  /* XO/OO are opaque so cannot use subregs. */
 	  if (mode == OOmode || mode == XOmode )
 	    {
-	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
-	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
-	      emit_insn (gen_rtx_SET (dst_i, src_i));
+	      if (REG_P (dst) &&  REG_P (src))
+		{
+		  rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
+		  rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
+		  emit_insn (gen_rtx_SET (dst_i, src_i));
+		}
 	    }
 	  else
 	    emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
@@ -27814,7 +27844,9 @@  rs6000_split_multireg_move (rtx dst, rtx src)
       if (restore_basereg != NULL_RTX)
 	emit_insn (restore_basereg);
     }
+  return;
 }
+
 
 /* Return true if the peephole2 can combine a load involving a combination of
    an addis instruction and a load with an offset that can be fused together on
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index f59be536570..99f070ee24d 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -159,6 +159,7 @@ 
    UNSPEC_XXSPLTIW_CONST
    UNSPEC_FMAX
    UNSPEC_FMIN
+   UNSPEC_LXVP
   ])
 
 ;;
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 155788de40a..a052163eec4 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -34,6 +34,11 @@  rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+
 rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
index 31d2c21c88f..b1521253474 100644
--- a/gcc/pair-fusion.cc
+++ b/gcc/pair-fusion.cc
@@ -312,9 +312,9 @@  static int
 encode_lfs (lfs_fields fields)
 {
   int size_log2 = exact_log2 (fields.size);
-  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
-  return ((int)fields.load_p << 3)
-    | ((int)fields.fpsimd_p << 2)
+  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
+  return ((int)fields.load_p << 4)
+    | ((int)fields.fpsimd_p << 3)
     | (size_log2 - 2);
 }
 
@@ -322,8 +322,8 @@  encode_lfs (lfs_fields fields)
 static lfs_fields
 decode_lfs (int lfs)
 {
-  bool load_p = (lfs & (1 << 3));
-  bool fpsimd_p = (lfs & (1 << 2));
+  bool load_p = (lfs & (1 << 4));
+  bool fpsimd_p = (lfs & (1 << 3));
   unsigned size = 1U << ((lfs & 3) + 2);
   return { load_p, fpsimd_p, size };
 }
@@ -425,6 +425,9 @@  pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
   if (MEM_VOLATILE_P (mem))
     return;
 
+  if (load_p && !m_pass->fuseable_load_p (insn))
+    return;
+
   // Ignore writeback accesses if the hook says to do so.
   if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
       && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
@@ -1814,7 +1817,7 @@  pair_fusion_bb_info::fuse_pair (bool load_p,
     }
 
   rtx reg_notes = combine_reg_notes (first, second, load_p);
-
+  m_pass->set_multiword_subreg (i1, i2, load_p);
   rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
   insn_change *pair_change = nullptr;
   auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
@@ -1828,18 +1831,18 @@  pair_fusion_bb_info::fuse_pair (bool load_p,
       changes.safe_push (make_delete (first));
       pair_change = make_change (second);
       changes.safe_push (pair_change);
-
       pair_change->move_range = move_range;
       pair_change->new_defs = merge_access_arrays (attempt,
 						   input_defs[0],
 						   input_defs[1]);
+      m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
       gcc_assert (pair_change->new_defs.is_valid ());
-
       pair_change->new_uses
 	= merge_access_arrays (attempt,
 			       drop_memory_access (input_uses[0]),
 			       drop_memory_access (input_uses[1]));
       gcc_assert (pair_change->new_uses.is_valid ());
+      m_pass->set_uses (i1, i2, &pair_change, &attempt);
       set_pair_pat (pair_change);
     }
   else
@@ -1872,6 +1875,7 @@  pair_fusion_bb_info::fuse_pair (bool load_p,
 						store_def,
 						change->new_defs);
 	      gcc_assert (change->new_defs.is_valid ());
+	      m_pass->set_defs (i1, i2, &change, &attempt);
 	      change->move_range = move_range;
 	      pair_change = change;
 	      break;
@@ -1908,6 +1912,7 @@  pair_fusion_bb_info::fuse_pair (bool load_p,
 	      change->new_defs = insert_access (attempt, new_set,
 						change->new_defs);
 	      gcc_assert (change->new_defs.is_valid ());
+	      m_pass->set_defs (i1, i2, &change, &attempt);
 	      pair_change = change;
 	      break;
 	    }
@@ -2405,6 +2410,15 @@  pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
       reg_ops[i] = XEXP (pats[i], !load_p);
     }
 
+  if (!load_p && !m_pass->fuseable_store_p (i1, i2))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
   if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
     {
       if (dump_file)
@@ -2997,6 +3011,8 @@  void pair_fusion::process_block (bb_info *bb)
       if (GET_CODE (pat) != SET)
 	continue;
 
+      change_existing_multword_mode (rti);
+
       if (track_stores && MEM_P (XEXP (pat, 0)))
 	bb_state.track_access (insn, false, XEXP (pat, 0));
       else if (track_loads && MEM_P (XEXP (pat, 1)))
diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
index 45e4edceecb..b3af49de892 100644
--- a/gcc/pair-fusion.h
+++ b/gcc/pair-fusion.h
@@ -26,8 +26,12 @@  namespace rtl_ssa {
   class insn_info;
   class insn_range_info;
   class bb_info;
+  class insn_change;
+  class insn_range_info;
 }
 
+class obstack_watermark;
+
 // Information about a potential base candidate, used in try_fuse_pair.
 // There may be zero, one, or two viable RTL bases for a given pair.
 struct base_cand
@@ -142,6 +146,19 @@  struct pair_fusion {
   // true iff INSN is a load pair.
   virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
 
+  // Given INSN change multiword mode load and store to respective
+  // unspec instruction.
+  virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
+
+  // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
+  // new rtx with INSN.  Remove all uses of definition that are
+  // removed given CHANGES.
+  virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
+				    obstack_watermark *attempt,
+				    rtl_ssa::insn_change **pair_change,
+				    auto_vec<rtl_ssa::insn_change *> &changes)
+				    = 0;
+
   // Return true if we should track loads.
   virtual bool track_loads_p ()
   {
@@ -171,6 +188,37 @@  struct pair_fusion {
   virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
 					  rtx regs[2], bool load_p) = 0;
 
+  // Given insn_info pair I1 and I2, sets subreg with multiword registers
+  // to assign register pairs by allocators.
+  // LOAD_P is true iff the pair is a load.
+  virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
+				     rtl_ssa::insn_info *i2,
+				     bool load_p) = 0;
+
+  // Given insn_info pair I1 and I2, sets uses in PAIR_CHANGE.
+  virtual void set_uses (rtl_ssa::insn_info *i1,
+			 rtl_ssa::insn_info *i2,
+			 rtl_ssa::insn_change **pair_change,
+			 obstack_watermark *attempt) = 0;
+
+  // Given insn_info pair I1 and I2, sets defss in PAIR_CHANGE.
+  virtual void set_defs (rtl_ssa::insn_info *i1,
+			 rtl_ssa::insn_info *i2,
+			 rtl_ssa::insn_change **pair_change,
+			 obstack_watermark *attempt) = 0;
+
+
+  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+  // store mem pairs.
+  // Return true if feasible to perform store mem pairs otherwise false.
+  virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
+				 rtl_ssa::insn_info *i2) = 0;
+
+  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+  // load mem pairs.
+  // Return true if feasible to perform load mem pairs otherwise false.
+  virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
+
   void process_block (rtl_ssa::bb_info *bb);
   rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
 					 const rtl_ssa::insn_range_info
diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
index 8be04f1aa96..27a732b7353 100644
--- a/gcc/rtl-ssa/functions.h
+++ b/gcc/rtl-ssa/functions.h
@@ -222,6 +222,13 @@  public:
   template<typename T, typename... Ts>
   T *change_alloc (obstack_watermark &wm, Ts... args);
 
+  auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
+
+  template<typename T, typename... Ts>
+  T *allocate (Ts... args);
+
+  void remove_use (use_info *);
+
 private:
   class bb_phi_info;
   class build_info;
@@ -231,9 +238,6 @@  private:
   // allocate_temp during its lifetime.
   obstack_watermark temp_watermark () { return &m_temp_obstack; }
 
-  template<typename T, typename... Ts>
-  T *allocate (Ts... args);
-
   template<typename T, typename... Ts>
   T *allocate_temp (Ts... args);
 
@@ -269,7 +273,6 @@  private:
   static void insert_use_after (use_info *, use_info *);
 
   void add_use (use_info *);
-  void remove_use (use_info *);
 
   insn_info::order_node *need_order_node (insn_info *);
 
diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
new file mode 100644
index 00000000000..d10ff0cdf36
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+	
+void
+foo2 ()
+{
+  __vector_quad *dst1;
+  __vector_quad *dst2;
+  vector unsigned char src;
+  __vector_quad acc;
+  vector unsigned char *ptr;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst1 = acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+  *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
new file mode 100644
index 00000000000..c523572cf3c
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */ 
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+  __vector_quad acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
index 69ee826e1be..ae29127f954 100644
--- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
@@ -258,8 +258,8 @@  foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
   dst[13] = acc;
 }
 
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
 /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
 /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
 /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */