diff mbox

[rs6000,middle-end,0/1] Add implementation for different targets for pair mem fusion

Message ID 814c71dd-0a2e-4caf-b7a7-abf148c754af@linux.ibm.com
State New
Headers show

Commit Message

Ajit Agarwal June 2, 2024, 1:08 p.m. UTC
Hello All:


All comments are addressed and patch is split into rs6000 and aarch64 target changes.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implements virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Tested for powerpc64-linux-gnu.

Thanks & Regards
Ajit

rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implements virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-02  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/rs6000/rs6000-passes.def: New mem fusion pass
	before pass_early_remat.
	* pair-fusion.h: Add additional pure virtual function
	required for rs6000 target implementation.
	* pair-fusion.cc: Use of virtual functions for additional
	virtual function addded for rs6000 target.
	* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
	Add target specific implementation for generic pure virtual
	functions.
	* config.gcc: Add new object file.
	* config/rs6000/rs6000-protos.h: Add new prototype for mem
	fusion pass.
	* config/rs6000/t-rs6000: Add new rule.
	* rtl-ssa/accesses.h: Moved set_is_live_out_use as public
	from private.

gcc/testsuite/ChangeLog:

	* g++.target/powerpc/mem-fusion.C: New test.
	* g++.target/powerpc/mem-fusion-1.C: New test.
	* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc                                |   2 +
 gcc/config/rs6000/rs6000-mem-fusion.cc        | 651 ++++++++++++++++++
 gcc/config/rs6000/rs6000-passes.def           |   4 +-
 gcc/config/rs6000/rs6000-protos.h             |   1 +
 gcc/config/rs6000/t-rs6000                    |   5 +
 gcc/pair-fusion.cc                            |  26 +-
 gcc/pair-fusion.h                             |  20 +
 gcc/rtl-ssa/accesses.h                        |   2 +-
 .../g++.target/powerpc/mem-fusion-1.C         |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c        |   4 +-
 11 files changed, 743 insertions(+), 9 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C
diff mbox

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e32..348308b2e93 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@  powerpc*-*-*)
 	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
 	extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+	extra_objs="${extra_objs} rs6000-mem-fusion.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
 	extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@  rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
 	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+	extra_objs="${extra_objs} rs6000-mem-fusion.o"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
 	;;
diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc b/gcc/config/rs6000/rs6000-mem-fusion.cc
new file mode 100644
index 00000000000..45795cd48c4
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
@@ -0,0 +1,651 @@ 
+/* Subroutines used to perform adjacent load/store into
+   paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "df.h"
+#include "rtl.h"
+#include "rtl-iter.h"
+#include "rtl-ssa.h"
+#include "tree-pass.h"
+#include "ordered-hash-map.h"
+#include "pair-fusion.h"
+
+using namespace rtl_ssa;
+
+struct rs6000_pair_fusion : public pair_fusion
+{
+  bool fpsimd_op_p (rtx , machine_mode , bool)  override final
+  {
+    return false;
+  }
+
+  bool pair_mem_insn_p (rtx_insn *, bool &) override final
+  {
+    return false;
+  }
+
+  bool pair_mem_ok_with_policy (rtx, bool) override final
+  {
+    return true;
+  }
+
+  bool pair_operand_mode_ok_p (machine_mode mode) override final;
+
+  rtx gen_pair (rtx *pats, rtx writeback, bool load_p) override final;
+
+  bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
+  {
+    return true;
+  }
+
+  int pair_mem_alias_check_limit () override final
+  {
+    return 0;
+  }
+
+  bool should_handle_writeback (enum writeback) override final
+  {
+    return false;
+  }
+
+  bool track_loads_p () override final
+  {
+    return true;
+  }
+
+  bool track_stores_p () override final
+  {
+    return true;
+  }
+
+  bool pair_mem_in_range_p (HOST_WIDE_INT) override final
+  {
+    return true;
+  }
+
+  rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
+  {
+    return NULL_RTX;
+  }
+
+  rtx destructure_pair (rtx_def **, rtx, bool) override final
+  {
+    return NULL_RTX;
+  }
+
+  // (insn 31 62 32 2 (set (reg:V16QI 177 [ MEM <vector(8) short unsigned int>
+  // [(short unsigned int *)vectp.62_36 + 64B] ])
+  //      (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+  //              (const_int 64 [0x40])) [1 MEM <vector(8) short unsigned int>
+  //	 [(short unsigned int *)vectp.62_36 + 64B]+0 S16 A16]))
+  //     	 {vsx_movv16qi_64bit}
+  //   (nil))
+  // (insn 32 31 16 2 (set (reg:V16QI 178 [ MEM <vector(8) short unsigned int>
+  // [(short unsigned int *)vectp.62_36 + 80B] ])
+  //      (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+  //              (const_int 80 [0x50])) [1 MEM <vector(8) short unsigned int>
+  //              [(short unsigned int *)vectp.62_36 + 80B]+0 S16 A16]))
+  //              {vsx_movv16qi_64bit}
+  //   (nil))
+  // (insn 16 32 21 2 (set (reg:V16QI 159 [ MEM <vector(8) short unsigned int>
+  //  [(short unsigned int *)vectp.62_36 + 16B] ])
+  //      (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+  //              (const_int 16 [0x10])) [1 MEM <vector(8) short unsigned int>
+  //		       	[(short unsigned int *)vectp.62_36 + 16B]+0 S16 A16]))
+  //		      	{vsx_movv16qi_64bit}
+  //   (nil))
+  // (insn 21 16 22 2 (set (reg:V16QI 165 [ MEM <vector(8) short unsigned int>
+  // [(short unsigned int *)vectp.62_36 + 32B] ])
+  //      (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+  //              (const_int 32 [0x20])) [1 MEM <vector(8) short unsigned int>
+  //	 [(short unsigned int *)vectp.62_36 + 32B]+0 S16 A16]))
+  //	 {vsx_movv16qi_64bit}
+  //   (nil))
+  // (insn 22 21 37 2 (set (reg:V16QI 166 [ MEM <vector(8) short unsigned int>
+  // [(short unsigned int *)vectp.62_36 + 48B] ])
+  //      (mem:V16QI (plus:DI (reg/f:DI 121 [ vectp.62 ])
+  //              (const_int 48 [0x30])) [1 MEM <vector(8) short unsigned int>
+  //	 [(short unsigned int *)vectp.62_36 + 48B]+0 S16 A16]))
+  //	 {vsx_movv16qi_64bit}
+  //   (nil))
+  //
+  // insn 22 and insn 31 is merged in the failure case and breaks the code.
+  // This function handles the above case.
+  bool should_handle_unordered_insns (insn_info *i1,
+				      insn_info *i2) override final
+  {
+    if (*i1 > *i2)
+      return false;
+
+    return true;
+  }
+
+  bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
+
+  bool fuseable_load_p (insn_info *insn) override final;
+
+  void set_multiword_subreg (insn_info *i1, insn_info *i2,
+			     bool load_p) override final;
+};
+
+bool
+rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+  return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
+
+}
+
+// df_insn_rescan the unspec instruction where operands
+// are reversed given insn_info INFO.
+static void
+set_rescan_load (insn_info *info)
+{
+  for (auto def : info->defs())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->nondebug_insn_uses ())
+	{
+	  insn_info *info = use->insn ();
+	  if (info)
+	    {
+	      rtx_insn *rtl_insn = info->rtl ();
+	      rtx set = single_set (rtl_insn);
+
+	      if (set == NULL_RTX)
+		return;
+
+	      rtx op0 = SET_SRC (set);
+	      if (GET_CODE (op0) != UNSPEC)
+		return;
+
+	      use->set_is_live_out_use (true);
+	      df_insn_rescan (rtl_insn);
+	     }
+	}
+    }
+}
+
+// df_insn_rescan the def instruction where operands are reversed given INSN.
+static bool
+set_rescan_store (insn_info *insn)
+{
+  for (auto use : insn->uses())
+    {
+      auto def = use->def ();
+
+      if (def->insn ()->is_artificial())
+	return false;
+
+      if (def->insn () && def->insn ()->rtl ()
+	  && def->insn()->is_real() )
+	{
+	  rtx_insn *rtl_insn = def->insn ()->rtl ();
+	  rtx set = single_set (rtl_insn);
+
+	  if (set == NULL_RTX)
+	    return false;
+	  df_insn_rescan (rtl_insn);
+	}
+    }
+
+  return true;
+}
+
+ordered_hash_map<rtx_insn *, bool> insn_map;
+
+// Return false if dependent def is load instruction given INSN otherwise
+// false.
+static bool
+feasible_store_p (rtx_insn *insn, bool immediate_dep)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+
+      if (!def_link || !def_link->ref
+	  || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+
+      rtx_insn *select_insn2 = DF_REF_INSN (def_link->ref);
+
+      if (select_insn2 == NULL)
+	continue;
+
+      if (select_insn2 == insn)
+	return true;
+
+      while (def_link && def_link->ref)
+	{
+	  rtx set = single_set (select_insn2);
+	  rtx insn_set = single_set (insn);
+
+	  if (set != NULL_RTX && insn_set != NULL_RTX)
+	    {
+	      if (GET_MODE (SET_SRC (set)) != GET_MODE (SET_SRC (insn_set)))
+		{
+		  if (GET_MODE (SET_SRC (set)) == OOmode)
+		    return false;
+
+		  immediate_dep = false;
+		}
+		else
+		  {
+		    if (immediate_dep && MEM_P (SET_SRC (set)))
+		      return false;
+		  }
+
+		if (insn_map.get (select_insn2))
+		  return true;
+		else
+		  insn_map.put (select_insn2, true);
+
+		if (!feasible_store_p (select_insn2, immediate_dep))
+		  return false;
+	    }
+	  def_link = def_link->next;
+	}
+     }
+  return true;
+}
+
+// Check for feasibility of store to be fuseable or not. Return true if
+// feasible otherwise false.
+static bool
+feasible_store_p (insn_info *insn)
+{
+  for (auto use : insn->uses ())
+    {
+      auto def = use->def ();
+
+      if (def->insn ()->is_artificial ())
+	return false;
+
+      if (def->insn () && def->insn ()->rtl ()
+	  && def->insn()->is_real ())
+	{
+	  rtx_insn *rtl_insn = def->insn ()->rtl ();
+	  rtx set = single_set (rtl_insn);
+
+	  if (set == NULL_RTX)
+	    return false;
+
+	  // Return false if dependent def is load.
+	  if (rtl_insn && MEM_P (SET_SRC (set)))
+	    return false;
+
+	  // Return false if dependent def is store.
+	  if (rtl_insn && MEM_P (SET_DEST (set)))
+	    return false;
+
+	  // Return false if dependent def is parallel.
+	  if (GET_CODE (PATTERN (rtl_insn)) == PARALLEL)
+	    return false;
+
+	  rtx src = SET_SRC (set);
+	  rtx_code code = GET_CODE (src);
+
+	  // Return false if dependent def is CONST_VECTOR or UNSPEC.
+	  if (code == CONST_VECTOR || code == UNSPEC)
+	    return false;
+
+	  // Recursively check for dependent instruction is Load.
+	  if (!feasible_store_p (rtl_insn, true))
+	    return false;
+
+	  if (GET_RTX_CLASS (code) == RTX_TERNARY)
+	    return false;
+	}
+    }
+  return true;
+}
+
+// Check if store can be fuseable or not.  Return true if fuseable otherwise
+// false.
+bool
+rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
+{
+  rtx_insn *insn1 = i1->rtl ();
+  rtx_insn *insn2 = i2->rtl ();
+
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_src_exp = SET_SRC (insn2_body);
+
+  // Return false if def and use count are not same.
+  if (REG_P (src_exp) &&
+      (DF_REG_DEF_COUNT (REGNO (src_exp)) != DF_REG_USE_COUNT (REGNO (src_exp))
+       || DF_REG_USE_COUNT (REGNO (src_exp)) > 1))
+    return false;
+
+  // Return false if src of insn1 and src of ins2 are same.
+  if (src_exp == insn2_src_exp)
+    return false;
+
+  // Return false if src of insn1 is subreg.
+  if (GET_CODE (src_exp) == SUBREG)
+    return false;
+
+  // Return false if src of insn1 is TImode or TFmode.
+  if (GET_MODE (src_exp) == TImode || GET_MODE (src_exp) == TFmode)
+    return false;
+
+  if (!feasible_store_p (i1))
+    return false;;
+
+  if (!feasible_store_p (i2))
+    return false;
+
+  return true;
+}
+
+// Set subreg for def of store INSN given rtx SRC instruction.
+static void
+set_store_subreg (rtx_insn *insn, rtx src)
+{
+  rtx set = single_set (insn);
+  rtx src_exp = SET_SRC (set);
+  df_ref use;
+
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+
+      if (!def_link || !def_link->ref
+	  || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+
+      while (def_link && def_link->ref)
+	{
+	  rtx *loc = DF_REF_LOC (def_link->ref);
+
+	  if (GET_MODE (*loc) == GET_MODE (src_exp))
+	      *loc = copy_rtx (src);
+
+	  def_link = def_link->next;
+	}
+    }
+}
+
+// Generate store pair stxvp given rtx I1.
+static rtx
+rs6000_gen_store_pair (rtx i1)
+{
+  rtx src_exp = SET_SRC (i1);
+  rtx dest_exp = SET_DEST (i1);
+  rtx stxv;
+  PUT_MODE_RAW (src_exp, OOmode);
+  PUT_MODE_RAW (dest_exp, OOmode);
+  stxv = gen_rtx_SET (dest_exp, src_exp);
+  if (dump_file)
+    {
+      fprintf (dump_file, "Replacing stxv with stxvp  \n");
+      print_rtl_single (dump_file, stxv);
+    }
+  return stxv;
+}
+
+// Check whether load can be fusable or not.
+// Return true if dependent use is UNSPEC otherwise false.
+// Adjacent load pair fusion with 256 bit OOmode is seen
+// and valid with use of load in UNSPEC. Thats why this check is added.
+bool
+rs6000_pair_fusion::fuseable_load_p (insn_info *info)
+{
+  for (auto def : info->defs ())
+    {
+      auto set = dyn_cast<set_info *> (def);
+      for (auto use : set->nondebug_insn_uses ())
+	{
+	  insn_info *info = use->insn ();
+
+	  if (info->is_artificial ())
+	    return false;
+
+	  if (info && info->is_real ())
+	    {
+	      rtx_insn *rtl_insn = info->rtl ();
+	      rtx set = single_set (rtl_insn);
+
+	      if (set == NULL_RTX)
+		return false;
+
+	      rtx op0 = SET_SRC (set);
+	      if (GET_CODE (op0) != UNSPEC)
+		return false;
+	    }
+	}
+    }
+  return true;
+}
+
+// Set subreg with use of INSN given SRC rtx instruction.
+static void
+set_load_subreg (rtx_insn *insn, rtx src)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+
+      if (!def_link || !def_link->ref
+	  || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+
+      while (def_link && def_link->ref)
+	{
+	  rtx *loc = DF_REF_LOC (def_link->ref);
+	  *loc =  copy_rtx (src);
+	  def_link = def_link->next;
+	}
+     }
+}
+
+// Set subreg for OO mode store pair to generate registers in pairs
+// given insn_info I1 and I2.
+static void
+set_multiword_subreg_store (insn_info *i1, insn_info *i2)
+{
+  rtx_insn *insn1 = i1->rtl ();
+  rtx_insn *insn2 = i2->rtl ();
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_dest_exp = SET_DEST (insn2_body);
+  machine_mode mode = GET_MODE (src_exp);
+  int regoff;
+  rtx src;
+  rtx addr = XEXP (insn2_dest_exp, 0);
+
+  PUT_MODE_RAW (src_exp, OOmode);
+  if (GET_CODE (addr) == PLUS
+      && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
+    regoff = 16;
+  else
+    regoff = 0;
+
+  src = simplify_gen_subreg (mode,
+			     src_exp, GET_MODE (src_exp),
+			     regoff);
+
+  set_store_subreg (insn1, src);
+
+  int regoff1 = 0;
+  rtx src1;
+
+  src1 = simplify_gen_subreg (mode,
+			      src_exp, GET_MODE (src_exp),
+			      regoff1);
+
+  set_store_subreg (insn2, src1);
+  set_rescan_store (i1);
+  set_rescan_store (i2);
+  df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair load to generate registers in pairs given
+// insn_info I2 and I2.
+static void
+set_multiword_subreg_load (insn_info *i1, insn_info *i2)
+{
+  rtx_insn *insn1 = i1->rtl();
+  rtx_insn *insn2 = i2->rtl();
+  rtx body = PATTERN (insn1);
+  rtx dest_exp = SET_DEST (body);
+  rtx insn2_body = PATTERN (insn2);
+  machine_mode mode = GET_MODE (dest_exp);
+  PUT_MODE_RAW (dest_exp, OOmode);
+
+  rtx insn2_src_exp = SET_SRC (insn2_body);
+  int regoff = 0;
+  rtx src;
+
+  src = simplify_gen_subreg (mode,
+			     dest_exp, GET_MODE (dest_exp),
+			     regoff);
+
+  set_load_subreg (insn2, src);
+
+  int regoff1;
+  rtx src1;
+  rtx addr = XEXP (insn2_src_exp, 0);
+
+  if (GET_CODE (addr) == PLUS
+      && XEXP (addr, 1)
+      && CONST_INT_P (XEXP(addr, 1)))
+    regoff1 = 16;
+  else
+    regoff1 = 0;
+
+  src1 = simplify_gen_subreg (mode,
+			      dest_exp, GET_MODE (dest_exp),
+			      regoff1);
+
+  set_load_subreg (insn1, src1);
+  set_rescan_load (i1);
+  set_rescan_load (i2);
+  df_insn_rescan (insn1);
+}
+
+// Set subreg for OO mode pair to generate sequential registers given
+// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
+// if store insn.
+void
+rs6000_pair_fusion::set_multiword_subreg(insn_info *i1, insn_info *i2,
+					      bool load_p)
+{
+  if (load_p)
+    set_multiword_subreg_load (i1, i2);
+  else
+    set_multiword_subreg_store (i1, i2);
+}
+
+// Return load pair given rtx I1.
+static rtx
+rs6000_gen_load_pair (rtx i1)
+{
+  rtx src_exp = SET_SRC (i1);
+  rtx dest_exp = SET_DEST (i1);
+  rtx lxv;
+  PUT_MODE_RAW (src_exp, OOmode);
+  PUT_MODE_RAW (dest_exp, OOmode);
+  lxv = gen_rtx_SET (dest_exp, src_exp);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "lxv with lxvp ");
+      print_rtl_single (dump_file, lxv);
+    }
+
+  return lxv;
+}
+
+rtx
+rs6000_pair_fusion::gen_pair (rtx *pats, rtx writeback, bool load_p)
+{
+  if (load_p || writeback)
+    return rs6000_gen_load_pair (pats[0]);
+  else
+    return rs6000_gen_store_pair (pats[0]);
+}
+
+const pass_data pass_data_mem_fusion =
+{
+  RTL_PASS, /* type */
+  "mem_fusion", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_mem_fusion : public rtl_opt_pass
+{
+public:
+  pass_mem_fusion (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_mem_fusion, ctxt)
+  {}
+
+  opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
+
+  /* opt_pass methods: */
+  bool gate (function *)
+    {
+      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+    }
+
+  unsigned int execute (function *) final override
+    {
+      /* We use DF data flow because we change location rtx
+	 which is easier to find and modify.
+	 We use mix of rtl-ssa def-use and DF data flow
+	 where it is easier.  */
+      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+      df_analyze ();
+      df_set_flags (DF_DEFER_INSN_RESCAN);
+
+      rs6000_pair_fusion pass;
+      pass.run ();
+      return 0;
+    }
+}; // class pass_mem_fusion
+
+rtl_opt_pass *
+make_pass_mem_fusion (gcc::context *ctxt)
+{
+  return new pass_mem_fusion (ctxt);
+}
diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
index 46a0d0b8c56..0b48f57014d 100644
--- a/gcc/config/rs6000/rs6000-passes.def
+++ b/gcc/config/rs6000/rs6000-passes.def
@@ -28,7 +28,9 @@  along with GCC; see the file COPYING3.  If not see
      The power8 does not have instructions that automaticaly do the byte swaps
      for loads and stores.  */
   INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
-
+  /* Pass to replace adjacent memory addresses lxv/stxv instruction with
+     lxvp/stxvp instruction.  */
+  INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
   /* Pass to do the PCREL_OPT optimization that combines the load of an
      external symbol's address along with a single load or store using that
      address as a base register.  */
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 09a57a806fa..1412b31c2eb 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -343,6 +343,7 @@  namespace gcc { class context; }
 class rtl_opt_pass;
 
 extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
 extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
 extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
 extern bool rs6000_quadword_masked_address_p (const_rtx exp);
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index b3ce09d523b..df9b3a35b66 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -35,6 +35,11 @@  rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+
 rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
index 9f897ac04e2..c048ebc0623 100644
--- a/gcc/pair-fusion.cc
+++ b/gcc/pair-fusion.cc
@@ -312,9 +312,9 @@  static int
 encode_lfs (lfs_fields fields)
 {
   int size_log2 = exact_log2 (fields.size);
-  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
-  return ((int)fields.load_p << 3)
-    | ((int)fields.fpsimd_p << 2)
+  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
+  return ((int)fields.load_p << 4)
+    | ((int)fields.fpsimd_p << 3)
     | (size_log2 - 2);
 }
 
@@ -322,8 +322,8 @@  encode_lfs (lfs_fields fields)
 static lfs_fields
 decode_lfs (int lfs)
 {
-  bool load_p = (lfs & (1 << 3));
-  bool fpsimd_p = (lfs & (1 << 2));
+  bool load_p = (lfs & (1 << 4));
+  bool fpsimd_p = (lfs & (1 << 3));
   unsigned size = 1U << ((lfs & 3) + 2);
   return { load_p, fpsimd_p, size };
 }
@@ -425,6 +425,9 @@  pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
   if (MEM_VOLATILE_P (mem))
     return;
 
+  if (load_p && !m_pass->fuseable_load_p (insn))
+    return;
+
   // Ignore writeback accesses if the hook says to do so.
   if (!m_pass->should_handle_writeback (writeback::EXISTING)
       && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
@@ -1821,6 +1824,7 @@  pair_fusion_bb_info::fuse_pair (bool load_p,
 
   rtx reg_notes = combine_reg_notes (first, second, load_p);
 
+  m_pass->set_multiword_subreg (first, second, load_p);
   rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
   insn_change *pair_change = nullptr;
   auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
@@ -2383,6 +2387,9 @@  bool
 pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
 				    insn_info *i1, insn_info *i2)
 {
+  if (!m_pass->should_handle_unordered_insns (i1, i2))
+    return false;
+
   if (dump_file)
     fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
 	     load_p, i1->uid (), i2->uid ());
@@ -2411,6 +2418,15 @@  pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
       reg_ops[i] = XEXP (pats[i], !load_p);
     }
 
+  if (!load_p && !m_pass->fuseable_store_p (i1, i2))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
   if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
     {
       if (dump_file)
diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
index 2a38dc8f743..5371e48d3e2 100644
--- a/gcc/pair-fusion.h
+++ b/gcc/pair-fusion.h
@@ -171,6 +171,26 @@  struct pair_fusion {
   virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
 					  rtx regs[2], bool load_p) = 0;
 
+  // Given insn_info pair I1 and I2, sets subreg with multiword registers
+  // to assign register pairs by allocators.
+  // LOAD_P is true iff the pair is a load.
+  virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, rtl_ssa::insn_info *i2,
+				     bool load_p) = 0;
+
+  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+  // store mem pairs.
+  // Return true if feasible to perform store mem pairs otherwise false.
+  virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, rtl_ssa::insn_info *i2) = 0;
+
+  // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
+  // load mem pairs.
+  // Return true if feasible to perform load mem pairs otherwise false.
+  virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
+
+  // Given insn_info pair I1 and I2, return true if offsets are in order.
+  virtual bool should_handle_unordered_insns (rtl_ssa::insn_info *i1,
+					      rtl_ssa::insn_info *i2) = 0;
+
   void process_block (rtl_ssa::bb_info *bb);
   rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
 					 const rtl_ssa::insn_range_info
diff --git a/gcc/rtl-ssa/accesses.h b/gcc/rtl-ssa/accesses.h
index 7d2916d00c2..3e800932d86 100644
--- a/gcc/rtl-ssa/accesses.h
+++ b/gcc/rtl-ssa/accesses.h
@@ -379,6 +379,7 @@  public:
   //
   // This routine is only meaningful when def () is nonnull.
   bool is_last_use () const;
+  void set_is_live_out_use (bool value) { m_is_live_out_use = value; }
 
   // Print a description of def () to PP.
   void print_def (pretty_printer *pp) const;
@@ -430,7 +431,6 @@  private:
   void record_reference (rtx_obj_reference, bool);
   void set_insn (insn_info *);
   void set_def (set_info *set) { m_def = set; }
-  void set_is_live_out_use (bool value) { m_is_live_out_use = value; }
   void copy_prev_from (use_info *);
   void copy_next_from (use_info *);
   void set_last_use (use_info *);
diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
new file mode 100644
index 00000000000..d10ff0cdf36
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+	
+void
+foo2 ()
+{
+  __vector_quad *dst1;
+  __vector_quad *dst2;
+  vector unsigned char src;
+  __vector_quad acc;
+  vector unsigned char *ptr;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst1 = acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+  *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
new file mode 100644
index 00000000000..c523572cf3c
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */ 
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+  __vector_quad acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
index 69ee826e1be..ae29127f954 100644
--- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
@@ -258,8 +258,8 @@  foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
   dst[13] = acc;
 }
 
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
 /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
 /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
 /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */