diff mbox series

[committed] RISC-V: Infrastructure for instruction fusion

Message ID 289c3d37-398f-4761-b8be-4213d65d6df7@ventanamicro.com
State New
Headers show
Series [committed] RISC-V: Infrastructure for instruction fusion | expand

Commit Message

Jeff Law Nov. 19, 2023, 9:19 p.m. UTC
I've been meaning to extract this and upstream it for a long time.  The 
work is primarily Philipp from VRULL with one case added by Raphael and 
light bugfixing on my part.

Essentially there's 10 distinct fusions supported and they can be 
selected individually by building a suitable mask in the uarch tuning 
structure.  Additional cases can be added -- the bulk of the effort is 
in recognizing the two fusible instructions.

The cases supported in this patch are all from the Veyron V1 processor, 
though the hope is they will be useful elsewhere.  I would encourage 
those familiar with other uarch implementations to enable fusion cases 
for those uarchs and extend the set of supported cases if any are missing.

Pushing to the trunk...

jeff
commit c177f28d601408180fdb2db0d5ba89d53b370b5e
Author: Philipp Tomsich <philipp.tomsich@vrull.eu>
Date:   Sun Nov 19 14:11:45 2023 -0700

    [committed] RISC-V: Infrastructure for instruction fusion
    
    I've been meaning to extract this and upstream it for a long time.  The work is
    primarily Philipp from VRULL with one case added by Raphael and light bugfixing
    on my part.
    
    Essentially there's 10 distinct fusions supported and they can be selected
    individually by building a suitable mask in the uarch tuning structure.
    Additional cases can be added -- the bulk of the effort is in recognizing the
    two fusible instructions.
    
    The cases supported in this patch are all from the Veyron V1 processor, though
    the hope is they will be useful elsewhere.  I would encourage those familiar
    with other uarch implementations to enable fusion cases for those uarchs and
    extend the set of supported cases if any are missing.
    
    gcc/
            * config/riscv/riscv-protos.h (extract_base_offset_in_addr): Prototype.
            * config/riscv/riscv.cc (riscv_fusion_pairs): New enum.
            (riscv_tune_param): Add fusible_ops field.
            (riscv_tune_param_rocket_tune_info): Initialize new field.
            (riscv_tune_param_sifive_7_tune_info): Likewise.
            (thead_c906_tune_info): Likewise.
            (generic_oo_tune_info): Likewise.
            (optimize_size_tune_info): Likewise.
            (riscv_macro_fusion_p): New function.
            (riscv_fusion_enabled_p): Likewise.
            (riscv_macro_fusion_pair_p): Likewise.
            (TARGET_SCHED_MACRO_FUSION_P): Define.
            (TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
            (extract_base_offset_in_addr): Moved into riscv.cc from...
            * config/riscv/thead.cc: Here.
    
            Co-authored-by: Raphael Zinsly <rzinsly@ventanamicro.com>
            Co-authored-by: Jeff Law <jlaw@ventanamicro.com>
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 196b53f10f3..ae528db1898 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -626,6 +626,7 @@  extern bool riscv_expand_strcmp (rtx, rtx, rtx, rtx, rtx);
 extern bool riscv_expand_strlen (rtx, rtx, rtx, rtx);
 
 /* Routines implemented in thead.cc.  */
+extern bool extract_base_offset_in_addr (rtx, rtx *, rtx *);
 extern bool th_mempair_operands_p (rtx[4], bool, machine_mode);
 extern void th_mempair_order_operands (rtx[4], bool, machine_mode);
 extern void th_mempair_prepare_save_restore_operands (rtx[4], bool,
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index c2bd1c2ed29..3701f41b1b3 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -249,6 +249,21 @@  struct riscv_integer_op {
    The worst case is LUI, ADDI, SLLI, ADDI, SLLI, ADDI, SLLI, ADDI.  */
 #define RISCV_MAX_INTEGER_OPS 8
 
+enum riscv_fusion_pairs
+{
+  RISCV_FUSE_NOTHING = 0,
+  RISCV_FUSE_ZEXTW = (1 << 0),
+  RISCV_FUSE_ZEXTH = (1 << 1),
+  RISCV_FUSE_ZEXTWS = (1 << 2),
+  RISCV_FUSE_LDINDEXED = (1 << 3),
+  RISCV_FUSE_LUI_ADDI = (1 << 4),
+  RISCV_FUSE_AUIPC_ADDI = (1 << 5),
+  RISCV_FUSE_LUI_LD = (1 << 6),
+  RISCV_FUSE_AUIPC_LD = (1 << 7),
+  RISCV_FUSE_LDPREINCREMENT = (1 << 8),
+  RISCV_FUSE_ALIGNED_STD = (1 << 9),
+};
+
 /* Costs of various operations on the different architectures.  */
 
 struct riscv_tune_param
@@ -264,6 +279,7 @@  struct riscv_tune_param
   unsigned short fmv_cost;
   bool slow_unaligned_access;
   bool use_divmod_expansion;
+  unsigned int fusible_ops;
 };
 
 
@@ -344,6 +360,7 @@  static const struct riscv_tune_param rocket_tune_info = {
   8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
+  RISCV_FUSE_NOTHING,                           /* fusible_ops */
 };
 
 /* Costs to use when optimizing for Sifive 7 Series.  */
@@ -359,6 +376,7 @@  static const struct riscv_tune_param sifive_7_tune_info = {
   8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
+  RISCV_FUSE_NOTHING,                           /* fusible_ops */
 };
 
 /* Costs to use when optimizing for T-HEAD c906.  */
@@ -373,7 +391,8 @@  static const struct riscv_tune_param thead_c906_tune_info = {
   5,            /* memory_cost */
   8,		/* fmv_cost */
   false,            /* slow_unaligned_access */
-  false		/* use_divmod_expansion */
+  false,	/* use_divmod_expansion */
+  RISCV_FUSE_NOTHING,                           /* fusible_ops */
 };
 
 /* Costs to use when optimizing for a generic ooo profile.  */
@@ -389,6 +408,7 @@  static const struct riscv_tune_param generic_ooo_tune_info = {
   4,						/* fmv_cost */
   false,					/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
+  RISCV_FUSE_NOTHING,                           /* fusible_ops */
 };
 
 /* Costs to use when optimizing for size.  */
@@ -404,6 +424,7 @@  static const struct riscv_tune_param optimize_size_tune_info = {
   8,						/* fmv_cost */
   false,					/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
+  RISCV_FUSE_NOTHING,                           /* fusible_ops */
 };
 
 static bool riscv_avoid_shrink_wrapping_separate ();
@@ -7794,6 +7815,259 @@  riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
   return more - 1;
 }
 
+/* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
+   instruction fusion of some sort.  */
+
+static bool
+riscv_macro_fusion_p (void)
+{
+  return tune_param->fusible_ops != RISCV_FUSE_NOTHING;
+}
+
+/* Return true iff the instruction fusion described by OP is enabled.  */
+
+static bool
+riscv_fusion_enabled_p(enum riscv_fusion_pairs op)
+{
+  return tune_param->fusible_ops & op;
+}
+
+/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
+   should be kept together during scheduling.  */
+
+static bool
+riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+{
+  rtx prev_set = single_set (prev);
+  rtx curr_set = single_set (curr);
+  /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
+  bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
+
+  if (!riscv_macro_fusion_p ())
+    return false;
+
+  if (simple_sets_p && (riscv_fusion_enabled_p (RISCV_FUSE_ZEXTW) ||
+			riscv_fusion_enabled_p (RISCV_FUSE_ZEXTH)))
+    {
+      /* We are trying to match the following:
+	   prev (slli) == (set (reg:DI rD)
+			       (ashift:DI (reg:DI rS) (const_int 32)))
+	   curr (slri) == (set (reg:DI rD)
+			       (lshiftrt:DI (reg:DI rD) (const_int <shift>)))
+	 with <shift> being either 32 for FUSE_ZEXTW, or
+			 `less than 32 for FUSE_ZEXTWS. */
+
+      if (GET_CODE (SET_SRC (prev_set)) == ASHIFT
+	  && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT
+	  && REG_P (SET_DEST (prev_set))
+	  && REG_P (SET_DEST (curr_set))
+	  && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set))
+	  && REGNO (XEXP (SET_SRC (curr_set), 0)) == REGNO(SET_DEST (curr_set))
+	  && CONST_INT_P (XEXP (SET_SRC (prev_set), 1))
+	  && CONST_INT_P (XEXP (SET_SRC (curr_set), 1))
+	  && INTVAL (XEXP (SET_SRC (prev_set), 1)) == 32
+	  && (( INTVAL (XEXP (SET_SRC (curr_set), 1)) == 32
+		&& riscv_fusion_enabled_p(RISCV_FUSE_ZEXTW) )
+	      || ( INTVAL (XEXP (SET_SRC (curr_set), 1)) < 32
+		   && riscv_fusion_enabled_p(RISCV_FUSE_ZEXTWS))))
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_ZEXTH))
+    {
+      /* We are trying to match the following:
+	   prev (slli) == (set (reg:DI rD)
+			       (ashift:DI (reg:DI rS) (const_int 48)))
+	   curr (slri) == (set (reg:DI rD)
+			       (lshiftrt:DI (reg:DI rD) (const_int 48))) */
+
+      if (GET_CODE (SET_SRC (prev_set)) == ASHIFT
+	  && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT
+	  && REG_P (SET_DEST (prev_set))
+	  && REG_P (SET_DEST (curr_set))
+	  && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set))
+	  && REGNO (XEXP (SET_SRC (curr_set), 0)) == REGNO(SET_DEST (curr_set))
+	  && CONST_INT_P (XEXP (SET_SRC (prev_set), 1))
+	  && CONST_INT_P (XEXP (SET_SRC (curr_set), 1))
+	  && INTVAL (XEXP (SET_SRC (prev_set), 1)) == 48
+	  && INTVAL (XEXP (SET_SRC (curr_set), 1)) == 48)
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_LDINDEXED))
+    {
+      /* We are trying to match the following:
+	   prev (add) == (set (reg:DI rD)
+			      (plus:DI (reg:DI rS1) (reg:DI rS2))
+	   curr (ld)  == (set (reg:DI rD)
+			      (mem:DI (reg:DI rD))) */
+
+      if (MEM_P (SET_SRC (curr_set))
+	  && REG_P (XEXP (SET_SRC (curr_set), 0))
+	  && REGNO (XEXP (SET_SRC (curr_set), 0)) == REGNO (SET_DEST (prev_set))
+	  && GET_CODE (SET_SRC (prev_set)) == PLUS
+	  && REG_P (XEXP (SET_SRC (prev_set), 0))
+	  && REG_P (XEXP (SET_SRC (prev_set), 1)))
+	return true;
+
+      /* We are trying to match the following:
+	   prev (add) == (set (reg:DI rD)
+			      (plus:DI (reg:DI rS1) (reg:DI rS2)))
+	   curr (lw)  == (set (any_extend:DI (mem:SUBX (reg:DI rD)))) */
+
+      if ((GET_CODE (SET_SRC (curr_set)) == SIGN_EXTEND
+	   || (GET_CODE (SET_SRC (curr_set)) == ZERO_EXTEND))
+	  && MEM_P (XEXP (SET_SRC (curr_set), 0))
+	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
+	  && REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) == REGNO (SET_DEST (prev_set))
+	  && GET_CODE (SET_SRC (prev_set)) == PLUS
+	  && REG_P (XEXP (SET_SRC (prev_set), 0))
+	  && REG_P (XEXP (SET_SRC (prev_set), 1)))
+	return true;
+    }
+
+    if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_LDPREINCREMENT))
+    {
+      /* We are trying to match the following:
+	   prev (add) == (set (reg:DI rS)
+			      (plus:DI (reg:DI rS) (const_int))
+	   curr (ld)  == (set (reg:DI rD)
+			      (mem:DI (reg:DI rS))) */
+
+      if (MEM_P (SET_SRC (curr_set))
+	  && REG_P (XEXP (SET_SRC (curr_set), 0))
+	  && REGNO (XEXP (SET_SRC (curr_set), 0)) == REGNO (SET_DEST (prev_set))
+	  && GET_CODE (SET_SRC (prev_set)) == PLUS
+	  && REG_P (XEXP (SET_SRC (prev_set), 0))
+	  && CONST_INT_P (XEXP (SET_SRC (prev_set), 1)))
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_LUI_ADDI))
+    {
+      /* We are trying to match the following:
+	   prev (lui)  == (set (reg:DI rD) (const_int UPPER_IMM_20))
+	   curr (addi) == (set (reg:DI rD)
+			       (plus:DI (reg:DI rD) (const_int IMM12))) */
+
+      if ((GET_CODE (SET_SRC (curr_set)) == LO_SUM
+	   || (GET_CODE (SET_SRC (curr_set)) == PLUS
+	       && CONST_INT_P (XEXP (SET_SRC (curr_set), 1))
+	       && SMALL_OPERAND (INTVAL (XEXP (SET_SRC (curr_set), 1)))))
+	  && (GET_CODE (SET_SRC (prev_set)) == HIGH
+	      || (CONST_INT_P (SET_SRC (prev_set))
+		  && LUI_OPERAND (INTVAL (SET_SRC (prev_set))))))
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_AUIPC_ADDI))
+    {
+      /* We are trying to match the following:
+	   prev (auipc) == (set (reg:DI rD) (unspec:DI [...] UNSPEC_AUIPC))
+	   curr (addi)  == (set (reg:DI rD)
+				(plus:DI (reg:DI rD) (const_int IMM12)))
+	 and
+	   prev (auipc) == (set (reg:DI rD) (unspec:DI [...] UNSPEC_AUIPC))
+	   curr (addi)  == (set (reg:DI rD)
+				(lo_sum:DI (reg:DI rD) (const_int IMM12))) */
+
+      if (GET_CODE (SET_SRC (prev_set)) == UNSPEC
+	  && XINT (prev_set, 1) == UNSPEC_AUIPC
+	  && (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+	      || (GET_CODE (SET_SRC (curr_set)) == PLUS
+		  && SMALL_OPERAND (INTVAL (XEXP (SET_SRC (curr_set), 1))))))
+
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_LUI_LD))
+    {
+      /* We are trying to match the following:
+	   prev (lui)  == (set (reg:DI rD) (const_int UPPER_IMM_20))
+	   curr (ld)  == (set (reg:DI rD)
+			      (mem:DI (plus:DI (reg:DI rD) (const_int IMM12)))) */
+
+      if (CONST_INT_P (SET_SRC (prev_set))
+	  && LUI_OPERAND (INTVAL (SET_SRC (prev_set)))
+	  && MEM_P (SET_SRC (curr_set))
+	  && GET_CODE (XEXP (SET_SRC (curr_set), 0)) == PLUS)
+	return true;
+
+      if (GET_CODE (SET_SRC (prev_set)) == HIGH
+	  && MEM_P (SET_SRC (curr_set))
+	  && GET_CODE (XEXP (SET_SRC (curr_set), 0)) == LO_SUM
+	  && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
+	return true;
+
+      if (GET_CODE (SET_SRC (prev_set)) == HIGH
+	  && (GET_CODE (SET_SRC (curr_set)) == SIGN_EXTEND
+	      || GET_CODE (SET_SRC (curr_set)) == ZERO_EXTEND)
+	  && MEM_P (XEXP (SET_SRC (curr_set), 0))
+	  && (GET_CODE (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) == LO_SUM
+	      && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (XEXP (XEXP (SET_SRC (curr_set), 0), 0), 0))))
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_AUIPC_LD))
+    {
+      /* We are trying to match the following:
+	   prev (auipc) == (set (reg:DI rD) (unspec:DI [...] UNSPEC_AUIPC))
+	   curr (ld)  == (set (reg:DI rD)
+			      (mem:DI (plus:DI (reg:DI rD) (const_int IMM12)))) */
+
+      if (GET_CODE (SET_SRC (prev_set)) == UNSPEC
+	  && XINT (prev_set, 1) == UNSPEC_AUIPC
+	  && MEM_P (SET_SRC (curr_set))
+	  && GET_CODE (XEXP (SET_SRC (curr_set), 0)) == PLUS)
+	return true;
+    }
+
+  if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_ALIGNED_STD))
+    {
+      /* We are trying to match the following:
+	   prev (sd) == (set (mem (plus (reg sp|fp) (const_int)))
+			      (reg rS1))
+	   curr (sd) == (set (mem (plus (reg sp|fp) (const_int)))
+			      (reg rS2)) */
+
+      if (MEM_P (SET_DEST (prev_set))
+	  && MEM_P (SET_DEST (curr_set))
+	  /* We can probably relax this condition.  The documentation is a bit
+	     unclear about sub-word cases.  So we just model DImode for now.  */
+	  && GET_MODE (SET_DEST (curr_set)) == DImode
+	  && GET_MODE (SET_DEST (prev_set)) == DImode)
+	{
+	  rtx base_prev, base_curr, offset_prev, offset_curr;
+
+	  extract_base_offset_in_addr (SET_DEST (prev_set), &base_prev, &offset_prev);
+	  extract_base_offset_in_addr (SET_DEST (curr_set), &base_curr, &offset_curr);
+
+	  /* The two stores must be contained within opposite halves of the same
+	     16 byte aligned block of memory.  We know that the stack pointer and
+	     the frame pointer have suitable alignment.  So we just need to check
+	     the offsets of the two stores for suitable alignment.
+
+	     Originally the thought was to check MEM_ALIGN, but that was reporting
+	     incorrect alignments, even for SP/FP accesses, so we gave up on that
+	     approach.  */
+	  if (base_prev != NULL_RTX
+	      && base_curr != NULL_RTX
+	      && REG_P (base_prev)
+	      && REG_P (base_curr)
+	      && REGNO (base_prev) == REGNO (base_curr)
+	      && (REGNO (base_prev) == STACK_POINTER_REGNUM
+		  || REGNO (base_prev) == HARD_FRAME_POINTER_REGNUM)
+	      && ((INTVAL (offset_prev) == INTVAL (offset_curr) + 8
+		   && (INTVAL (offset_prev) % 16) == 0)
+		  || ((INTVAL (offset_curr) == INTVAL (offset_prev) + 8)
+		      && (INTVAL (offset_curr) % 16) == 0)))
+	    return true;
+	}
+    }
+
+  return false;
+}
+
 /* Adjust the cost/latency of instructions for scheduling.
    For now this is just used to change the latency of vector instructions
    according to their LMUL.  We assume that an insn with LMUL == 8 requires
@@ -9780,6 +10054,40 @@  riscv_preferred_else_value (unsigned ifn, tree vectype, unsigned int nops,
   return default_preferred_else_value (ifn, vectype, nops, ops);
 }
 
+/* If MEM is in the form of "base+offset", extract the two parts
+   of address and set to BASE and OFFSET, otherwise return false
+   after clearing BASE and OFFSET.  */
+
+bool
+extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
+{
+  rtx addr;
+
+  gcc_assert (MEM_P (mem));
+
+  addr = XEXP (mem, 0);
+
+  if (REG_P (addr))
+    {
+      *base = addr;
+      *offset = const0_rtx;
+      return true;
+    }
+
+  if (GET_CODE (addr) == PLUS
+      && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
+    {
+      *base = XEXP (addr, 0);
+      *offset = XEXP (addr, 1);
+      return true;
+    }
+
+  *base = NULL_RTX;
+  *offset = NULL_RTX;
+
+  return false;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -9802,6 +10110,10 @@  riscv_preferred_else_value (unsigned ifn, tree vectype, unsigned int nops,
 
 #undef TARGET_SCHED_ISSUE_RATE
 #define TARGET_SCHED_ISSUE_RATE riscv_issue_rate
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P riscv_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p
 
 #undef  TARGET_SCHED_VARIABLE_ISSUE
 #define TARGET_SCHED_VARIABLE_ISSUE riscv_sched_variable_issue
diff --git a/gcc/config/riscv/thead.cc b/gcc/config/riscv/thead.cc
index a485fb1fba6..bd9af7ecd60 100644
--- a/gcc/config/riscv/thead.cc
+++ b/gcc/config/riscv/thead.cc
@@ -36,40 +36,6 @@ 
 #include "regs.h"
 #include "riscv-protos.h"
 
-/* If MEM is in the form of "base+offset", extract the two parts
-   of address and set to BASE and OFFSET, otherwise return false
-   after clearing BASE and OFFSET.  */
-
-static bool
-extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
-{
-  rtx addr;
-
-  gcc_assert (MEM_P (mem));
-
-  addr = XEXP (mem, 0);
-
-  if (REG_P (addr))
-    {
-      *base = addr;
-      *offset = const0_rtx;
-      return true;
-    }
-
-  if (GET_CODE (addr) == PLUS
-      && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
-    {
-      *base = XEXP (addr, 0);
-      *offset = XEXP (addr, 1);
-      return true;
-    }
-
-  *base = NULL_RTX;
-  *offset = NULL_RTX;
-
-  return false;
-}
-
 /* If X is a PLUS of a CONST_INT, return the two terms in *BASE_PTR
    and *OFFSET_PTR.  Return X in *BASE_PTR and 0 in *OFFSET_PTR otherwise.  */