diff mbox

[ARM] Implement TARGET_SCHED_MACRO_FUSION_PAIR_P

Message ID 5461F922.1020106@arm.com
State New
Headers show

Commit Message

Kyrylo Tkachov Nov. 11, 2014, 11:55 a.m. UTC
Hi all,

This is the arm implementation of the macro fusion hook.
It tries to fuse movw+movt operations together. It also tries to take 
lo_sum RTXs into account since those generate movt instructions as well.

Bootstrapped and tested on arm-none-linux-gnueabihf.

Ok for trunk?

Thanks,
Kyrill

2014-11-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/arm/arm-protos.h (tune_params): Add fuseable_ops field.
     * config/arm/arm.c (arm_macro_fusion_p): New function.
     (arm_macro_fusion_pair_p): Likewise.
     (TARGET_SCHED_MACRO_FUSION_P): Define.
     (TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
     (ARM_FUSE_NOTHING): Likewise.
     (ARM_FUSE_MOVW_MOVT): Likewise.
     (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,
     arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,
     arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,
     arm_cortex_a53_tune, arm_cortex_a57_tune, arm_cortex_a9_tune,
     arm_cortex_a12_tune, arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune
     arm_cortex_a5_tune): Specify fuseable_ops value.

Comments

Kyrylo Tkachov Nov. 27, 2014, 2:57 p.m. UTC | #1
Ping.

Thanks,
Kyrill

On 11/11/14 11:55, Kyrill Tkachov wrote:
> Hi all,
>
> This is the arm implementation of the macro fusion hook.
> It tries to fuse movw+movt operations together. It also tries to take
> lo_sum RTXs into account since those generate movt instructions as well.
>
> Bootstrapped and tested on arm-none-linux-gnueabihf.
>
> Ok for trunk?
>
> Thanks,
> Kyrill
>
> 2014-11-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>       * config/arm/arm-protos.h (tune_params): Add fuseable_ops field.
>       * config/arm/arm.c (arm_macro_fusion_p): New function.
>       (arm_macro_fusion_pair_p): Likewise.
>       (TARGET_SCHED_MACRO_FUSION_P): Define.
>       (TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
>       (ARM_FUSE_NOTHING): Likewise.
>       (ARM_FUSE_MOVW_MOVT): Likewise.
>       (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,
>       arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,
>       arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,
>       arm_cortex_a53_tune, arm_cortex_a57_tune, arm_cortex_a9_tune,
>       arm_cortex_a12_tune, arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune
>       arm_cortex_a5_tune): Specify fuseable_ops value.
Ramana Radhakrishnan Dec. 2, 2014, 10:58 p.m. UTC | #2
On Tue, Nov 11, 2014 at 11:55 AM, Kyrill Tkachov <kyrylo.tkachov@arm.com> wrote:
> Hi all,
>
> This is the arm implementation of the macro fusion hook.
> It tries to fuse movw+movt operations together. It also tries to take lo_sum
> RTXs into account since those generate movt instructions as well.
>
> Bootstrapped and tested on arm-none-linux-gnueabihf.
>
> Ok for trunk?



>  if (current_tune->fuseable_ops & ARM_FUSE_MOVW_MOVT)
>+    {
>+      /* We are trying to fuse
>+         movw imm / movt imm
>+         instructions as a group that gets scheduled together.  */
>+

A comment here about the insn structure would be useful.

>+      set_dest = SET_DEST (curr_set);
>+      if (GET_CODE (set_dest) == ZERO_EXTRACT)
>+        {
>+          if (CONST_INT_P (SET_SRC (curr_set))
>+          && CONST_INT_P (SET_SRC (prev_set))
>+          && REG_P (XEXP (set_dest, 0))
>+          && REG_P (SET_DEST (prev_set))
>+          && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
>+        return true;
>+        }
>+      else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
>+               && REG_P (SET_DEST (curr_set))
>+               && REG_P (SET_DEST (prev_set))
>+               && GET_CODE (SET_SRC (prev_set)) == HIGH
>+               && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
>+        {
>+          return true;
>+        }

Can we add a fast path exit to be

if (GET_MODE (set_dest) != SImode)
  return false;

I did think whether we wanted to use reg_overlap_mentioned_p as that
may simplify the logic a bit but that's  overkill here as we still
want to restrict it to the cases above.

Otherwise OK.

Ramana




>+    }
>+  return false;

>
> Thanks,
> Kyrill
>
> 2014-11-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * config/arm/arm-protos.h (tune_params): Add fuseable_ops field.
>     * config/arm/arm.c (arm_macro_fusion_p): New function.
>     (arm_macro_fusion_pair_p): Likewise.
>     (TARGET_SCHED_MACRO_FUSION_P): Define.
>     (TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
>     (ARM_FUSE_NOTHING): Likewise.
>     (ARM_FUSE_MOVW_MOVT): Likewise.
>     (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,
>     arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,
>     arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,
>     arm_cortex_a53_tune, arm_cortex_a57_tune, arm_cortex_a9_tune,
>     arm_cortex_a12_tune, arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune
>     arm_cortex_a5_tune): Specify fuseable_ops value.
diff mbox

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index a37aa80..98e3cf0 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -281,6 +281,8 @@  struct tune_params
   bool string_ops_prefer_neon;
   /* Maximum number of instructions to inline calls to memset.  */
   int max_insns_inline_memset;
+  /* Bitfield encoding the fuseable pairs of instructions.  */
+  unsigned int fuseable_ops;
 };
 
 extern const struct tune_params *current_tune;
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3f2ddd4..40df4c0 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -258,6 +258,7 @@  static tree arm_build_builtin_va_list (void);
 static void arm_expand_builtin_va_start (tree, rtx);
 static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
 static void arm_option_override (void);
+static bool arm_macro_fusion_p (void);
 static unsigned HOST_WIDE_INT arm_shift_truncation_mask (machine_mode);
 static bool arm_cannot_copy_insn_p (rtx_insn *);
 static int arm_issue_rate (void);
@@ -296,6 +297,7 @@  static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
 
+static bool aarch_macro_fusion_pair_p (rtx_insn*, rtx_insn*);
 static bool arm_vectorize_vec_perm_const_ok (machine_mode vmode,
 					     const unsigned char *sel);
 
@@ -404,6 +406,12 @@  static const struct attribute_spec arm_attribute_table[] =
 #undef  TARGET_COMP_TYPE_ATTRIBUTES
 #define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
 
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P arm_macro_fusion_p
+
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
+
 #undef  TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
 #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes
 
@@ -1710,6 +1718,9 @@  const struct cpu_cost_table v7m_extra_costs =
   }
 };
 
+#define ARM_FUSE_NOTHING	(0)
+#define ARM_FUSE_MOVW_MOVT	(1 << 0)
+
 const struct tune_params arm_slowmul_tune =
 {
   arm_slowmul_rtx_costs,
@@ -1726,7 +1737,8 @@  const struct tune_params arm_slowmul_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_fastmul_tune =
@@ -1745,7 +1757,8 @@  const struct tune_params arm_fastmul_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 /* StrongARM has early execution of branches, so a sequence that is worth
@@ -1767,7 +1780,8 @@  const struct tune_params arm_strongarm_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_xscale_tune =
@@ -1786,7 +1800,8 @@  const struct tune_params arm_xscale_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_9e_tune =
@@ -1805,7 +1820,8 @@  const struct tune_params arm_9e_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_v6t2_tune =
@@ -1824,7 +1840,8 @@  const struct tune_params arm_v6t2_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 /* Generic Cortex tuning.  Use more specific tunings if appropriate.  */
@@ -1844,7 +1861,8 @@  const struct tune_params arm_cortex_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a8_tune =
@@ -1863,7 +1881,8 @@  const struct tune_params arm_cortex_a8_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   true,						/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a7_tune =
@@ -1882,7 +1901,8 @@  const struct tune_params arm_cortex_a7_tune =
   false,					/* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   true,						/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a15_tune =
@@ -1901,7 +1921,8 @@  const struct tune_params arm_cortex_a15_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   true, true,                                   /* Prefer 32-bit encodings.  */
   true,						/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a53_tune =
@@ -1920,7 +1941,8 @@  const struct tune_params arm_cortex_a53_tune =
   false,					/* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_MOVW_MOVT				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a57_tune =
@@ -1939,7 +1961,8 @@  const struct tune_params arm_cortex_a57_tune =
   false,                                       /* Prefer Neon for 64-bits bitops.  */
   true, true,                                  /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_MOVW_MOVT				/* Fuseable pairs of instructions.  */
 };
 
 /* Branches can be dual-issued on Cortex-A5, so conditional execution is
@@ -1961,7 +1984,8 @@  const struct tune_params arm_cortex_a5_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   true,						/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a9_tune =
@@ -1980,7 +2004,8 @@  const struct tune_params arm_cortex_a9_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_cortex_a12_tune =
@@ -1999,7 +2024,8 @@  const struct tune_params arm_cortex_a12_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   true,						/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_MOVW_MOVT				/* Fuseable pairs of instructions.  */
 };
 
 /* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
@@ -2046,7 +2072,8 @@  const struct tune_params arm_cortex_m7_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -2067,7 +2094,8 @@  const struct tune_params arm_v6m_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 const struct tune_params arm_fa726te_tune =
@@ -2086,7 +2114,8 @@  const struct tune_params arm_fa726te_tune =
   false,                                        /* Prefer Neon for 64-bits bitops.  */
   false, false,                                 /* Prefer 32-bit encodings.  */
   false,					/* Prefer Neon for stringops.  */
-  8						/* Maximum insns to inline memset.  */
+  8,						/* Maximum insns to inline memset.  */
+  ARM_FUSE_NOTHING				/* Fuseable pairs of instructions.  */
 };
 
 
@@ -32222,6 +32251,59 @@  arm_gen_setmem (rtx *operands)
   return arm_block_set_aligned_non_vect (dstbase, length, value, align);
 }
 
+
+static bool
+arm_macro_fusion_p (void)
+{
+  return current_tune->fuseable_ops != ARM_FUSE_NOTHING;
+}
+
+
+static bool
+aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
+{
+  rtx set_dest;
+  rtx prev_set = single_set (prev);
+  rtx curr_set = single_set (curr);
+
+  if (!prev_set
+      || !curr_set)
+    return false;
+
+  if (any_condjump_p (curr))
+    return false;
+
+  if (!arm_macro_fusion_p ())
+    return false;
+
+  if (current_tune->fuseable_ops & ARM_FUSE_MOVW_MOVT)
+    {
+      /* We are trying to fuse
+         movw imm / movt imm
+         instructions as a group that gets scheduled together.  */
+
+      set_dest = SET_DEST (curr_set);
+      if (GET_CODE (set_dest) == ZERO_EXTRACT)
+        {
+          if (CONST_INT_P (SET_SRC (curr_set))
+	      && CONST_INT_P (SET_SRC (prev_set))
+	      && REG_P (XEXP (set_dest, 0))
+	      && REG_P (SET_DEST (prev_set))
+	      && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
+	    return true;
+        }
+      else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+               && REG_P (SET_DEST (curr_set))
+               && REG_P (SET_DEST (prev_set))
+               && GET_CODE (SET_SRC (prev_set)) == HIGH
+               && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
+        {
+          return true;
+        }
+    }
+  return false;
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT