diff mbox series

[avr,applied] Create more opportunities for the -mfuse-add optimization pass

Message ID e75bbaab-16e7-4ff4-b8f4-afae843c92a9@gjlay.de
State New
Headers show
Series [avr,applied] Create more opportunities for the -mfuse-add optimization pass | expand

Commit Message

Georg-Johann Lay July 6, 2024, 10:15 a.m. UTC
Up to now, a post-reload split for fake PLUS addresses
was only run on AVR_TINY.  However, also non-AVR_TINY cores
have some address registers that don't support PLUS addressing,
which is the X register, and the Z register with [E]LPM.

This patch splits also these patterns.  The fuse-add pass can
already handle all the generated RTXes.

Johann

--

AVR: Create more opportunities for -mfuse-add optimization.

avr_split_tiny_move() was only run for AVR_TINY because it has no PLUS
addressing modes.  Same applies to the X register on ordinary cores, and
also to the Z register when used with [E]LPM.  For example, without this 
patch

long long addLL (long long *a, long long *b)
{
   return *a + *b;
}

compiles with "-mmcu=atmgea128 -Os -dp" to:

     ...
     movw r26,r24     ;  80  [c=4 l=1]  *movhi/0
     movw r30,r22     ;  81  [c=4 l=1]  *movhi/0
     ld r18,X         ;  82  [c=4 l=1]  movqi_insn/3
     adiw r26,1   ;  83  [c=4 l=3]  movqi_insn/3
     ld r19,X
     sbiw r26,1
     adiw r26,2   ;  84  [c=4 l=3]  movqi_insn/3
     ld r20,X
     sbiw r26,2
     adiw r26,3   ;  85  [c=4 l=3]  movqi_insn/3
     ld r21,X
     sbiw r26,3
     adiw r26,4   ;  86  [c=4 l=3]  movqi_insn/3
     ld r22,X
     sbiw r26,4
     adiw r26,5   ;  87  [c=4 l=3]  movqi_insn/3
     ld r23,X
     sbiw r26,5
     adiw r26,6   ;  88  [c=4 l=3]  movqi_insn/3
     ld r24,X
     sbiw r26,6
     adiw r26,7   ;  89  [c=4 l=2]  movqi_insn/3
     ld r25,X
     ld r10,Z         ;  90  [c=4 l=1]  movqi_insn/3
     ...

whereas with this patch it becomes:

     ...
     movw r26,r24     ;  80  [c=4 l=1]  *movhi/0
     movw r30,r22     ;  81  [c=4 l=1]  *movhi/0
     ld r18,X+        ;  140 [c=4 l=1]  movqi_insn/3
     ld r19,X+        ;  142 [c=4 l=1]  movqi_insn/3
     ld r20,X+        ;  144 [c=4 l=1]  movqi_insn/3
     ld r21,X+        ;  146 [c=4 l=1]  movqi_insn/3
     ld r22,X+        ;  148 [c=4 l=1]  movqi_insn/3
     ld r23,X+        ;  150 [c=4 l=1]  movqi_insn/3
     ld r24,X+        ;  152 [c=4 l=1]  movqi_insn/3
     ld r25,X         ;  109 [c=4 l=1]  movqi_insn/3
     ld r10,Z         ;  111 [c=4 l=1]  movqi_insn/3
     ...

gcc/
	* config/avr/avr.md: Also split with avr_split_tiny_move()
	for non-AVR_TINY.
	* config/avr/avr.cc (avr_split_tiny_move): Don't change memory
	references with base regs that can do PLUS addressing.
	(avr_out_lpm_no_lpmx) [POST_INC]: Don't output final ADIW when the
	address register is unused after.
gcc/testsuite/
	* gcc.target/avr/torture/fuse-add.c: New test.
diff mbox series

Patch

diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index f048bf5fd41..d299fceb782 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -4471,28 +4471,21 @@  avr_out_lpm_no_lpmx (rtx_insn *insn, rtx *xop, int *plen)
       gcc_assert (REG_Z == REGNO (XEXP (addr, 0))
 		  && n_bytes <= 4);
 
-      if (regno_dest == LPM_REGNO)
-	avr_asm_len ("%4lpm"      CR_TAB
-		     "adiw %2,1", xop, plen, 2);
-      else
-	avr_asm_len ("%4lpm"      CR_TAB
-		     "mov %A0,%3" CR_TAB
-		     "adiw %2,1", xop, plen, 3);
+      for (int i = 0; i < n_bytes; ++i)
+	{
+	  rtx reg = simplify_gen_subreg (QImode, dest, GET_MODE (dest), i);
 
-      if (n_bytes >= 2)
-	avr_asm_len ("%4lpm"      CR_TAB
-		     "mov %B0,%3" CR_TAB
-		     "adiw %2,1", xop, plen, 3);
+	  if (i > 0)
+	    avr_asm_len ("adiw %2,1", xop, plen, 1);
 
-      if (n_bytes >= 3)
-	avr_asm_len ("%4lpm"      CR_TAB
-		     "mov %C0,%3" CR_TAB
-		     "adiw %2,1", xop, plen, 3);
+	  avr_asm_len ("%4lpm", xop, plen, 1);
 
-      if (n_bytes >= 4)
-	avr_asm_len ("%4lpm"      CR_TAB
-		     "mov %D0,%3" CR_TAB
-		     "adiw %2,1", xop, plen, 3);
+	  if (REGNO (reg) != LPM_REGNO)
+	    avr_asm_len ("mov %0,r0", &reg, plen, 1);
+	}
+
+      if (! _reg_unused_after (insn, xop[2], false))
+	avr_asm_len ("adiw %2,1", xop, plen, 1);
 
       break; /* POST_INC */
 
@@ -6685,6 +6678,14 @@  avr_split_tiny_move (rtx_insn * /*insn*/, rtx *xop)
   if (REGNO (base) > REG_Z)
     return false;
 
+  if (! AVR_TINY
+      // Only keep base registers that can't do PLUS addressing.
+      && ((REGNO (base) != REG_X
+	   && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (mem)))
+	  || avr_load_libgcc_p (mem)
+	  || avr_mem_memx_p (mem)))
+    return false;
+
   bool volatile_p = MEM_VOLATILE_P (mem);
   bool mem_volatile_p = false;
   if (frame_pointer_needed
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index dabf4c0fc5a..2783b8c986f 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -1035,8 +1035,7 @@  (define_split
   [(parallel [(set (match_operand:MOVMODE 0 "nonimmediate_operand")
                    (match_operand:MOVMODE 1 "general_operand"))
               (clobber (reg:CC REG_CC))])]
-  "AVR_TINY
-   && reload_completed
+  "reload_completed
    && avr_fuse_add > 0
    // Only split this for .split2 when we are before
    // pass .avr-fuse-add (which runs after proep).
diff --git a/gcc/testsuite/gcc.target/avr/torture/fuse-add.c b/gcc/testsuite/gcc.target/avr/torture/fuse-add.c
new file mode 100644
index 00000000000..b78b1aa9fc9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/fuse-add.c
@@ -0,0 +1,59 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-std=gnu99" } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+extern const uint64_t aa __asm ("real_aa");
+extern const uint64_t bb __asm ("real_bb");
+
+__attribute__((used)) const uint64_t real_aa = 0x1122334455667788;
+__attribute__((used)) const uint64_t real_bb = 0x0908070605040302;
+
+__attribute__((noinline,noclone))
+uint64_t add1 (const uint64_t *aa, const uint64_t *bb)
+{
+  return *aa + *bb;
+}
+
+#ifdef __FLASH
+extern const __flash uint64_t fa __asm ("real_fa");
+extern const __flash uint64_t fb __asm ("real_fb");
+
+__attribute__((used)) const __flash uint64_t real_fa = 0x1122334455667788;
+__attribute__((used)) const __flash uint64_t real_fb = 0x0908070605040302;
+
+__attribute__((noinline,noclone))
+uint64_t add2 (const __flash uint64_t *aa, const uint64_t *bb)
+{
+  return *aa + *bb;
+}
+
+uint64_t add3 (const uint64_t *aa, const __flash uint64_t *bb)
+{
+  return *aa + *bb;
+}
+
+uint64_t add4 (const __flash uint64_t *aa, const __flash uint64_t *bb)
+{
+  return *aa + *bb;
+}
+#endif /* have __flash */
+
+int main (void)
+{
+  if (add1 (&aa, &bb) != real_aa + real_bb)
+    __builtin_exit (__LINE__);
+
+#ifdef __FLASH
+  if (add2 (&fa, &bb) != real_fa + real_bb)
+    __builtin_exit (__LINE__);
+
+  if (add3 (&aa, &fb) != real_aa + real_fb)
+    __builtin_exit (__LINE__);
+
+  if (add4 (&fa, &fb) != real_fa + real_fb)
+    __builtin_exit (__LINE__);
+#endif
+
+  return 0;
+}