diff mbox series

[RFC] ARM: thumb1: Use LDMIA/STMIA for DI/DF loads/stores

Message ID 20240616085145.751639-1-lis8215@gmail.com
State New
Headers show
Series [RFC] ARM: thumb1: Use LDMIA/STMIA for DI/DF loads/stores | expand

Commit Message

Siarhei Volkau June 16, 2024, 8:51 a.m. UTC
If the address register is dead after load/store operation it looks
beneficial to use LDMIA/STMIA instead of pair of LDR/STR instructions,
at least if optimizing for size.

E.g.
 ldr r0, [r3, #0]
 ldr r1, [r3, #4]  @ r3 is dead after
will be replaced by
 ldmia r3!, {r0, r1}

also for reused reg is legal to:
 ldr r2, [r3, #0]
 ldr r3, [r3, #4] @ r3 reused
will be replaced by
 ldmia r3, {r2, r3}

However, I know little about other thumb CPUs except Cortex M0/M0+.
1. Is there any drawbacks if optimizing speed?
2. Might it be profitable for thumb2?

Regarding code size with the patch gives for v6-m/nofp:
       libgcc:  -52 bytes / -0.10%
Newlib's libc:  -68 bytes / -0.03%
         libm:  -96 bytes / -0.10%
    libstdc++: -140 bytes / -0.02%

Also I have questions regarding testing the patch.
It's obscure how to do it properly, for now I compile
for arm-none-eabi target and make check seems failing
on any compilable test due to missing symbols from libnosys.
I guess that arm-gnu-elf is the correct triple but it still
advisable for proper commands to make & run the testsuite.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 gcc/config/arm/arm-protos.h |  2 +-
 gcc/config/arm/arm.cc       |  7 ++++++-
 gcc/config/arm/thumb1.md    | 10 ++++++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

Comments

Richard Earnshaw (lists) June 17, 2024, 12:43 p.m. UTC | #1
Hi Siarahei,

On 16/06/2024 09:51, Siarhei Volkau wrote:
> If the address register is dead after load/store operation it looks
> beneficial to use LDMIA/STMIA instead of pair of LDR/STR instructions,
> at least if optimizing for size.
> 
> E.g.
>  ldr r0, [r3, #0]
>  ldr r1, [r3, #4]  @ r3 is dead after
> will be replaced by
>  ldmia r3!, {r0, r1}
> 
> also for reused reg is legal to:
>  ldr r2, [r3, #0]
>  ldr r3, [r3, #4] @ r3 reused
> will be replaced by
>  ldmia r3, {r2, r3}
> 
> However, I know little about other thumb CPUs except Cortex M0/M0+.
> 1. Is there any drawbacks if optimizing speed?
> 2. Might it be profitable for thumb2?

I like the idea behind this patch, but I think I'd try first doing this as a peephole2 rule to rewrite the address in this case.  That has the additional advantage that we then estimate the size of the instruction more accurately.  

I think it would then be easy to extend this to thumb2 as well if it looks like a win (perhaps only for -Os in the thumb2 case).


> 
> Regarding code size with the patch gives for v6-m/nofp:
>        libgcc:  -52 bytes / -0.10%
> Newlib's libc:  -68 bytes / -0.03%
>          libm:  -96 bytes / -0.10%
>     libstdc++: -140 bytes / -0.02%
> 
> Also I have questions regarding testing the patch.
> It's obscure how to do it properly, for now I compile
> for arm-none-eabi target and make check seems failing
> on any compilable test due to missing symbols from libnosys.
> I guess that arm-gnu-elf is the correct triple but it still
> advisable for proper commands to make & run the testsuite.

For testing, I'd start with something like gcc/testsuite/gcc.target/arm/thumb-andsi.c as a template and adapt that for your specific case.  Matching something like "ldmia\tr[0-7]!," should be enough.

R.

> 
> Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
> ---
>  gcc/config/arm/arm-protos.h |  2 +-
>  gcc/config/arm/arm.cc       |  7 ++++++-
>  gcc/config/arm/thumb1.md    | 10 ++++++++--
>  3 files changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index 2cd560c9925..548bfbaccdc 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -254,7 +254,7 @@ extern int thumb_shiftable_const (unsigned HOST_WIDE_INT);
>  extern enum arm_cond_code maybe_get_arm_condition_code (rtx);
>  extern void thumb1_final_prescan_insn (rtx_insn *);
>  extern void thumb2_final_prescan_insn (rtx_insn *);
> -extern const char *thumb_load_double_from_address (rtx *);
> +extern const char *thumb_load_double_from_address (rtx *, rtx_insn *);
>  extern const char *thumb_output_move_mem_multiple (int, rtx *);
>  extern const char *thumb_call_via_reg (rtx);
>  extern void thumb_expand_cpymemqi (rtx *);
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index b8c32db0a1d..73c2478ed77 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -28350,7 +28350,7 @@ thumb1_output_interwork (void)
>     a computed memory address.  The computed address may involve a
>     register which is overwritten by the load.  */
>  const char *
> -thumb_load_double_from_address (rtx *operands)
> +thumb_load_double_from_address (rtx *operands, rtx_insn *insn)
>  {
>    rtx addr;
>    rtx base;
> @@ -28368,6 +28368,11 @@ thumb_load_double_from_address (rtx *operands)
>    switch (GET_CODE (addr))
>      {
>      case REG:
> +      if (find_reg_note (insn, REG_DEAD, addr))
> +        return "ldmia\t%m1!, {%0, %H0}";
> +      else if (REGNO (addr) == REGNO (operands[0]) + 1)
> +        return "ldmia\t%m1, {%0, %H0}";
> +
>        operands[2] = adjust_address (operands[1], SImode, 4);
>  
>        if (REGNO (operands[0]) == REGNO (addr))
> diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
> index d7074b43f60..8da6887b560 100644
> --- a/gcc/config/arm/thumb1.md
> +++ b/gcc/config/arm/thumb1.md
> @@ -637,8 +637,11 @@
>      case 5:
>        return \"stmia\\t%0, {%1, %H1}\";
>      case 6:
> -      return thumb_load_double_from_address (operands);
> +      return thumb_load_double_from_address (operands, insn);
>      case 7:
> +      if (MEM_P (operands[0]) && REG_P (XEXP (operands[0], 0))
> +          && find_reg_note (insn, REG_DEAD, XEXP (operands[0], 0)))
> +        return \"stmia\\t%m0!, {%1, %H1}\";
>        operands[2] = gen_rtx_MEM (SImode,
>  			     plus_constant (Pmode, XEXP (operands[0], 0), 4));
>        output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands);
> @@ -970,8 +973,11 @@
>      case 2:
>        return \"stmia\\t%0, {%1, %H1}\";
>      case 3:
> -      return thumb_load_double_from_address (operands);
> +      return thumb_load_double_from_address (operands, insn);
>      case 4:
> +      if (MEM_P (operands[0]) && REG_P (XEXP (operands[0], 0))
> +          && find_reg_note (insn, REG_DEAD, XEXP (operands[0], 0)))
> +        return \"stmia\\t%m0!, {%1, %H1}\";
>        operands[2] = gen_rtx_MEM (SImode,
>  				 plus_constant (Pmode,
>  						XEXP (operands[0], 0), 4));
Siarhei Volkau June 18, 2024, 12:24 p.m. UTC | #2
пн, 17 июн. 2024 г. в 15:43, Richard Earnshaw (lists)
<Richard.Earnshaw@arm.com>:

> I like the idea behind this patch, but I think I'd try first doing this as a peephole2 rule to rewrite the address in this case.  That has the additional advantage that we then estimate the size of the instruction more accurately.

Indeed, I tried it and it seems to work, although sometimes it does
odd things that I can't explain, e.g:

define_insn patch                    define_peephole2 patch
...                                  ...
ldmia   r0!, {r4, r5}                movs    r3, r0
ldmia   r1!, {r2, r3}                ldmia   r3!, {r4, r5}
movs    r0, r7                       movs    r0, r7
...                                  ldr     r2, [r1, #0]
                                     ldr     r3, [r1, #4]
                                     # r1 unused later on
                                     ...

But in general it finds a little bit more cases where ldmia/stmia can
be applied.

> > 2. Might it be profitable for thumb2?

> I think it would then be easy to extend this to thumb2 as well if it looks like a win (perhaps only for -Os in the thumb2 case).

Sounds good, I'll look at it later.

> For testing, I'd start with something like gcc/testsuite/gcc.target/arm/thumb-andsi.c as a template and adapt that for your specific case.  Matching something like "ldmia\tr[0-7]!," should be enough.

I'll send the v2 patch with test case(s) soon.

BR, Siarhei
diff mbox series

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2cd560c9925..548bfbaccdc 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -254,7 +254,7 @@  extern int thumb_shiftable_const (unsigned HOST_WIDE_INT);
 extern enum arm_cond_code maybe_get_arm_condition_code (rtx);
 extern void thumb1_final_prescan_insn (rtx_insn *);
 extern void thumb2_final_prescan_insn (rtx_insn *);
-extern const char *thumb_load_double_from_address (rtx *);
+extern const char *thumb_load_double_from_address (rtx *, rtx_insn *);
 extern const char *thumb_output_move_mem_multiple (int, rtx *);
 extern const char *thumb_call_via_reg (rtx);
 extern void thumb_expand_cpymemqi (rtx *);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d..73c2478ed77 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -28350,7 +28350,7 @@  thumb1_output_interwork (void)
    a computed memory address.  The computed address may involve a
    register which is overwritten by the load.  */
 const char *
-thumb_load_double_from_address (rtx *operands)
+thumb_load_double_from_address (rtx *operands, rtx_insn *insn)
 {
   rtx addr;
   rtx base;
@@ -28368,6 +28368,11 @@  thumb_load_double_from_address (rtx *operands)
   switch (GET_CODE (addr))
     {
     case REG:
+      if (find_reg_note (insn, REG_DEAD, addr))
+        return "ldmia\t%m1!, {%0, %H0}";
+      else if (REGNO (addr) == REGNO (operands[0]) + 1)
+        return "ldmia\t%m1, {%0, %H0}";
+
       operands[2] = adjust_address (operands[1], SImode, 4);
 
       if (REGNO (operands[0]) == REGNO (addr))
diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
index d7074b43f60..8da6887b560 100644
--- a/gcc/config/arm/thumb1.md
+++ b/gcc/config/arm/thumb1.md
@@ -637,8 +637,11 @@ 
     case 5:
       return \"stmia\\t%0, {%1, %H1}\";
     case 6:
-      return thumb_load_double_from_address (operands);
+      return thumb_load_double_from_address (operands, insn);
     case 7:
+      if (MEM_P (operands[0]) && REG_P (XEXP (operands[0], 0))
+          && find_reg_note (insn, REG_DEAD, XEXP (operands[0], 0)))
+        return \"stmia\\t%m0!, {%1, %H1}\";
       operands[2] = gen_rtx_MEM (SImode,
 			     plus_constant (Pmode, XEXP (operands[0], 0), 4));
       output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands);
@@ -970,8 +973,11 @@ 
     case 2:
       return \"stmia\\t%0, {%1, %H1}\";
     case 3:
-      return thumb_load_double_from_address (operands);
+      return thumb_load_double_from_address (operands, insn);
     case 4:
+      if (MEM_P (operands[0]) && REG_P (XEXP (operands[0], 0))
+          && find_reg_note (insn, REG_DEAD, XEXP (operands[0], 0)))
+        return \"stmia\\t%m0!, {%1, %H1}\";
       operands[2] = gen_rtx_MEM (SImode,
 				 plus_constant (Pmode,
 						XEXP (operands[0], 0), 4));