diff mbox series

[avr] Run pass avr-fuse-add a second time

Message ID ebce7b58-329c-4a43-b949-309bac619280@gjlay.de
State New
Headers show
Series [avr] Run pass avr-fuse-add a second time | expand

Commit Message

Georg-Johann Lay Aug. 30, 2024, 12:09 p.m. UTC
There are cases, where opportunities to use POST_INC addressing
only occur very late in the compilation process.  Take for example
the following function from AVR-LibC's qsort:

void swapfunc (char *a, char *b, int n)
{
     do
     {
         char t = *a;
         *a++ = *b;
         *b++ = t;
     } while (--n > 0);
}


which  -mmcu=avrtiny -S -Os -dp  compiles to:

swapfunc:
	push r28		 ;  72	[c=4 l=1]  pushqi1/0
	push r29		 ;  73	[c=4 l=1]  pushqi1/0
/* prologue: function */
/* frame size = 0 */
/* stack size = 2 */
	mov r26,r24	 ;  66	[c=4 l=1]  movqi_insn/0
	mov r27,r25	 ;  67	[c=4 l=1]  movqi_insn/0
	mov r30,r22	 ;  68	[c=4 l=1]  movqi_insn/0
	mov r31,r23	 ;  69	[c=4 l=1]  movqi_insn/0
	mov r22,r20	 ;  70	[c=4 l=1]  movqi_insn/0
	mov r23,r21	 ;  71	[c=4 l=1]  movqi_insn/0
.L2:
	ld r20,X		 ;  55	[c=4 l=1]  movqi_insn/3
	ld r21,Z		 ;  57	[c=4 l=1]  movqi_insn/3
	st X,r21		 ;  58	[c=4 l=1]  movqi_insn/2
	subi r26,-1	 ;  59	[c=4 l=2]  *addhi3_clobber/0
	sbci r27,-1
	st Z,r20		 ;  61	[c=4 l=1]  movqi_insn/2
	subi r30,-1	 ;  62	[c=4 l=2]  *addhi3_clobber/0
	sbci r31,-1
	subi r22, 1	 ;  81	[c=4 l=2]  *add.for.cczn.hi/0
	sbci r23, 0
	breq .+2	 ;  82	[c=8 l=2]  branch_ZN
	brpl .L2
/* epilogue start */
	pop r29		 ;  76	[c=4 l=1]  popqi
	pop r28		 ;  77	[c=4 l=1]  popqi
	ret		 ;  78	[c=0 l=1]  return_from_epilogue

Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
because the code prior to cprop_hardreg is a bit of a mess that moves
addresses back and forth (including Y).  Only after cprop_hardreg the
code is simple enough so post-inc can be detected by avr-fuse-add.

Hence this patch runs a 2nd instance of that pass late after
cprop_hardreg (the 1st instance runs prior to RTL peephole).

It renames avr_split_tiny_move to avr_split_fake_addressing_move
because that function also splits some insns on non-avrtiny.

The patch removes a define_split that's no more needed because
such splits are performed by avr_split_fake_addressing_move.

Passed without new regressions.  Ok for trunk?

Johann

p.s. post-inc etc. optimizations is basically non-existent in GCC.
One reasons seems to be SSA because each SSA variable gets its own
register, which results in a jungle of required registers which
even pass auto-inc-dec cannot penetrate...

Take for example the following code, that would require 12 instructions
as indicated by the comments:

void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
{
     // Set Z (R30) to bb (1 MOVW or 2 MOVs)
     // Set X (R26) to aa (1 MOVW or 2 MOVs)
     do
     {
         uint8_t sum = 0;
         sum += *bb++; // 1 instruction: POST_INC load
         sum += *bb++; // 2 instructions: POST_INC load + add
         sum += *bb++; // 2 instructions: POST_INC load + add
         sum += *bb++; // 2 instructions: POST_INC load + add
         *aa++ = sum;  // 1 instruction: POST_INC store
     } while (--nn);   // 2 instructions: dec + branch
}

But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
requires more than a dozen registers and computes each
intermediate in its own 16-bit register, coming out with
code that requires 34 instructions instead of 12.

--

AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.

gcc/
	* config/avr/avr-protos.h (avr_split_tiny_move): Rename to
	avr_split_fake_addressing_move.
	* config/avr/avr-passes.cc: Same.
	(avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
	(avr_pass_fuse_add) <clone>: Override.
	* config/avr/avr-passes.def (avr_pass_fuse_add): Run again
	after pass_cprop_hardreg.
	* config/avr/avr.md (split-lpmx): Remove a define_split.  Such
	splits are performed by avr_split_fake_addressing_move.

Comments

Richard Biener Aug. 30, 2024, 12:46 p.m. UTC | #1
On Fri, Aug 30, 2024 at 2:10 PM Georg-Johann Lay <avr@gjlay.de> wrote:
>
> There are cases, where opportunities to use POST_INC addressing
> only occur very late in the compilation process.  Take for example
> the following function from AVR-LibC's qsort:
>
> void swapfunc (char *a, char *b, int n)
> {
>      do
>      {
>          char t = *a;
>          *a++ = *b;
>          *b++ = t;
>      } while (--n > 0);
> }
>
>
> which  -mmcu=avrtiny -S -Os -dp  compiles to:
>
> swapfunc:
>         push r28                 ;  72  [c=4 l=1]  pushqi1/0
>         push r29                 ;  73  [c=4 l=1]  pushqi1/0
> /* prologue: function */
> /* frame size = 0 */
> /* stack size = 2 */
>         mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
>         mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
>         mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
>         mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
>         mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
>         mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
> .L2:
>         ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
>         ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
>         st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
>         subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
>         sbci r27,-1
>         st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
>         subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
>         sbci r31,-1
>         subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
>         sbci r23, 0
>         breq .+2         ;  82  [c=8 l=2]  branch_ZN
>         brpl .L2
> /* epilogue start */
>         pop r29          ;  76  [c=4 l=1]  popqi
>         pop r28          ;  77  [c=4 l=1]  popqi
>         ret              ;  78  [c=0 l=1]  return_from_epilogue
>
> Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
> because the code prior to cprop_hardreg is a bit of a mess that moves
> addresses back and forth (including Y).  Only after cprop_hardreg the
> code is simple enough so post-inc can be detected by avr-fuse-add.
>
> Hence this patch runs a 2nd instance of that pass late after
> cprop_hardreg (the 1st instance runs prior to RTL peephole).
>
> It renames avr_split_tiny_move to avr_split_fake_addressing_move
> because that function also splits some insns on non-avrtiny.
>
> The patch removes a define_split that's no more needed because
> such splits are performed by avr_split_fake_addressing_move.
>
> Passed without new regressions.  Ok for trunk?
>
> Johann
>
> p.s. post-inc etc. optimizations is basically non-existent in GCC.
> One reasons seems to be SSA because each SSA variable gets its own
> register, which results in a jungle of required registers which
> even pass auto-inc-dec cannot penetrate...
>
> Take for example the following code, that would require 12 instructions
> as indicated by the comments:
>
> void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
> {
>      // Set Z (R30) to bb (1 MOVW or 2 MOVs)
>      // Set X (R26) to aa (1 MOVW or 2 MOVs)
>      do
>      {
>          uint8_t sum = 0;
>          sum += *bb++; // 1 instruction: POST_INC load
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          *aa++ = sum;  // 1 instruction: POST_INC store
>      } while (--nn);   // 2 instructions: dec + branch
> }

I think the reason here is weird behavior with __flash and how IVOPTS
tries to enable auto-incdec:

  _30 = bb_2 + 1;
  _10 = MEM[(const <address-space-1> uint8_t *)_30];
  _29 = bb_2 + 2;
  _12 = MEM[(const <address-space-1> uint8_t *)_29];
  _19 = _10 + _12;
  sum_13 = _19 + _9;
  bb_14 = bb_2 + 4;
  _28 = bb_14 + 65535;
  _15 = MEM[(const <address-space-1> uint8_t *)_28];

without __flash you get

  _10 = MEM[(const uint8_t *)_27 + 1B];
  sum_11 = _9 + _10;
  _12 = MEM[(const uint8_t *)_27 + 2B];
  sum_13 = sum_11 + _12;
  _15 = MEM[(const uint8_t *)_27 + 3B];

which of ocurse is a problem for pre/post-inc addressing as well
but one that I think is solved (well, you can hope...).

Can you open a bugreport with this testcase?

> But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
> requires more than a dozen registers and computes each
> intermediate in its own 16-bit register, coming out with
> code that requires 34 instructions instead of 12.
>
> --
>
> AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.
>
> gcc/
>         * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
>         avr_split_fake_addressing_move.
>         * config/avr/avr-passes.cc: Same.
>         (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
>         (avr_pass_fuse_add) <clone>: Override.
>         * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
>         after pass_cprop_hardreg.
>         * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
>         splits are performed by avr_split_fake_addressing_move.
Georg-Johann Lay Aug. 30, 2024, 1:28 p.m. UTC | #2
Am 30.08.24 um 14:46 schrieb Richard Biener:
> On Fri, Aug 30, 2024 at 2:10 PM Georg-Johann Lay <avr@gjlay.de> wrote:
>>
>> There are cases, where opportunities to use POST_INC addressing
>> only occur very late in the compilation process.  Take for example
>> the following function from AVR-LibC's qsort:
>>
>> void swapfunc (char *a, char *b, int n)
>> {
>>       do
>>       {
>>           char t = *a;
>>           *a++ = *b;
>>           *b++ = t;
>>       } while (--n > 0);
>> }
>>
>>
>> which  -mmcu=avrtiny -S -Os -dp  compiles to:
>>
>> swapfunc:
>>          push r28                 ;  72  [c=4 l=1]  pushqi1/0
>>          push r29                 ;  73  [c=4 l=1]  pushqi1/0
>> /* prologue: function */
>> /* frame size = 0 */
>> /* stack size = 2 */
>>          mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
>>          mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
>>          mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
>>          mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
>>          mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
>>          mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
>> .L2:
>>          ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
>>          ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
>>          st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
>>          subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
>>          sbci r27,-1
>>          st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
>>          subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
>>          sbci r31,-1
>>          subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
>>          sbci r23, 0
>>          breq .+2         ;  82  [c=8 l=2]  branch_ZN
>>          brpl .L2
>> /* epilogue start */
>>          pop r29          ;  76  [c=4 l=1]  popqi
>>          pop r28          ;  77  [c=4 l=1]  popqi
>>          ret              ;  78  [c=0 l=1]  return_from_epilogue
>>
>> Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
>> because the code prior to cprop_hardreg is a bit of a mess that moves
>> addresses back and forth (including Y).  Only after cprop_hardreg the
>> code is simple enough so post-inc can be detected by avr-fuse-add.
>>
>> Hence this patch runs a 2nd instance of that pass late after
>> cprop_hardreg (the 1st instance runs prior to RTL peephole).
>>
>> It renames avr_split_tiny_move to avr_split_fake_addressing_move
>> because that function also splits some insns on non-avrtiny.
>>
>> The patch removes a define_split that's no more needed because
>> such splits are performed by avr_split_fake_addressing_move.
>>
>> Passed without new regressions.  Ok for trunk?
>>
>> Johann
>>
>> p.s. post-inc etc. optimizations is basically non-existent in GCC.
>> One reasons seems to be SSA because each SSA variable gets its own
>> register, which results in a jungle of required registers which
>> even pass auto-inc-dec cannot penetrate...
>>
>> Take for example the following code, that would require 12 instructions
>> as indicated by the comments:
>>
>> void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
>> {
>>       // Set Z (R30) to bb (1 MOVW or 2 MOVs)
>>       // Set X (R26) to aa (1 MOVW or 2 MOVs)
>>       do
>>       {
>>           uint8_t sum = 0;
>>           sum += *bb++; // 1 instruction: POST_INC load
>>           sum += *bb++; // 2 instructions: POST_INC load + add
>>           sum += *bb++; // 2 instructions: POST_INC load + add
>>           sum += *bb++; // 2 instructions: POST_INC load + add
>>           *aa++ = sum;  // 1 instruction: POST_INC store
>>       } while (--nn);   // 2 instructions: dec + branch
>> }
> 
> I think the reason here is weird behavior with __flash and how IVOPTS
> tries to enable auto-incdec:
> 
>    _30 = bb_2 + 1;
>    _10 = MEM[(const <address-space-1> uint8_t *)_30];
>    _29 = bb_2 + 2;
>    _12 = MEM[(const <address-space-1> uint8_t *)_29];
>    _19 = _10 + _12;
>    sum_13 = _19 + _9;
>    bb_14 = bb_2 + 4;
>    _28 = bb_14 + 65535;
>    _15 = MEM[(const <address-space-1> uint8_t *)_28];
> 
> without __flash you get
> 
>    _10 = MEM[(const uint8_t *)_27 + 1B];
>    sum_11 = _9 + _10;
>    _12 = MEM[(const uint8_t *)_27 + 2B];
>    sum_13 = sum_11 + _12;
>    _15 = MEM[(const uint8_t *)_27 + 3B];
> 
> which of ocurse is a problem for pre/post-inc addressing as well
> but one that I think is solved (well, you can hope...).
> 
> Can you open a bugreport with this testcase?

https://gcc.gnu.org/PR116542

Without __flash, the code uses Z, Z+1, Z+2 and Z+3 as addresses,
but still uses one register for access and one register to increment
the address.

Then there are these crazy computation like:

start_address - current_address + nn == 0

as condition for loop termination instead of just dec + branch
of nn.

Johann


>> But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
>> requires more than a dozen registers and computes each
>> intermediate in its own 16-bit register, coming out with
>> code that requires 34 instructions instead of 12.
>>
>> --
>>
>> AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.
>>
>> gcc/
>>          * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
>>          avr_split_fake_addressing_move.
>>          * config/avr/avr-passes.cc: Same.
>>          (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
>>          (avr_pass_fuse_add) <clone>: Override.
>>          * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
>>          after pass_cprop_hardreg.
>>          * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
>>          splits are performed by avr_split_fake_addressing_move.
Richard Biener Aug. 30, 2024, 1:31 p.m. UTC | #3
On Fri, Aug 30, 2024 at 3:28 PM Georg-Johann Lay <avr@gjlay.de> wrote:
>
> Am 30.08.24 um 14:46 schrieb Richard Biener:
> > On Fri, Aug 30, 2024 at 2:10 PM Georg-Johann Lay <avr@gjlay.de> wrote:
> >>
> >> There are cases, where opportunities to use POST_INC addressing
> >> only occur very late in the compilation process.  Take for example
> >> the following function from AVR-LibC's qsort:
> >>
> >> void swapfunc (char *a, char *b, int n)
> >> {
> >>       do
> >>       {
> >>           char t = *a;
> >>           *a++ = *b;
> >>           *b++ = t;
> >>       } while (--n > 0);
> >> }
> >>
> >>
> >> which  -mmcu=avrtiny -S -Os -dp  compiles to:
> >>
> >> swapfunc:
> >>          push r28                 ;  72  [c=4 l=1]  pushqi1/0
> >>          push r29                 ;  73  [c=4 l=1]  pushqi1/0
> >> /* prologue: function */
> >> /* frame size = 0 */
> >> /* stack size = 2 */
> >>          mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
> >>          mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
> >>          mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
> >>          mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
> >>          mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
> >>          mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
> >> .L2:
> >>          ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
> >>          ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
> >>          st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
> >>          subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
> >>          sbci r27,-1
> >>          st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
> >>          subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
> >>          sbci r31,-1
> >>          subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
> >>          sbci r23, 0
> >>          breq .+2         ;  82  [c=8 l=2]  branch_ZN
> >>          brpl .L2
> >> /* epilogue start */
> >>          pop r29          ;  76  [c=4 l=1]  popqi
> >>          pop r28          ;  77  [c=4 l=1]  popqi
> >>          ret              ;  78  [c=0 l=1]  return_from_epilogue
> >>
> >> Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
> >> because the code prior to cprop_hardreg is a bit of a mess that moves
> >> addresses back and forth (including Y).  Only after cprop_hardreg the
> >> code is simple enough so post-inc can be detected by avr-fuse-add.
> >>
> >> Hence this patch runs a 2nd instance of that pass late after
> >> cprop_hardreg (the 1st instance runs prior to RTL peephole).
> >>
> >> It renames avr_split_tiny_move to avr_split_fake_addressing_move
> >> because that function also splits some insns on non-avrtiny.
> >>
> >> The patch removes a define_split that's no more needed because
> >> such splits are performed by avr_split_fake_addressing_move.
> >>
> >> Passed without new regressions.  Ok for trunk?
> >>
> >> Johann
> >>
> >> p.s. post-inc etc. optimizations is basically non-existent in GCC.
> >> One reasons seems to be SSA because each SSA variable gets its own
> >> register, which results in a jungle of required registers which
> >> even pass auto-inc-dec cannot penetrate...
> >>
> >> Take for example the following code, that would require 12 instructions
> >> as indicated by the comments:
> >>
> >> void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
> >> {
> >>       // Set Z (R30) to bb (1 MOVW or 2 MOVs)
> >>       // Set X (R26) to aa (1 MOVW or 2 MOVs)
> >>       do
> >>       {
> >>           uint8_t sum = 0;
> >>           sum += *bb++; // 1 instruction: POST_INC load
> >>           sum += *bb++; // 2 instructions: POST_INC load + add
> >>           sum += *bb++; // 2 instructions: POST_INC load + add
> >>           sum += *bb++; // 2 instructions: POST_INC load + add
> >>           *aa++ = sum;  // 1 instruction: POST_INC store
> >>       } while (--nn);   // 2 instructions: dec + branch
> >> }
> >
> > I think the reason here is weird behavior with __flash and how IVOPTS
> > tries to enable auto-incdec:
> >
> >    _30 = bb_2 + 1;
> >    _10 = MEM[(const <address-space-1> uint8_t *)_30];
> >    _29 = bb_2 + 2;
> >    _12 = MEM[(const <address-space-1> uint8_t *)_29];
> >    _19 = _10 + _12;
> >    sum_13 = _19 + _9;
> >    bb_14 = bb_2 + 4;
> >    _28 = bb_14 + 65535;
> >    _15 = MEM[(const <address-space-1> uint8_t *)_28];
> >
> > without __flash you get
> >
> >    _10 = MEM[(const uint8_t *)_27 + 1B];
> >    sum_11 = _9 + _10;
> >    _12 = MEM[(const uint8_t *)_27 + 2B];
> >    sum_13 = sum_11 + _12;
> >    _15 = MEM[(const uint8_t *)_27 + 3B];
> >
> > which of ocurse is a problem for pre/post-inc addressing as well
> > but one that I think is solved (well, you can hope...).
> >
> > Can you open a bugreport with this testcase?
>
> https://gcc.gnu.org/PR116542

Thanks.

> Without __flash, the code uses Z, Z+1, Z+2 and Z+3 as addresses,
> but still uses one register for access and one register to increment
> the address.
>
> Then there are these crazy computation like:
>
> start_address - current_address + nn == 0
>
> as condition for loop termination instead of just dec + branch
> of nn.

That's likely due to costs as estimated by IVOPTs and the backend.

Richard.

> Johann
>
>
> >> But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
> >> requires more than a dozen registers and computes each
> >> intermediate in its own 16-bit register, coming out with
> >> code that requires 34 instructions instead of 12.
> >>
> >> --
> >>
> >> AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.
> >>
> >> gcc/
> >>          * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
> >>          avr_split_fake_addressing_move.
> >>          * config/avr/avr-passes.cc: Same.
> >>          (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
> >>          (avr_pass_fuse_add) <clone>: Override.
> >>          * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
> >>          after pass_cprop_hardreg.
> >>          * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
> >>          splits are performed by avr_split_fake_addressing_move.
Georg-Johann Lay Aug. 30, 2024, 1:40 p.m. UTC | #4
Am 30.08.24 um 15:31 schrieb Richard Biener:
> On Fri, Aug 30, 2024 at 3:28 PM Georg-Johann Lay <avr@gjlay.de> wrote:
>>
>> Am 30.08.24 um 14:46 schrieb Richard Biener:
>>> On Fri, Aug 30, 2024 at 2:10 PM Georg-Johann Lay <avr@gjlay.de> wrote:
>>>>
>>>> There are cases, where opportunities to use POST_INC addressing
>>>> only occur very late in the compilation process.  Take for example
>>>> the following function from AVR-LibC's qsort:
>>>>
>>>> void swapfunc (char *a, char *b, int n)
>>>> {
>>>>        do
>>>>        {
>>>>            char t = *a;
>>>>            *a++ = *b;
>>>>            *b++ = t;
>>>>        } while (--n > 0);
>>>> }
>>>>
>>>>
>>>> which  -mmcu=avrtiny -S -Os -dp  compiles to:
>>>>
>>>> swapfunc:
>>>>           push r28                 ;  72  [c=4 l=1]  pushqi1/0
>>>>           push r29                 ;  73  [c=4 l=1]  pushqi1/0
>>>> /* prologue: function */
>>>> /* frame size = 0 */
>>>> /* stack size = 2 */
>>>>           mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
>>>>           mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
>>>>           mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
>>>>           mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
>>>>           mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
>>>>           mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
>>>> .L2:
>>>>           ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
>>>>           ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
>>>>           st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
>>>>           subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
>>>>           sbci r27,-1
>>>>           st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
>>>>           subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
>>>>           sbci r31,-1
>>>>           subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
>>>>           sbci r23, 0
>>>>           breq .+2         ;  82  [c=8 l=2]  branch_ZN
>>>>           brpl .L2
>>>> /* epilogue start */
>>>>           pop r29          ;  76  [c=4 l=1]  popqi
>>>>           pop r28          ;  77  [c=4 l=1]  popqi
>>>>           ret              ;  78  [c=0 l=1]  return_from_epilogue
>>>>
>>>> Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
>>>> because the code prior to cprop_hardreg is a bit of a mess that moves
>>>> addresses back and forth (including Y).  Only after cprop_hardreg the
>>>> code is simple enough so post-inc can be detected by avr-fuse-add.
>>>>
>>>> Hence this patch runs a 2nd instance of that pass late after
>>>> cprop_hardreg (the 1st instance runs prior to RTL peephole).
>>>>
>>>> It renames avr_split_tiny_move to avr_split_fake_addressing_move
>>>> because that function also splits some insns on non-avrtiny.
>>>>
>>>> The patch removes a define_split that's no more needed because
>>>> such splits are performed by avr_split_fake_addressing_move.
>>>>
>>>> Passed without new regressions.  Ok for trunk?
>>>>
>>>> Johann
>>>>
>>>> p.s. post-inc etc. optimizations is basically non-existent in GCC.
>>>> One reasons seems to be SSA because each SSA variable gets its own
>>>> register, which results in a jungle of required registers which
>>>> even pass auto-inc-dec cannot penetrate...
>>>>
>>>> Take for example the following code, that would require 12 instructions
>>>> as indicated by the comments:
>>>>
>>>> void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
>>>> {
>>>>        // Set Z (R30) to bb (1 MOVW or 2 MOVs)
>>>>        // Set X (R26) to aa (1 MOVW or 2 MOVs)
>>>>        do
>>>>        {
>>>>            uint8_t sum = 0;
>>>>            sum += *bb++; // 1 instruction: POST_INC load
>>>>            sum += *bb++; // 2 instructions: POST_INC load + add
>>>>            sum += *bb++; // 2 instructions: POST_INC load + add
>>>>            sum += *bb++; // 2 instructions: POST_INC load + add
>>>>            *aa++ = sum;  // 1 instruction: POST_INC store
>>>>        } while (--nn);   // 2 instructions: dec + branch
>>>> }
>>>
>>> I think the reason here is weird behavior with __flash and how IVOPTS
>>> tries to enable auto-incdec:
>>>
>>>     _30 = bb_2 + 1;
>>>     _10 = MEM[(const <address-space-1> uint8_t *)_30];
>>>     _29 = bb_2 + 2;
>>>     _12 = MEM[(const <address-space-1> uint8_t *)_29];
>>>     _19 = _10 + _12;
>>>     sum_13 = _19 + _9;
>>>     bb_14 = bb_2 + 4;
>>>     _28 = bb_14 + 65535;
>>>     _15 = MEM[(const <address-space-1> uint8_t *)_28];
>>>
>>> without __flash you get
>>>
>>>     _10 = MEM[(const uint8_t *)_27 + 1B];
>>>     sum_11 = _9 + _10;
>>>     _12 = MEM[(const uint8_t *)_27 + 2B];
>>>     sum_13 = sum_11 + _12;
>>>     _15 = MEM[(const uint8_t *)_27 + 3B];
>>>
>>> which of ocurse is a problem for pre/post-inc addressing as well
>>> but one that I think is solved (well, you can hope...).
>>>
>>> Can you open a bugreport with this testcase?
>>
>> https://gcc.gnu.org/PR116542
> 
> Thanks.
> 
>> Without __flash, the code uses Z, Z+1, Z+2 and Z+3 as addresses,
>> but still uses one register for access and one register to increment
>> the address.
>>
>> Then there are these crazy computation like:
>>
>> start_address - current_address + nn == 0
>>
>> as condition for loop termination instead of just dec + branch
>> of nn.
> 
> That's likely due to costs as estimated by IVOPTs and the backend.

Simple code would use DEC which costs 1 arithmetic + 1 reg for
nn (in addition to the current address).

The current code uses start address, start nn + ADD + SUB (in
addition to the current address).  So doubles the register
pressure and doubles number or arithmetic compared to
simple code.

There are many cases where -fno-ivopts or -fno-tree-loop-optimize
give better code, but I doubt that switching off these passes
altogether is a good idea...

Johann

> Richard.
> 
>> Johann
>>
>>
>>>> But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
>>>> requires more than a dozen registers and computes each
>>>> intermediate in its own 16-bit register, coming out with
>>>> code that requires 34 instructions instead of 12.
>>>>
>>>> --
>>>>
>>>> AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.
>>>>
>>>> gcc/
>>>>           * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
>>>>           avr_split_fake_addressing_move.
>>>>           * config/avr/avr-passes.cc: Same.
>>>>           (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
>>>>           (avr_pass_fuse_add) <clone>: Override.
>>>>           * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
>>>>           after pass_cprop_hardreg.
>>>>           * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
>>>>           splits are performed by avr_split_fake_addressing_move.
Denis Chertykov Aug. 31, 2024, 5:43 p.m. UTC | #5
пт, 30 авг. 2024 г. в 16:09, Georg-Johann Lay <avr@gjlay.de>:
>
> There are cases, where opportunities to use POST_INC addressing
> only occur very late in the compilation process.  Take for example
> the following function from AVR-LibC's qsort:
>
> void swapfunc (char *a, char *b, int n)
> {
>      do
>      {
>          char t = *a;
>          *a++ = *b;
>          *b++ = t;
>      } while (--n > 0);
> }
>
>
> which  -mmcu=avrtiny -S -Os -dp  compiles to:
>
> swapfunc:
>         push r28                 ;  72  [c=4 l=1]  pushqi1/0
>         push r29                 ;  73  [c=4 l=1]  pushqi1/0
> /* prologue: function */
> /* frame size = 0 */
> /* stack size = 2 */
>         mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
>         mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
>         mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
>         mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
>         mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
>         mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
> .L2:
>         ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
>         ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
>         st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
>         subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
>         sbci r27,-1
>         st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
>         subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
>         sbci r31,-1
>         subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
>         sbci r23, 0
>         breq .+2         ;  82  [c=8 l=2]  branch_ZN
>         brpl .L2
> /* epilogue start */
>         pop r29          ;  76  [c=4 l=1]  popqi
>         pop r28          ;  77  [c=4 l=1]  popqi
>         ret              ;  78  [c=0 l=1]  return_from_epilogue
>
> Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
> because the code prior to cprop_hardreg is a bit of a mess that moves
> addresses back and forth (including Y).  Only after cprop_hardreg the
> code is simple enough so post-inc can be detected by avr-fuse-add.
>
> Hence this patch runs a 2nd instance of that pass late after
> cprop_hardreg (the 1st instance runs prior to RTL peephole).
>
> It renames avr_split_tiny_move to avr_split_fake_addressing_move
> because that function also splits some insns on non-avrtiny.
>
> The patch removes a define_split that's no more needed because
> such splits are performed by avr_split_fake_addressing_move.
>
> Passed without new regressions.  Ok for trunk?

Ok.
Please apply.

Denis

>
> Johann
>
> p.s. post-inc etc. optimizations is basically non-existent in GCC.
> One reasons seems to be SSA because each SSA variable gets its own
> register, which results in a jungle of required registers which
> even pass auto-inc-dec cannot penetrate...
>
> Take for example the following code, that would require 12 instructions
> as indicated by the comments:
>
> void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
> {
>      // Set Z (R30) to bb (1 MOVW or 2 MOVs)
>      // Set X (R26) to aa (1 MOVW or 2 MOVs)
>      do
>      {
>          uint8_t sum = 0;
>          sum += *bb++; // 1 instruction: POST_INC load
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          sum += *bb++; // 2 instructions: POST_INC load + add
>          *aa++ = sum;  // 1 instruction: POST_INC store
>      } while (--nn);   // 2 instructions: dec + branch
> }
>
> But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
> requires more than a dozen registers and computes each
> intermediate in its own 16-bit register, coming out with
> code that requires 34 instructions instead of 12.
>
> --
>
> AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.
>
> gcc/
>         * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
>         avr_split_fake_addressing_move.
>         * config/avr/avr-passes.cc: Same.
>         (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
>         (avr_pass_fuse_add) <clone>: Override.
>         * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
>         after pass_cprop_hardreg.
>         * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
>         splits are performed by avr_split_fake_addressing_move.
diff mbox series

Patch

diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index 8b018ff6a05..8a71b57ada1 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -1009,7 +1009,7 @@  static const pass_data avr_pass_data_fuse_add =
   RTL_PASS,	    // type
   "",		    // name (will be patched)
   OPTGROUP_NONE,    // optinfo_flags
-  TV_DF_SCAN,	    // tv_id
+  TV_MACH_DEP,	    // tv_id
   0,		    // properties_required
   0,		    // properties_provided
   0,		    // properties_destroyed
@@ -1026,6 +1026,13 @@  public:
     this->name = name;
   }
 
+  // Cloning is required because we are running one instance of the pass
+  // before peephole2. and a second one after cprop_hardreg.
+  opt_pass * clone () final override
+  {
+    return make_avr_pass_fuse_add (m_ctxt);
+  }
+
   bool gate (function *) final override
   {
     return optimize && avr_fuse_add > 0;
@@ -1503,8 +1510,8 @@  avr_pass_fuse_add::fuse_mem_add (Mem_Insn &mem, Add_Insn &add)
    - PLUS insn of that kind.
    - Indirect loads and stores.
    In almost all cases, combine opportunities arise from the preparation
-   done by `avr_split_tiny_move', but in some rare cases combinations are
-   found for the ordinary cores, too.
+   done by `avr_split_fake_addressing_move', but in some rare cases combinations
+   are found for the ordinary cores, too.
       As we consider at most one Mem insn per try, there may still be missed
    optimizations like  POST_INC + PLUS + POST_INC  might be performed
    as  PRE_DEC + PRE_DEC  for two adjacent locations.  */
@@ -1714,7 +1721,7 @@  public:
    core's capabilities.  This sets the stage for pass .avr-fuse-add.  */
 
 bool
-avr_split_tiny_move (rtx_insn * /*insn*/, rtx *xop)
+avr_split_fake_addressing_move (rtx_insn * /*insn*/, rtx *xop)
 {
   bool store_p = false;
   rtx mem, reg_or_0;
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def
index 748260edaef..cd89d673727 100644
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -20,12 +20,33 @@ 
 /* A post reload optimization pass that fuses PLUS insns with CONST_INT
    addend with a load or store insn to get POST_INC or PRE_DEC addressing.
    It can also fuse two PLUSes to a single one, which may occur due to
-   splits from `avr_split_tiny_move'.  We do this in an own pass because
-   it can find more cases than peephole2, for example when there are
-   unrelated insns between the interesting ones.  */
+   splits from `avr_split_fake_addressing_move'.  We do this in an own
+   pass because it can find more cases than peephole2, for example when
+   there are unrelated insns between the interesting ones.  */
 
 INSERT_PASS_BEFORE (pass_peephole2, 1, avr_pass_fuse_add);
 
+/* There are cases where avr-fuse-add doesn't find POST_INC cases because
+   the RTL code at that time is too long-winded, and moves registers back and
+   forth (which seems to be the same reason for why pass auto_inc_dec cannot
+   find POST_INC, either).  Some of that long-windedness is cleaned up very
+   late in pass cprop_hardreg, which opens up new opportunities to find post
+   increments.  An example is the following function from AVR-LibC's qsort:
+
+   void swapfunc (char *a, char *b, int n)
+   {
+       do
+       {
+           char tmp = *a;
+           *a++ = *b;
+           *b++ = tmp;
+       } while (--n > 0);
+   }
+
+   Hence, run avr-fuse-add twice; the second time after cprop_hardreg.  */
+
+INSERT_PASS_AFTER (pass_cprop_hardreg, 1, avr_pass_fuse_add);
+
 /* An analysis pass that runs prior to prologue / epilogue generation.
    Computes cfun->machine->gasisr.maybe which is used in prologue and
    epilogue generation provided -mgas-isr-prologues is on.  */
@@ -47,9 +68,9 @@  INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes);
    tries to fix such situations by operating on the original mode.  This
    reduces code size and register pressure.
 
-   The assertion is that the code generated by casesi is unaltered and a
+   The assertion is that the code generated by casesi is unaltered and
    a sign-extend or zero-extend from QImode or HImode precedes the casesi
-   insns withaout any insns in between.  */
+   insns without any insns in between.  */
 
 INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
 
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 289c80cab55..d6f4abbac66 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -179,7 +179,7 @@  extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
 extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *);
 #ifdef RTX_CODE
 extern bool avr_casei_sequence_check_operands (rtx *xop);
-extern bool avr_split_tiny_move (rtx_insn *insn, rtx *operands);
+extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
 #endif /* RTX_CODE */
 
 /* From avr-log.cc */
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index f477ac170ea..520f1fe41a2 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -993,41 +993,10 @@  (define_peephole2
               (clobber (reg:CC REG_CC))])])
 
 
-;; For LPM loads from AS1 we split
-;;    R = *Z
-;; to
-;;    R = *Z++
-;;    Z = Z - sizeof (R)
-;;
-;; so that the second instruction can be optimized out.
-
-(define_split ; "split-lpmx"
-  [(set (match_operand:HISI 0 "register_operand" "")
-        (match_operand:HISI 1 "memory_operand" ""))]
-  "reload_completed
-   && AVR_HAVE_LPMX
-   && avr_mem_flash_p (operands[1])
-   && REG_P (XEXP (operands[1], 0))
-   && !reg_overlap_mentioned_p (XEXP (operands[1], 0), operands[0])"
-  [(set (match_dup 0)
-        (match_dup 2))
-   (set (match_dup 3)
-        (plus:HI (match_dup 3)
-                 (match_dup 4)))]
-  {
-     rtx addr = XEXP (operands[1], 0);
-
-    operands[2] = replace_equiv_address (operands[1],
-                                         gen_rtx_POST_INC (Pmode, addr));
-    operands[3] = addr;
-    operands[4] = gen_int_mode (-<SIZE>, HImode);
-  })
-
-
 ;; Legitimate address and stuff allows way more addressing modes than
 ;; Reduced Tiny actually supports.  Split them now so that we get
 ;; closer to real instructions which may result in some optimization
-;; opportunities.
+;; opportunities.  This applies also to fake X + offset addressing.
 (define_split
   [(parallel [(set (match_operand:MOVMODE 0 "nonimmediate_operand")
                    (match_operand:MOVMODE 1 "general_operand"))
@@ -1040,7 +1009,7 @@  (define_split
    && (MEM_P (operands[0]) || MEM_P (operands[1]))"
   [(scratch)]
   {
-    if (avr_split_tiny_move (curr_insn, operands))
+    if (avr_split_fake_addressing_move (curr_insn, operands))
       DONE;
     FAIL;
   })