diff mbox series

[3/4] sched1: model: only promote true dependecies in predecessor promotion

Message ID 20241020194018.3051160-4-vineetg@rivosinc.com
State New
Headers show
Series sched1 improvements | expand

Commit Message

Vineet Gupta Oct. 20, 2024, 7:40 p.m. UTC
Background
----------
sched1 runs a preliminary "model schedular" ahead of the main list schedular.
Its sole purpose is to keep register pressure to mimimum [1] and it uses DFA
register depenendency tracking to arrange insns.

   [1] https://gcc.gnu.org/legacy-ml/gcc-patches/2011-12/msg01684.html

                           `The idea was to construct a preliminary
   "model" schedule in which the only objective is to keep register
   pressure to a minimum.  This schedule ignores pipeline characteristics,
   latencies, and the number of available registers.  The maximum pressure
   seen in this initial model schedule (MP) is then the benchmark for ECC(X).`

It starts off with an intial "worklist" of insns w/o any prior dependencies,
scheduling them, adding successors of scheduled insn to the worklist and so
on until all insns in the basic block are done.
It can run into situations where an otherwise to-be-scheduled candidate can't
because it's predecessors haven't been scheduled yet, requiring
"predecessor promotion" implemented in model_promote_predecessors ().
Promotion is essentially bumping INSN model_priority so that it makes it
towards the head of elligible-to-schedule list.

An INSN can have multiple dependencies/predecessor nodes, some of them
being true dependency REG_DEP_TRUE meaning the predecessor register output
is a must have for the INSN to be scheduled.
e.g.
In the sched1 dump below, insn 70 has multiple deps, but 68 and 69 are
true reg deps:

;;	| insn | prio |
;;	|   68 |    3 | r217=r144+r146
;;	|   69 |    5 | r218=flt(r143)
;;	|   70 |    2 | [r217]=r218

;;      insn  code    bb   dep  prio  cost   reservation
;;      ----  ----    --   ---  ----  ----   -----------
;;       70   286     6     6     2     1   alu	: FW: 97tnm 91m 83tn 78m 76tn 72tn
;;						: BK: 69t 68t 57n 60n 61n 64n
                                                      ^^^ ^^^

Issue
-----
Currently predecessor promotion bumps the priority of all predecessors
to same value, treating the true deps and the rest alike. This simple
strategy can sometimes cause a subtle inadvertent effect: given the right
"other" conditions (depth, height etc) a non true dependency can get
scheduled ahead of the true dep, increasing the live range between the
true dep and the dependent. This increases the peak register pressure
for the BB. Subsequently this inflated peak register pressure steers
the main list schdular, giving it the lower bound to work with.
Main schedular can make pressure worse (due to instruction latencies
and pipeline models etc) but otherwise it can work with the given
peak pressure. Thus a higher model pressure will ultimately lead to a
less than ideal final schedule.

This subtle effect get crazy exacerbated on RISC-V SPEC2017 Cactu
benchmark.

For the spill2.cpp testcase (reduced from Cactu itself) on RISC-V,
here's what was seen:

  - After idx #6, insn 70 predecessors are promoted (see list above).
    Of the true deps, insn 68 is already schdeuled, insn 69 needs to be.
    insn 69 does get promoted (higher priority 4) but so does insn 60
    (prio 4) with its predecessor insn 58 getting even higher
    promotion (prio 5).

  - The insn 58 and its deps chain end up being scheduled ahead of insn 70
    such that now there are 3 insns seperating it from its direct dep
    insn 69. This blows reg pressure past the spill threshhold of
    sched_class_regs_num[GR_REGS] as evident from the pressure
    summary at the end.

;;	Model schedule:
;;
;;	| idx insn | mpri hght dpth prio |
;;	|   0   56 |    0    8    0   15 | r210=flt(r185#0)     GR_REGS:[25,+0] FP_REGS:[0,+1]
;;	|   1   57 |    0    8    1   12 | [r242+low(`e')]=r210 GR_REGS:[25,+0] FP_REGS:[1,-1]
;;	|   2   63 |    2    7    0   12 | r215=r141+r183       GR_REGS:[25,+1] FP_REGS:[0,+0]
;;	|   3   64 |    1    8    2   11 | r216=[r215]          GR_REGS:[26,-1] FP_REGS:[0,+1]
;;	|   4   65 |    0    8    3    8 | r143=fix(r216)       GR_REGS:[25,+1] FP_REGS:[1,-1]
;;	|   5   67 |    0    8    4    4 | r146=r143<<0x3       GR_REGS:[26,+1] FP_REGS:[0,+0]
;;	|   6   68 |    0    8    5    3 | r217=r144+r146       GR_REGS:[27,+1] FP_REGS:[0,+0]

;;	+--- priority of 70 = 3, priority of 69 60 61 = 4

;;	|   7   69 |    4    7    4    5 | r218=flt(r143)        GR_REGS:[28,+0] FP_REGS:[0,+1]
;;	|   8   58 |    6    4    0    5 | r211=r138+r183        GR_REGS:[28,+1] FP_REGS:[1,+0]
                                                                         ^^^^^^^
;;	|   9   60 |    5    5    2    4 | r213=[r211]           GR_REGS:[29,-1] FP_REGS:[1,+1]
;;	|  10   61 |    4    3    0    4 | r214=[r243+low(`o')]  GR_REGS:[28,+0] FP_REGS:[2,+1]
;;	|  11   70 |    3    8    6    2 | [r217]=r218           GR_REGS:[28,-1] FP_REGS:[3,-1]
...
...
;; Pressure summary: GR_REGS:29 FP_REGS:3
                            ^^^

Solution
--------
When promoting predecessors, only assign the true deps higher priority.
The rest of predecessors get the same priority as depedant insn.

Implementation
--------------

Predecessor promotion logic can be described in pseudo "C" as follows:

  insn->model_priority = model_next_priority++;
  for (;;)                                                   #0
    {
     FOR_EACH_DEP (insn->insn, SD_LIST_HARD_BACK, sd_it, dep)
       {
	if (pro->insn                                        #1
	      && pro->model_priority != model_next_priority
	      && QUEUE_INDEX (pro->insn) != QUEUE_SCHEDULED)
	 {

            pro->model_priority = model_next_priority;       #2

	    if (QUEUE_INDEX (pro->insn) == QUEUE_READY)
	      {
...
              }
            else // recurse to dependent                     #3
              {
                  pro->next = first;
                  first = pro;
              }
	 }                                  // if not schduled
       }                                    // FOR_EACH_DEP

       if (!first)
        break;
       insn = first;
       first = insn->next;
    }                                       // for
   model_next_priority++ ;

 - The core change of this patch is essentially bifurcating #2 to bump the
   priortiy for true dependency more than the rest of deps.

 - An additional gaurd added in #3, to only recurse for true deps;
   otherwise it can end up clobbering the recurse list with same entry
   showing up multiple times, (e.g. an insn could be predecessor of multiple
   dependant insns: as true dep for first and normal dep for the others).

 - The condition #1 also needs to be tightened for the two levels of
   predecessor priorities.

 - The good (and bad) thing is there is a overarching infinite loop #0
   and any coding snafus tend to hit it pretty fast, just by trying to
   bootstrap the toolchain (specially libgcc) or building glibc off of it.

 - There is also a need to track true dependencies in addition to the
   existing all deps of an insn. This is needed at the call-site of
   promotion logic to only invoke promotion when there's unscheduled
   true dependecies.

 - These changes are NOT gated behing the new target hook as it seems
   like the right thing to do anywhere/everywhere.

Improvement measurement
------------------------
Results are convincing On RISC-V (BPI-F3) run of Cactu.
(Build: -Ofast -march=rv64gcv_zba_zbb_zbs)

 Before:
 ------
 7,631,707,552,979      cycles:u                         #    1.600 GHz
 2,630,225,489,010      instructions:u                   #    0.34  insn per cycle

 After (just this fix)
 -----
 7,100,864,968,062      cycles:u          ( 7% faster)   #    1.600 GHz
 2,180,957,013,712      instructions:u    (17% fewer)    #    0.31  insn per cycle

 Aggregate (with ECC fix)
 ----
 6,736,337,207,427      cycles:u          (12% faster)   #    1.600 GHz
 2,078,712,047,604      instructions:u    (21% fewer)    #    0.31  insn per cycle

 ECC fix alone (prev patch)
 ----
 7,153,778,156,281      cycles:u         ( 6% faster)    #    1.600 GHz
 2,143,115,846,207      instructions:u   (18.5% faster)  #    0.30  insn per cycle

Significant gains are also seen on aarch64 (QEMU dynamic icounts only)
(Build: -Ofast -march=armv9-a+sve2)

 Before                   : 1,382,403,783,566
 After (just this fix)    : 1,237,532,639,657 (10.4% fewer)
 Aggregate (with ECC fix) : 1,113,896,471,282 (19.4% fewer)
 Just ECC fix (prev patch): 1,264,869,192,921 (8.5% fewer)

TBD
---

On RISC-V the individual gains from model pressure fix (7, 17) and
the ECC fix (6, 18.5) are are not adding up with both fixes (12, 21)
indicating that main list schedular could be undoing some of the
model schedule arrangements (which it does anyways for right
reasons). On aarch64 they seem to be accumulating on top nicely,
atleast in the QEMU icounts reduction.

gcc/ChangeLog:

	PR target/114729
	* haifa-sched.cc (struct model_insn_info): Add field
	unscheduled_true_preds.
	(model_analyze_insns): Initialize unscheduled_true_preds.
	(model_add_successors_to_worklist): Decrement
	unscheduled_true_preds.
	(model_promote_predecessors): Handle true dependencies
	differently than the rest.
	(model_choose_insn): Promote only on pending true dependencies.
	(model_dump_pressure_summary): Print BB index.
	(model_start_schedule): Call dump summary with BB reference.
	* sched-rgn.cc (debug_dependencies): Print predecessors for
	debugging aid.

gcc/testsuite/ChangeLog:

	PR target/114729
	* gcc.target/riscv/sched1-spills/hang1.c: New test.
	* gcc.target/riscv/sched1-spills/hang5.c: New test.
	* gcc.target/riscv/sched1-spills/spill2.cpp: New test.

Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
---
 gcc/haifa-sched.cc                            | 66 ++++++++++++++-----
 gcc/sched-rgn.cc                              | 14 +++-
 .../gcc.target/riscv/sched1-spills/hang1.c    | 32 +++++++++
 .../gcc.target/riscv/sched1-spills/hang5.c    | 60 +++++++++++++++++
 .../gcc.target/riscv/sched1-spills/spill2.cpp | 37 +++++++++++
 5 files changed, 192 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/sched1-spills/hang1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/sched1-spills/hang5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/sched1-spills/spill2.cpp

Comments

Jeff Law Oct. 31, 2024, 1:11 a.m. UTC | #1
On 10/20/24 1:40 PM, Vineet Gupta wrote:
> Background
> ----------
> sched1 runs a preliminary "model schedular" ahead of the main list schedular.
> Its sole purpose is to keep register pressure to mimimum [1] and it uses DFA
> register depenendency tracking to arrange insns.
> 
>     [1] https://gcc.gnu.org/legacy-ml/gcc-patches/2011-12/msg01684.html
> 
>                             `The idea was to construct a preliminary
>     "model" schedule in which the only objective is to keep register
>     pressure to a minimum.  This schedule ignores pipeline characteristics,
>     latencies, and the number of available registers.  The maximum pressure
>     seen in this initial model schedule (MP) is then the benchmark for ECC(X).`
> 
> It starts off with an intial "worklist" of insns w/o any prior dependencies,
> scheduling them, adding successors of scheduled insn to the worklist and so
> on until all insns in the basic block are done.
> It can run into situations where an otherwise to-be-scheduled candidate can't
> because it's predecessors haven't been scheduled yet, requiring
> "predecessor promotion" implemented in model_promote_predecessors ().
> Promotion is essentially bumping INSN model_priority so that it makes it
> towards the head of elligible-to-schedule list.
> 
> An INSN can have multiple dependencies/predecessor nodes, some of them
> being true dependency REG_DEP_TRUE meaning the predecessor register output
> is a must have for the INSN to be scheduled.
> e.g.
> In the sched1 dump below, insn 70 has multiple deps, but 68 and 69 are
> true reg deps:
> 
> ;;	| insn | prio |
> ;;	|   68 |    3 | r217=r144+r146
> ;;	|   69 |    5 | r218=flt(r143)
> ;;	|   70 |    2 | [r217]=r218
> 
> ;;      insn  code    bb   dep  prio  cost   reservation
> ;;      ----  ----    --   ---  ----  ----   -----------
> ;;       70   286     6     6     2     1   alu	: FW: 97tnm 91m 83tn 78m 76tn 72tn
> ;;						: BK: 69t 68t 57n 60n 61n 64n
>                                                        ^^^ ^^^
> 
> Issue
> -----
> Currently predecessor promotion bumps the priority of all predecessors
> to same value, treating the true deps and the rest alike. This simple
> strategy can sometimes cause a subtle inadvertent effect: given the right
> "other" conditions (depth, height etc) a non true dependency can get
> scheduled ahead of the true dep, increasing the live range between the
> true dep and the dependent. This increases the peak register pressure
> for the BB. Subsequently this inflated peak register pressure steers
> the main list schdular, giving it the lower bound to work with.
> Main schedular can make pressure worse (due to instruction latencies
> and pipeline models etc) but otherwise it can work with the given
> peak pressure. Thus a higher model pressure will ultimately lead to a
> less than ideal final schedule.
> 
> This subtle effect get crazy exacerbated on RISC-V SPEC2017 Cactu
> benchmark.
> 
> For the spill2.cpp testcase (reduced from Cactu itself) on RISC-V,
> here's what was seen:
> 
>    - After idx #6, insn 70 predecessors are promoted (see list above).
>      Of the true deps, insn 68 is already schdeuled, insn 69 needs to be.
>      insn 69 does get promoted (higher priority 4) but so does insn 60
>      (prio 4) with its predecessor insn 58 getting even higher
>      promotion (prio 5).
> 
>    - The insn 58 and its deps chain end up being scheduled ahead of insn 70
>      such that now there are 3 insns seperating it from its direct dep
>      insn 69. This blows reg pressure past the spill threshhold of
>      sched_class_regs_num[GR_REGS] as evident from the pressure
>      summary at the end.
> 
> ;;	Model schedule:
> ;;
> ;;	| idx insn | mpri hght dpth prio |
> ;;	|   0   56 |    0    8    0   15 | r210=flt(r185#0)     GR_REGS:[25,+0] FP_REGS:[0,+1]
> ;;	|   1   57 |    0    8    1   12 | [r242+low(`e')]=r210 GR_REGS:[25,+0] FP_REGS:[1,-1]
> ;;	|   2   63 |    2    7    0   12 | r215=r141+r183       GR_REGS:[25,+1] FP_REGS:[0,+0]
> ;;	|   3   64 |    1    8    2   11 | r216=[r215]          GR_REGS:[26,-1] FP_REGS:[0,+1]
> ;;	|   4   65 |    0    8    3    8 | r143=fix(r216)       GR_REGS:[25,+1] FP_REGS:[1,-1]
> ;;	|   5   67 |    0    8    4    4 | r146=r143<<0x3       GR_REGS:[26,+1] FP_REGS:[0,+0]
> ;;	|   6   68 |    0    8    5    3 | r217=r144+r146       GR_REGS:[27,+1] FP_REGS:[0,+0]
> 
> ;;	+--- priority of 70 = 3, priority of 69 60 61 = 4
> 
> ;;	|   7   69 |    4    7    4    5 | r218=flt(r143)        GR_REGS:[28,+0] FP_REGS:[0,+1]
> ;;	|   8   58 |    6    4    0    5 | r211=r138+r183        GR_REGS:[28,+1] FP_REGS:[1,+0]
>                                                                           ^^^^^^^
> ;;	|   9   60 |    5    5    2    4 | r213=[r211]           GR_REGS:[29,-1] FP_REGS:[1,+1]
> ;;	|  10   61 |    4    3    0    4 | r214=[r243+low(`o')]  GR_REGS:[28,+0] FP_REGS:[2,+1]
> ;;	|  11   70 |    3    8    6    2 | [r217]=r218           GR_REGS:[28,-1] FP_REGS:[3,-1]
> ...
> ...
> ;; Pressure summary: GR_REGS:29 FP_REGS:3
So at a 30k foot level, does the model schedule need to actually be a 
correct schedule?  If so, then we probably can't do what you're trying 
to do.  We have a memory dependency between insns 60 and 70.  If r211 
and r217 are potentially the same, then swapping those two instructions 
is unsafe.

And if it doesn't strictly need to be a valid schedule are we giving an 
overly-optimistic view of the best that can be done from a pressure 
standpoint with this change?  And if so, is that wise?

I really don't have answers to those questions.  But I think they're 
pretty fundamental to get a handle on before we can discuss 
implementation details.

jeff
Vineet Gupta Oct. 31, 2024, 7:35 p.m. UTC | #2
On 10/30/24 18:11, Jeff Law wrote:
> On 10/20/24 1:40 PM, Vineet Gupta wrote:
>> Background
>> ----------
>> sched1 runs a preliminary "model schedular" ahead of the main list schedular.
>> Its sole purpose is to keep register pressure to mimimum [1] and it uses DFA
>> register depenendency tracking to arrange insns.
>>
>>     [1] https://gcc.gnu.org/legacy-ml/gcc-patches/2011-12/msg01684.html
>>
>>                             `The idea was to construct a preliminary
>>     "model" schedule in which the only objective is to keep register
>>     pressure to a minimum.  This schedule ignores pipeline characteristics,
>>     latencies, and the number of available registers.  The maximum pressure
>>     seen in this initial model schedule (MP) is then the benchmark for ECC(X).`
>>
>> It starts off with an intial "worklist" of insns w/o any prior dependencies,
>> scheduling them, adding successors of scheduled insn to the worklist and so
>> on until all insns in the basic block are done.
>> It can run into situations where an otherwise to-be-scheduled candidate can't
>> because it's predecessors haven't been scheduled yet, requiring
>> "predecessor promotion" implemented in model_promote_predecessors ().
>> Promotion is essentially bumping INSN model_priority so that it makes it
>> towards the head of elligible-to-schedule list.
>>
>> An INSN can have multiple dependencies/predecessor nodes, some of them
>> being true dependency REG_DEP_TRUE meaning the predecessor register output
>> is a must have for the INSN to be scheduled.
>> e.g.
>> In the sched1 dump below, insn 70 has multiple deps, but 68 and 69 are
>> true reg deps:
>>
>> ;;	| insn | prio |
>> ;;	|   68 |    3 | r217=r144+r146
>> ;;	|   69 |    5 | r218=flt(r143)
>> ;;	|   70 |    2 | [r217]=r218
>>
>> ;;      insn  code    bb   dep  prio  cost   reservation
>> ;;      ----  ----    --   ---  ----  ----   -----------
>> ;;       70   286     6     6     2     1   alu	: FW: 97tnm 91m 83tn 78m 76tn 72tn
>> ;;						: BK: 69t 68t 57n 60n 61n 64n
>>                                                        ^^^ ^^^

Indeed insn 60 is in backwards dependency of insn 70, although its not a true/data dependency.
'n' in dump above implies DEP_NONREG.

>>
>> Issue
>> -----
>> Currently predecessor promotion bumps the priority of all predecessors
>> to same value, treating the true deps and the rest alike. This simple
>> strategy can sometimes cause a subtle inadvertent effect: given the right
>> "other" conditions (depth, height etc) a non true dependency can get
>> scheduled ahead of the true dep, increasing the live range between the
>> true dep and the dependent. This increases the peak register pressure
>> for the BB. Subsequently this inflated peak register pressure steers
>> the main list schdular, giving it the lower bound to work with.
>> Main schedular can make pressure worse (due to instruction latencies
>> and pipeline models etc) but otherwise it can work with the given
>> peak pressure. Thus a higher model pressure will ultimately lead to a
>> less than ideal final schedule.
>>
>> This subtle effect get crazy exacerbated on RISC-V SPEC2017 Cactu
>> benchmark.
>>
>> For the spill2.cpp testcase (reduced from Cactu itself) on RISC-V,
>> here's what was seen:
>>
>>    - After idx #6, insn 70 predecessors are promoted (see list above).
>>      Of the true deps, insn 68 is already schdeuled, insn 69 needs to be.
>>      insn 69 does get promoted (higher priority 4) but so does insn 60
>>      (prio 4) with its predecessor insn 58 getting even higher
>>      promotion (prio 5).
>>
>>    - The insn 58 and its deps chain end up being scheduled ahead of insn 70
>>      such that now there are 3 insns seperating it from its direct dep
>>      insn 69. This blows reg pressure past the spill threshhold of
>>      sched_class_regs_num[GR_REGS] as evident from the pressure
>>      summary at the end.
>>
>> ;;	Model schedule:
>> ;;
>> ;;	| idx insn | mpri hght dpth prio |
>> ;;	|   0   56 |    0    8    0   15 | r210=flt(r185#0)     GR_REGS:[25,+0] FP_REGS:[0,+1]
>> ;;	|   1   57 |    0    8    1   12 | [r242+low(`e')]=r210 GR_REGS:[25,+0] FP_REGS:[1,-1]
>> ;;	|   2   63 |    2    7    0   12 | r215=r141+r183       GR_REGS:[25,+1] FP_REGS:[0,+0]
>> ;;	|   3   64 |    1    8    2   11 | r216=[r215]          GR_REGS:[26,-1] FP_REGS:[0,+1]
>> ;;	|   4   65 |    0    8    3    8 | r143=fix(r216)       GR_REGS:[25,+1] FP_REGS:[1,-1]
>> ;;	|   5   67 |    0    8    4    4 | r146=r143<<0x3       GR_REGS:[26,+1] FP_REGS:[0,+0]
>> ;;	|   6   68 |    0    8    5    3 | r217=r144+r146       GR_REGS:[27,+1] FP_REGS:[0,+0]
>>
>> ;;	+--- priority of 70 = 3, priority of 69 60 61 = 4
>>
>> ;;	|   7   69 |    4    7    4    5 | r218=flt(r143)        GR_REGS:[28,+0] FP_REGS:[0,+1]
>> ;;	|   8   58 |    6    4    0    5 | r211=r138+r183        GR_REGS:[28,+1] FP_REGS:[1,+0]
>>                                                                           ^^^^^^^
>> ;;	|   9   60 |    5    5    2    4 | r213=[r211]           GR_REGS:[29,-1] FP_REGS:[1,+1]
>> ;;	|  10   61 |    4    3    0    4 | r214=[r243+low(`o')]  GR_REGS:[28,+0] FP_REGS:[2,+1]
>> ;;	|  11   70 |    3    8    6    2 | [r217]=r218           GR_REGS:[28,-1] FP_REGS:[3,-1]
>> ...
>> ...
>> ;; Pressure summary: GR_REGS:29 FP_REGS:3
> So at a 30k foot level, does the model schedule need to actually be a 
> correct schedule?

From my reading of the code and the background posts pointed to by Richard [1],
it doesn't have to be - its sole purpose is to compute the *minimum* register pressure.

   [1] https://gcc.gnu.org/legacy-ml/gcc-patches/2011-12/msg01684.html

|                                                      `The idea was to construct a preliminary
| "model" schedule in which the only objective is to keep register
| pressure to a minimum. This schedule ignores pipeline characteristics,
| latencies, and the number of available registers. The maximum pressure
| seen in this initial model schedule (MP) is then the benchmark for ECC(X).`

It creates the initial arrangement of insns but stuff does get moved around by
 main scheduler (seems to happen for this very case too, see below)

> If so, then we probably can't do what you're trying 
> to do.  We have a memory dependency between insns 60 and 70.  If r211 
> and r217 are potentially the same, then swapping those two instructions 
> is unsafe.

Whao ! steely eyes.

So model schedule is  indeed ordering them as 70 -> 60

;;    |   0   56 |    0    8    0   15 | r210=flt(r185#0)                GR_REGS:[25,+0] FP_REGS:[0,+1]
;;    |   1   57 |    0    8    1   12 | [r242+low(`e')]=r210            GR_REGS:[25,+0] FP_REGS:[1,-1]
;;    |   2   63 |    2    7    0   12 | r215=r141+r183                  GR_REGS:[25,+1] FP_REGS:[0,+0]
;;    |   3   64 |    1    8    2   11 | r216=[r215]                     GR_REGS:[26,-1] FP_REGS:[0,+1]
;;    |   4   65 |    0    8    3    8 | r143=fix(r216)                  GR_REGS:[25,+1] FP_REGS:[1,-1]
;;    |   5   67 |    0    8    4    4 | r146=r143<<0x3                  GR_REGS:[26,+1] FP_REGS:[0,+0]
;;    |   6   68 |    0    8    5    3 | r217=r144+r146                  GR_REGS:[27,+1] FP_REGS:[0,+0]
;;    |   7   69 |    4    7    4    5 | r218=flt(r143)                  GR_REGS:[28,+0] FP_REGS:[0,+1]
;;    |   8   70 |    3    8    6    2 | [r217]=r218                     GR_REGS:[28,-1] FP_REGS:[1,-1]
;;    |   9   58 |    6    4    0    5 | r211=r138+r183                  GR_REGS:[27,+1] FP_REGS:[0,+0]
;;    |  10   60 |    5    5    2    4 | r213=[r211]                     GR_REGS:[28,-1] FP_REGS:[0,+1]


And Main scheduler reorders them back in orig order 60 -> 70

;;      0--> b  0: i  56 r210=flt(r185#0)                        :alu:GR_REGS+0(0)FP_REGS+1(1):model 0
;;      1--> b  0: i  63 r215=r141+r183                          :alu:GR_REGS+1(1)FP_REGS+0(0)
;;      2--> b  0: i  61 r214=[r243+low(`o')]                    :alu:@GR_REGS+0(0)@FP_REGS+1(1)
;;      3--> b  0: i  57 [r242+low(`e')]=r210                    :alu:@GR_REGS+0(0)@FP_REGS+0(-1):model 1
;;      4--> b  0: i  64 r216=[r215]                             :alu:GR_REGS+0(-1)FP_REGS+1(1):model 3
;;      5--> b  4: i 102 r179=sxn(r179#0+0x1)                    :alu:GR_REGS+1(0)FP_REGS+0(0)
;;      6--> b  4: i 105 r185=sxn(r185#0+r205#0)                 :alu:GR_REGS+1(0)FP_REGS+0(0)
;;      7--> b  0: i  65 r143=fix(r216)                          :alu:@GR_REGS+1(1)@FP_REGS+0(-1):model 4
;;      8--> b  0: i  58 r211=r138+r183                          :alu:@GR_REGS+1(1)@FP_REGS+0(0)
;;      9--> b  0: i  60 r213=[r211]                             :alu:GR_REGS+0(-1)FP_REGS+1(1)
;;     10--> b  0: i  69 r218=flt(r143)                          :alu:GR_REGS+0(0)FP_REGS+1(1)
;;     11--> b  0: i  67 r146=r143<<0x3                          :alu:@GR_REGS+1(1)@FP_REGS+0(0):model 5
;;     12--> b  0: i  68 r217=r144+r146                          :alu:GR_REGS+1(1)FP_REGS+0(0):model 6
;;     13--> b  0: i  70 [r217]=r218                             :alu:GR_REGS+0(-1)FP_REGS+0(-1):model 8



> And if it doesn't strictly need to be a valid schedule are we giving an 
> overly-optimistic view of the best that can be done from a pressure 
> standpoint with this change?  And if so, is that wise?

As I mentioned above, the design goal of model schedule is to keep pressure to min.
So indeed we might be a bit more optimistic than reality here. But main list scheduler fixes
that if that leads to undesired outcomes. What we are trying to do here is not pessimize
 in certain cases, especially when that's not by design but just an outcome of the
 implementation subtlety.

As I look at it, for this example, the incoming insn stream (at the entry of sched1) doesn't
 lead to spilling. However the "initial" arrangement of insn by model schedule ends up
requiring a spill and the clearly is an algorithmic/implementation oversight if not an
outright bug.


> I really don't have answers to those questions.  But I think they're 
> pretty fundamental to get a handle on before we can discuss 
> implementation details.

Of course these questions and challenges need to be put up if we are trying to making
fundamental behavioral changes to things established over a decade ago.
Every thing must be questioned and reasoned about. My understanding of this is very
recent, it would also be good if Richard or other folks experienced in art chime in as well.

Thx,
-Vineet
Jeff Law Nov. 5, 2024, 12:45 a.m. UTC | #3
On 10/31/24 1:35 PM, Vineet Gupta wrote:

>>>
>>> An INSN can have multiple dependencies/predecessor nodes, some of them
>>> being true dependency REG_DEP_TRUE meaning the predecessor register output
>>> is a must have for the INSN to be scheduled.
>>> e.g.
>>> In the sched1 dump below, insn 70 has multiple deps, but 68 and 69 are
>>> true reg deps:
>>>
>>> ;;	| insn | prio |
>>> ;;	|   68 |    3 | r217=r144+r146
>>> ;;	|   69 |    5 | r218=flt(r143)
>>> ;;	|   70 |    2 | [r217]=r218
>>>
>>> ;;      insn  code    bb   dep  prio  cost   reservation
>>> ;;      ----  ----    --   ---  ----  ----   -----------
>>> ;;       70   286     6     6     2     1   alu	: FW: 97tnm 91m 83tn 78m 76tn 72tn
>>> ;;						: BK: 69t 68t 57n 60n 61n 64n
>>>                                                         ^^^ ^^^
> 
> Indeed insn 60 is in backwards dependency of insn 70, although its not a true/data dependency.
> 'n' in dump above implies DEP_NONREG.
Right.  But it might still be a true/data dependency, just one that 
isn't though a REG object, but a MEM object.  Also note that in some 
cases we factor dependencies to prevent the various algorithms in the 
scheduler from blowing up.   See flush_pending_lists for example.


>>> ;; Pressure summary: GR_REGS:29 FP_REGS:3
>> So at a 30k foot level, does the model schedule need to actually be a
>> correct schedule?
> 
>  From my reading of the code and the background posts pointed to by Richard [1],
> it doesn't have to be - its sole purpose is to compute the *minimum* register pressure.

My default position would be that the schedule would still need to 
produce valid code.  Otherwise the "minimum pressure" concept just 
doesn't make much sense.  I could reorder the insns in all kinds of 
interesting ways that would produce a smaller pressure artifically if I 
wasn't constrained by the need to produce a schedule that honored the 
semantics of the original code.

I didn't see anything in Richard's 2011 post that would indicate that 
the model schedule should have enough freedom to produce an invalid 
schedule.




> 
>     [1] https://gcc.gnu.org/legacy-ml/gcc-patches/2011-12/msg01684.html
> 
> |                                                      `The idea was to construct a preliminary
> | "model" schedule in which the only objective is to keep register
> | pressure to a minimum. This schedule ignores pipeline characteristics,
> | latencies, and the number of available registers. The maximum pressure
> | seen in this initial model schedule (MP) is then the benchmark for ECC(X).`
> 
> It creates the initial arrangement of insns but stuff does get moved around by
>   main scheduler (seems to happen for this very case too, see below)
> 
>> If so, then we probably can't do what you're trying
>> to do.  We have a memory dependency between insns 60 and 70.  If r211
>> and r217 are potentially the same, then swapping those two instructions
>> is unsafe.
> 
> Whao ! steely eyes.
> 
> So model schedule is  indeed ordering them as 70 -> 60
> 
> ;;    |   0   56 |    0    8    0   15 | r210=flt(r185#0)                GR_REGS:[25,+0] FP_REGS:[0,+1]
> ;;    |   1   57 |    0    8    1   12 | [r242+low(`e')]=r210            GR_REGS:[25,+0] FP_REGS:[1,-1]
> ;;    |   2   63 |    2    7    0   12 | r215=r141+r183                  GR_REGS:[25,+1] FP_REGS:[0,+0]
> ;;    |   3   64 |    1    8    2   11 | r216=[r215]                     GR_REGS:[26,-1] FP_REGS:[0,+1]
> ;;    |   4   65 |    0    8    3    8 | r143=fix(r216)                  GR_REGS:[25,+1] FP_REGS:[1,-1]
> ;;    |   5   67 |    0    8    4    4 | r146=r143<<0x3                  GR_REGS:[26,+1] FP_REGS:[0,+0]
> ;;    |   6   68 |    0    8    5    3 | r217=r144+r146                  GR_REGS:[27,+1] FP_REGS:[0,+0]
> ;;    |   7   69 |    4    7    4    5 | r218=flt(r143)                  GR_REGS:[28,+0] FP_REGS:[0,+1]
> ;;    |   8   70 |    3    8    6    2 | [r217]=r218                     GR_REGS:[28,-1] FP_REGS:[1,-1]
> ;;    |   9   58 |    6    4    0    5 | r211=r138+r183                  GR_REGS:[27,+1] FP_REGS:[0,+0]
> ;;    |  10   60 |    5    5    2    4 | r213=[r211]                     GR_REGS:[28,-1] FP_REGS:[0,+1]
Interesting.  Was that without your change, or only with your change?

> 
> 
> And Main scheduler reorders them back in orig order 60 -> 70
As it probably must to produce valid code barring something which tells 
us the two memory objects don't alias.



> 
> 
> 
>> And if it doesn't strictly need to be a valid schedule are we giving an
>> overly-optimistic view of the best that can be done from a pressure
>> standpoint with this change?  And if so, is that wise?
> 
> As I mentioned above, the design goal of model schedule is to keep pressure to min.
> So indeed we might be a bit more optimistic than reality here. But main list scheduler fixes
> that if that leads to undesired outcomes. What we are trying to do here is not pessimize
>   in certain cases, especially when that's not by design but just an outcome of the
>   implementation subtlety.
And my point is that if I'm allowed to generate a minimal register 
pressure schedule without needing it to generate the same semantics as 
the original code, then I could drastically reduce the register pressure 
:-) But I'd also claim that doing so isn't actually useful.

The mental model I'd work from is we want to know the minimal pressure 
while still preserving the original code semantics -- unless someone who 
knows this better (ie Richard S) were to say otherwise.



jeff
Vineet Gupta Nov. 5, 2024, 7:33 p.m. UTC | #4
On 11/4/24 16:45, Jeff Law wrote:
> On 10/31/24 1:35 PM, Vineet Gupta wrote:
>>>> An INSN can have multiple dependencies/predecessor nodes, some of them
>>>> being true dependency REG_DEP_TRUE meaning the predecessor register output
>>>> is a must have for the INSN to be scheduled.
>>>> e.g.
>>>> In the sched1 dump below, insn 70 has multiple deps, but 68 and 69 are
>>>> true reg deps:
>>>>
>>>> ;;	| insn | prio |
>>>> ;;	|   68 |    3 | r217=r144+r146
>>>> ;;	|   69 |    5 | r218=flt(r143)
>>>> ;;	|   70 |    2 | [r217]=r218
>>>>
>>>> ;;      insn  code    bb   dep  prio  cost   reservation
>>>> ;;      ----  ----    --   ---  ----  ----   -----------
>>>> ;;       70   286     6     6     2     1   alu	: FW: 97tnm 91m 83tn 78m 76tn 72tn
>>>> ;;						: BK: 69t 68t 57n 60n 61n 64n
>>>>                                                         ^^^ ^^^
>> Indeed insn 60 is in backwards dependency of insn 70, although its not a true/data dependency.
>> 'n' in dump above implies DEP_NONREG.
> Right.  But it might still be a true/data dependency, just one that 
> isn't though a REG object, but a MEM object.  Also note that in some 
> cases we factor dependencies to prevent the various algorithms in the 
> scheduler from blowing up.   See flush_pending_lists for example.
>
>
>>>> ;; Pressure summary: GR_REGS:29 FP_REGS:3
>>> So at a 30k foot level, does the model schedule need to actually be a
>>> correct schedule?
>>  From my reading of the code and the background posts pointed to by Richard [1],
>> it doesn't have to be - its sole purpose is to compute the *minimum* register pressure.
> My default position would be that the schedule would still need to 
> produce valid code.  Otherwise the "minimum pressure" concept just 
> doesn't make much sense.  I could reorder the insns in all kinds of 
> interesting ways that would produce a smaller pressure artifically if I 
> wasn't constrained by the need to produce a schedule that honored the 
> semantics of the original code.

Fair enough ! But does that mean any load will be potentially scheduled ahead of
any store, especially if the loads address is not computed in the BB and lacking
specific non-aliasing annotations or toggles.

> I didn't see anything in Richard's 2011 post that would indicate that 
> the model schedule should have enough freedom to produce an invalid 
> schedule.

True.

>> So model schedule is  indeed ordering them as 70 -> 60
>> ;;    |   0   56 |    0    8    0   15 | r210=flt(r185#0)                GR_REGS:[25,+0] FP_REGS:[0,+1]
>> ;;    |   1   57 |    0    8    1   12 | [r242+low(`e')]=r210            GR_REGS:[25,+0] FP_REGS:[1,-1]
>> ;;    |   2   63 |    2    7    0   12 | r215=r141+r183                  GR_REGS:[25,+1] FP_REGS:[0,+0]
>> ;;    |   3   64 |    1    8    2   11 | r216=[r215]                     GR_REGS:[26,-1] FP_REGS:[0,+1]
>> ;;    |   4   65 |    0    8    3    8 | r143=fix(r216)                  GR_REGS:[25,+1] FP_REGS:[1,-1]
>> ;;    |   5   67 |    0    8    4    4 | r146=r143<<0x3                  GR_REGS:[26,+1] FP_REGS:[0,+0]
>> ;;    |   6   68 |    0    8    5    3 | r217=r144+r146                  GR_REGS:[27,+1] FP_REGS:[0,+0]
>> ;;    |   7   69 |    4    7    4    5 | r218=flt(r143)                  GR_REGS:[28,+0] FP_REGS:[0,+1]
>> ;;    |   8   70 |    3    8    6    2 | [r217]=r218                     GR_REGS:[28,-1] FP_REGS:[1,-1]
>> ;;    |   9   58 |    6    4    0    5 | r211=r138+r183                  GR_REGS:[27,+1] FP_REGS:[0,+0]
>> ;;    |  10   60 |    5    5    2    4 | r213=[r211]                     GR_REGS:[28,-1] FP_REGS:[0,+1]
> Interesting.  Was that without your change, or only with your change?

With my change.

>> And Main scheduler reorders them back in orig order 60 -> 70
> As it probably must to produce valid code barring something which tells 
> us the two memory objects don't alias.

Yeah need to look if that is really causal or it just happens to be...

>>> And if it doesn't strictly need to be a valid schedule are we giving an
>>> overly-optimistic view of the best that can be done from a pressure
>>> standpoint with this change?  And if so, is that wise?
>> As I mentioned above, the design goal of model schedule is to keep pressure to min.
>> So indeed we might be a bit more optimistic than reality here. But main list scheduler fixes
>> that if that leads to undesired outcomes. What we are trying to do here is not pessimize
>>   in certain cases, especially when that's not by design but just an outcome of the
>>   implementation subtlety.
> And my point is that if I'm allowed to generate a minimal register 
> pressure schedule without needing it to generate the same semantics as 
> the original code, then I could drastically reduce the register pressure 
> :-) But I'd also claim that doing so isn't actually useful.
>
> The mental model I'd work from is we want to know the minimal pressure 
> while still preserving the original code semantics -- unless someone who 
> knows this better (ie Richard S) were to say otherwise.

OK I'll see how we can tighten the constraints a bit more - I still feel there
is value in this approach, of course valid codegen trumps everything.
Given where we are in the release cycle, I'll post  v2 with just patch 1/4 (and
2/4 squashed) with --param approach.

Thx,
-Vineet
Richard Sandiford Nov. 5, 2024, 7:54 p.m. UTC | #5
Sorry, still haven't found time to look at the patch properly
(hopefully after stage 1 closes, if not before), but:

Jeff Law <jeffreyalaw@gmail.com> writes:
> [...]
> On 10/31/24 1:35 PM, Vineet Gupta wrote:
>>> And if it doesn't strictly need to be a valid schedule are we giving an
>>> overly-optimistic view of the best that can be done from a pressure
>>> standpoint with this change?  And if so, is that wise?
>> 
>> As I mentioned above, the design goal of model schedule is to keep pressure to min.
>> So indeed we might be a bit more optimistic than reality here. But main list scheduler fixes
>> that if that leads to undesired outcomes. What we are trying to do here is not pessimize
>>   in certain cases, especially when that's not by design but just an outcome of the
>>   implementation subtlety.
> And my point is that if I'm allowed to generate a minimal register 
> pressure schedule without needing it to generate the same semantics as 
> the original code, then I could drastically reduce the register pressure 
> :-) But I'd also claim that doing so isn't actually useful.
>
> The mental model I'd work from is we want to know the minimal pressure 
> while still preserving the original code semantics -- unless someone who 
> knows this better (ie Richard S) were to say otherwise.

Yeah, that's right.  The "model" schedule was supposed to be a correct
schedule that completely ignored the target pipeline and instead tried
to minimise register pressure.  This was in contrast to the list
scheduler without any pressure heuristics, which tried to fill pipeline
bubbles while completely ignoring register pressure.

The idea was then to strike a balance between the list scheduler's
natural tendency to do things as soon as the pipeline model allowed
vs the model scheduler's tendency to delay things that would increase
pressure.  But all three schedules (the two extremes, and the compromise)
are intended to be correct in isolation.

Richard
diff mbox series

Patch

diff --git a/gcc/haifa-sched.cc b/gcc/haifa-sched.cc
index f8c42d30d5a4..67f99ce00339 100644
--- a/gcc/haifa-sched.cc
+++ b/gcc/haifa-sched.cc
@@ -1862,6 +1862,9 @@  struct model_insn_info {
 
   /* The number of predecessor nodes that must still be scheduled.  */
   int unscheduled_preds;
+
+  /* The subset of above which have true register dependency.  */
+  int unscheduled_true_preds;
 };
 
 /* Information about the pressure limit for a particular register class.
@@ -3529,7 +3532,20 @@  model_analyze_insns (void)
 	insn->old_queue = QUEUE_INDEX (iter);
 	QUEUE_INDEX (iter) = QUEUE_NOWHERE;
 
-	insn->unscheduled_preds = dep_list_size (iter, SD_LIST_HARD_BACK);
+	insn->unscheduled_preds = 0;
+	insn->unscheduled_true_preds = 0;
+	/* opencoded dep_list_size () to get true deps as well.  */
+	FOR_EACH_DEP (iter, SD_LIST_HARD_BACK, sd_it, dep)
+	  {
+	    if (DEBUG_INSN_P (DEP_PRO (dep)))
+	      continue;
+	    insn->unscheduled_preds++;
+	    if (DEP_TYPE (dep) == REG_DEP_TRUE)
+	      insn->unscheduled_true_preds++;
+	  }
+	gcc_assert (insn->unscheduled_preds
+		    == dep_list_size (iter, SD_LIST_HARD_BACK));
+
 	if (insn->unscheduled_preds == 0)
 	  model_add_to_worklist (insn, NULL, model_worklist);
 
@@ -3661,6 +3677,9 @@  model_add_successors_to_worklist (struct model_insn_info *insn)
 	{
 	  con->unscheduled_preds--;
 
+	  if (DEP_TYPE (dep) == REG_DEP_TRUE)
+	    con->unscheduled_true_preds--;
+
 	  /* Update the depth field of each true-dependent successor.
 	     Increasing the depth gives them a higher priority than
 	     before.  */
@@ -3692,7 +3711,7 @@  model_add_successors_to_worklist (struct model_insn_info *insn)
 static void
 model_promote_predecessors (struct model_insn_info *insn)
 {
-  struct model_insn_info *pro, *first;
+  struct model_insn_info *pro, *first, *leaf_true_dep = NULL;
   sd_iterator_def sd_it;
   dep_t dep;
 
@@ -3708,16 +3727,25 @@  model_promote_predecessors (struct model_insn_info *insn)
     {
       FOR_EACH_DEP (insn->insn, SD_LIST_HARD_BACK, sd_it, dep)
 	{
+	  bool true_dep = (DEP_TYPE (dep) == REG_DEP_TRUE);
+
 	  pro = MODEL_INSN_INFO (DEP_PRO (dep));
 	  /* The first test is to ignore debug instructions, and instructions
 	     from other blocks.  */
 	  if (pro->insn
-	      && pro->model_priority != model_next_priority
-	      && QUEUE_INDEX (pro->insn) != QUEUE_SCHEDULED)
+	      && QUEUE_INDEX (pro->insn) != QUEUE_SCHEDULED
+	      && ((true_dep && (pro->model_priority < model_next_priority))
+		  || (!true_dep
+		      && (pro->model_priority < (model_next_priority - 1)))))
 	    {
-	      pro->model_priority = model_next_priority;
+	      if (true_dep)
+		pro->model_priority = model_next_priority;
+	      else
+		pro->model_priority = model_next_priority - 1;
+
 	      if (sched_verbose >= 7)
-		fprintf (sched_dump, " %d", INSN_UID (pro->insn));
+		fprintf (sched_dump, " %d=%d", INSN_UID (pro->insn),
+			 pro->model_priority);
 	      if (QUEUE_INDEX (pro->insn) == QUEUE_READY)
 		{
 		  /* PRO is already in the worklist, but it now has
@@ -3726,12 +3754,14 @@  model_promote_predecessors (struct model_insn_info *insn)
 		  model_remove_from_worklist (pro);
 		  model_add_to_worklist (pro, NULL, model_worklist);
 		}
-	      else
+	      else if (true_dep)
 		{
 		  /* PRO isn't in the worklist.  Recursively process
 		     its predecessors until we find one that is.  */
 		  pro->next = first;
 		  first = pro;
+		  if (pro->unscheduled_true_preds == 0)
+		    leaf_true_dep = pro;
 		}
 	    }
 	}
@@ -3740,9 +3770,15 @@  model_promote_predecessors (struct model_insn_info *insn)
       insn = first;
       first = insn->next;
     }
-  if (sched_verbose >= 7)
-    fprintf (sched_dump, " = %d\n", model_next_priority);
+  if (leaf_true_dep)
+    {
+      gcc_assert (QUEUE_INDEX (leaf_true_dep->insn) == QUEUE_NOWHERE);
+      gcc_assert (leaf_true_dep->model_priority == model_next_priority);
+      model_add_to_worklist (leaf_true_dep, NULL, model_worklist);
+    }
   model_next_priority++;
+  if (sched_verbose >= 7)
+    fprintf (sched_dump, "  NEXT prio = %d\n", model_next_priority);
 }
 
 /* Pick one instruction from model_worklist and process it.  */
@@ -3760,10 +3796,10 @@  model_choose_insn (void)
       count = param_max_sched_ready_insns;
       while (count > 0 && insn)
 	{
-	  fprintf (sched_dump, ";;\t+---   %d [%d, %d, %d, %d]\n",
+	  fprintf (sched_dump, ";;\t+---   %d [%d, %d, %d, %d][%d]\n",
 		   INSN_UID (insn->insn), insn->model_priority,
 		   insn->depth + insn->alap, insn->depth,
-		   INSN_PRIORITY (insn->insn));
+		   INSN_PRIORITY (insn->insn), insn->unscheduled_preds);
 	  count--;
 	  insn = insn->next;
 	}
@@ -3824,7 +3860,7 @@  model_choose_insn (void)
 	fprintf (sched_dump, ";;\t+--- promoting insn %d, which is ready\n",
 		 INSN_UID (insn->insn));
     }
-  if (insn->unscheduled_preds)
+  if (insn->unscheduled_true_preds)
     /* INSN isn't yet ready to issue.  Give all its predecessors the
        highest priority.  */
     model_promote_predecessors (insn);
@@ -3857,11 +3893,11 @@  model_reset_queue_indices (void)
    to sched_dump.  */
 
 static void
-model_dump_pressure_summary (void)
+model_dump_pressure_summary (basic_block bb)
 {
   int pci, cl;
 
-  fprintf (sched_dump, ";; Pressure summary:");
+  fprintf (sched_dump, ";; Pressure summary (bb %d):", bb->index);
   for (pci = 0; pci < ira_pressure_classes_num; pci++)
     {
       cl = ira_pressure_classes[pci];
@@ -3900,7 +3936,7 @@  model_start_schedule (basic_block bb)
   model_curr_point = 0;
   initiate_reg_pressure_info (df_get_live_in (bb));
   if (sched_verbose >= 1)
-    model_dump_pressure_summary ();
+    model_dump_pressure_summary (bb);
 }
 
 /* Free the information associated with GROUP.  */
diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc
index 3d8cff76aaf9..bedccc9f2b83 100644
--- a/gcc/sched-rgn.cc
+++ b/gcc/sched-rgn.cc
@@ -2855,15 +2855,25 @@  void debug_dependencies (rtx_insn *head, rtx_insn *tail)
       else
 	print_reservation (sched_dump, insn);
 
-      fprintf (sched_dump, "\t: ");
+      fprintf (sched_dump, "\t: FW:");
       {
 	sd_iterator_def sd_it;
 	dep_t dep;
 
 	FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
-	  fprintf (sched_dump, "%d%s%s ", INSN_UID (DEP_CON (dep)),
+	  fprintf (sched_dump, " %d%s%s%s", INSN_UID (DEP_CON (dep)),
+		   DEP_TYPE (dep) == REG_DEP_TRUE ? "t" : "",
 		   DEP_NONREG (dep) ? "n" : "",
 		   DEP_MULTIPLE (dep) ? "m" : "");
+	if (sched_verbose >= 5)
+	  {
+	    fprintf (sched_dump, "\n;;\t\t\t\t\t\t: BK:");
+	    FOR_EACH_DEP (insn, SD_LIST_HARD_BACK, sd_it, dep)
+	      fprintf (sched_dump, " %d%s%s%s", INSN_UID (DEP_PRO (dep)),
+		       DEP_TYPE (dep) == REG_DEP_TRUE ? "t" : "",
+		       DEP_NONREG (dep) ? "n" : "",
+		       DEP_MULTIPLE (dep) ? "m" : "");
+	  }
       }
       fprintf (sched_dump, "\n");
     }
diff --git a/gcc/testsuite/gcc.target/riscv/sched1-spills/hang1.c b/gcc/testsuite/gcc.target/riscv/sched1-spills/hang1.c
new file mode 100644
index 000000000000..312bea1fede0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sched1-spills/hang1.c
@@ -0,0 +1,32 @@ 
+/* Reduced from libgcc/_divtc3.c
+   - Interim version of model schedule changes would cause a infinite loop
+     during predecessor promotion.
+
+     insn 11 predecessor promotion called with insn 9 being predecessor
+     of two insn 10 and insn 11.
+
+;;   --- Region Dependences --- b 2 bb 0
+    ...
+;;       10    76     2     1     4     3   alu	: FW: 11tm
+;;						: BK: 9t
+;;       11   357     2     6     1     1   alu	: FW:
+;;						: BK: 5 6 8 9 10tm 7tm
+    ...
+;;	Model schedule:
+;;
+;;	| idx insn | mpri hght dpth prio |
+;;	|   0    5 |    0    3    0    8 | r138=`a'
+;;	|   1    6 |    0    3    1    7 | r140=[r138]
+;;	|   2    7 |    0    3    2    4 | r139=abs(r140)
+    ...
+;;	+--- priority of 11 = 2, priority of 8=2 9=2 10=3 9=3 8=3  */
+
+
+/* { dg-options "-O2 -fPIC -march=rv64gc_zba_zbb_zbs_zfa -mabi=lp64d" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "O1" "-Og" "-Os" "-Oz" } } */
+
+float a, b;
+void c() {
+  if (__builtin_fabsl(a) < __builtin_fabsl(b))
+    a = 2;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/sched1-spills/hang5.c b/gcc/testsuite/gcc.target/riscv/sched1-spills/hang5.c
new file mode 100644
index 000000000000..0edb04223234
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sched1-spills/hang5.c
@@ -0,0 +1,60 @@ 
+/* Reduced from glibc:iconv/gconv_simple.c
+   - Interim version of model schedule changes would cause a infinite loop
+     during predecessor promotion because true dependency was not getting
+     added to worklist.
+
+     insn 19 is indirect predecessor of multiple insns, but not direct of
+     any in the BB.
+
+;;   --- Region Dependences --- b 4 bb 0
+    ...
+;;       19   275     4     0     2     1   alu	: FW: 26 20
+;;						: BK:
+;;       20     5     4     2     2     1   alu	: FW: 26tm
+;;						: BK: 15 19
+;;       22   125     4     1     5     3   alu	: FW: 26 25t
+;;						: BK: 18tn
+;;       25   104     4     1     2     1   alu	: FW: 26tm
+;;						: BK: 22t
+;;       26   353     4     7     1     1   alu	: FW:
+;;						: BK: 15 16 19 22 25tm 20tm 18m
+    ...
+;;	Model schedule:
+;;
+;;	| idx insn | mpri hght dpth prio |
+;;	|   0   15 |    0    5    0   10 | r161=r134+r140
+;;	|   1   16 |    0    5    1    9 | r139=zxn([r161+0x4])
+;;	|   2   18 |    0    5    2    6 | [r178+low(`g')]=r139#0
+;;	|   3   22 |    0    5    3    5 | r164=sxn([r134])
+;;	|   4   25 |    0    5    4    2 | r166=r164&0x7
+    ...
+;;	+--- priority of 26 = 1, priority of 19=1 20=2 19=1  NEXT prio = 3
+;;	+--- priority of 26 = 3, priority of 19=3 20=4 19=3  NEXT prio = 5  */
+
+
+/* { dg-options "-O2 -march=rv64gc_zba_zbb_zbs_zfa -mabi=lp64d" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "O1" "-Og" "-Os" "-Oz" } } */
+
+typedef struct {
+  int a;
+  struct {
+    char b[];
+  } c;
+} d;
+struct e {
+  d *f;
+} i;
+char g[4];
+char h;
+void j(struct e *m) {
+  d *k;
+  long a = 0, l;
+  for (; a < (m->f->a & 7); ++a)
+    g[0] = m->f->c.b[a];
+  for (; a < l; a++)
+    k->c.b[a] = h;
+}
+void n() {
+  if (i.f->a & 7)
+    j(&i);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/sched1-spills/spill2.cpp b/gcc/testsuite/gcc.target/riscv/sched1-spills/spill2.cpp
new file mode 100644
index 000000000000..cb739ac6a7d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sched1-spills/spill2.cpp
@@ -0,0 +1,37 @@ 
+/* Reduced from SPEC2017 Cactu ML_BSSN_RHS.cpp
+   Showed spill despite the prior ECC fix (vs. -fno-schedule-insns) due to
+   promotion of non true predecessors (essentally led to this patch).  */
+
+/* { dg-options "-O2 -march=rv64gc -mabi=lp64d -save-temps -fverbose-asm" } */
+/* { dg-final { scan-assembler-times "%sfp" 0 } } */
+
+void a();
+double *b, *c, *d, *f, *g, *h;
+double e, o, q;
+int k, l, n;
+int *m;
+long p;
+void r() {
+  long ai = p;
+  for (;;)
+    for (int j; n; ++j)
+      for (int i(m[0]); i; i++) {
+        long aj = i * j;
+        e = aj;
+        double am = b[aj], ba = am, bf = ba * o;
+        c[aj] = aj = f[aj];
+        double aq = g[aj];
+        double at = ((double *)a)[aj];
+        switch (l)
+        case 2:
+          (&aj)[ai] = (&d[aj])[ai];
+        double ax(aq);
+        double ay(k == 1 ? at : 0);
+        double az = ay;
+        double be = ax;
+        double bg = az * q * be * bf;
+        double bh = bg;
+        h[aj] = bh;
+      }
+}
+