diff mbox series

RFC model schedule tweak (was Re: sched1 pathology on RISC-V : PR/114729)

Message ID d26aae29-ba2f-4772-bfb9-ecf5a0218818@rivosinc.com
State New
Headers show
Series RFC model schedule tweak (was Re: sched1 pathology on RISC-V : PR/114729) | expand

Commit Message

Vineet Gupta Sept. 10, 2024, 5:39 a.m. UTC
On 8/27/24 18:10, Vineet Gupta wrote:
> On 8/7/24 10:47, Richard Sandiford wrote:
>> is probably not appropriate.  We should probably just use the baseECC,
>> as suggested by the first sentence in the comment.  It looks like the hack:
>>
>> diff --git a/gcc/haifa-sched.cc b/gcc/haifa-sched.cc
>> index 1bc610f9a5f..9601e929a88 100644
>> --- a/gcc/haifa-sched.cc
>> +++ b/gcc/haifa-sched.cc
>> @@ -2512,7 +2512,7 @@ model_set_excess_costs (rtx_insn **insns, int count)
>>  	    print_p = true;
>>  	  }
>>  	cost = model_excess_cost (insns[i], print_p);
>> -	if (cost <= 0)
>> +	if (cost <= 0 && 0)
>>  	  {
>>  	    priority = INSN_PRIORITY (insns[i]) - insn_delay (insns[i]) - cost;
>>  	    priority_base = MAX (priority_base, priority);
>> @@ -2525,6 +2525,7 @@ model_set_excess_costs (rtx_insn **insns, int count)
>>  
>>    /* Use MAX (baseECC, 0) and baseP to calculcate ECC for each
>>       instruction.  */
>> +  if (0)
>>    for (i = 0; i < count; i++)
>>      {
>>        cost = INSN_REG_PRESSURE_EXCESS_COST_CHANGE (insns[i]);
>>
>> fixes things for me.  Perhaps we should replace these && 0s
>> with a query for an out-of-order core?
>>
>> I haven't benchmarked this. :)  And I'm looking at the code for the
>> first time in many years, so I'm certainly forgetting details.
> Back to the original issue, I reran reduce with the hack and see the issue still.
> I've updated the PR but here's the gist essentially:
>
>         -fno-schedule-insns             |      -fschedule-insns
>                                         |
> 	li	t3,2			| 	li	t5,2
> 	li	t1,1			| 	li	t4,1
>               ---xxx---			| 	sd	s10,8(sp)	# %sfp
> .L2:					| .L2:
>               ---xxx---			| 	ld	a5,8(sp)	# %sfp
> 	beq	s10,zero,.L9		| 	beq	a5,zero,.L9		
> 	mv	a1,s6			| 	mv	a1,a6			
> 	beq	s6,zero,.L7		| 	beq	a6,zero,.L7		
> 	mulw	a2,s5,s6		| 	mulw	a2,a0,a6		
> 	mv	a0,s5			| 	mv	s10,a0			
> 	slli	a6,s5,3			| 	slli	s11,a0,3		
> 	slli	a3,a2,3			| 	slli	a3,a2,3			
> .L6:					| .L6:
> 	fcvt.d.w fa5,a2		# 56	| 	fcvt.d.w fa5,a2         # 56
> 	add	a5,s4,a3        # 63	| 	add	a4,s5,a3        # 63
> 	fsd	fa5,%lo(e)(t6)	# 57	| 	add	a5,s6,a3        # 58
> 	fld	fa4,0(a5)	# 64	| 	fsd	fa5,%lo(e)(t2)  # 57
> 	fld	fa5,%lo(o)(t5)		| 	fld	fa3,0(a4)       # 64
> 	add	a5,s3,a3	# 58	| 	fld	fa4,0(a5)       # 60
> 	fmul.d	fa4,fa4,fa5		| 	fld	fa5,%lo(o)(t0)	
> 	fld	fa5,0(a5)	# 60	| 	fcvt.l.d a4,fa3,rtz
>
> schedule_insn () is called as follows:
>
> ;; 0--> b  0: i  56 r210=flt(r185#0) :alu: GR_REGS+0(0) FP_REGS+1(1):model 0
> ;; 1--> b  0: i  63 r215=r141+r183   :alu: GR_REGS+1(1) FP_REGS+0(0)  
> ;; 2--> b  0: i  58 r211=r138+r183   :alu:@GR_REGS+1(1)@FP_REGS+0(0)
>
>
> The prints between insn 63 and insn 58 show the issue
>
> ;;  point uid  prio delay  class : del   ECC-gpr          del  ECC-fpr    ECC-tot
> ;;|   1   57 |   12  +2 | GR_REGS:[0 base cost 0] FP_REGS:[-1 base cost 0]ECC 0
> ;;|  10   61 |    4  +1 | GR_REGS:[0 base cost 0] FP_REGS:[1 base cost 0] ECC 0
> ;;|   8   58 |    5  +1 | GR_REGS:[1 base cost 0] FP_REGS:[0 base cost 0] ECC 0
>
> ;; --1-- RFS_PRESSURE_DELAY winner 58 looser 57
> ;; --2-- RFS_PRESSURE_DELAY winner 58 looser 57
>
> (1) insn 57 delta is -1 (reduces pressure), while insn 58 delta is 1 (increases pressure) yet ECC is 0 for both, meaning for pressure considerations
> they are treated the same. This ECC "dilution" happens in model_spill_cost ().
>
> (2). insn 57 is a store (so prio 2) while insn 58 is an add (so prio 1).
>
> rank_for_schedule (): RFS_PRESSURE_DELAY chooses lower of (ECC + insn_delay) and thus picks 58 over 57 causing the issue.

Turns out that the issue was happening much earlier, at the time of building model schedule itself.
 A sink insn gets delayed more than it should causing the pressure to exceed sched_class_regs_num [].

The insn stream coming into sched1 is following:

;;   ======================================================
;;   -- basic block 6 from 56 to 78 -- before reload
;;   ======================================================
;;    | insn | prio |
;;    |   56 |   15 | r210=flt(r185#0)        
;;    |   57 |   12 | [r242+low(`e')]=r210    
;;    |   58 |    5 | r211=r138+r183
;;    |   60 |    4 | r213=[r211]   
;;    |   61 |    4 | r214=[r243+low(`o')]
;;    |   62 |    1 | r175=r213*r214          
;;    |   63 |   12 | r215=r141+r183
;;    |   64 |   11 | r216=[r215]
;;    |   65 |    8 | r143=fix(r216)          
;;    |   67 |    4 | r146=r143<<0x3          
;;    |   68 |    3 | r217=r144+r146          
;;    |   69 |    5 | r218=flt(r143)          
;;    |   70 |    2 | [r217]=r218             
;;    |   71 |    2 | r219=r149+r146          
;;    |   72 |    1 | r177=[r219]             
;;    |   75 |    2 | r220=r246+r146          
;;    |   76 |    1 | r171=[r220]             
;;    |   78 |    1 | pc={(r152!=r249)?L88:pc}

;;   --- Region Dependences --- b 6 bb 0 
;;      insn  code    bb   dep  prio  cost   reservation
;;      ----  ----    --   ---  ----  ----   -----------
;;       56   164     6     0    15     3   alu    : 105m 78 57 
;;       57   286     6     1    12     1   alu    : 97nm 91m 83n 78m 72n 70n 64n 60n 
;;       58     5     6     0     5     1   alu    : 108m 78 60 
;;       60   286     6     2     4     3   alu    : 97nm 78 70n 62 
;;       61   286     6     0     4     3   alu    : 97nm 78 70n 62 
;;       62    22     6     2     1     7   alu    : 100m 78 
;;       63     5     6     0    12     1   alu    : 108m 78 64 
;;       64   286     6     2    11     3   alu    : 97nm 78 70n 65 
;;       65   158     6     1     8     3   alu    : 80 78 69 67 
;;       67   297     6     1     4     1   alu    : 95m 87 78 75 71 68 
;;       68     5     6     1     3     1   alu    : 87 78 70 
;;       69   165     6     1     5     3   alu    : 78 70 
;;       70   286     6     6     2     1   alu    : 97nm 91m 83n 78m 76n 72n 
;;       71     5     6     1     2     1   alu    : 87 78 72 
;;       72   286     6     3     1     3   alu    : 99m 97nm 78 
;;       75     5     6     1     2     1   alu    : 87 78 76 
;;       76   286     6     2     1     3   alu    : 98m 97nm 158m 78 
;;       78   353     6    17     1     1   alu    : 97nm 91nm 86 85n 83 


The Model schedule calculated is not ideal,and the pressure 29 *will* lead to spill.

;;    Model schedule:
;;
;;    | idx insn | mpri hght dpth prio |
;;    |   0   56 |    0    8    0   15 | r210=flt(r185#0)                GR_REGS:[25,+0] FP_REGS:[0,+1]
;;    |   1   57 |    0    8    1   12 | [r242+low(`e')]=r210            GR_REGS:[25,+0] FP_REGS:[1,-1]
;;    |   2   63 |    2    7    0   12 | r215=r141+r183                  GR_REGS:[25,+1] FP_REGS:[0,+0]
;;    |   3   64 |    1    8    2   11 | r216=[r215]                     GR_REGS:[26,-1] FP_REGS:[0,+1]
;;    |   4   65 |    0    8    3    8 | r143=fix(r216)                  GR_REGS:[25,+1] FP_REGS:[1,-1]
;;    |   5   67 |    0    8    4    4 | r146=r143<<0x3                  GR_REGS:[26,+1] FP_REGS:[0,+0]
;;    |   6   68 |    0    8    5    3 | r217=r144+r146                  GR_REGS:[27,+1] FP_REGS:[0,+0]
;;    |   7   69 |    4    7    4    5 | r218=flt(r143)                  GR_REGS:[28,+0] FP_REGS:[0,+1]

;;    |   8   58 |    6    4    0    5 | r211=r138+r183                  GR_REGS:[28,+1] FP_REGS:[1,+0]  <-- pressure blows up here
;;    |   9   60 |    5    5    2    4 | r213=[r211]                     GR_REGS:[29,-1] FP_REGS:[1,+1] 
;;    |  10   61 |    4    3    0    4 | r214=[r243+low(`o')]            GR_REGS:[28,+0] FP_REGS:[2,+1]

;;    |  11   70 |    3    8    6    2 | [r217]=r218                     GR_REGS:[28,-1] FP_REGS:[3,-1]
;;    |  12   62 |    0    4    3    1 | r175=r213*r214                  GR_REGS:[27,+0] FP_REGS:[2,-1]
;;    |  13   71 |    8    7    5    2 | r219=r149+r146                  GR_REGS:[27,+1] FP_REGS:[1,+0]
;;    |  14   72 |    7    8    7    1 | r177=[r219]                     GR_REGS:[28,-1] FP_REGS:[1,+1]
;;    |  15   75 |   10    7    5    2 | r220=r246+r146                  GR_REGS:[27,+1] FP_REGS:[2,+0]
;;    |  16   76 |    9    8    7    1 | r171=[r220]                     GR_REGS:[28,-1] FP_REGS:[2,+1]
;;    |  17   78 |    0    0    0    1 | pc={(r152!=r249)?L88:pc}        GR_REGS:[27,+0] FP_REGS:[3,+0]
;; Pressure summary: GR_REGS:29 FP_REGS:3    # dumps model_before_pressure.limits[pci].pressure


**The issue is insn 70 not following immediately after insn 69. increasing the live range - and the spike in curr_reg_pressure [ ]**

If we zoom in, the issue happens when insn 70 predecessors are promoted via model priority. The True deps insn 69 as well not true deps insn 60, 61
are promoted - latter is what delays insn 70.

;;    |   6   68 |    0    8    5    3 | r217=r144+r146                  GR_REGS:[27,+1] FP_REGS:[0,+0]

    <-- model_choose_insn() called with model_worklist pointing to insn 70

;;    +--- worklist:
                 mpri hght dpth prio
;;    +---   70 [0, 8, 6, 2]    
;;    +---   71 [0, 7, 5, 2]
;;    +---   75 [0, 7, 5, 2]
;;    +---   69 [0, 7, 4, 5]
;;    +---   60 [0, 5, 2, 4]
;;    +---   58 [0, 4, 0, 5]
;;    +---   72 [0, 3, 2, 1]
;;    +---   61 [0, 3, 0, 4]

    <-- since 70 has unscheduled_preds, model_choose_insn() -> model_promote_predecessors (insn 70)

;;    +--- priority of 70 = 3, priority of 69 60 61 = 4    <-- 70 depends on 69 whose model priority gets bumped
                                                                priority of other deps 60 and 61 also bumped
;;    +--- worklist:
;;    +---   69 [4, 7, 4, 5]   <- ok
;;    +---   60 [4, 5, 2, 4]   <- nok
;;    +---   61 [4, 3, 0, 4]
;;    +---   70 [3, 8, 6, 2]   <- nok
;;    +---   71 [0, 7, 5, 2]
;;    +---   75 [0, 7, 5, 2]
;;    +---   58 [0, 4, 0, 5]
;;    +---   72 [0, 3, 2, 1]

    <-- 69 rightly picked as it is prereq for 70

;;    |   7   69 |    4    7    4    5 | r218=flt(r143)                  GR_REGS:[28,+0] FP_REGS:[0,+1]

    <-- BUT bumped model priority of 60 causes it to be picked, NOT 70 (this is THE issue)

;;    +--- worklist:
                 mpri hght dpth prio
;;    +---   60 [4, 5, 2, 4]
;;    +---   61 [4, 3, 0, 4]
;;    +---   70 [3, 8, 6, 2]
;;    +---   71 [0, 7, 5, 2]
;;    +---   75 [0, 7, 5, 2]
;;    +---   58 [0, 4, 0, 5]
;;    +---   72 [0, 3, 2, 1]


Attached patch which distinguishes between true predecessors vs. others fixes the spill in this test case.
The patch is not production yet as it causes infinite loop when building libgcc, however this is an RFC for the direction of fix.

Test case (-O2 -march=rv64gc_zba -mabi=lp64d), full sched1 log w/ and w/o my patch

Thx,
-Vineet
diff mbox series

Patch

From 41465628fe4fb3dea9d45f93b263b4aac2bf1694 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vineetg@rivosinc.com>
Date: Mon, 9 Sep 2024 13:29:49 -0700
Subject: [RFC] sched1: model pressure deps promotion to be conservative
 [PR/114729]

On RISC-V, sched1 triggers a spill pathology for SPEC2017 Cactu adding
an additional 1.2 trillion dynamic icounts (user mode qemu) or almost
half of the total 2.4 trillion.

This is a multitude of various smaller issues.

One of the issues is the "model schedule" building in
model_start_schedule() -> model_choose_insn ().
An insn requires its predecessors to be added to the model first which
is done in model_promote_predecessors ().

However adding all predecessors including the non true deps ones causes
the successor insn to be delayed more than it should, increasing the
pressure.

Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
---
 gcc/haifa-sched.cc | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/gcc/haifa-sched.cc b/gcc/haifa-sched.cc
index 47e3c1788f6e..4db310207b5b 100644
--- a/gcc/haifa-sched.cc
+++ b/gcc/haifa-sched.cc
@@ -1862,6 +1862,8 @@  struct model_insn_info {
 
   /* The number of predecessor nodes that must still be scheduled.  */
   int unscheduled_preds;
+
+  int unscheduled_true_preds;
 };
 
 /* Information about the pressure limit for a particular register class.
@@ -3484,7 +3486,18 @@  model_analyze_insns (void)
 	insn->old_queue = QUEUE_INDEX (iter);
 	QUEUE_INDEX (iter) = QUEUE_NOWHERE;
 
-	insn->unscheduled_preds = dep_list_size (iter, SD_LIST_HARD_BACK);
+        insn->unscheduled_preds = 0;
+        insn->unscheduled_true_preds = 0;
+	FOR_EACH_DEP (iter, SD_LIST_HARD_BACK, sd_it, dep)
+	  {
+	    if (!DEBUG_INSN_P (DEP_PRO (dep)))
+	      {
+		insn->unscheduled_preds++;
+		if (DEP_TYPE (dep) == REG_DEP_TRUE)
+		  insn->unscheduled_true_preds++;
+	      }
+	  }
+	gcc_assert(insn->unscheduled_preds == dep_list_size (iter, SD_LIST_HARD_BACK));
 	if (insn->unscheduled_preds == 0)
 	  model_add_to_worklist (insn, NULL, model_worklist);
 
@@ -3616,6 +3629,9 @@  model_add_successors_to_worklist (struct model_insn_info *insn)
 	{
 	  con->unscheduled_preds--;
 
+	  if (DEP_TYPE (dep) == REG_DEP_TRUE)
+	      con->unscheduled_true_preds--;
+
 	  /* Update the depth field of each true-dependent successor.
 	     Increasing the depth gives them a higher priority than
 	     before.  */
@@ -3647,6 +3663,7 @@  model_add_successors_to_worklist (struct model_insn_info *insn)
 static void
 model_promote_predecessors (struct model_insn_info *insn)
 {
+  unsigned int model_new_priority;
   struct model_insn_info *pro, *first;
   sd_iterator_def sd_it;
   dep_t dep;
@@ -3654,7 +3671,9 @@  model_promote_predecessors (struct model_insn_info *insn)
   if (sched_verbose >= 7)
     fprintf (sched_dump, ";;\t+--- priority of %d = %d, priority of",
 	     INSN_UID (insn->insn), model_next_priority);
-  insn->model_priority = model_next_priority++;
+  insn->model_priority = model_next_priority;
+  model_new_priority = model_next_priority + 1;
+
   model_remove_from_worklist (insn);
   model_add_to_worklist_at (insn, NULL);
 
@@ -3670,7 +3689,6 @@  model_promote_predecessors (struct model_insn_info *insn)
 	      && pro->model_priority != model_next_priority
 	      && QUEUE_INDEX (pro->insn) != QUEUE_SCHEDULED)
 	    {
-	      pro->model_priority = model_next_priority;
 	      if (sched_verbose >= 7)
 		fprintf (sched_dump, " %d", INSN_UID (pro->insn));
 	      if (QUEUE_INDEX (pro->insn) == QUEUE_READY)
@@ -3678,6 +3696,10 @@  model_promote_predecessors (struct model_insn_info *insn)
 		  /* PRO is already in the worklist, but it now has
 		     a higher priority than before.  Move it at the
 		     appropriate place.  */
+		  if (DEP_TYPE (dep) == REG_DEP_TRUE)
+		    {
+		      pro->model_priority = model_new_priority;
+		    }
 		  model_remove_from_worklist (pro);
 		  model_add_to_worklist (pro, NULL, model_worklist);
 		}
@@ -3696,8 +3718,8 @@  model_promote_predecessors (struct model_insn_info *insn)
       first = insn->next;
     }
   if (sched_verbose >= 7)
-    fprintf (sched_dump, " = %d\n", model_next_priority);
-  model_next_priority++;
+    fprintf (sched_dump, " = %d\n", model_new_priority);
+  model_next_priority = model_new_priority + 1;
 }
 
 /* Pick one instruction from model_worklist and process it.  */
@@ -3783,7 +3805,8 @@  model_choose_insn (void)
     /* INSN isn't yet ready to issue.  Give all its predecessors the
        highest priority.  */
     model_promote_predecessors (insn);
-  else
+
+  if (!insn->unscheduled_true_preds)
     {
       /* INSN is ready.  Add it to the end of model_schedule and
 	 process its successors.  */
-- 
2.43.0