diff mbox series

[u-boot-marvell] PLEASE TEST ddr: marvell: a38x: fix SPLIT_OUT_MIX state decision

Message ID 20220104202827.9120-1-kabel@kernel.org
State Superseded
Delegated to: Stefan Roese
Headers show
Series [u-boot-marvell] PLEASE TEST ddr: marvell: a38x: fix SPLIT_OUT_MIX state decision | expand

Commit Message

Marek Behún Jan. 4, 2022, 8:28 p.m. UTC
From: Marek Behún <marek.behun@nic.cz>

Hello,

continuing my  last discussion with Chris [1] about this, could you
please test this change? (For Chris, mainly on your x530, since last
time you said it hanged your board in SPL.)

It should fix DDR3 training issues.

[1] https://lore.kernel.org/u-boot/20210208191225.14645-1-marek.behun@nic.cz/

Signed-off-by: Marek Behún <marek.behun@nic.cz>
---
 .../a38x/ddr3_training_centralization.c       | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

Comments

Chris Packham Jan. 4, 2022, 10:10 p.m. UTC | #1
Hi Marek,

On Wed, Jan 5, 2022 at 9:28 AM Marek Behún <kabel@kernel.org> wrote:
>
> From: Marek Behún <marek.behun@nic.cz>
>
> Hello,
>
> continuing my  last discussion with Chris [1] about this, could you
> please test this change? (For Chris, mainly on your x530, since last
> time you said it hanged your board in SPL.)

I still get a hang after "Returning to BootROM (return address
0xffff05c4)... " (after the DDR training).

>
> It should fix DDR3 training issues.
>
> [1] https://lore.kernel.org/u-boot/20210208191225.14645-1-marek.behun@nic.cz/
>
> Signed-off-by: Marek Behún <marek.behun@nic.cz>
> ---
>  .../a38x/ddr3_training_centralization.c       | 27 +++++++++++++++++++
>  1 file changed, 27 insertions(+)
>
> diff --git a/drivers/ddr/marvell/a38x/ddr3_training_centralization.c b/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
> index 648b37ef6f..ed799757b9 100644
> --- a/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
> +++ b/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
> @@ -55,6 +55,7 @@ static int ddr3_tip_centralization(u32 dev_num, u32 mode)
>         enum hws_training_ip_stat training_result[MAX_INTERFACE_NUM];
>         u32 if_id, pattern_id, bit_id;
>         u8 bus_id;
> +       u8 current_byte_status;
>         u8 cur_start_win[BUS_WIDTH_IN_BITS];
>         u8 centralization_result[MAX_INTERFACE_NUM][BUS_WIDTH_IN_BITS];
>         u8 cur_end_win[BUS_WIDTH_IN_BITS];
> @@ -166,6 +167,10 @@ static int ddr3_tip_centralization(u32 dev_num, u32 mode)
>                                                   result[search_dir_id][7]));
>                                 }
>
> +                               current_byte_status =
> +                                       mv_ddr_tip_sub_phy_byte_status_get(if_id,
> +                                                                          bus_id);
> +
>                                 for (bit_id = 0; bit_id < BUS_WIDTH_IN_BITS;
>                                      bit_id++) {
>                                         /* check if this code is valid for 2 edge, probably not :( */
> @@ -174,11 +179,33 @@ static int ddr3_tip_centralization(u32 dev_num, u32 mode)
>                                                                [HWS_LOW2HIGH]
>                                                                [bit_id],
>                                                                EDGE_1);
> +                                       if (current_byte_status &
> +                                           (BYTE_SPLIT_OUT_MIX |
> +                                            BYTE_HOMOGENEOUS_SPLIT_OUT)) {
> +                                               if (cur_start_win[bit_id] >= 64)
> +                                                       cur_start_win[bit_id] -= 64;
> +                                               else
> +                                                       cur_start_win[bit_id] = 0;
> +                                               DEBUG_CENTRALIZATION_ENGINE
> +                                                       (DEBUG_LEVEL_INFO,
> +                                                        ("pattern %d IF %d pup %d bit %d subtract 64 adll from start\n",
> +                                                         pattern_id, if_id, bus_id, bit_id));
> +                                       }
>                                         cur_end_win[bit_id] =
>                                                 GET_TAP_RESULT(result
>                                                                [HWS_HIGH2LOW]
>                                                                [bit_id],
>                                                                EDGE_1);
> +                                       if (cur_end_win[bit_id] >= 64 &&
> +                                           (current_byte_status &
> +                                            BYTE_SPLIT_OUT_MIX)) {
> +                                               cur_end_win[bit_id] -= 64;
> +                                               DEBUG_CENTRALIZATION_ENGINE
> +                                                       (DEBUG_LEVEL_INFO,
> +                                                        ("pattern %d IF %d pup %d bit %d subtract 64 adll from end\n",
> +                                                         pattern_id, if_id, bus_id, bit_id));
> +                                       }
> +
>                                         /* window length */
>                                         current_window[bit_id] =
>                                                 cur_end_win[bit_id] -
> --
> 2.34.1
>
Marek Behún Jan. 5, 2022, 9:10 a.m. UTC | #2
On Wed, 5 Jan 2022 11:10:51 +1300
Chris Packham <judge.packham@gmail.com> wrote:

> Hi Marek,
> 
> On Wed, Jan 5, 2022 at 9:28 AM Marek Behún <kabel@kernel.org> wrote:
> >
> > From: Marek Behún <marek.behun@nic.cz>
> >
> > Hello,
> >
> > continuing my  last discussion with Chris [1] about this, could you
> > please test this change? (For Chris, mainly on your x530, since last
> > time you said it hanged your board in SPL.)  
> 
> I still get a hang after "Returning to BootROM (return address
> 0xffff05c4)... " (after the DDR training).

Hi Chris,

would it be possible for me to connect to a computer to which the x530
is connected via UART so that I can debug it? Could you prepare such an
environment? It would also need a mechanism to reset the board remotely
somehow. I prepared a similar remote debugging environment for Marvell
when they were solving some issues for us.

If not, then I guess we will just have to send debugging code and
results back and forth.

Marek
Chris Packham Jan. 5, 2022, 10:51 p.m. UTC | #3
On Wed, Jan 5, 2022 at 10:10 PM Marek Behún <kabel@kernel.org> wrote:
>
> On Wed, 5 Jan 2022 11:10:51 +1300
> Chris Packham <judge.packham@gmail.com> wrote:
>
> > Hi Marek,
> >
> > On Wed, Jan 5, 2022 at 9:28 AM Marek Behún <kabel@kernel.org> wrote:
> > >
> > > From: Marek Behún <marek.behun@nic.cz>
> > >
> > > Hello,
> > >
> > > continuing my  last discussion with Chris [1] about this, could you
> > > please test this change? (For Chris, mainly on your x530, since last
> > > time you said it hanged your board in SPL.)
> >
> > I still get a hang after "Returning to BootROM (return address
> > 0xffff05c4)... " (after the DDR training).
>
> Hi Chris,
>
> would it be possible for me to connect to a computer to which the x530
> is connected via UART so that I can debug it? Could you prepare such an
> environment? It would also need a mechanism to reset the board remotely
> somehow. I prepared a similar remote debugging environment for Marvell
> when they were solving some issues for us.
>
> If not, then I guess we will just have to send debugging code and
> results back and forth.
>
> Marek

It might be possible. We can probably set something up to be internet
accessible and have remotely controllable PDUs. Unfortunately most of
the people who can set this up are still on holiday and by the time we
get that sorted I'll be on leave again.

In terms of the actual problem the plot thickens. The board I had been
testing with was a production unit with MV88F6820-B0 at 1600 MHz (DDR
800MHz). To debug the hang I wanted to attach a JTAG debugger so I
found an old prototype that had the JTAG header fitted, but the
problem didn't occur. The older prototype has MV88F6820-A0 at 1866 MHz
(DDR 933MHz) (as well as the usual collection of re-work wires that
prototypes often accumulate). I don't know if the A0 vs B0 differences
are significant or if the clock speed has some impact.

I'll get the JTAG header fitted to my production unit and see where I get to.
Chris Packham Jan. 6, 2022, 3:04 a.m. UTC | #4
On Thu, Jan 6, 2022 at 11:51 AM Chris Packham <judge.packham@gmail.com> wrote:
>
> On Wed, Jan 5, 2022 at 10:10 PM Marek Behún <kabel@kernel.org> wrote:
> >
> > On Wed, 5 Jan 2022 11:10:51 +1300
> > Chris Packham <judge.packham@gmail.com> wrote:
> >
> > > Hi Marek,
> > >
> > > On Wed, Jan 5, 2022 at 9:28 AM Marek Behún <kabel@kernel.org> wrote:
> > > >
> > > > From: Marek Behún <marek.behun@nic.cz>
> > > >
> > > > Hello,
> > > >
> > > > continuing my  last discussion with Chris [1] about this, could you
> > > > please test this change? (For Chris, mainly on your x530, since last
> > > > time you said it hanged your board in SPL.)
> > >
> > > I still get a hang after "Returning to BootROM (return address
> > > 0xffff05c4)... " (after the DDR training).
> >
> > Hi Chris,
> >
> > would it be possible for me to connect to a computer to which the x530
> > is connected via UART so that I can debug it? Could you prepare such an
> > environment? It would also need a mechanism to reset the board remotely
> > somehow. I prepared a similar remote debugging environment for Marvell
> > when they were solving some issues for us.
> >
> > If not, then I guess we will just have to send debugging code and
> > results back and forth.
> >
> > Marek
>
> It might be possible. We can probably set something up to be internet
> accessible and have remotely controllable PDUs. Unfortunately most of
> the people who can set this up are still on holiday and by the time we
> get that sorted I'll be on leave again.
>
> In terms of the actual problem the plot thickens. The board I had been
> testing with was a production unit with MV88F6820-B0 at 1600 MHz (DDR
> 800MHz). To debug the hang I wanted to attach a JTAG debugger so I
> found an old prototype that had the JTAG header fitted, but the
> problem didn't occur. The older prototype has MV88F6820-A0 at 1866 MHz
> (DDR 933MHz) (as well as the usual collection of re-work wires that
> prototypes often accumulate). I don't know if the A0 vs B0 differences
> are significant or if the clock speed has some impact.
>
> I'll get the JTAG header fitted to my production unit and see where I get to.

When I got the JTAG debugger attached to my production board I could
see that with the "split change" after training was complete (just
before the jump to u-boot proper) some of the DDR was inaccessible
(random 4byte words throughout the memory). The lockup was a data
abort trying to run the code out of RAM. Without the change the memory
was "good".

I think we might be running over some old issues that we've
experienced with our internal u-boot fork. We first started to see
these as we ramped production and started seeing DDR training failures
on some boards (but not others using the same PCB and DDR). With the
increased scrutiny on the DDR we did end up making some layout changes
on the newer boards and have tried to work around the issues in SW for
the existing boards.

I actually thought the recent changes to the mvebu ddr code may have
addressed some of these issues as the board I had on my desk had been
working happily with upstream u-boot despite having a suspect board
layout. The workaround for the older boards that we've been using in
our u-boot fork is to disable ECC. This appears to do the trick with
Marek's patch so I'm tempted to say let's just go with that (I'll post
a patch for that shortly).
diff mbox series

Patch

diff --git a/drivers/ddr/marvell/a38x/ddr3_training_centralization.c b/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
index 648b37ef6f..ed799757b9 100644
--- a/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
+++ b/drivers/ddr/marvell/a38x/ddr3_training_centralization.c
@@ -55,6 +55,7 @@  static int ddr3_tip_centralization(u32 dev_num, u32 mode)
 	enum hws_training_ip_stat training_result[MAX_INTERFACE_NUM];
 	u32 if_id, pattern_id, bit_id;
 	u8 bus_id;
+	u8 current_byte_status;
 	u8 cur_start_win[BUS_WIDTH_IN_BITS];
 	u8 centralization_result[MAX_INTERFACE_NUM][BUS_WIDTH_IN_BITS];
 	u8 cur_end_win[BUS_WIDTH_IN_BITS];
@@ -166,6 +167,10 @@  static int ddr3_tip_centralization(u32 dev_num, u32 mode)
 						  result[search_dir_id][7]));
 				}
 
+				current_byte_status =
+					mv_ddr_tip_sub_phy_byte_status_get(if_id,
+									   bus_id);
+
 				for (bit_id = 0; bit_id < BUS_WIDTH_IN_BITS;
 				     bit_id++) {
 					/* check if this code is valid for 2 edge, probably not :( */
@@ -174,11 +179,33 @@  static int ddr3_tip_centralization(u32 dev_num, u32 mode)
 							       [HWS_LOW2HIGH]
 							       [bit_id],
 							       EDGE_1);
+					if (current_byte_status &
+					    (BYTE_SPLIT_OUT_MIX |
+					     BYTE_HOMOGENEOUS_SPLIT_OUT)) {
+						if (cur_start_win[bit_id] >= 64)
+							cur_start_win[bit_id] -= 64;
+						else
+							cur_start_win[bit_id] = 0;
+						DEBUG_CENTRALIZATION_ENGINE
+							(DEBUG_LEVEL_INFO,
+							 ("pattern %d IF %d pup %d bit %d subtract 64 adll from start\n",
+							  pattern_id, if_id, bus_id, bit_id));
+					}
 					cur_end_win[bit_id] =
 						GET_TAP_RESULT(result
 							       [HWS_HIGH2LOW]
 							       [bit_id],
 							       EDGE_1);
+					if (cur_end_win[bit_id] >= 64 &&
+					    (current_byte_status &
+					     BYTE_SPLIT_OUT_MIX)) {
+						cur_end_win[bit_id] -= 64;
+						DEBUG_CENTRALIZATION_ENGINE
+							(DEBUG_LEVEL_INFO,
+							 ("pattern %d IF %d pup %d bit %d subtract 64 adll from end\n",
+							  pattern_id, if_id, bus_id, bit_id));
+					}
+
 					/* window length */
 					current_window[bit_id] =
 						cur_end_win[bit_id] -