diff mbox

[v4] board-qemu: add private hcall to inform host on "phandle" update

Message ID 150089967343.21184.17474774545195474225.stgit@bahia.lan
State Accepted
Headers show

Commit Message

Greg Kurz July 24, 2017, 12:34 p.m. UTC
The "interrupt-map" property in each PHB node references the "phandle"
property of the "interrupt-controller" node. This is used by the guest
OS to setup IRQs for any PCI device plugged into the PHB. QEMU sets this
property to an arbitrary value in the flattened DT passed to SLOF.

Since commit 82954d4c1088, SLOF has some generic code to convert all
references to any "phandle" property to a SLOF specific value.

This is is perfectly okay for coldplug devices, since the guest OS only
sees the converted value in "interrupt-map". It is a problem though for
hotplug devices. Since they don't go through SLOF, the guest OS receives
the arbitrary value set by QEMU and fails to setup IRQs.

In order to support PHB hotplug, this patch introduces a new private
hcall, which allows SLOF to tell QEMU that a "phandle" was converted
from an old value to a new value.

Suggested-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Greg Kurz <groug@kaod.org>
---
v4: - fix bad stack usage in fdt-hv-update-phandle error path

v3: - drop "<> 0" before IF

v2: - use ?dup
    - switch the order of the parameters of hv-update-phandle
    - added stack comment to hv-update-phandle
---
 board-qemu/slof/fdt.fs    |   14 ++++++++++++--
 lib/libhvcall/hvcall.code |    7 +++++++
 lib/libhvcall/hvcall.in   |    1 +
 lib/libhvcall/libhvcall.h |    1 +
 4 files changed, 21 insertions(+), 2 deletions(-)

Comments

Alexey Kardashevskiy July 25, 2017, 7 a.m. UTC | #1
On 24/07/17 22:34, Greg Kurz wrote:
> The "interrupt-map" property in each PHB node references the "phandle"
> property of the "interrupt-controller" node. This is used by the guest
> OS to setup IRQs for any PCI device plugged into the PHB. QEMU sets this
> property to an arbitrary value in the flattened DT passed to SLOF.
> 
> Since commit 82954d4c1088, SLOF has some generic code to convert all
> references to any "phandle" property to a SLOF specific value.
> 
> This is is perfectly okay for coldplug devices, since the guest OS only
> sees the converted value in "interrupt-map". It is a problem though for
> hotplug devices. Since they don't go through SLOF, the guest OS receives
> the arbitrary value set by QEMU and fails to setup IRQs.
> 
> In order to support PHB hotplug, this patch introduces a new private
> hcall, which allows SLOF to tell QEMU that a "phandle" was converted
> from an old value to a new value.

Thanks, applied.

> 
> Suggested-by: Thomas Huth <thuth@redhat.com>
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
> v4: - fix bad stack usage in fdt-hv-update-phandle error path
> 
> v3: - drop "<> 0" before IF
> 
> v2: - use ?dup
>     - switch the order of the parameters of hv-update-phandle
>     - added stack comment to hv-update-phandle
> ---
>  board-qemu/slof/fdt.fs    |   14 ++++++++++++--
>  lib/libhvcall/hvcall.code |    7 +++++++
>  lib/libhvcall/hvcall.in   |    1 +
>  lib/libhvcall/libhvcall.h |    1 +
>  4 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/board-qemu/slof/fdt.fs b/board-qemu/slof/fdt.fs
> index 851645eae299..a24e3449e7c1 100644
> --- a/board-qemu/slof/fdt.fs
> +++ b/board-qemu/slof/fdt.fs
> @@ -308,18 +308,28 @@ fdt-claim-reserve
>     3drop
>  ;
>  
> +\ Tell QEMU about the updated phandle:
> +: fdt-hv-update-phandle ( old new -- )
> +   hv-update-phandle ?dup IF
> +      \ Ignore hcall not implemented error, print error otherwise
> +      dup -2 <> IF ." HV-UPDATE-PHANDLE error: " . cr ELSE drop THEN
> +   THEN
> +;
> +
>  \ Replace one FDT phandle "val" with a OF1275 phandle "node" in the
>  \ whole tree:
>  : fdt-update-phandle ( val node -- )
>     >r
>     FALSE TO (fdt-phandle-replaced)
> -   r@ s" /" find-node               ( val node root )
> -   fdt-replace-all-phandles
> +   r@ 2dup s" /" find-node          ( val node val node root )
> +   fdt-replace-all-phandles         ( val node )
>     (fdt-phandle-replaced) IF
> +      fdt-hv-update-phandle
>        r@ set-node
>        s" phandle" delete-property
>        s" linux,phandle" delete-property
>     ELSE
> +      2drop
>        diagnostic-mode? IF
>           cr ." Warning: Did not replace phandle in " r@ node>path type cr
>        THEN
> diff --git a/lib/libhvcall/hvcall.code b/lib/libhvcall/hvcall.code
> index 0ff50f27e8a9..834974862ef5 100644
> --- a/lib/libhvcall/hvcall.code
> +++ b/lib/libhvcall/hvcall.code
> @@ -129,3 +129,10 @@ PRIM(check_X2d_and_X2d_patch_X2d_sc1)
>  
>  	patch_broken_sc1((void*)start, (void*)end, (void*)patch_ins);
>  MIRP
> +
> +// : hv-update-phandle ( old_phandle new_phandle -- res )
> +PRIM(hv_X2d_update_X2d_phandle)
> +	uint32_t new_phandle = TOS.u; POP;
> +	uint32_t old_phandle = TOS.u;
> +	TOS.u = hv_generic(KVMPPC_H_UPDATE_PHANDLE, old_phandle, new_phandle);
> +MIRP
> diff --git a/lib/libhvcall/hvcall.in b/lib/libhvcall/hvcall.in
> index 4437b77f001d..ab7513af8977 100644
> --- a/lib/libhvcall/hvcall.in
> +++ b/lib/libhvcall/hvcall.in
> @@ -31,4 +31,5 @@ cod(RX!)
>  cod(hv-logical-memop)
>  cod(hv-cas)
>  cod(hv-rtas-update)
> +cod(hv-update-phandle)
>  cod(get-print-version)
> diff --git a/lib/libhvcall/libhvcall.h b/lib/libhvcall/libhvcall.h
> index b2ea3f6bf944..5776a2b772f7 100644
> --- a/lib/libhvcall/libhvcall.h
> +++ b/lib/libhvcall/libhvcall.h
> @@ -25,6 +25,7 @@
>  /* Client Architecture support */
>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>  #define KVMPPC_H_RTAS_UPDATE    (KVMPPC_HCALL_BASE + 0x3)
> +#define KVMPPC_H_UPDATE_PHANDLE (KVMPPC_HCALL_BASE + 0x4)
>  
>  #ifndef __ASSEMBLY__
>  
> 
> _______________________________________________
> SLOF mailing list
> SLOF@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/slof
>
Greg Kurz Sept. 6, 2017, 12:33 p.m. UTC | #2
On Tue, 25 Jul 2017 17:00:54 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 24/07/17 22:34, Greg Kurz wrote:
> > The "interrupt-map" property in each PHB node references the "phandle"
> > property of the "interrupt-controller" node. This is used by the guest
> > OS to setup IRQs for any PCI device plugged into the PHB. QEMU sets this
> > property to an arbitrary value in the flattened DT passed to SLOF.
> > 
> > Since commit 82954d4c1088, SLOF has some generic code to convert all
> > references to any "phandle" property to a SLOF specific value.
> > 
> > This is is perfectly okay for coldplug devices, since the guest OS only
> > sees the converted value in "interrupt-map". It is a problem though for
> > hotplug devices. Since they don't go through SLOF, the guest OS receives
> > the arbitrary value set by QEMU and fails to setup IRQs.
> > 
> > In order to support PHB hotplug, this patch introduces a new private
> > hcall, which allows SLOF to tell QEMU that a "phandle" was converted
> > from an old value to a new value.  
> 
> Thanks, applied.
> 

But David Gibson seems to be "really, really" opposed to this solution as
expressed in this thread:

https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html

He's asking if we can change SLOF to not use phandles as raw pointers
instead... I must admit that this goes way beyond my very limited
knowledge on SLOF and forth... :-\

I hope that someone here can provide some hints on the feasibility
and effort needed to satisfy David's request.

Cheers,

--
Greg

> > 
> > Suggested-by: Thomas Huth <thuth@redhat.com>
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> > ---
> > v4: - fix bad stack usage in fdt-hv-update-phandle error path
> > 
> > v3: - drop "<> 0" before IF
> > 
> > v2: - use ?dup
> >     - switch the order of the parameters of hv-update-phandle
> >     - added stack comment to hv-update-phandle
> > ---
> >  board-qemu/slof/fdt.fs    |   14 ++++++++++++--
> >  lib/libhvcall/hvcall.code |    7 +++++++
> >  lib/libhvcall/hvcall.in   |    1 +
> >  lib/libhvcall/libhvcall.h |    1 +
> >  4 files changed, 21 insertions(+), 2 deletions(-)
> > 
> > diff --git a/board-qemu/slof/fdt.fs b/board-qemu/slof/fdt.fs
> > index 851645eae299..a24e3449e7c1 100644
> > --- a/board-qemu/slof/fdt.fs
> > +++ b/board-qemu/slof/fdt.fs
> > @@ -308,18 +308,28 @@ fdt-claim-reserve
> >     3drop
> >  ;
> >  
> > +\ Tell QEMU about the updated phandle:
> > +: fdt-hv-update-phandle ( old new -- )
> > +   hv-update-phandle ?dup IF
> > +      \ Ignore hcall not implemented error, print error otherwise
> > +      dup -2 <> IF ." HV-UPDATE-PHANDLE error: " . cr ELSE drop THEN
> > +   THEN
> > +;
> > +
> >  \ Replace one FDT phandle "val" with a OF1275 phandle "node" in the
> >  \ whole tree:
> >  : fdt-update-phandle ( val node -- )  
> >     >r  
> >     FALSE TO (fdt-phandle-replaced)
> > -   r@ s" /" find-node               ( val node root )
> > -   fdt-replace-all-phandles
> > +   r@ 2dup s" /" find-node          ( val node val node root )
> > +   fdt-replace-all-phandles         ( val node )
> >     (fdt-phandle-replaced) IF
> > +      fdt-hv-update-phandle
> >        r@ set-node
> >        s" phandle" delete-property
> >        s" linux,phandle" delete-property
> >     ELSE
> > +      2drop
> >        diagnostic-mode? IF
> >           cr ." Warning: Did not replace phandle in " r@ node>path type cr
> >        THEN
> > diff --git a/lib/libhvcall/hvcall.code b/lib/libhvcall/hvcall.code
> > index 0ff50f27e8a9..834974862ef5 100644
> > --- a/lib/libhvcall/hvcall.code
> > +++ b/lib/libhvcall/hvcall.code
> > @@ -129,3 +129,10 @@ PRIM(check_X2d_and_X2d_patch_X2d_sc1)
> >  
> >  	patch_broken_sc1((void*)start, (void*)end, (void*)patch_ins);
> >  MIRP
> > +
> > +// : hv-update-phandle ( old_phandle new_phandle -- res )
> > +PRIM(hv_X2d_update_X2d_phandle)
> > +	uint32_t new_phandle = TOS.u; POP;
> > +	uint32_t old_phandle = TOS.u;
> > +	TOS.u = hv_generic(KVMPPC_H_UPDATE_PHANDLE, old_phandle, new_phandle);
> > +MIRP
> > diff --git a/lib/libhvcall/hvcall.in b/lib/libhvcall/hvcall.in
> > index 4437b77f001d..ab7513af8977 100644
> > --- a/lib/libhvcall/hvcall.in
> > +++ b/lib/libhvcall/hvcall.in
> > @@ -31,4 +31,5 @@ cod(RX!)
> >  cod(hv-logical-memop)
> >  cod(hv-cas)
> >  cod(hv-rtas-update)
> > +cod(hv-update-phandle)
> >  cod(get-print-version)
> > diff --git a/lib/libhvcall/libhvcall.h b/lib/libhvcall/libhvcall.h
> > index b2ea3f6bf944..5776a2b772f7 100644
> > --- a/lib/libhvcall/libhvcall.h
> > +++ b/lib/libhvcall/libhvcall.h
> > @@ -25,6 +25,7 @@
> >  /* Client Architecture support */
> >  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> >  #define KVMPPC_H_RTAS_UPDATE    (KVMPPC_HCALL_BASE + 0x3)
> > +#define KVMPPC_H_UPDATE_PHANDLE (KVMPPC_HCALL_BASE + 0x4)
> >  
> >  #ifndef __ASSEMBLY__
> >  
> > 
> > _______________________________________________
> > SLOF mailing list
> > SLOF@lists.ozlabs.org
> > https://lists.ozlabs.org/listinfo/slof
> >   
> 
>
Segher Boessenkool Sept. 6, 2017, 1:41 p.m. UTC | #3
On Wed, Sep 06, 2017 at 02:33:45PM +0200, Greg Kurz wrote:
> But David Gibson seems to be "really, really" opposed to this solution as
> expressed in this thread:
> 
> https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html
> 
> He's asking if we can change SLOF to not use phandles as raw pointers
> instead... I must admit that this goes way beyond my very limited
> knowledge on SLOF and forth... :-\

A phandle is an opaque cookie to everything but the firmware itself.
Using pointers to some internal structure works just fine; most Open
Firmware implementations do this.

Anything other than OF itself should *not* make up phandles.  There
is no way to guarantee these will be unique, so it is a non-starter.

If QEMU wants to create a device node, it should ask OF to do it,
say, via new-device.


Segher
Greg Kurz Sept. 6, 2017, 2:40 p.m. UTC | #4
On Wed, 6 Sep 2017 08:41:21 -0500
Segher Boessenkool <segher@kernel.crashing.org> wrote:

> On Wed, Sep 06, 2017 at 02:33:45PM +0200, Greg Kurz wrote:
> > But David Gibson seems to be "really, really" opposed to this solution as
> > expressed in this thread:
> > 
> > https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html
> > 
> > He's asking if we can change SLOF to not use phandles as raw pointers
> > instead... I must admit that this goes way beyond my very limited
> > knowledge on SLOF and forth... :-\  
> 
> A phandle is an opaque cookie to everything but the firmware itself.
> Using pointers to some internal structure works just fine; most Open
> Firmware implementations do this.
> 
> Anything other than OF itself should *not* make up phandles.  There
> is no way to guarantee these will be unique, so it is a non-starter.
> 
> If QEMU wants to create a device node, it should ask OF to do it,
> say, via new-device.
> 
> 
> Segher

Thanks for the quick answer!

Hmm... QEMU creates an FDT and writes it into the guest memory to pass
it to SLOF. QEMU uses arbitrary and unique numbers when a phandle is
needed (we just need one actually for the interrupt controller which
is referenced by each PHB). SLOF then fixes all the QEMU-generated
phandles to "opaque cookies" (see commit 82954d4c1088)... the problem
is that if we want to hotplug another PHB later on, SLOF isn't involved
anymore and QEMU doesn't know about the "opaque cookie" to be put in
the node for this PHB... I hope I'm clear enough.

This being said, you partly answer the question when you say:

"A phandle is an opaque cookie to everything but the firmware itself."

and

"Anything other than OF itself should *not* make up phandles."

Even if QEMU takes care of only forging unique phandles, it already
does some arm-twisting with the OF specification... but would it be
acceptable for SLOF to have a translation table when using phandles
generated by QEMU ?

Cheers,

--
Greg
Segher Boessenkool Sept. 6, 2017, 3:05 p.m. UTC | #5
On Wed, Sep 06, 2017 at 04:40:58PM +0200, Greg Kurz wrote:
> > A phandle is an opaque cookie to everything but the firmware itself.
> > Using pointers to some internal structure works just fine; most Open
> > Firmware implementations do this.
> > 
> > Anything other than OF itself should *not* make up phandles.  There
> > is no way to guarantee these will be unique, so it is a non-starter.
> > 
> > If QEMU wants to create a device node, it should ask OF to do it,
> > say, via new-device.

> Hmm... QEMU creates an FDT and writes it into the guest memory to pass
> it to SLOF.

It probably should skip that FDT intermediate step, just create all
nodes it wants directly?  Sounds a lot simpler, and all current phandle
problems will disappear too.

> QEMU uses arbitrary and unique numbers when a phandle is
> needed (we just need one actually for the interrupt controller which
> is referenced by each PHB). SLOF then fixes all the QEMU-generated
> phandles to "opaque cookies" (see commit 82954d4c1088)... the problem
> is that if we want to hotplug another PHB later on, SLOF isn't involved
> anymore and QEMU doesn't know about the "opaque cookie" to be put in
> the node for this PHB... I hope I'm clear enough.
> 
> This being said, you partly answer the question when you say:
> 
> "A phandle is an opaque cookie to everything but the firmware itself."
> 
> and
> 
> "Anything other than OF itself should *not* make up phandles."
> 
> Even if QEMU takes care of only forging unique phandles,

It cannot.  It cannot know about all phandles the firmware uses.  It
can guess, sure, but that is very fragile.

> it already
> does some arm-twisting with the OF specification... but would it be
> acceptable for SLOF to have a translation table when using phandles
> generated by QEMU ?

Then you need to translate from "qhandle" to phandle every time anything
changes, keep both up to date.  Why not just have actual phandles?
Those stay up to date always (namely, they never change).

Why does QEMU think it is a good idea to make up phandles out of thin
air?  Or, why does it do it if it is not a good idea?


Segher
Mark Cave-Ayland Sept. 6, 2017, 6:42 p.m. UTC | #6
On 06/09/17 14:41, Segher Boessenkool wrote:

> On Wed, Sep 06, 2017 at 02:33:45PM +0200, Greg Kurz wrote:
>> But David Gibson seems to be "really, really" opposed to this solution as
>> expressed in this thread:
>>
>> https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html
>>
>> He's asking if we can change SLOF to not use phandles as raw pointers
>> instead... I must admit that this goes way beyond my very limited
>> knowledge on SLOF and forth... :-\
> 
> A phandle is an opaque cookie to everything but the firmware itself.
> Using pointers to some internal structure works just fine; most Open
> Firmware implementations do this.
> 
> Anything other than OF itself should *not* make up phandles.  There
> is no way to guarantee these will be unique, so it is a non-starter.
> 
> If QEMU wants to create a device node, it should ask OF to do it,
> say, via new-device.

Definitely phandles should be considered an opaque for use by the
firmware only, so I agree the approach in this patch sounds wrong.

I think I missed the first part of this thread, however can someone
explain why hotplug devices need to update the SLOF DT? All OSs I've
seen iterate the entire DT and store a local copy during early boot,
primarily because once you've taken over memory management it's almost
impossible to call OF without crashing because the OF doesn't have
complete knowledge of the kernel mappings.

Therefore if the guest OS doesn't read the DT after early boot, and it
can handle the hotplug itself then why does OF need to know? Surely it
would just pick up the new device in the updated DT generated by QEMU at
the next reset as normal?


ATB,

Mark.
Alexey Kardashevskiy Sept. 7, 2017, 2:21 a.m. UTC | #7
On 07/09/17 04:42, Mark Cave-Ayland wrote:
> On 06/09/17 14:41, Segher Boessenkool wrote:
> 
>> On Wed, Sep 06, 2017 at 02:33:45PM +0200, Greg Kurz wrote:
>>> But David Gibson seems to be "really, really" opposed to this solution as
>>> expressed in this thread:
>>>
>>> https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html
>>>
>>> He's asking if we can change SLOF to not use phandles as raw pointers
>>> instead... I must admit that this goes way beyond my very limited
>>> knowledge on SLOF and forth... :-\
>>
>> A phandle is an opaque cookie to everything but the firmware itself.
>> Using pointers to some internal structure works just fine; most Open
>> Firmware implementations do this.
>>
>> Anything other than OF itself should *not* make up phandles.  There
>> is no way to guarantee these will be unique, so it is a non-starter.
>>
>> If QEMU wants to create a device node, it should ask OF to do it,
>> say, via new-device.
> 
> Definitely phandles should be considered an opaque for use by the
> firmware only, so I agree the approach in this patch sounds wrong.
> 
> I think I missed the first part of this thread, however can someone
> explain why hotplug devices need to update the SLOF DT? 

They do not.

When hotplug happens, QEMU provides a DT blob to the guest for a new PHB
which includes "interrupt-map" which tells how PHB interrupts are mapped to
a global interrupt controller (XICS) which is identified by that phandle.
So this blob has to have a valid XICS phandle - QEMU could get it somehow
from SLOF but this hypercall is new. QEMU makes it up instead as we needed
to construct the interrupt-map of the default PHB and it requires the XICS
phandle which is not known at the time as SLOF has not started yet and has
not generated phandles.

We could move that interrupt-map creating code ([1]) to SLOF, get rid of
the phandle fixing hack ([2]) but this won't help with hotplugged PHBs - we
will still have to have the interrupt-map code in QEMU's PHB hotplug path +
hcall from SLOF to update the XICS phandle in QEMU, propably this is the
way to go.

[1]
https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_pci.c;h=d84abf1070a0ae58ef13162d00c0d80e147f1b19;hb=HEAD#l2166
[2]
https://git.qemu.org/?p=SLOF.git;a=blob;f=board-qemu/slof/fdt.fs;h=a24e3449e7c1fc61d1894d600afc95cfc6fe6518;hb=HEAD#l353


> All OSs I've
> seen iterate the entire DT and store a local copy during early boot,
> primarily because once you've taken over memory management it's almost
> impossible to call OF without crashing because the OF doesn't have
> complete knowledge of the kernel mappings.
> 
> Therefore if the guest OS doesn't read the DT after early boot, and it
> can handle the hotplug itself then why does OF need to know? Surely it
> would just pick up the new device in the updated DT generated by QEMU at
> the next reset as normal?
David Gibson Sept. 7, 2017, 6:21 a.m. UTC | #8
On Wed, Sep 06, 2017 at 02:33:45PM +0200, Greg Kurz wrote:
> On Tue, 25 Jul 2017 17:00:54 +1000
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
> > On 24/07/17 22:34, Greg Kurz wrote:
> > > The "interrupt-map" property in each PHB node references the "phandle"
> > > property of the "interrupt-controller" node. This is used by the guest
> > > OS to setup IRQs for any PCI device plugged into the PHB. QEMU sets this
> > > property to an arbitrary value in the flattened DT passed to SLOF.
> > > 
> > > Since commit 82954d4c1088, SLOF has some generic code to convert all
> > > references to any "phandle" property to a SLOF specific value.
> > > 
> > > This is is perfectly okay for coldplug devices, since the guest OS only
> > > sees the converted value in "interrupt-map". It is a problem though for
> > > hotplug devices. Since they don't go through SLOF, the guest OS receives
> > > the arbitrary value set by QEMU and fails to setup IRQs.
> > > 
> > > In order to support PHB hotplug, this patch introduces a new private
> > > hcall, which allows SLOF to tell QEMU that a "phandle" was converted
> > > from an old value to a new value.  
> > 
> > Thanks, applied.
> > 
> 
> But David Gibson seems to be "really, really" opposed to this solution as
> expressed in this thread:
> 
> https://lists.nongnu.org/archive/html/qemu-ppc/2017-07/msg00556.html

I certainly don't like it.

> He's asking if we can change SLOF to not use phandles as raw pointers
> instead... I must admit that this goes way beyond my very limited
> knowledge on SLOF and forth... :-\

From what Thomas has told me, it's pretty hard :(.

> I hope that someone here can provide some hints on the feasibility
> and effort needed to satisfy David's request.

I really dislike this approach, but I've yet to find a better one.
David Gibson Sept. 7, 2017, 6:38 a.m. UTC | #9
On Wed, Sep 06, 2017 at 10:05:41AM -0500, Segher Boessenkool wrote:
> On Wed, Sep 06, 2017 at 04:40:58PM +0200, Greg Kurz wrote:
> > > A phandle is an opaque cookie to everything but the firmware itself.
> > > Using pointers to some internal structure works just fine; most Open
> > > Firmware implementations do this.
> > > 
> > > Anything other than OF itself should *not* make up phandles.  There
> > > is no way to guarantee these will be unique, so it is a non-starter.
> > > 
> > > If QEMU wants to create a device node, it should ask OF to do it,
> > > say, via new-device.
> 
> > Hmm... QEMU creates an FDT and writes it into the guest memory to pass
> > it to SLOF.
> 
> It probably should skip that FDT intermediate step, just create all
> nodes it wants directly?  Sounds a lot simpler, and all current phandle
> problems will disappear too.

No, it would be a complete PITA.  We'd have to have a way of running
the guest (running SLOF) concurrent with the qemu machine reset code,
communicating OF commands in, and responses back.  As it is now, we
just drop the FDT into guest memory before SLOF executes its first
instruction.

This is actually perfectly sound, even with an archaically OF centric
view of the world.  OF is allowed to build its DT from other
information available to it.  In this case that other data just
happens to be a FDT which will, in practice, look pretty similar to
OF's final DT.  The information flow is strictly in one direction:
qemu to OF to guest.

In other words OF *does* allocate all the phandles itself - in the
tree it presents to the guest.  The phandles allocated by qemu are
overwritten and obsolete by the time the guest gets the DT from OF.

The problem comes when we need to do hotplug, and qemu needs to inject
DT fragments long after SLOF is dead.  So far it works, because the
only things about the existing output-from-SLOF tree we rely on would
be completely perverse for SLOF to gratuitously change from the
input-to-SLOF DT.  To allow PHB hotplug, though, we need the master
XICS phandle, and SLOF *does* change all the phandles, so we don't
know it.  Hence this proposal.

I don't like this proposal precisely because it does make the DT
communication two way between qemu and SLOF, which has the prospect to
make things far, far uglier.

Unfortunately, I'm yet to see a better way to do this.

> > QEMU uses arbitrary and unique numbers when a phandle is
> > needed (we just need one actually for the interrupt controller which
> > is referenced by each PHB). SLOF then fixes all the QEMU-generated
> > phandles to "opaque cookies" (see commit 82954d4c1088)... the problem
> > is that if we want to hotplug another PHB later on, SLOF isn't involved
> > anymore and QEMU doesn't know about the "opaque cookie" to be put in
> > the node for this PHB... I hope I'm clear enough.
> > 
> > This being said, you partly answer the question when you say:
> > 
> > "A phandle is an opaque cookie to everything but the firmware itself."
> > 
> > and
> > 
> > "Anything other than OF itself should *not* make up phandles."
> > 
> > Even if QEMU takes care of only forging unique phandles,
> 
> It cannot.  It cannot know about all phandles the firmware uses.  It
> can guess, sure, but that is very fragile.
> 
> > it already
> > does some arm-twisting with the OF specification... but would it be
> > acceptable for SLOF to have a translation table when using phandles
> > generated by QEMU ?

I'm not sure exactly what you mean by this.  Converting SLOF to use
qemu supplied phandles everywhere would be possible, I think, but very
awkward and ugly.

Applying some sort of translation just to the hotplugged fragments
isn't possible because SLOF is no longer around by the time of the
hotplug.  qemu talks directly to the guest.

> Then you need to translate from "qhandle" to phandle every time anything
> changes, keep both up to date.  Why not just have actual phandles?
> Those stay up to date always (namely, they never change).
> 
> Why does QEMU think it is a good idea to make up phandles out of thin
> air?  Or, why does it do it if it is not a good idea?

It doesn't.  The problem is that it needs to *know* a phandle that
SLOF made up in order to make a compatible hotplug fragment.  At
present it has no way to do that - qemu never sees the
output-from-SLOF DT.
Greg Kurz Sept. 7, 2017, 7:56 a.m. UTC | #10
On Thu, 7 Sep 2017 16:38:00 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Sep 06, 2017 at 10:05:41AM -0500, Segher Boessenkool wrote:
> > On Wed, Sep 06, 2017 at 04:40:58PM +0200, Greg Kurz wrote:  
> > > > A phandle is an opaque cookie to everything but the firmware itself.
> > > > Using pointers to some internal structure works just fine; most Open
> > > > Firmware implementations do this.
> > > > 
> > > > Anything other than OF itself should *not* make up phandles.  There
> > > > is no way to guarantee these will be unique, so it is a non-starter.
> > > > 
> > > > If QEMU wants to create a device node, it should ask OF to do it,
> > > > say, via new-device.  
> >   
> > > Hmm... QEMU creates an FDT and writes it into the guest memory to pass
> > > it to SLOF.  
> > 
> > It probably should skip that FDT intermediate step, just create all
> > nodes it wants directly?  Sounds a lot simpler, and all current phandle
> > problems will disappear too.  
> 
> No, it would be a complete PITA.  We'd have to have a way of running
> the guest (running SLOF) concurrent with the qemu machine reset code,
> communicating OF commands in, and responses back.  As it is now, we
> just drop the FDT into guest memory before SLOF executes its first
> instruction.
> 
> This is actually perfectly sound, even with an archaically OF centric
> view of the world.  OF is allowed to build its DT from other
> information available to it.  In this case that other data just
> happens to be a FDT which will, in practice, look pretty similar to
> OF's final DT.  The information flow is strictly in one direction:
> qemu to OF to guest.
> 
> In other words OF *does* allocate all the phandles itself - in the
> tree it presents to the guest.  The phandles allocated by qemu are
> overwritten and obsolete by the time the guest gets the DT from OF.
> 
> The problem comes when we need to do hotplug, and qemu needs to inject
> DT fragments long after SLOF is dead.  So far it works, because the
> only things about the existing output-from-SLOF tree we rely on would
> be completely perverse for SLOF to gratuitously change from the
> input-to-SLOF DT.  To allow PHB hotplug, though, we need the master
> XICS phandle, and SLOF *does* change all the phandles, so we don't
> know it.  Hence this proposal.
> 
> I don't like this proposal precisely because it does make the DT
> communication two way between qemu and SLOF, which has the prospect to
> make things far, far uglier.
> 
> Unfortunately, I'm yet to see a better way to do this.
> 
> > > QEMU uses arbitrary and unique numbers when a phandle is
> > > needed (we just need one actually for the interrupt controller which
> > > is referenced by each PHB). SLOF then fixes all the QEMU-generated
> > > phandles to "opaque cookies" (see commit 82954d4c1088)... the problem
> > > is that if we want to hotplug another PHB later on, SLOF isn't involved
> > > anymore and QEMU doesn't know about the "opaque cookie" to be put in
> > > the node for this PHB... I hope I'm clear enough.
> > > 
> > > This being said, you partly answer the question when you say:
> > > 
> > > "A phandle is an opaque cookie to everything but the firmware itself."
> > > 
> > > and
> > > 
> > > "Anything other than OF itself should *not* make up phandles."
> > > 
> > > Even if QEMU takes care of only forging unique phandles,  
> > 
> > It cannot.  It cannot know about all phandles the firmware uses.  It
> > can guess, sure, but that is very fragile.
> >   
> > > it already
> > > does some arm-twisting with the OF specification... but would it be
> > > acceptable for SLOF to have a translation table when using phandles
> > > generated by QEMU ?  
> 
> I'm not sure exactly what you mean by this.  Converting SLOF to use
> qemu supplied phandles everywhere would be possible, I think, but very
> awkward and ugly.
> 

I had this last mail from you in mind, when I posted the PHB hotplug series:

https://lists.nongnu.org/archive/html/qemu-ppc/2017-08/msg00005.html

"I'd be thinking of replacing the direct pointer dereferences for
phandles with a simple lookup table of phandle to pointer, populated
as we unflatten the tree."

> Applying some sort of translation just to the hotplugged fragments
> isn't possible because SLOF is no longer around by the time of the
> hotplug.  qemu talks directly to the guest.
> 

Yes of course... hence the initial proposal of having SLOF to send any
phandle value it *fixes* back to QEMU (only one with the current code).

> > Then you need to translate from "qhandle" to phandle every time anything
> > changes, keep both up to date.  Why not just have actual phandles?
> > Those stay up to date always (namely, they never change).
> > 
> > Why does QEMU think it is a good idea to make up phandles out of thin
> > air?  Or, why does it do it if it is not a good idea?  
> 
> It doesn't.  The problem is that it needs to *know* a phandle that
> SLOF made up in order to make a compatible hotplug fragment.  At
> present it has no way to do that - qemu never sees the
> output-from-SLOF DT.
> 

<thinking aloud>
Would it be more acceptable to send back the updated value of the
XICS phandle through CAS ?
</thinking aloud>
Segher Boessenkool Sept. 7, 2017, 8:13 a.m. UTC | #11
Hi!

So I have some questions...  I don't understand why all this is so hard:

On Thu, Sep 07, 2017 at 04:38:00PM +1000, David Gibson wrote:
> No, it would be a complete PITA.  We'd have to have a way of running
> the guest (running SLOF) concurrent with the qemu machine reset code,
> communicating OF commands in, and responses back.  As it is now, we
> just drop the FDT into guest memory before SLOF executes its first
> instruction.

Why do you pass around FDT pieces?  Why not simply make (say) a PCI BAR
visible?

> The problem comes when we need to do hotplug, and qemu needs to inject
> DT fragments long after SLOF is dead.  So far it works, because the
> only things about the existing output-from-SLOF tree we rely on would
> be completely perverse for SLOF to gratuitously change from the
> input-to-SLOF DT.  To allow PHB hotplug, though, we need the master
> XICS phandle, and SLOF *does* change all the phandles, so we don't
> know it.  Hence this proposal.

Why is there an interrupt map in the hotplugged piece?  Why not in its
parent instead?

> I'm not sure exactly what you mean by this.  Converting SLOF to use
> qemu supplied phandles everywhere would be possible, I think, but very
> awkward and ugly.

It isn't completely possible either.  The root node is special, you
will have problems with some support packages, ihandles and interposing
will be a royal pain if you can make it work at all.

> It doesn't.  The problem is that it needs to *know* a phandle that
> SLOF made up in order to make a compatible hotplug fragment.  At
> present it has no way to do that - qemu never sees the
> output-from-SLOF DT.

So, see the above two questions.


Segher
David Gibson Sept. 7, 2017, 9:54 a.m. UTC | #12
On Thu, Sep 07, 2017 at 09:56:12AM +0200, Greg Kurz wrote:
> On Thu, 7 Sep 2017 16:38:00 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Sep 06, 2017 at 10:05:41AM -0500, Segher Boessenkool wrote:
> > > On Wed, Sep 06, 2017 at 04:40:58PM +0200, Greg Kurz wrote:  
> > > > > A phandle is an opaque cookie to everything but the firmware itself.
> > > > > Using pointers to some internal structure works just fine; most Open
> > > > > Firmware implementations do this.
> > > > > 
> > > > > Anything other than OF itself should *not* make up phandles.  There
> > > > > is no way to guarantee these will be unique, so it is a non-starter.
> > > > > 
> > > > > If QEMU wants to create a device node, it should ask OF to do it,
> > > > > say, via new-device.  
> > >   
> > > > Hmm... QEMU creates an FDT and writes it into the guest memory to pass
> > > > it to SLOF.  
> > > 
> > > It probably should skip that FDT intermediate step, just create all
> > > nodes it wants directly?  Sounds a lot simpler, and all current phandle
> > > problems will disappear too.  
> > 
> > No, it would be a complete PITA.  We'd have to have a way of running
> > the guest (running SLOF) concurrent with the qemu machine reset code,
> > communicating OF commands in, and responses back.  As it is now, we
> > just drop the FDT into guest memory before SLOF executes its first
> > instruction.
> > 
> > This is actually perfectly sound, even with an archaically OF centric
> > view of the world.  OF is allowed to build its DT from other
> > information available to it.  In this case that other data just
> > happens to be a FDT which will, in practice, look pretty similar to
> > OF's final DT.  The information flow is strictly in one direction:
> > qemu to OF to guest.
> > 
> > In other words OF *does* allocate all the phandles itself - in the
> > tree it presents to the guest.  The phandles allocated by qemu are
> > overwritten and obsolete by the time the guest gets the DT from OF.
> > 
> > The problem comes when we need to do hotplug, and qemu needs to inject
> > DT fragments long after SLOF is dead.  So far it works, because the
> > only things about the existing output-from-SLOF tree we rely on would
> > be completely perverse for SLOF to gratuitously change from the
> > input-to-SLOF DT.  To allow PHB hotplug, though, we need the master
> > XICS phandle, and SLOF *does* change all the phandles, so we don't
> > know it.  Hence this proposal.
> > 
> > I don't like this proposal precisely because it does make the DT
> > communication two way between qemu and SLOF, which has the prospect to
> > make things far, far uglier.
> > 
> > Unfortunately, I'm yet to see a better way to do this.
> > 
> > > > QEMU uses arbitrary and unique numbers when a phandle is
> > > > needed (we just need one actually for the interrupt controller which
> > > > is referenced by each PHB). SLOF then fixes all the QEMU-generated
> > > > phandles to "opaque cookies" (see commit 82954d4c1088)... the problem
> > > > is that if we want to hotplug another PHB later on, SLOF isn't involved
> > > > anymore and QEMU doesn't know about the "opaque cookie" to be put in
> > > > the node for this PHB... I hope I'm clear enough.
> > > > 
> > > > This being said, you partly answer the question when you say:
> > > > 
> > > > "A phandle is an opaque cookie to everything but the firmware itself."
> > > > 
> > > > and
> > > > 
> > > > "Anything other than OF itself should *not* make up phandles."
> > > > 
> > > > Even if QEMU takes care of only forging unique phandles,  
> > > 
> > > It cannot.  It cannot know about all phandles the firmware uses.  It
> > > can guess, sure, but that is very fragile.
> > >   
> > > > it already
> > > > does some arm-twisting with the OF specification... but would it be
> > > > acceptable for SLOF to have a translation table when using phandles
> > > > generated by QEMU ?  
> > 
> > I'm not sure exactly what you mean by this.  Converting SLOF to use
> > qemu supplied phandles everywhere would be possible, I think, but very
> > awkward and ugly.
> > 
> 
> I had this last mail from you in mind, when I posted the PHB hotplug series:
> 
> https://lists.nongnu.org/archive/html/qemu-ppc/2017-08/msg00005.html
> 
> "I'd be thinking of replacing the direct pointer dereferences for
> phandles with a simple lookup table of phandle to pointer, populated
> as we unflatten the tree."

Ah, right.  Since then I talked to Thomas and discovered this was
harder than I thought.

> > Applying some sort of translation just to the hotplugged fragments
> > isn't possible because SLOF is no longer around by the time of the
> > hotplug.  qemu talks directly to the guest.
> 
> Yes of course... hence the initial proposal of having SLOF to send any
> phandle value it *fixes* back to QEMU (only one with the current code).
> 
> > > Then you need to translate from "qhandle" to phandle every time anything
> > > changes, keep both up to date.  Why not just have actual phandles?
> > > Those stay up to date always (namely, they never change).
> > > 
> > > Why does QEMU think it is a good idea to make up phandles out of thin
> > > air?  Or, why does it do it if it is not a good idea?  
> > 
> > It doesn't.  The problem is that it needs to *know* a phandle that
> > SLOF made up in order to make a compatible hotplug fragment.  At
> > present it has no way to do that - qemu never sees the
> > output-from-SLOF DT.
> > 
> 
> <thinking aloud>
> Would it be more acceptable to send back the updated value of the
> XICS phandle through CAS ?
> </thinking aloud>

I don't think that makes sense.  I mean CAS is a negotiation between
guest OS and firmware.  We use a custom hypercall to punt most of it
from firmware to hypervisor.  We could piggyback on that to add
firmware -> hypervisor information, but I think it would be
functionally equivalent to an update phandle hypercall, just more
awkward.
David Gibson Sept. 7, 2017, 10:03 a.m. UTC | #13
On Thu, Sep 07, 2017 at 03:13:16AM -0500, Segher Boessenkool wrote:
> Hi!
> 
> So I have some questions...  I don't understand why all this is so hard:
> 
> On Thu, Sep 07, 2017 at 04:38:00PM +1000, David Gibson wrote:
> > No, it would be a complete PITA.  We'd have to have a way of running
> > the guest (running SLOF) concurrent with the qemu machine reset code,
> > communicating OF commands in, and responses back.  As it is now, we
> > just drop the FDT into guest memory before SLOF executes its first
> > instruction.
> 
> Why do you pass around FDT pieces?  Why not simply make (say) a PCI BAR
> visible?

Because the firmware needs to know about non-PCI devices too.  That
includes both things like PAPR virtual IO devices, and also the PCI
host bridges themselves.  Firmware can't probe for BARs until it can
access config space, and it can't do that until it has a handle on the
host bridges.

Remember that there is no real hardware we're emulating here that OF
can use the same probes for.  This is a purely paravirtual platform
which assumes the firmware "just knows" what's going on.  "Seeding"
the OF tree with an FDT was our chosen way to get that information
into guest firmware; it's pretty much the simplest option.

> > The problem comes when we need to do hotplug, and qemu needs to inject
> > DT fragments long after SLOF is dead.  So far it works, because the
> > only things about the existing output-from-SLOF tree we rely on would
> > be completely perverse for SLOF to gratuitously change from the
> > input-to-SLOF DT.  To allow PHB hotplug, though, we need the master
> > XICS phandle, and SLOF *does* change all the phandles, so we don't
> > know it.  Hence this proposal.
> 
> Why is there an interrupt map in the hotplugged piece?  Why not in its
> parent instead?

In most cases it is; that's why we haven't needed this yet.  The
problem case is hotplugging of an entire PCI host bridge.  That needs
an interrupt map to remap the PCI interrupt descriptors to XICs
interrupts.

> > I'm not sure exactly what you mean by this.  Converting SLOF to use
> > qemu supplied phandles everywhere would be possible, I think, but very
> > awkward and ugly.
> 
> It isn't completely possible either.  The root node is special, you
> will have problems with some support packages, ihandles and interposing
> will be a royal pain if you can make it work at all.

Hmm.  I don't really see how it can be impossible.  There has to be
some sort of "lookup phandle" operation - they don't *have* to be
pointers (and couldn't be for, e.g. a trye 64-bit OF implementation).
And if there's that we can have a lookup table to find the internal
pointers.

> > It doesn't.  The problem is that it needs to *know* a phandle that
> > SLOF made up in order to make a compatible hotplug fragment.  At
> > present it has no way to do that - qemu never sees the
> > output-from-SLOF DT.
> 
> So, see the above two questions.
> 
> 
> Segher
>
Mark Cave-Ayland Sept. 8, 2017, 11:39 a.m. UTC | #14
On 07/09/17 03:21, Alexey Kardashevskiy wrote:

>> I think I missed the first part of this thread, however can someone
>> explain why hotplug devices need to update the SLOF DT? 
> 
> They do not.
> 
> When hotplug happens, QEMU provides a DT blob to the guest for a new PHB
> which includes "interrupt-map" which tells how PHB interrupts are mapped to
> a global interrupt controller (XICS) which is identified by that phandle.
> So this blob has to have a valid XICS phandle - QEMU could get it somehow
> from SLOF but this hypercall is new. QEMU makes it up instead as we needed
> to construct the interrupt-map of the default PHB and it requires the XICS
> phandle which is not known at the time as SLOF has not started yet and has
> not generated phandles.
> 
> We could move that interrupt-map creating code ([1]) to SLOF, get rid of
> the phandle fixing hack ([2]) but this won't help with hotplugged PHBs - we
> will still have to have the interrupt-map code in QEMU's PHB hotplug path +
> hcall from SLOF to update the XICS phandle in QEMU, propably this is the
> way to go.
> 
> [1]
> https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_pci.c;h=d84abf1070a0ae58ef13162d00c0d80e147f1b19;hb=HEAD#l2166
> [2]
> https://git.qemu.org/?p=SLOF.git;a=blob;f=board-qemu/slof/fdt.fs;h=a24e3449e7c1fc61d1894d600afc95cfc6fe6518;hb=HEAD#l353

Thanks for the explanation. For comparison how does this work in real
hardware - presumably there must some existing mechanism for notifying
XICS that an extra PHB has been inserted?

If you're looking for a way to reference a node outside of OF then the
only way to consistently do this is via an OF path. What if when the DT
blob for PHB was created in QEMU you create a fake interrupt-parent-path
string property containing the OF path to the interrupt controller, and
move the generation of interrupt-map to SLOF?

In SLOF you could then do something like below to get the phandle from
the OF path:
"interrupt-parent-path" get-package-property dev ihandle>phandle
and from there, substituting the phandle into interrupt-map is trivial.

Similarly for the guest, it should be easy to iterate over the kernel DT
to locate the interrupt controller device based upon OF path, and then
use the interrupt-map information to update its routing information for
the hotplugged PHB accordingly.


ATB,

Mark.
David Gibson Sept. 8, 2017, 11:59 a.m. UTC | #15
On Fri, Sep 08, 2017 at 12:39:51PM +0100, Mark Cave-Ayland wrote:
> On 07/09/17 03:21, Alexey Kardashevskiy wrote:
> 
> >> I think I missed the first part of this thread, however can someone
> >> explain why hotplug devices need to update the SLOF DT? 
> > 
> > They do not.
> > 
> > When hotplug happens, QEMU provides a DT blob to the guest for a new PHB
> > which includes "interrupt-map" which tells how PHB interrupts are mapped to
> > a global interrupt controller (XICS) which is identified by that phandle.
> > So this blob has to have a valid XICS phandle - QEMU could get it somehow
> > from SLOF but this hypercall is new. QEMU makes it up instead as we needed
> > to construct the interrupt-map of the default PHB and it requires the XICS
> > phandle which is not known at the time as SLOF has not started yet and has
> > not generated phandles.
> > 
> > We could move that interrupt-map creating code ([1]) to SLOF, get rid of
> > the phandle fixing hack ([2]) but this won't help with hotplugged PHBs - we
> > will still have to have the interrupt-map code in QEMU's PHB hotplug path +
> > hcall from SLOF to update the XICS phandle in QEMU, propably this is the
> > way to go.
> > 
> > [1]
> > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_pci.c;h=d84abf1070a0ae58ef13162d00c0d80e147f1b19;hb=HEAD#l2166
> > [2]
> > https://git.qemu.org/?p=SLOF.git;a=blob;f=board-qemu/slof/fdt.fs;h=a24e3449e7c1fc61d1894d600afc95cfc6fe6518;hb=HEAD#l353
> 
> Thanks for the explanation. For comparison how does this work in real
> hardware - presumably there must some existing mechanism for notifying
> XICS that an extra PHB has been inserted?

Not really.  The bare metal platform is a different beast from the
PAPR platform.  I think hardware level PHB hotplug isn't even
possible, it only works here because they're vPHBs.

> If you're looking for a way to reference a node outside of OF then the
> only way to consistently do this is via an OF path. What if when the DT
> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> string property containing the OF path to the interrupt controller, and
> move the generation of interrupt-map to SLOF?

> In SLOF you could then do something like below to get the phandle from
> the OF path:
> "interrupt-parent-path" get-package-property dev ihandle>phandle
> and from there, substituting the phandle into interrupt-map is trivial.

Nope.  At the time of hotplug, SLOF no longer exists - it's handed
over to the guest.

> Similarly for the guest, it should be easy to iterate over the kernel DT
> to locate the interrupt controller device based upon OF path, and then
> use the interrupt-map information to update its routing information for
> the hotplugged PHB accordingly.

That requires a non-PAPR-compliant guest change.  Existing guests
already support this when running under PowerVM.
Alexey Kardashevskiy Sept. 8, 2017, 12:30 p.m. UTC | #16
On 08/09/17 21:39, Mark Cave-Ayland wrote:
> On 07/09/17 03:21, Alexey Kardashevskiy wrote:
> 
>>> I think I missed the first part of this thread, however can someone
>>> explain why hotplug devices need to update the SLOF DT? 
>>
>> They do not.
>>
>> When hotplug happens, QEMU provides a DT blob to the guest for a new PHB
>> which includes "interrupt-map" which tells how PHB interrupts are mapped to
>> a global interrupt controller (XICS) which is identified by that phandle.
>> So this blob has to have a valid XICS phandle - QEMU could get it somehow
>> from SLOF but this hypercall is new. QEMU makes it up instead as we needed
>> to construct the interrupt-map of the default PHB and it requires the XICS
>> phandle which is not known at the time as SLOF has not started yet and has
>> not generated phandles.
>>
>> We could move that interrupt-map creating code ([1]) to SLOF, get rid of
>> the phandle fixing hack ([2]) but this won't help with hotplugged PHBs - we
>> will still have to have the interrupt-map code in QEMU's PHB hotplug path +
>> hcall from SLOF to update the XICS phandle in QEMU, propably this is the
>> way to go.
>>
>> [1]
>> https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_pci.c;h=d84abf1070a0ae58ef13162d00c0d80e147f1b19;hb=HEAD#l2166
>> [2]
>> https://git.qemu.org/?p=SLOF.git;a=blob;f=board-qemu/slof/fdt.fs;h=a24e3449e7c1fc61d1894d600afc95cfc6fe6518;hb=HEAD#l353
> 
> Thanks for the explanation. For comparison how does this work in real
> hardware - presumably there must some existing mechanism for notifying
> XICS that an extra PHB has been inserted?

I am not aware of any hardware capable of hotplugging PHBs :)

> If you're looking for a way to reference a node outside of OF then the
> only way to consistently do this is via an OF path. What if when the DT
> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> string property containing the OF path to the interrupt controller, and
> move the generation of interrupt-map to SLOF?
>> In SLOF you could then do something like below to get the phandle from
> the OF path:
> "interrupt-parent-path" get-package-property dev ihandle>phandle
> and from there, substituting the phandle into interrupt-map is trivial.
> 
> Similarly for the guest, it should be easy to iterate over the kernel DT
> to locate the interrupt controller device based upon OF path, and then
> use the interrupt-map information to update its routing information for
> the hotplugged PHB accordingly.

There is no path in the interrups-map, it uses phandles, this is defined in
the interrupt binding spec which the existing guests use, it is kinda late
to change that. We can change how things work between QEMU and SLOF but
once the guest kernel is up, there is no more SLOF and QEMU has to do all
these tricks to the DT blobs itself.
Mark Cave-Ayland Sept. 8, 2017, 12:51 p.m. UTC | #17
On 08/09/17 12:59, David Gibson wrote:

>> If you're looking for a way to reference a node outside of OF then the
>> only way to consistently do this is via an OF path. What if when the DT
>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
>> string property containing the OF path to the interrupt controller, and
>> move the generation of interrupt-map to SLOF?
> 
>> In SLOF you could then do something like below to get the phandle from
>> the OF path:
>> "interrupt-parent-path" get-package-property dev ihandle>phandle
>> and from there, substituting the phandle into interrupt-map is trivial.
> 
> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> over to the guest.

Yes, I understand that. This would be the process for getting the
initial DT information to SLOF to generate interrupt-map upon boot.

>> Similarly for the guest, it should be easy to iterate over the kernel DT
>> to locate the interrupt controller device based upon OF path, and then
>> use the interrupt-map information to update its routing information for
>> the hotplugged PHB accordingly.
> 
> That requires a non-PAPR-compliant guest change.  Existing guests
> already support this when running under PowerVM.

My understanding from the thread was that hotplugging PHBs is a new
feature? In that case the transition is simple: if the
interrupt-parent-path property is present, use it to locate the
interrupt-parent. If it's not present then use the existing behaviour
which works but won't support hotplugging PHBs.

However you also mention this is supported under PowerVM. Does that mean
these patches are already in use somewhere?

Both these approaches require changes, however the benefit of this
approach would be that it should easily extend this moving forwards for
hotplugging other devices requiring interrupt-maps in future.


ATB,

Mark.
Mark Cave-Ayland Sept. 8, 2017, 12:54 p.m. UTC | #18
On 08/09/17 13:30, Alexey Kardashevskiy wrote:

>> Thanks for the explanation. For comparison how does this work in real
>> hardware - presumably there must some existing mechanism for notifying
>> XICS that an extra PHB has been inserted?
> 
> I am not aware of any hardware capable of hotplugging PHBs :)

Ha okay :)

>> If you're looking for a way to reference a node outside of OF then the
>> only way to consistently do this is via an OF path. What if when the DT
>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
>> string property containing the OF path to the interrupt controller, and
>> move the generation of interrupt-map to SLOF?
>>> In SLOF you could then do something like below to get the phandle from
>> the OF path:
>> "interrupt-parent-path" get-package-property dev ihandle>phandle
>> and from there, substituting the phandle into interrupt-map is trivial.
>>
>> Similarly for the guest, it should be easy to iterate over the kernel DT
>> to locate the interrupt controller device based upon OF path, and then
>> use the interrupt-map information to update its routing information for
>> the hotplugged PHB accordingly.
> 
> There is no path in the interrups-map, it uses phandles, this is defined in
> the interrupt binding spec which the existing guests use, it is kinda late
> to change that. We can change how things work between QEMU and SLOF but
> once the guest kernel is up, there is no more SLOF and QEMU has to do all
> these tricks to the DT blobs itself.

My understanding from this thread though was that the FDT can't be
inserted directly into the kernel DT as-is because it still needs to
translate between qhandles and phandles. So there must already be a hook
somewhere on hotplug to allow the FDT blob to be altered before it is
handed over to the kernel, so the interrupt-map property can be fixed up
based upon interrupt-parent-path at the same time.


ATB,

Mark.
Greg Kurz Sept. 8, 2017, 1:20 p.m. UTC | #19
On Fri, 8 Sep 2017 13:51:24 +0100
Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:

> On 08/09/17 12:59, David Gibson wrote:
> 
> >> If you're looking for a way to reference a node outside of OF then the
> >> only way to consistently do this is via an OF path. What if when the DT
> >> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> >> string property containing the OF path to the interrupt controller, and
> >> move the generation of interrupt-map to SLOF?  
> >   
> >> In SLOF you could then do something like below to get the phandle from
> >> the OF path:
> >> "interrupt-parent-path" get-package-property dev ihandle>phandle
> >> and from there, substituting the phandle into interrupt-map is trivial.  
> > 
> > Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> > over to the guest.  
> 
> Yes, I understand that. This would be the process for getting the
> initial DT information to SLOF to generate interrupt-map upon boot.
> 
> >> Similarly for the guest, it should be easy to iterate over the kernel DT
> >> to locate the interrupt controller device based upon OF path, and then
> >> use the interrupt-map information to update its routing information for
> >> the hotplugged PHB accordingly.  
> > 
> > That requires a non-PAPR-compliant guest change.  Existing guests
> > already support this when running under PowerVM.  
> 
> My understanding from the thread was that hotplugging PHBs is a new
> feature? In that case the transition is simple: if the

The feature is mentioned in the PAPR spec but not yet implemented in QEMU.

> interrupt-parent-path property is present, use it to locate the
> interrupt-parent. If it's not present then use the existing behaviour
> which works but won't support hotplugging PHBs.
> 
> However you also mention this is supported under PowerVM. Does that mean
> these patches are already in use somewhere?
> 

PowerVM is IBM's proprietary (ie, closed-source) environment to run
para-virtualized machines.

> Both these approaches require changes, however the benefit of this
> approach would be that it should easily extend this moving forwards for
> hotplugging other devices requiring interrupt-maps in future.
> 
> 
> ATB,
> 
> Mark.
Mark Cave-Ayland Sept. 8, 2017, 2 p.m. UTC | #20
On 08/09/17 14:20, Greg Kurz wrote:
> On Fri, 8 Sep 2017 13:51:24 +0100
> Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
> 
>> On 08/09/17 12:59, David Gibson wrote:
>>
>>>> If you're looking for a way to reference a node outside of OF then the
>>>> only way to consistently do this is via an OF path. What if when the DT
>>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
>>>> string property containing the OF path to the interrupt controller, and
>>>> move the generation of interrupt-map to SLOF?  
>>>   
>>>> In SLOF you could then do something like below to get the phandle from
>>>> the OF path:
>>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
>>>> and from there, substituting the phandle into interrupt-map is trivial.  
>>>
>>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
>>> over to the guest.  
>>
>> Yes, I understand that. This would be the process for getting the
>> initial DT information to SLOF to generate interrupt-map upon boot.
>>
>>>> Similarly for the guest, it should be easy to iterate over the kernel DT
>>>> to locate the interrupt controller device based upon OF path, and then
>>>> use the interrupt-map information to update its routing information for
>>>> the hotplugged PHB accordingly.  
>>>
>>> That requires a non-PAPR-compliant guest change.  Existing guests
>>> already support this when running under PowerVM.  
>>
>> My understanding from the thread was that hotplugging PHBs is a new
>> feature? In that case the transition is simple: if the
> 
> The feature is mentioned in the PAPR spec but not yet implemented in QEMU.

Meh. So in that case if this hacking of phandles is already part of the
PAPR specification, I guess we are too late :(


ATB,

Mark.
Greg Kurz Sept. 8, 2017, 2:54 p.m. UTC | #21
On Fri, 8 Sep 2017 15:00:36 +0100
Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:

> On 08/09/17 14:20, Greg Kurz wrote:
> > On Fri, 8 Sep 2017 13:51:24 +0100
> > Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
> >   
> >> On 08/09/17 12:59, David Gibson wrote:
> >>  
> >>>> If you're looking for a way to reference a node outside of OF then the
> >>>> only way to consistently do this is via an OF path. What if when the DT
> >>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> >>>> string property containing the OF path to the interrupt controller, and
> >>>> move the generation of interrupt-map to SLOF?    
> >>>     
> >>>> In SLOF you could then do something like below to get the phandle from
> >>>> the OF path:
> >>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
> >>>> and from there, substituting the phandle into interrupt-map is trivial.    
> >>>
> >>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> >>> over to the guest.    
> >>
> >> Yes, I understand that. This would be the process for getting the
> >> initial DT information to SLOF to generate interrupt-map upon boot.
> >>  
> >>>> Similarly for the guest, it should be easy to iterate over the kernel DT
> >>>> to locate the interrupt controller device based upon OF path, and then
> >>>> use the interrupt-map information to update its routing information for
> >>>> the hotplugged PHB accordingly.    
> >>>
> >>> That requires a non-PAPR-compliant guest change.  Existing guests
> >>> already support this when running under PowerVM.    
> >>
> >> My understanding from the thread was that hotplugging PHBs is a new
> >> feature? In that case the transition is simple: if the  
> > 
> > The feature is mentioned in the PAPR spec but not yet implemented in QEMU.  
> 
> Meh. So in that case if this hacking of phandles is already part of the
> PAPR specification, I guess we are too late :(
> 

Just to clarify: the PAPR specification explicitly mentions that PHBs
are hot-pluggable, but the fact that each PHB node has an "interrupt-map"
property which contains a phandle referring to the platform interrupt
controller comes from the "Open Firmware Recommended Practice: Interrupt
Mapping" spec.

Cheers,

--
Greg

> 
> ATB,
> 
> Mark.
Mark Cave-Ayland Sept. 8, 2017, 4:48 p.m. UTC | #22
On 08/09/17 15:54, Greg Kurz wrote:

>>> The feature is mentioned in the PAPR spec but not yet implemented in QEMU.  
>>
>> Meh. So in that case if this hacking of phandles is already part of the
>> PAPR specification, I guess we are too late :(
>> 
> Just to clarify: the PAPR specification explicitly mentions that PHBs
> are hot-pluggable, but the fact that each PHB node has an "interrupt-map"
> property which contains a phandle referring to the platform interrupt
> controller comes from the "Open Firmware Recommended Practice: Interrupt
> Mapping" spec.

Right, except that it sounds like PowerVM already uses this so it may
already be set in stone.

The technique described whereby a temporary property is used to pass
information into a device node is something I've used a few time in
OpenBIOS to pass information from C into the Forth bindings.

I can only re-emphasise what Segher says in that phandles/ihandles
should be consider opaques owned by OF. For maximum flexibility you
could extend this scheme for any node/property that requires a phandle
reference to another node in a similar way:

For any property "foo" in the current package, if "foo-devices" also
exists then "foo-devices" is an array of strings property containing the
full OF device paths for which the phandle is to be substituted into the
"foo" property for each placeholder value (currently 0x00001111). In a
way it's similar to a sprintf() placeholders.

A quick example shows this could be generated by QEMU:

   device_type: phb
   interrupt-map-devices: "/pci/path/to/xics"
   interrupt-map: 0x1 0x2 0x1111 0x0

On boot SLOF can generate the initial device tree by noticing
interrupt-map and interrupt-map-devices, locating each device in
interrupt-map-devices and obtaining its phandle. Scanning interrupt-map
SLOF then detects 0x1111 and substitutes the phandle into the property,
and then removes the temporary interrupt-map-devices when done:

   device_type: phb
   interrupt-map: 0x1 0x2 ph 0x0

Guests can then do the same for hotplug events, scanning the hotplug DT
created by QEMU looking for "foo" and "foo-devices" and then either
generating the appropriate interrupt-map property in the same way or
locating the interrupt-parent device directly via its OF path in order
to set the interrupt routing.

Anyhow I hope I've explained how I believe this should work in a bit
more detail - I'll leave it now for others to decide whether this is a
suitable approach or not.


ATB,

Mark.
David Gibson Sept. 9, 2017, 3:38 a.m. UTC | #23
On Fri, Sep 08, 2017 at 01:54:45PM +0100, Mark Cave-Ayland wrote:
> On 08/09/17 13:30, Alexey Kardashevskiy wrote:
> 
> >> Thanks for the explanation. For comparison how does this work in real
> >> hardware - presumably there must some existing mechanism for notifying
> >> XICS that an extra PHB has been inserted?
> > 
> > I am not aware of any hardware capable of hotplugging PHBs :)
> 
> Ha okay :)

There might be some, actually.  I suspect some of those Xilinx chips
with a core and big FPGA could do it by programming in a host bridge.

> >> If you're looking for a way to reference a node outside of OF then the
> >> only way to consistently do this is via an OF path. What if when the DT
> >> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> >> string property containing the OF path to the interrupt controller, and
> >> move the generation of interrupt-map to SLOF?
> >>> In SLOF you could then do something like below to get the phandle from
> >> the OF path:
> >> "interrupt-parent-path" get-package-property dev ihandle>phandle
> >> and from there, substituting the phandle into interrupt-map is trivial.
> >>
> >> Similarly for the guest, it should be easy to iterate over the kernel DT
> >> to locate the interrupt controller device based upon OF path, and then
> >> use the interrupt-map information to update its routing information for
> >> the hotplugged PHB accordingly.
> > 
> > There is no path in the interrups-map, it uses phandles, this is defined in
> > the interrupt binding spec which the existing guests use, it is kinda late
> > to change that. We can change how things work between QEMU and SLOF but
> > once the guest kernel is up, there is no more SLOF and QEMU has to do all
> > these tricks to the DT blobs itself.
> 
> My understanding from this thread though was that the FDT can't be
> inserted directly into the kernel DT as-is because it still needs to
> translate between qhandles and phandles. So there must already be a hook
> somewhere on hotplug to allow the FDT blob to be altered before it is
> handed over to the kernel, so the interrupt-map property can be fixed up
> based upon interrupt-parent-path at the same time.

Nope.  There is no hook, the DT goes straight from qemu to the guest
OS.  Everything we've hotplugged so far hasn't need to reference any
phandles, so it hasn't mattered.

[Aside: strictly, it's not an FDT piece that goes to the guest OS;
the guest performs a sequence of calls to traverse the DT fragment.
qemu uses an fdt fragment as an internal detail, but that's not part
of the protocol]
David Gibson Sept. 9, 2017, 4:39 a.m. UTC | #24
On Fri, Sep 08, 2017 at 05:48:51PM +0100, Mark Cave-Ayland wrote:
> On 08/09/17 15:54, Greg Kurz wrote:
> 
> >>> The feature is mentioned in the PAPR spec but not yet implemented in QEMU.  
> >>
> >> Meh. So in that case if this hacking of phandles is already part of the
> >> PAPR specification, I guess we are too late :(
> >> 
> > Just to clarify: the PAPR specification explicitly mentions that PHBs
> > are hot-pluggable, but the fact that each PHB node has an "interrupt-map"
> > property which contains a phandle referring to the platform interrupt
> > controller comes from the "Open Firmware Recommended Practice: Interrupt
> > Mapping" spec.
> 
> Right, except that it sounds like PowerVM already uses this so it may
> already be set in stone.

More or less, yes.

> The technique described whereby a temporary property is used to pass
> information into a device node is something I've used a few time in
> OpenBIOS to pass information from C into the Forth bindings.

Sure, but not relevant.  OF is no longer around at hotplug time.

> I can only re-emphasise what Segher says in that phandles/ihandles
> should be consider opaques owned by OF. For maximum flexibility you
> could extend this scheme for any node/property that requires a phandle
> reference to another node in a similar way:

Note that we don't actually violate this tenet.  The qemu generate
phandles are strictly temporary, being replaced by OF owned ones.  The
problem is that qemu then doesn't *know* the OF generated ones.

> For any property "foo" in the current package, if "foo-devices" also
> exists then "foo-devices" is an array of strings property containing the
> full OF device paths for which the phandle is to be substituted into the
> "foo" property for each placeholder value (currently 0x00001111). In a
> way it's similar to a sprintf() placeholders.
> 
> A quick example shows this could be generated by QEMU:
> 
>    device_type: phb
>    interrupt-map-devices: "/pci/path/to/xics"
>    interrupt-map: 0x1 0x2 0x1111 0x0
> 
> On boot SLOF can generate the initial device tree by noticing
> interrupt-map and interrupt-map-devices, locating each device in
> interrupt-map-devices and obtaining its phandle. Scanning interrupt-map
> SLOF then detects 0x1111 and substitutes the phandle into the property,
> and then removes the temporary interrupt-map-devices when done:

Sure, but why.

>    device_type: phb
>    interrupt-map: 0x1 0x2 ph 0x0
> 
> Guests can then do the same for hotplug events, scanning the hotplug DT
> created by QEMU looking for "foo" and "foo-devices" and then either
> generating the appropriate interrupt-map property in the same way or
> locating the interrupt-parent device directly via its OF path in order
> to set the interrupt routing.

Nope.  Because that's *not* what the guest does.  The guest side is
already implemented and works with PowerVM.  If we require the guest
to do extra work we're not doing PHB hotplug the way PAPR specifies
any more.

> Anyhow I hope I've explained how I believe this should work in a bit
> more detail - I'll leave it now for others to decide whether this is a
> suitable approach or not.
> 
> 
> ATB,
> 
> Mark.
>
David Gibson Sept. 9, 2017, 6:48 a.m. UTC | #25
On Fri, Sep 08, 2017 at 03:00:36PM +0100, Mark Cave-Ayland wrote:
> On 08/09/17 14:20, Greg Kurz wrote:
> > On Fri, 8 Sep 2017 13:51:24 +0100
> > Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
> > 
> >> On 08/09/17 12:59, David Gibson wrote:
> >>
> >>>> If you're looking for a way to reference a node outside of OF then the
> >>>> only way to consistently do this is via an OF path. What if when the DT
> >>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> >>>> string property containing the OF path to the interrupt controller, and
> >>>> move the generation of interrupt-map to SLOF?  
> >>>   
> >>>> In SLOF you could then do something like below to get the phandle from
> >>>> the OF path:
> >>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
> >>>> and from there, substituting the phandle into interrupt-map is trivial.  
> >>>
> >>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> >>> over to the guest.  
> >>
> >> Yes, I understand that. This would be the process for getting the
> >> initial DT information to SLOF to generate interrupt-map upon boot.
> >>
> >>>> Similarly for the guest, it should be easy to iterate over the kernel DT
> >>>> to locate the interrupt controller device based upon OF path, and then
> >>>> use the interrupt-map information to update its routing information for
> >>>> the hotplugged PHB accordingly.  
> >>>
> >>> That requires a non-PAPR-compliant guest change.  Existing guests
> >>> already support this when running under PowerVM.  
> >>
> >> My understanding from the thread was that hotplugging PHBs is a new
> >> feature? In that case the transition is simple: if the
> > 
> > The feature is mentioned in the PAPR spec but not yet implemented in QEMU.
> 
> Meh. So in that case if this hacking of phandles is already part of the
> PAPR specification, I guess we are too late :(

Well, yes and no.  In PAPR the hotplug handling is framed in terms of
RTAS requests - the runtime portion of the guest firmware.

PowerVM has its own (proprietary) guest OF implementation.  Its
version of RTAS is a reasonably substantial piece of software that has
access to the device tree built by the boot-time portion of OF.  That
way its able to generate suitable DT fragments for plugged PHBs,
including phandle referencees.

Now hotplug clearly requires communication with the hypervisor, not
just guest firmware; and in fact that's true of nearly everything RTAS
does.  How the RTAS <-> hypervisor communication happens is not
specified by PAPR, and I don't know how the PowerVM implementation
does so.

For qemu/KVM, we decided - and I'm confident we were right to do so -
that having separate hypervisor <-> RTAS and RTAS <-> guest OS
protocols was silly.  So, our RTAS is a miniscule (literally 20 bytes
long) shim which simply forwards all RTAS requests to the hypervisor
(i.e. qemu).

This makes life much easier: it means we don't need to invent an
RTAS<->hypervisor protocol (for this and many other situations.  It
means we don't need to worry about updating such a protocol in sync
between the components.  It means we don't need a complicated piece of
RTAS code to be compiled with a guest-targetting toolchain.  It means
we don't need to jump through toolchain hoops to make code that's
relocatable and callable using the somewhat weird conventions that
RTAS uses.

But, it means the RTAS calls implemented in qemu don't have access to
the ouput-from-SLOF version of the device tree.

So, how do we address that?

One option is the one proposed earlier in the thread: a special
hypercall lets OF update qemu with the phandles of nodes as it
allocates them.  For now - and very likely, forever - the changed
phandles between the qemu generated "seed" tree and the OF-output tree
are the only changes that matter to us.

Another approach would be to snapshot the OF tree at the point we
instantiate RTAS.  We could either do that by having another special
hcall which lets OF report the whole revised tree to qemu.  Or we
could just have it dump it as FDT at a known location within the RTAS
blob (expanding it as necessary, obviously).
Greg Kurz Sept. 9, 2017, 6:57 p.m. UTC | #26
On Sat, 9 Sep 2017 16:48:33 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Sep 08, 2017 at 03:00:36PM +0100, Mark Cave-Ayland wrote:
> > On 08/09/17 14:20, Greg Kurz wrote:  
> > > On Fri, 8 Sep 2017 13:51:24 +0100
> > > Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
> > >   
> > >> On 08/09/17 12:59, David Gibson wrote:
> > >>  
> > >>>> If you're looking for a way to reference a node outside of OF then the
> > >>>> only way to consistently do this is via an OF path. What if when the DT
> > >>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> > >>>> string property containing the OF path to the interrupt controller, and
> > >>>> move the generation of interrupt-map to SLOF?    
> > >>>     
> > >>>> In SLOF you could then do something like below to get the phandle from
> > >>>> the OF path:
> > >>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
> > >>>> and from there, substituting the phandle into interrupt-map is trivial.    
> > >>>
> > >>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> > >>> over to the guest.    
> > >>
> > >> Yes, I understand that. This would be the process for getting the
> > >> initial DT information to SLOF to generate interrupt-map upon boot.
> > >>  
> > >>>> Similarly for the guest, it should be easy to iterate over the kernel DT
> > >>>> to locate the interrupt controller device based upon OF path, and then
> > >>>> use the interrupt-map information to update its routing information for
> > >>>> the hotplugged PHB accordingly.    
> > >>>
> > >>> That requires a non-PAPR-compliant guest change.  Existing guests
> > >>> already support this when running under PowerVM.    
> > >>
> > >> My understanding from the thread was that hotplugging PHBs is a new
> > >> feature? In that case the transition is simple: if the  
> > > 
> > > The feature is mentioned in the PAPR spec but not yet implemented in QEMU.  
> > 
> > Meh. So in that case if this hacking of phandles is already part of the
> > PAPR specification, I guess we are too late :(  
> 
> Well, yes and no.  In PAPR the hotplug handling is framed in terms of
> RTAS requests - the runtime portion of the guest firmware.
> 
> PowerVM has its own (proprietary) guest OF implementation.  Its
> version of RTAS is a reasonably substantial piece of software that has
> access to the device tree built by the boot-time portion of OF.  That
> way its able to generate suitable DT fragments for plugged PHBs,
> including phandle referencees.
> 
> Now hotplug clearly requires communication with the hypervisor, not
> just guest firmware; and in fact that's true of nearly everything RTAS
> does.  How the RTAS <-> hypervisor communication happens is not
> specified by PAPR, and I don't know how the PowerVM implementation
> does so.
> 
> For qemu/KVM, we decided - and I'm confident we were right to do so -
> that having separate hypervisor <-> RTAS and RTAS <-> guest OS
> protocols was silly.  So, our RTAS is a miniscule (literally 20 bytes
> long) shim which simply forwards all RTAS requests to the hypervisor
> (i.e. qemu).
> 
> This makes life much easier: it means we don't need to invent an
> RTAS<->hypervisor protocol (for this and many other situations.  It
> means we don't need to worry about updating such a protocol in sync
> between the components.  It means we don't need a complicated piece of
> RTAS code to be compiled with a guest-targetting toolchain.  It means
> we don't need to jump through toolchain hoops to make code that's
> relocatable and callable using the somewhat weird conventions that
> RTAS uses.
> 
> But, it means the RTAS calls implemented in qemu don't have access to
> the ouput-from-SLOF version of the device tree.
> 
> So, how do we address that?
> 
> One option is the one proposed earlier in the thread: a special
> hypercall lets OF update qemu with the phandles of nodes as it
> allocates them.  For now - and very likely, forever - the changed
> phandles between the qemu generated "seed" tree and the OF-output tree
> are the only changes that matter to us.
> 
> Another approach would be to snapshot the OF tree at the point we
> instantiate RTAS.  We could either do that by having another special
> hcall which lets OF report the whole revised tree to qemu.  Or we
> could just have it dump it as FDT at a known location within the RTAS
> blob (expanding it as necessary, obviously).
> 

So in all cases, this would be a SLOF->QEMU communication... My 2 cent
is that we currently have only one qhandle to fix and the initial proposal
is simple (and already merged in SLOF FWIW).

Cheers,

--
Greg
Mark Cave-Ayland Sept. 10, 2017, 6:14 p.m. UTC | #27
On 09/09/17 07:48, David Gibson wrote:

> On Fri, Sep 08, 2017 at 03:00:36PM +0100, Mark Cave-Ayland wrote:
>> On 08/09/17 14:20, Greg Kurz wrote:
>>> On Fri, 8 Sep 2017 13:51:24 +0100
>>> Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
>>>
>>>> On 08/09/17 12:59, David Gibson wrote:
>>>>
>>>>>> If you're looking for a way to reference a node outside of OF then the
>>>>>> only way to consistently do this is via an OF path. What if when the DT
>>>>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
>>>>>> string property containing the OF path to the interrupt controller, and
>>>>>> move the generation of interrupt-map to SLOF?  
>>>>>   
>>>>>> In SLOF you could then do something like below to get the phandle from
>>>>>> the OF path:
>>>>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
>>>>>> and from there, substituting the phandle into interrupt-map is trivial.  
>>>>>
>>>>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
>>>>> over to the guest.  
>>>>
>>>> Yes, I understand that. This would be the process for getting the
>>>> initial DT information to SLOF to generate interrupt-map upon boot.
>>>>
>>>>>> Similarly for the guest, it should be easy to iterate over the kernel DT
>>>>>> to locate the interrupt controller device based upon OF path, and then
>>>>>> use the interrupt-map information to update its routing information for
>>>>>> the hotplugged PHB accordingly.  
>>>>>
>>>>> That requires a non-PAPR-compliant guest change.  Existing guests
>>>>> already support this when running under PowerVM.  
>>>>
>>>> My understanding from the thread was that hotplugging PHBs is a new
>>>> feature? In that case the transition is simple: if the
>>>
>>> The feature is mentioned in the PAPR spec but not yet implemented in QEMU.
>>
>> Meh. So in that case if this hacking of phandles is already part of the
>> PAPR specification, I guess we are too late :(
> 
> Well, yes and no.  In PAPR the hotplug handling is framed in terms of
> RTAS requests - the runtime portion of the guest firmware.
> 
> PowerVM has its own (proprietary) guest OF implementation.  Its
> version of RTAS is a reasonably substantial piece of software that has
> access to the device tree built by the boot-time portion of OF.  That
> way its able to generate suitable DT fragments for plugged PHBs,
> including phandle referencees.
> 
> Now hotplug clearly requires communication with the hypervisor, not
> just guest firmware; and in fact that's true of nearly everything RTAS
> does.  How the RTAS <-> hypervisor communication happens is not
> specified by PAPR, and I don't know how the PowerVM implementation
> does so.
> 
> For qemu/KVM, we decided - and I'm confident we were right to do so -
> that having separate hypervisor <-> RTAS and RTAS <-> guest OS
> protocols was silly.  So, our RTAS is a miniscule (literally 20 bytes
> long) shim which simply forwards all RTAS requests to the hypervisor
> (i.e. qemu).
> 
> This makes life much easier: it means we don't need to invent an
> RTAS<->hypervisor protocol (for this and many other situations.  It
> means we don't need to worry about updating such a protocol in sync
> between the components.  It means we don't need a complicated piece of
> RTAS code to be compiled with a guest-targetting toolchain.  It means
> we don't need to jump through toolchain hoops to make code that's
> relocatable and callable using the somewhat weird conventions that
> RTAS uses.
> 
> But, it means the RTAS calls implemented in qemu don't have access to
> the ouput-from-SLOF version of the device tree.
> 
> So, how do we address that?
> 
> One option is the one proposed earlier in the thread: a special
> hypercall lets OF update qemu with the phandles of nodes as it
> allocates them.  For now - and very likely, forever - the changed
> phandles between the qemu generated "seed" tree and the OF-output tree
> are the only changes that matter to us.
> 
> Another approach would be to snapshot the OF tree at the point we
> instantiate RTAS.  We could either do that by having another special
> hcall which lets OF report the whole revised tree to qemu.  Or we
> could just have it dump it as FDT at a known location within the RTAS
> blob (expanding it as necessary, obviously).

Thanks for detailed explanation. I think having access to the DT would
be the safest option because some device properties may be generated
outside of QEMU within OF, e.g. by FCode ROMs which can also make
changes to the DT. And presumably this would then work regardless of the
device being hotplugged.


ATB,

Mark.
Segher Boessenkool Sept. 13, 2017, 11:21 p.m. UTC | #28
On Sat, Sep 09, 2017 at 01:38:29PM +1000, David Gibson wrote:
> [Aside: strictly, it's not an FDT piece that goes to the guest OS;
> the guest performs a sequence of calls to traverse the DT fragment.
> qemu uses an fdt fragment as an internal detail, but that's not part
> of the protocol]

Ah that is good to hear :-)

By the time qemu is queried for this information it should already know
the real phandle of the interrupt controller; can't it just patch it in?


Segher
David Gibson Sept. 14, 2017, 1:48 a.m. UTC | #29
On Wed, Sep 13, 2017 at 06:21:52PM -0500, Segher Boessenkool wrote:
> On Sat, Sep 09, 2017 at 01:38:29PM +1000, David Gibson wrote:
> > [Aside: strictly, it's not an FDT piece that goes to the guest OS;
> > the guest performs a sequence of calls to traverse the DT fragment.
> > qemu uses an fdt fragment as an internal detail, but that's not part
> > of the protocol]
> 
> Ah that is good to hear :-)

It's really not.  Spitting in a chunk of FDT would be a simpler and
more efficient method of accomplishing the same thing as the horrid
set of PAPR mandated RTAS calls (these are *not* the same as the
regular OF calls used at boot time - the boot time OF client interface
is long gone).  But then, I guess FDT wasn't a thing when that bit of
the PAPR spec was made.

> By the time qemu is queried for this information it should already know
> the real phandle of the interrupt controller;

Nope.  How would it?  After it fires it off, qemu is no longer in
communication with SLOF.

> can't it just patch it in?
> 
> 
> Segher
>
Segher Boessenkool Sept. 14, 2017, 3:56 p.m. UTC | #30
On Thu, Sep 14, 2017 at 11:48:28AM +1000, David Gibson wrote:
> On Wed, Sep 13, 2017 at 06:21:52PM -0500, Segher Boessenkool wrote:
> > On Sat, Sep 09, 2017 at 01:38:29PM +1000, David Gibson wrote:
> > > [Aside: strictly, it's not an FDT piece that goes to the guest OS;
> > > the guest performs a sequence of calls to traverse the DT fragment.
> > > qemu uses an fdt fragment as an internal detail, but that's not part
> > > of the protocol]
> > 
> > Ah that is good to hear :-)
> 
> It's really not.  Spitting in a chunk of FDT would be a simpler and
> more efficient method of accomplishing the same thing as the horrid
> set of PAPR mandated RTAS calls (these are *not* the same as the
> regular OF calls used at boot time - the boot time OF client interface
> is long gone).  But then, I guess FDT wasn't a thing when that bit of
> the PAPR spec was made.

Ah, I thought you meant client interface.  Yeah, bah :-(

> > By the time qemu is queried for this information it should already know
> > the real phandle of the interrupt controller;
> 
> Nope.  How would it?  After it fires it off, qemu is no longer in
> communication with SLOF.

Qemu emulates the hardware SLOF runs on, there is a gazillion ways it
can communicate.  You just have to figure out something reasonable.


Segher
David Gibson Sept. 15, 2017, 6:50 a.m. UTC | #31
On Thu, Sep 14, 2017 at 10:56:49AM -0500, Segher Boessenkool wrote:
> On Thu, Sep 14, 2017 at 11:48:28AM +1000, David Gibson wrote:
> > On Wed, Sep 13, 2017 at 06:21:52PM -0500, Segher Boessenkool wrote:
> > > On Sat, Sep 09, 2017 at 01:38:29PM +1000, David Gibson wrote:
> > > > [Aside: strictly, it's not an FDT piece that goes to the guest OS;
> > > > the guest performs a sequence of calls to traverse the DT fragment.
> > > > qemu uses an fdt fragment as an internal detail, but that's not part
> > > > of the protocol]
> > > 
> > > Ah that is good to hear :-)
> > 
> > It's really not.  Spitting in a chunk of FDT would be a simpler and
> > more efficient method of accomplishing the same thing as the horrid
> > set of PAPR mandated RTAS calls (these are *not* the same as the
> > regular OF calls used at boot time - the boot time OF client interface
> > is long gone).  But then, I guess FDT wasn't a thing when that bit of
> > the PAPR spec was made.
> 
> Ah, I thought you meant client interface.  Yeah, bah :-(
> 
> > > By the time qemu is queried for this information it should already know
> > > the real phandle of the interrupt controller;
> > 
> > Nope.  How would it?  After it fires it off, qemu is no longer in
> > communication with SLOF.
> 
> Qemu emulates the hardware SLOF runs on, there is a gazillion ways it
> can communicate.  You just have to figure out something reasonable.

Well, sure, and the patch which started this thread was a proposal for
one such.
David Gibson Sept. 27, 2017, 6:15 a.m. UTC | #32
On Sun, Sep 10, 2017 at 07:14:14PM +0100, Mark Cave-Ayland wrote:
> On 09/09/17 07:48, David Gibson wrote:
> 
> > On Fri, Sep 08, 2017 at 03:00:36PM +0100, Mark Cave-Ayland wrote:
> >> On 08/09/17 14:20, Greg Kurz wrote:
> >>> On Fri, 8 Sep 2017 13:51:24 +0100
> >>> Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote:
> >>>
> >>>> On 08/09/17 12:59, David Gibson wrote:
> >>>>
> >>>>>> If you're looking for a way to reference a node outside of OF then the
> >>>>>> only way to consistently do this is via an OF path. What if when the DT
> >>>>>> blob for PHB was created in QEMU you create a fake interrupt-parent-path
> >>>>>> string property containing the OF path to the interrupt controller, and
> >>>>>> move the generation of interrupt-map to SLOF?  
> >>>>>   
> >>>>>> In SLOF you could then do something like below to get the phandle from
> >>>>>> the OF path:
> >>>>>> "interrupt-parent-path" get-package-property dev ihandle>phandle
> >>>>>> and from there, substituting the phandle into interrupt-map is trivial.  
> >>>>>
> >>>>> Nope.  At the time of hotplug, SLOF no longer exists - it's handed
> >>>>> over to the guest.  
> >>>>
> >>>> Yes, I understand that. This would be the process for getting the
> >>>> initial DT information to SLOF to generate interrupt-map upon boot.
> >>>>
> >>>>>> Similarly for the guest, it should be easy to iterate over the kernel DT
> >>>>>> to locate the interrupt controller device based upon OF path, and then
> >>>>>> use the interrupt-map information to update its routing information for
> >>>>>> the hotplugged PHB accordingly.  
> >>>>>
> >>>>> That requires a non-PAPR-compliant guest change.  Existing guests
> >>>>> already support this when running under PowerVM.  
> >>>>
> >>>> My understanding from the thread was that hotplugging PHBs is a new
> >>>> feature? In that case the transition is simple: if the
> >>>
> >>> The feature is mentioned in the PAPR spec but not yet implemented in QEMU.
> >>
> >> Meh. So in that case if this hacking of phandles is already part of the
> >> PAPR specification, I guess we are too late :(
> > 
> > Well, yes and no.  In PAPR the hotplug handling is framed in terms of
> > RTAS requests - the runtime portion of the guest firmware.
> > 
> > PowerVM has its own (proprietary) guest OF implementation.  Its
> > version of RTAS is a reasonably substantial piece of software that has
> > access to the device tree built by the boot-time portion of OF.  That
> > way its able to generate suitable DT fragments for plugged PHBs,
> > including phandle referencees.
> > 
> > Now hotplug clearly requires communication with the hypervisor, not
> > just guest firmware; and in fact that's true of nearly everything RTAS
> > does.  How the RTAS <-> hypervisor communication happens is not
> > specified by PAPR, and I don't know how the PowerVM implementation
> > does so.
> > 
> > For qemu/KVM, we decided - and I'm confident we were right to do so -
> > that having separate hypervisor <-> RTAS and RTAS <-> guest OS
> > protocols was silly.  So, our RTAS is a miniscule (literally 20 bytes
> > long) shim which simply forwards all RTAS requests to the hypervisor
> > (i.e. qemu).
> > 
> > This makes life much easier: it means we don't need to invent an
> > RTAS<->hypervisor protocol (for this and many other situations.  It
> > means we don't need to worry about updating such a protocol in sync
> > between the components.  It means we don't need a complicated piece of
> > RTAS code to be compiled with a guest-targetting toolchain.  It means
> > we don't need to jump through toolchain hoops to make code that's
> > relocatable and callable using the somewhat weird conventions that
> > RTAS uses.
> > 
> > But, it means the RTAS calls implemented in qemu don't have access to
> > the ouput-from-SLOF version of the device tree.
> > 
> > So, how do we address that?
> > 
> > One option is the one proposed earlier in the thread: a special
> > hypercall lets OF update qemu with the phandles of nodes as it
> > allocates them.  For now - and very likely, forever - the changed
> > phandles between the qemu generated "seed" tree and the OF-output tree
> > are the only changes that matter to us.
> > 
> > Another approach would be to snapshot the OF tree at the point we
> > instantiate RTAS.  We could either do that by having another special
> > hcall which lets OF report the whole revised tree to qemu.  Or we
> > could just have it dump it as FDT at a known location within the RTAS
> > blob (expanding it as necessary, obviously).
> 
> Thanks for detailed explanation. I think having access to the DT would
> be the safest option because some device properties may be generated
> outside of QEMU within OF, e.g. by FCode ROMs which can also make
> changes to the DT. And presumably this would then work regardless of the
> device being hotplugged.

Heh, so Greg's voted for the existing phandle-update proposal based on
simplicity, you've voted for the give DT to qemu proposal based on
completeness.

I've also discussed this with Alexey and Michael Ellerman who had some
thoughts.  Here's what I propose as our way forwards.

On the qemu side
----------------

1. Implement a new vendor specific hcall KVMPPC_H_UPDATE_DT which
   takes a single parameter, pointing at an FDT blob.

2. On H_UPDATE_DT, qemu will sanity check the provided fdt blob and
   store it away in the MachineState (this will need to be migrated).

3. When qemu needs the xics phandle for PHB hotplug it will look it up
   in the DT supplied to it by H_UPDATE_DT, if available.  If it's not
   available it will fall back to PHANDLE_XICP and hope for the best
   (this is for corner/test cases where the user bypasses SLOF and
    boots directly into a kernel - kvm-unit-tests may want to use this
    mode).

4. If we ever need more information from the DT which SLOF might have
   altered in qemu, we can get it from the same place

On the guest side
-----------------

1. Typically the last thing the guest does with OF before killing it
   off is to call the (quiesce) word.

2. At quiesce time, SLOF will linearize it's current version of the DT
   and submit it to H_UPDATE_DT
Segher Boessenkool Sept. 27, 2017, 9:30 a.m. UTC | #33
On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
> 2. At quiesce time, SLOF will linearize it's current version of the DT
>    and submit it to H_UPDATE_DT

That will work.  It's a big hammer, but that also means it will solve
many related future problems.  I like it :-)


Segher
David Gibson Sept. 27, 2017, 10:23 a.m. UTC | #34
On Wed, Sep 27, 2017 at 04:30:55AM -0500, Segher Boessenkool wrote:
> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
> > 2. At quiesce time, SLOF will linearize it's current version of the DT
> >    and submit it to H_UPDATE_DT
> 
> That will work.  It's a big hammer, but that also means it will solve
> many related future problems.  I like it :-)

That's the theory.
Thomas Huth Sept. 27, 2017, 10:26 a.m. UTC | #35
On 27.09.2017 11:30, Segher Boessenkool wrote:
> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
>> 2. At quiesce time, SLOF will linearize it's current version of the DT
>>    and submit it to H_UPDATE_DT
> 
> That will work.  It's a big hammer, but that also means it will solve
> many related future problems.  I like it :-)

... it's likely just yet another slow-down to the boot process.
Linearizing the whole DT in Forth might take a while, I guess.

 Thomas
David Gibson Sept. 28, 2017, 4:24 a.m. UTC | #36
On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
> On 27.09.2017 11:30, Segher Boessenkool wrote:
> > On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
> >> 2. At quiesce time, SLOF will linearize it's current version of the DT
> >>    and submit it to H_UPDATE_DT
> > 
> > That will work.  It's a big hammer, but that also means it will solve
> > many related future problems.  I like it :-)
> 
> ... it's likely just yet another slow-down to the boot process.
> Linearizing the whole DT in Forth might take a while, I guess.

Do we have to do it in forth?  We could make a C extension to do this,
no?
Thomas Huth Sept. 28, 2017, 5:56 a.m. UTC | #37
On 28.09.2017 06:24, David Gibson wrote:
> On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
>> On 27.09.2017 11:30, Segher Boessenkool wrote:
>>> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
>>>> 2. At quiesce time, SLOF will linearize it's current version of the DT
>>>>    and submit it to H_UPDATE_DT
>>>
>>> That will work.  It's a big hammer, but that also means it will solve
>>> many related future problems.  I like it :-)
>>
>> ... it's likely just yet another slow-down to the boot process.
>> Linearizing the whole DT in Forth might take a while, I guess.
> 
> Do we have to do it in forth?  We could make a C extension to do this,
> no?

The device tree lives in Forth, so even if you write the code that
creates the FDT in C, you have to call back into Forth for each and
every node and property that you find along the tree. I'm pretty sure it
won't perform extremely fast...

Anyway, to give you a number: The FDT->OF-DT conversion (fdt.fs)
currently takes 350 ms when running SLOF on my Laptop with TCG. I guess
the other way round will be a bit slower (since the device tree lookup
has to search through the Forth dictionaries), so I guess we will have
something around 0.5 seconds in the end. If that slowdown during quiesce
is acceptable, then go for it! If not, it might be better to search a
different solution instead.

 Thomas
David Gibson Sept. 28, 2017, 6:26 a.m. UTC | #38
On Thu, Sep 28, 2017 at 07:56:01AM +0200, Thomas Huth wrote:
> On 28.09.2017 06:24, David Gibson wrote:
> > On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
> >> On 27.09.2017 11:30, Segher Boessenkool wrote:
> >>> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
> >>>> 2. At quiesce time, SLOF will linearize it's current version of the DT
> >>>>    and submit it to H_UPDATE_DT
> >>>
> >>> That will work.  It's a big hammer, but that also means it will solve
> >>> many related future problems.  I like it :-)
> >>
> >> ... it's likely just yet another slow-down to the boot process.
> >> Linearizing the whole DT in Forth might take a while, I guess.
> > 
> > Do we have to do it in forth?  We could make a C extension to do this,
> > no?
> 
> The device tree lives in Forth, so even if you write the code that
> creates the FDT in C, you have to call back into Forth for each and
> every node and property that you find along the tree. I'm pretty sure it
> won't perform extremely fast...

Ah, right.

> Anyway, to give you a number: The FDT->OF-DT conversion (fdt.fs)
> currently takes 350 ms when running SLOF on my Laptop with TCG. I guess
> the other way round will be a bit slower (since the device tree lookup
> has to search through the Forth dictionaries), so I guess we will have
> something around 0.5 seconds in the end. If that slowdown during quiesce
> is acceptable, then go for it! If not, it might be better to search a
> different solution instead.

Hrm.  Not a lot is occurring to me.  I'm really not too fond of the
approach of the UPDATE_PHANDLE hcall; it's so incomplete.  Not to
mention, I'm not sure what the cost will be of adding an hcall on
every phandle assignment.
Mark Cave-Ayland Sept. 28, 2017, 6:43 a.m. UTC | #39
On 27/09/17 10:30, Segher Boessenkool wrote:

> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
>> 2. At quiesce time, SLOF will linearize it's current version of the DT
>>    and submit it to H_UPDATE_DT
> 
> That will work.  It's a big hammer, but that also means it will solve
> many related future problems.  I like it :-)

Agreed, this should just about cover all circumstances I can think of.
I'm sure I've seen some more obscure OSs that don't call quiesce when
testing OpenBIOS, but given that this is new functionality and you can
update both SLOF and the client accordingly I don't think this will be
an issue.


ATB,

Mark.
Mark Cave-Ayland Sept. 28, 2017, 6:49 a.m. UTC | #40
On 28/09/17 06:56, Thomas Huth wrote:

> On 28.09.2017 06:24, David Gibson wrote:
>> On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
>>> On 27.09.2017 11:30, Segher Boessenkool wrote:
>>>> On Wed, Sep 27, 2017 at 04:15:43PM +1000, David Gibson wrote:
>>>>> 2. At quiesce time, SLOF will linearize it's current version of the DT
>>>>>    and submit it to H_UPDATE_DT
>>>>
>>>> That will work.  It's a big hammer, but that also means it will solve
>>>> many related future problems.  I like it :-)
>>>
>>> ... it's likely just yet another slow-down to the boot process.
>>> Linearizing the whole DT in Forth might take a while, I guess.
>>
>> Do we have to do it in forth?  We could make a C extension to do this,
>> no?
> 
> The device tree lives in Forth, so even if you write the code that
> creates the FDT in C, you have to call back into Forth for each and
> every node and property that you find along the tree. I'm pretty sure it
> won't perform extremely fast...
> 
> Anyway, to give you a number: The FDT->OF-DT conversion (fdt.fs)
> currently takes 350 ms when running SLOF on my Laptop with TCG. I guess
> the other way round will be a bit slower (since the device tree lookup
> has to search through the Forth dictionaries), so I guess we will have
> something around 0.5 seconds in the end. If that slowdown during quiesce
> is acceptable, then go for it! If not, it might be better to search a
> different solution instead.

Yeah I don't think this will be an issue. In OpenBIOS each device node
is modelled by a Forth struct with the children, peers and properties
modelled as linked lists (the phandle is actually the pointer to this
struct).

I'm not that familiar with SLOF at source level, but if it does the same
then it's not too difficult to model the same structure but in C. Add in
some generic linked list traversal routines to iterate over the DT and
its properties and you should be good.


ATB,

Mark.
Segher Boessenkool Sept. 28, 2017, 8:40 a.m. UTC | #41
On Thu, Sep 28, 2017 at 07:56:01AM +0200, Thomas Huth wrote:
> On 28.09.2017 06:24, David Gibson wrote:
> > On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
> >> ... it's likely just yet another slow-down to the boot process.
> >> Linearizing the whole DT in Forth might take a while, I guess.
> > 
> > Do we have to do it in forth?  We could make a C extension to do this,
> > no?
> 
> The device tree lives in Forth, so even if you write the code that
> creates the FDT in C, you have to call back into Forth for each and
> every node and property that you find along the tree. I'm pretty sure it
> won't perform extremely fast...
> 
> Anyway, to give you a number: The FDT->OF-DT conversion (fdt.fs)
> currently takes 350 ms when running SLOF on my Laptop with TCG. I guess
> the other way round will be a bit slower (since the device tree lookup
> has to search through the Forth dictionaries), so I guess we will have
> something around 0.5 seconds in the end. If that slowdown during quiesce
> is acceptable, then go for it! If not, it might be better to search a
> different solution instead.

Converting the device tree to an FDT should take about the same time
as "dev / ls".  How long does that take?

There are no dictionary lookups needed.


Segher
Segher Boessenkool Sept. 28, 2017, 8:49 a.m. UTC | #42
On Thu, Sep 28, 2017 at 07:49:36AM +0100, Mark Cave-Ayland wrote:
> Yeah I don't think this will be an issue. In OpenBIOS each device node
> is modelled by a Forth struct with the children, peers and properties
> modelled as linked lists (the phandle is actually the pointer to this
> struct).
> 
> I'm not that familiar with SLOF at source level, but if it does the same
> then it's not too difficult to model the same structure but in C. Add in
> some generic linked list traversal routines to iterate over the DT and
> its properties and you should be good.

It's about the same in SLOF.  Walking the device tree and converting
it to an FDT should be pretty trivial (you need node>qname and perhaps
cull some properties, for FDT peculiarities, but that's about it).


Segher
David Gibson Sept. 28, 2017, 9:30 a.m. UTC | #43
On Thu, Sep 28, 2017 at 03:40:09AM -0500, Segher Boessenkool wrote:
> On Thu, Sep 28, 2017 at 07:56:01AM +0200, Thomas Huth wrote:
> > On 28.09.2017 06:24, David Gibson wrote:
> > > On Wed, Sep 27, 2017 at 12:26:09PM +0200, Thomas Huth wrote:
> > >> ... it's likely just yet another slow-down to the boot process.
> > >> Linearizing the whole DT in Forth might take a while, I guess.
> > > 
> > > Do we have to do it in forth?  We could make a C extension to do this,
> > > no?
> > 
> > The device tree lives in Forth, so even if you write the code that
> > creates the FDT in C, you have to call back into Forth for each and
> > every node and property that you find along the tree. I'm pretty sure it
> > won't perform extremely fast...
> > 
> > Anyway, to give you a number: The FDT->OF-DT conversion (fdt.fs)
> > currently takes 350 ms when running SLOF on my Laptop with TCG. I guess
> > the other way round will be a bit slower (since the device tree lookup
> > has to search through the Forth dictionaries), so I guess we will have
> > something around 0.5 seconds in the end. If that slowdown during quiesce
> > is acceptable, then go for it! If not, it might be better to search a
> > different solution instead.
> 
> Converting the device tree to an FDT should take about the same time
> as "dev / ls".  How long does that take?

Actually, I doubt that, since I suspect dev / ls will be limited by
the actual output - either slow serial or quasi-serial or slow(ish)
rendering of characters on a framebuffer.

> 
> There are no dictionary lookups needed.
> 
> 
> Segher
>
Segher Boessenkool Sept. 28, 2017, 10:30 a.m. UTC | #44
On Thu, Sep 28, 2017 at 07:30:28PM +1000, David Gibson wrote:
> On Thu, Sep 28, 2017 at 03:40:09AM -0500, Segher Boessenkool wrote:
> > Converting the device tree to an FDT should take about the same time
> > as "dev / ls".  How long does that take?
> 
> Actually, I doubt that, since I suspect dev / ls will be limited by
> the actual output - either slow serial or quasi-serial or slow(ish)
> rendering of characters on a framebuffer.

So use a not-slow console?

Here, have one:

' emit behavior  ' drop to emit
dev / ls
to emit

Also interesting to test might be

' emit behavior  ' drop to emit
lsprop /
to emit


Segher
David Gibson Sept. 29, 2017, 6:19 a.m. UTC | #45
On Thu, Sep 28, 2017 at 03:49:14AM -0500, Segher Boessenkool wrote:
> On Thu, Sep 28, 2017 at 07:49:36AM +0100, Mark Cave-Ayland wrote:
> > Yeah I don't think this will be an issue. In OpenBIOS each device node
> > is modelled by a Forth struct with the children, peers and properties
> > modelled as linked lists (the phandle is actually the pointer to this
> > struct).
> > 
> > I'm not that familiar with SLOF at source level, but if it does the same
> > then it's not too difficult to model the same structure but in C. Add in
> > some generic linked list traversal routines to iterate over the DT and
> > its properties and you should be good.
> 
> It's about the same in SLOF.  Walking the device tree and converting
> it to an FDT should be pretty trivial (you need node>qname and perhaps
> cull some properties, for FDT peculiarities, but that's about it).

'name' should be the only property to cull.  You'll also need to
generate synthetic 'phandle' properties.  Plus some fluff to construct
the fdt header and memory reservation block, then split up the
structure and strings blocks.  The latter probably means you'll need
two working buffers, rather than emitting it all in one pass.
Segher Boessenkool Sept. 29, 2017, 8:18 a.m. UTC | #46
On Fri, Sep 29, 2017 at 04:19:59PM +1000, David Gibson wrote:
> On Thu, Sep 28, 2017 at 03:49:14AM -0500, Segher Boessenkool wrote:
> > On Thu, Sep 28, 2017 at 07:49:36AM +0100, Mark Cave-Ayland wrote:
> > > Yeah I don't think this will be an issue. In OpenBIOS each device node
> > > is modelled by a Forth struct with the children, peers and properties
> > > modelled as linked lists (the phandle is actually the pointer to this
> > > struct).
> > > 
> > > I'm not that familiar with SLOF at source level, but if it does the same
> > > then it's not too difficult to model the same structure but in C. Add in
> > > some generic linked list traversal routines to iterate over the DT and
> > > its properties and you should be good.
> > 
> > It's about the same in SLOF.  Walking the device tree and converting
> > it to an FDT should be pretty trivial (you need node>qname and perhaps
> > cull some properties, for FDT peculiarities, but that's about it).
> 
> 'name' should be the only property to cull.  You'll also need to
> generate synthetic 'phandle' properties.  Plus some fluff to construct
> the fdt header and memory reservation block, then split up the
> structure and strings blocks.  The latter probably means you'll need
> two working buffers, rather than emitting it all in one pass.

I'm a bit worried about the phandles FDT requires, which are a different
size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
the device tree specification uses 32 bit always).

Now usually everything lives low in memory, so the top 32 bits will all
be zeroes, but do we guarantee that anywhere (and do we want to restrict
to that!)

We cannot translate between the two types of phandles, because we do not
(and CANNOT) know what parts of properties are phandles (for example
when using FCode on a plugin board).

Or maybe phandles on flat device trees just are cell size and the spec
(I looked at devicetree.org) is horribly wrong?  That would be good
(here and everywhere else).


Segher
Segher Boessenkool Sept. 29, 2017, 8:42 a.m. UTC | #47
On Fri, Sep 29, 2017 at 03:18:41AM -0500, Segher Boessenkool wrote:
> I'm a bit worried about the phandles FDT requires, which are a different
> size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> the device tree specification uses 32 bit always).
> 
> Now usually everything lives low in memory, so the top 32 bits will all
> be zeroes, but do we guarantee that anywhere (and do we want to restrict
> to that!)

OTOH SLOF currently has similar bugs itself.  Oh well.

> We cannot translate between the two types of phandles, because we do not
> (and CANNOT) know what parts of properties are phandles (for example
> when using FCode on a plugin board).
> 
> Or maybe phandles on flat device trees just are cell size and the spec
> (I looked at devicetree.org) is horribly wrong?  That would be good
> (here and everywhere else).


Segher
Thomas Huth Sept. 29, 2017, 6:08 p.m. UTC | #48
On 29.09.2017 10:18, Segher Boessenkool wrote:
> On Fri, Sep 29, 2017 at 04:19:59PM +1000, David Gibson wrote:
>> On Thu, Sep 28, 2017 at 03:49:14AM -0500, Segher Boessenkool wrote:
>>> On Thu, Sep 28, 2017 at 07:49:36AM +0100, Mark Cave-Ayland wrote:
>>>> Yeah I don't think this will be an issue. In OpenBIOS each device node
>>>> is modelled by a Forth struct with the children, peers and properties
>>>> modelled as linked lists (the phandle is actually the pointer to this
>>>> struct).
>>>>
>>>> I'm not that familiar with SLOF at source level, but if it does the same
>>>> then it's not too difficult to model the same structure but in C. Add in
>>>> some generic linked list traversal routines to iterate over the DT and
>>>> its properties and you should be good.
>>>
>>> It's about the same in SLOF.  Walking the device tree and converting
>>> it to an FDT should be pretty trivial (you need node>qname and perhaps
>>> cull some properties, for FDT peculiarities, but that's about it).
>>
>> 'name' should be the only property to cull.  You'll also need to
>> generate synthetic 'phandle' properties.  Plus some fluff to construct
>> the fdt header and memory reservation block, then split up the
>> structure and strings blocks.  The latter probably means you'll need
>> two working buffers, rather than emitting it all in one pass.
> 
> I'm a bit worried about the phandles FDT requires, which are a different
> size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> the device tree specification uses 32 bit always).
> 
> Now usually everything lives low in memory, so the top 32 bits will all
> be zeroes, but do we guarantee that anywhere (and do we want to restrict
> to that!)

Currently QEMU puts the RTAS binary and the FDT just at the end of the
RMA or below 2G, whatever is lower (look for RTAS_MAX_ADDR in
hw/ppc/spapr.c). SLOF then puts itself right in front of the FDT from
QEMU (see romfs_base in board-qemu/llfw/stage2.c in the SLOF sources),
so everything related to SLOF always lives in the first 2G of RAM. Thus
we should be fine here.

 Thomas
Segher Boessenkool Sept. 29, 2017, 11:04 p.m. UTC | #49
On Fri, Sep 29, 2017 at 08:08:56PM +0200, Thomas Huth wrote:
> On 29.09.2017 10:18, Segher Boessenkool wrote:
> > I'm a bit worried about the phandles FDT requires, which are a different
> > size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> > the device tree specification uses 32 bit always).
> > 
> > Now usually everything lives low in memory, so the top 32 bits will all
> > be zeroes, but do we guarantee that anywhere (and do we want to restrict
> > to that!)
> 
> Currently QEMU puts the RTAS binary and the FDT just at the end of the
> RMA or below 2G, whatever is lower (look for RTAS_MAX_ADDR in
> hw/ppc/spapr.c). SLOF then puts itself right in front of the FDT from
> QEMU (see romfs_base in board-qemu/llfw/stage2.c in the SLOF sources),
> so everything related to SLOF always lives in the first 2G of RAM. Thus
> we should be fine here.

Yep, and SLOF encodes (at least some) ihandles as just 32 bit (and actually
reads them from the device tree again constantly, bad plan -- see other
thread).

I guess we can live with the restriction, certainly if we only do this
create-FDT-from-the-real-device-tree thing when running under QEMU.


Segher
David Gibson Sept. 30, 2017, 3:17 a.m. UTC | #50
On Fri, Sep 29, 2017 at 03:18:41AM -0500, Segher Boessenkool wrote:
> On Fri, Sep 29, 2017 at 04:19:59PM +1000, David Gibson wrote:
> > On Thu, Sep 28, 2017 at 03:49:14AM -0500, Segher Boessenkool wrote:
> > > On Thu, Sep 28, 2017 at 07:49:36AM +0100, Mark Cave-Ayland wrote:
> > > > Yeah I don't think this will be an issue. In OpenBIOS each device node
> > > > is modelled by a Forth struct with the children, peers and properties
> > > > modelled as linked lists (the phandle is actually the pointer to this
> > > > struct).
> > > > 
> > > > I'm not that familiar with SLOF at source level, but if it does the same
> > > > then it's not too difficult to model the same structure but in C. Add in
> > > > some generic linked list traversal routines to iterate over the DT and
> > > > its properties and you should be good.
> > > 
> > > It's about the same in SLOF.  Walking the device tree and converting
> > > it to an FDT should be pretty trivial (you need node>qname and perhaps
> > > cull some properties, for FDT peculiarities, but that's about it).
> > 
> > 'name' should be the only property to cull.  You'll also need to
> > generate synthetic 'phandle' properties.  Plus some fluff to construct
> > the fdt header and memory reservation block, then split up the
> > structure and strings blocks.  The latter probably means you'll need
> > two working buffers, rather than emitting it all in one pass.
> 
> I'm a bit worried about the phandles FDT requires, which are a different
> size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> the device tree specification uses 32 bit always).

Um.  Wat.

Maybe this was theoretically possible at some point.  Maybe there's
just never been a 64-bit OF in this sense.  But in practice phandles
(and all cells) have been always 32-bit for a very, very long time.

Linux has certainly never had any facility to handle 64-bit phandles.
You can tell because there aren't (is OF 64-bit)? conditionals
practically everywhere in the device tree code.

> Now usually everything lives low in memory, so the top 32 bits will all
> be zeroes, but do we guarantee that anywhere (and do we want to restrict
> to that!)
> 
> We cannot translate between the two types of phandles, because we do not
> (and CANNOT) know what parts of properties are phandles (for example
> when using FCode on a plugin board).
> 
> Or maybe phandles on flat device trees just are cell size and the spec
> (I looked at devicetree.org) is horribly wrong?  That would be good
> (here and everywhere else).
> 
> 
> Segher
>
David Gibson Sept. 30, 2017, 3:18 a.m. UTC | #51
On Fri, Sep 29, 2017 at 06:04:56PM -0500, Segher Boessenkool wrote:
> On Fri, Sep 29, 2017 at 08:08:56PM +0200, Thomas Huth wrote:
> > On 29.09.2017 10:18, Segher Boessenkool wrote:
> > > I'm a bit worried about the phandles FDT requires, which are a different
> > > size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> > > the device tree specification uses 32 bit always).
> > > 
> > > Now usually everything lives low in memory, so the top 32 bits will all
> > > be zeroes, but do we guarantee that anywhere (and do we want to restrict
> > > to that!)
> > 
> > Currently QEMU puts the RTAS binary and the FDT just at the end of the
> > RMA or below 2G, whatever is lower (look for RTAS_MAX_ADDR in
> > hw/ppc/spapr.c). SLOF then puts itself right in front of the FDT from
> > QEMU (see romfs_base in board-qemu/llfw/stage2.c in the SLOF sources),
> > so everything related to SLOF always lives in the first 2G of RAM. Thus
> > we should be fine here.
> 
> Yep, and SLOF encodes (at least some) ihandles as just 32 bit (and actually
> reads them from the device tree again constantly, bad plan -- see other
> thread).
> 
> I guess we can live with the restriction, certainly if we only do this
> create-FDT-from-the-real-device-tree thing when running under QEMU.

64-bit phandles would break much, much more than just this proposal.
Segher Boessenkool Oct. 1, 2017, 8:37 p.m. UTC | #52
On Sat, Sep 30, 2017 at 01:17:55PM +1000, David Gibson wrote:
> On Fri, Sep 29, 2017 at 03:18:41AM -0500, Segher Boessenkool wrote:
> > I'm a bit worried about the phandles FDT requires, which are a different
> > size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> > the device tree specification uses 32 bit always).
> 
> Um.  Wat.
> 
> Maybe this was theoretically possible at some point.  Maybe there's
> just never been a 64-bit OF in this sense.  But in practice phandles
> (and all cells) have been always 32-bit for a very, very long time.

All cells (and all phandles) are 64 bits on a 64-bit OF.  This is what
a 64-bit OF _is_, fundamentally: it uses 64-bit cells.  Cells are the
fundamental data type: for example stack entries are cells.

The things in the device tree are 32 bits.  A few places in the Open
Firmware specification unfortunately call those "cells" as well.

With the status quo (32-bit phandles in the device tree, so the client
uses 32-bit phandles as well) we can still use phandles with a non-zero
upper 32 bits in OF, using one of various translation schemes.  I was
worried that the translation via FDT would prevent that, force OF to
always use only memory in the low 4GB.  I now think that not much changes,
and in practice we will always use low 4GB addresses for OF's memory
anyway.  The recommendation in 1275.6 (the 64-bit extension) is similar.


Segher
David Gibson Oct. 3, 2017, 12:21 a.m. UTC | #53
On Sun, Oct 01, 2017 at 03:37:07PM -0500, Segher Boessenkool wrote:
> On Sat, Sep 30, 2017 at 01:17:55PM +1000, David Gibson wrote:
> > On Fri, Sep 29, 2017 at 03:18:41AM -0500, Segher Boessenkool wrote:
> > > I'm a bit worried about the phandles FDT requires, which are a different
> > > size than what OF uses (OF uses cell size, which is 64 bit on 64-bit OF;
> > > the device tree specification uses 32 bit always).
> > 
> > Um.  Wat.
> > 
> > Maybe this was theoretically possible at some point.  Maybe there's
> > just never been a 64-bit OF in this sense.  But in practice phandles
> > (and all cells) have been always 32-bit for a very, very long time.
> 
> All cells (and all phandles) are 64 bits on a 64-bit OF.  This is what
> a 64-bit OF _is_, fundamentally: it uses 64-bit cells.  Cells are the
> fundamental data type: for example stack entries are cells.
> 
> The things in the device tree are 32 bits.  A few places in the Open
> Firmware specification unfortunately call those "cells" as well.

Right.  Unfortunate as it may be, that's the main sense in which
"cell" is now being used.

> With the status quo (32-bit phandles in the device tree, so the client
> uses 32-bit phandles as well) we can still use phandles with a non-zero
> upper 32 bits in OF, using one of various translation schemes.  I was
> worried that the translation via FDT would prevent that, force OF to
> always use only memory in the low 4GB.  I now think that not much changes,
> and in practice we will always use low 4GB addresses for OF's memory
> anyway.  The recommendation in 1275.6 (the 64-bit extension) is similar.

So, I'm still a bit unclear on this.  Ok, so in a 64-bit OF phandles
are 64-bit internally.  What happens when they get encoded out into
the device tree though (e.g. in 'interrupt-parent' or whatever)?

1) Do they get encoded as 64-bit integers?

If this is the case then Linux does not support 64-bit OF, and never
has.

2) Do they get (somehow) translated down into 32-bit quantities?

If this is the case, then this problem has always existed and fdt adds
nothing new.  As long as the phandles are encoded/translated the same
way in the special 'phandle' properties, and when they're referenced
in 'interrupt-parent' and similar situations, fdt is fine.
Segher Boessenkool Oct. 4, 2017, 1:17 p.m. UTC | #54
On Tue, Oct 03, 2017 at 11:21:34AM +1100, David Gibson wrote:
> > All cells (and all phandles) are 64 bits on a 64-bit OF.  This is what
> > a 64-bit OF _is_, fundamentally: it uses 64-bit cells.  Cells are the
> > fundamental data type: for example stack entries are cells.
> > 
> > The things in the device tree are 32 bits.  A few places in the Open
> > Firmware specification unfortunately call those "cells" as well.
> 
> Right.  Unfortunate as it may be, that's the main sense in which
> "cell" is now being used.

Not in the OF world (or Forth world, even).  Culture clash :-)

> > With the status quo (32-bit phandles in the device tree, so the client
> > uses 32-bit phandles as well) we can still use phandles with a non-zero
> > upper 32 bits in OF, using one of various translation schemes.  I was
> > worried that the translation via FDT would prevent that, force OF to
> > always use only memory in the low 4GB.  I now think that not much changes,
> > and in practice we will always use low 4GB addresses for OF's memory
> > anyway.  The recommendation in 1275.6 (the 64-bit extension) is similar.
> 
> So, I'm still a bit unclear on this.  Ok, so in a 64-bit OF phandles
> are 64-bit internally.  What happens when they get encoded out into
> the device tree though (e.g. in 'interrupt-parent' or whatever)?

> 2) Do they get (somehow) translated down into 32-bit quantities?

This.  And translating 32-bit ihandles and phandles back into "real"
ones can get complex.  But it looks like we won't have to go there :-)


Segher
David Gibson Oct. 4, 2017, 11:39 p.m. UTC | #55
On Wed, Oct 04, 2017 at 08:17:12AM -0500, Segher Boessenkool wrote:
> On Tue, Oct 03, 2017 at 11:21:34AM +1100, David Gibson wrote:
> > > All cells (and all phandles) are 64 bits on a 64-bit OF.  This is what
> > > a 64-bit OF _is_, fundamentally: it uses 64-bit cells.  Cells are the
> > > fundamental data type: for example stack entries are cells.
> > > 
> > > The things in the device tree are 32 bits.  A few places in the Open
> > > Firmware specification unfortunately call those "cells" as well.
> > 
> > Right.  Unfortunate as it may be, that's the main sense in which
> > "cell" is now being used.
> 
> Not in the OF world (or Forth world, even).  Culture clash :-)

Sorry.  My world's bigger than your world :-p.

> > > With the status quo (32-bit phandles in the device tree, so the client
> > > uses 32-bit phandles as well) we can still use phandles with a non-zero
> > > upper 32 bits in OF, using one of various translation schemes.  I was
> > > worried that the translation via FDT would prevent that, force OF to
> > > always use only memory in the low 4GB.  I now think that not much changes,
> > > and in practice we will always use low 4GB addresses for OF's memory
> > > anyway.  The recommendation in 1275.6 (the 64-bit extension) is similar.
> > 
> > So, I'm still a bit unclear on this.  Ok, so in a 64-bit OF phandles
> > are 64-bit internally.  What happens when they get encoded out into
> > the device tree though (e.g. in 'interrupt-parent' or whatever)?
> 
> > 2) Do they get (somehow) translated down into 32-bit quantities?
> 
> This.

Ok, so this is always an issue with a 64-bit OF, and has nothing to do
with FDT.

> And translating 32-bit ihandles and phandles back into "real"
> ones can get complex.  But it looks like we won't have to go there :-)

Right.  It won't be a problem if you constrain OF's allocations to be
within a 4G region.  If not, you're probably going to have to treat
them as indirect labels, and have some sort of lookup table to
translate them to actual pointers.  Which makes them pretty much
equivalent to the way FDT treats phandles - an arbitrary, unique,
32-bit label on a node.

Though, TBH, if you really need > 4G for your boot time firmware, what
the _hell_ are you doing.
diff mbox

Patch

diff --git a/board-qemu/slof/fdt.fs b/board-qemu/slof/fdt.fs
index 851645eae299..a24e3449e7c1 100644
--- a/board-qemu/slof/fdt.fs
+++ b/board-qemu/slof/fdt.fs
@@ -308,18 +308,28 @@  fdt-claim-reserve
    3drop
 ;
 
+\ Tell QEMU about the updated phandle:
+: fdt-hv-update-phandle ( old new -- )
+   hv-update-phandle ?dup IF
+      \ Ignore hcall not implemented error, print error otherwise
+      dup -2 <> IF ." HV-UPDATE-PHANDLE error: " . cr ELSE drop THEN
+   THEN
+;
+
 \ Replace one FDT phandle "val" with a OF1275 phandle "node" in the
 \ whole tree:
 : fdt-update-phandle ( val node -- )
    >r
    FALSE TO (fdt-phandle-replaced)
-   r@ s" /" find-node               ( val node root )
-   fdt-replace-all-phandles
+   r@ 2dup s" /" find-node          ( val node val node root )
+   fdt-replace-all-phandles         ( val node )
    (fdt-phandle-replaced) IF
+      fdt-hv-update-phandle
       r@ set-node
       s" phandle" delete-property
       s" linux,phandle" delete-property
    ELSE
+      2drop
       diagnostic-mode? IF
          cr ." Warning: Did not replace phandle in " r@ node>path type cr
       THEN
diff --git a/lib/libhvcall/hvcall.code b/lib/libhvcall/hvcall.code
index 0ff50f27e8a9..834974862ef5 100644
--- a/lib/libhvcall/hvcall.code
+++ b/lib/libhvcall/hvcall.code
@@ -129,3 +129,10 @@  PRIM(check_X2d_and_X2d_patch_X2d_sc1)
 
 	patch_broken_sc1((void*)start, (void*)end, (void*)patch_ins);
 MIRP
+
+// : hv-update-phandle ( old_phandle new_phandle -- res )
+PRIM(hv_X2d_update_X2d_phandle)
+	uint32_t new_phandle = TOS.u; POP;
+	uint32_t old_phandle = TOS.u;
+	TOS.u = hv_generic(KVMPPC_H_UPDATE_PHANDLE, old_phandle, new_phandle);
+MIRP
diff --git a/lib/libhvcall/hvcall.in b/lib/libhvcall/hvcall.in
index 4437b77f001d..ab7513af8977 100644
--- a/lib/libhvcall/hvcall.in
+++ b/lib/libhvcall/hvcall.in
@@ -31,4 +31,5 @@  cod(RX!)
 cod(hv-logical-memop)
 cod(hv-cas)
 cod(hv-rtas-update)
+cod(hv-update-phandle)
 cod(get-print-version)
diff --git a/lib/libhvcall/libhvcall.h b/lib/libhvcall/libhvcall.h
index b2ea3f6bf944..5776a2b772f7 100644
--- a/lib/libhvcall/libhvcall.h
+++ b/lib/libhvcall/libhvcall.h
@@ -25,6 +25,7 @@ 
 /* Client Architecture support */
 #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
 #define KVMPPC_H_RTAS_UPDATE    (KVMPPC_HCALL_BASE + 0x3)
+#define KVMPPC_H_UPDATE_PHANDLE (KVMPPC_HCALL_BASE + 0x4)
 
 #ifndef __ASSEMBLY__