diff mbox series

[qemu,v21] spapr: Implement Open Firmware client interface

Message ID 20210615070612.22679-1-aik@ozlabs.ru
State New
Headers show
Series [qemu,v21] spapr: Implement Open Firmware client interface | expand

Commit Message

Alexey Kardashevskiy June 15, 2021, 7:06 a.m. UTC
The PAPR platform describes an OS environment that's presented by
a combination of a hypervisor and firmware. The features it specifies
require collaboration between the firmware and the hypervisor.

Since the beginning, the runtime component of the firmware (RTAS) has
been implemented as a 20 byte shim which simply forwards it to
a hypercall implemented in qemu. The boot time firmware component is
SLOF - but a build that's specific to qemu, and has always needed to be
updated in sync with it. Even though we've managed to limit the amount
of runtime communication we need between qemu and SLOF, there's some,
and it has become increasingly awkward to handle as we've implemented
new features.

This implements a boot time OF client interface (CI) which is
enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
which implements Open Firmware Client Interface (OF CI). This allows
using a smaller stateless firmware which does not have to manage
the device tree.

The new "vof.bin" firmware image is included with source code under
pc-bios/. It also includes RTAS blob.

This implements a handful of CI methods just to get -kernel/-initrd
working. In particular, this implements the device tree fetching and
simple memory allocator - "claim" (an OF CI memory allocator) and updates
"/memory@0/available" to report the client about available memory.

This implements changing some device tree properties which we know how
to deal with, the rest is ignored. To allow changes, this skips
fdt_pack() when x-vof=on as not packing the blob leaves some room for
appending.

In absence of SLOF, this assigns phandles to device tree nodes to make
device tree traversing work.

When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.

This adds basic instances support which are managed by a hash map
ihandle -> [phandle].

Before the guest started, the used memory is:
0..e60 - the initial firmware
8000..10000 - stack
400000.. - kernel
3ea0000.. - initramdisk

This OF CI does not implement "interpret".

Unlike SLOF, this does not format uninitialized nvram. Instead, this
includes a disk image with pre-formatted nvram.

With this basic support, this can only boot into kernel directly.
However this is just enough for the petitboot kernel and initradmdisk to
boot from any possible source. Note this requires reasonably recent guest
kernel with:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735

The immediate benefit is much faster booting time which especially
crucial with fully emulated early CPU bring up environments. Also this
may come handy when/if GRUB-in-the-userspace sees light of the day.

This separates VOF and sPAPR in a hope that VOF bits may be reused by
other POWERPC boards which do not support pSeries.

This make VOF optional, it is disabled by default, add --enable-vof
to ./configure to enable it.

This assumes potential support for booting from QEMU backends
such as blockdev or netdev without devices/drivers used.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

The example command line is:

/home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline \
-nographic \
-vga none \
-enable-kvm \
-m 8G \
-machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
-kernel pbuild/kernel-le-guest/vmlinux \
-initrd pb/rootfs.cpio.xz \
-drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw \
-global spapr-nvram.drive=DRIVE0 \
-snapshot \
-smp 8,threads=8 \
-L /home/aik/t/qemu-ppc64-bios/ \
-trace events=qemu_trace_events \
-d guest_errors \
-chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
-mon chardev=SOCKET0,mode=control

---
Changes:
v21:
* s/ld/ldz/ in entry.S
* moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to Kconfig
* made CONFIG_VOF optional
* s/l.lds/vof.lds/
* force 32 BE in spapr_machine_reset() instead of the firmware
* added checks for non-null methods of VofMachineIfClass
* moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better comment
* added  path_offset wrapper for handling mixed case for addresses
after "@" in node names
* changed getprop() to check for actual "name" property in the fdt
* moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike similar
rtas_ld/ldl_be_*) they return error codes
* VOF_MEM_READ uses now address_space_read (it was address_space_read_full
before, not sure why)

v20:
* compile vof.bin with -mcpu=power4 for better compatibility
* s/std/stw/ in entry.S to make it work on ppc32
* fixed dt_available property to support both 32 and 64bit
* shuffled prom_args handling code
* do not enforce 32bit in MSR (again, to support 32bit platforms)

v19:
* put bootargs in the FDT
* moved setting properties to a VOF machine hook
* moved fw_size and claim for it to vof_init()
* added CROSS to the VOF's makefile
* simplified phandles assigning
* pass MachineState to all machine hooks instead of calling
qdev_get_machine (following QOM)
* bunch of smaller changes and added comments
* added simple test to attempt to start with x-vof=on

v18:
* fixed top addr (max address for "claim") on radix - it equals to ram_size
and vof->top_addr was uint32_t
* fixed "available" property which got broken in v14 but it is only visible
to clients which care (== grub)
* reshuffled vof_dt_memory_available() calls, added vof_init() to allow
vof_claim() before rendering the FDT

v17:
* mv hw/ppc/vof.h include/hw/ppc/vof.h
* VofMachineIfClass -> VofMachineClass; it is not VofMachineInterface as
nobody used this scheme, usually "Interface" is dropped, a couple of times
it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
used by include/hw/vmstate-if.h
* added SPDX
* other fixes from v16 review

v16:
* rebased on dwg/ppc-for-6.1
* s/SpaprVofInterface/VofMachineInterface/

v15:
* bugfix: claimed memory for the VOF itself
* ditched OF_STACK_ADDR and allocate one instead, now it starts from 0x8000
because it is aligned to its size (no particular reason though)
* coding style
* moved nvram.bin up one level
* ditched bool in the firmware
* made debugging code conditional using trace_event_get_state() + qemu_loglevel_mask()
* renamed the CAS interface to SpaprVofInterface
* added "write" which for now dumps the message and ihandle via
trace point for early debug assistance
* commented on when we allocate of_instances in vof_build_dt()
* store fw_size is SpaprMachine to let spapr_vof_reset() claim it
* many small fixes from v14's review

v14:
* check for truncates in readstr()
* ditched a separate vof_reset()
* spapr->vof is a pointer now, dropped the "on" field
* removed rtas_base from vof and updated comment why we allow setting it
* added myself to maintainers
* updated commit log about blockdev and other possible platforms
* added a note why new hcall is 0x5
* no in place endianness convertion in spapr_h_vof_client
* converted all cpu_physical_memory_read/write to address_space_rw
* git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c

v13:
* rebase on latest ppc-for-6.0
* shuffled code around to touch spapr.c less

v12:
* split VOF and SPAPR

v11:
* added g_autofree
* fixed gcc warnings
* fixed few leaks
* added nvram image to make "nvram --print-config" not crash;
Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
is 16K, or it just does not work (empty output from "nvram")

v10:
* now rebased to compile with meson

v9:
* remove special handling of /rtas/rtas-size as now we always add it in QEMU
* removed leftovers from scsi/grub/stdout/stdin/...

v8:
* no read/write/seek
* no @dev in instances
* the machine flag is "x-vof" for now

v7:
* now we have a small firmware which loads at 0 as SLOF and starts from
0x100 as SLOF
* no MBR/ELF/GRUB business in QEMU anymore
* blockdev is a separate patch
* networking is a separate patch

v6:
* borrowed a big chunk of commit log introduction from David
* fixed initial stack pointer (points to the highest address of stack)
* traces for "interpret" and others
* disabled  translate_kernel_address() hack so grub can load (work in
progress)
* added "milliseconds" for grub
* fixed "claim" allocator again
* moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
* moved the most code possible from spapr.c to spapr_of_client.c, such as
RTAS, prom entry and FDT build/finalize
* separated blobs
* GRUB now proceeds to its console prompt (there are still other issues)
* parse MBR/GPT to find PReP and load GRUB

v5:
* made instances keep device and chardev pointers
* removed VIO dependencies
* print error if RTAS memory is not claimed as it should have been
* pack FDT as "quiesce"

v4:
* fixed open
* validate ihandles in "call-method"

v3:
* fixed phandles allocation
* s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
* fixed size of /chosen/stdout
* bunch of renames
* do not create rtas properties at all, let the client deal with it;
instead setprop allows changing these in the FDT
* no more packing FDT when bios=off - nobody needs it and getprop does not
work otherwise
* allow updating initramdisk device tree properties (for zImage)
* added instances
* fixed stdout on OF's "write"
* removed special handling for stdout in OF client, spapr-vty handles it
instead

v2:
* fixed claim()
* added "setprop"
* cleaner client interface and RTAS blobs management
* boots to petitboot and further to the target system
* more trace points

v20

v20!
---
 configure               |    9 +
 pc-bios/vof/Makefile    |   23 +
 include/hw/ppc/spapr.h  |   25 +-
 include/hw/ppc/vof.h    |   55 ++
 pc-bios/vof/vof.h       |   43 ++
 hw/ppc/spapr.c          |   87 +++-
 hw/ppc/spapr_hcall.c    |   29 +-
 hw/ppc/spapr_vof.c      |  153 ++++++
 hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
 pc-bios/vof/bootmem.c   |   14 +
 pc-bios/vof/ci.c        |   91 ++++
 pc-bios/vof/libc.c      |   92 ++++
 pc-bios/vof/main.c      |   21 +
 tests/qtest/rtas-test.c |   17 +-
 MAINTAINERS             |   12 +
 hw/ppc/Kconfig          |    3 +
 hw/ppc/meson.build      |    3 +
 hw/ppc/trace-events     |   24 +
 meson.build             |    1 +
 pc-bios/README          |    2 +
 pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
 pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
 pc-bios/vof/entry.S     |   49 ++
 pc-bios/vof/vof.lds     |   48 ++
 24 files changed, 1840 insertions(+), 13 deletions(-)
 create mode 100644 pc-bios/vof/Makefile
 create mode 100644 include/hw/ppc/vof.h
 create mode 100644 pc-bios/vof/vof.h
 create mode 100644 hw/ppc/spapr_vof.c
 create mode 100644 hw/ppc/vof.c
 create mode 100644 pc-bios/vof/bootmem.c
 create mode 100644 pc-bios/vof/ci.c
 create mode 100644 pc-bios/vof/libc.c
 create mode 100644 pc-bios/vof/main.c
 create mode 100644 pc-bios/vof-nvram.bin
 create mode 100755 pc-bios/vof.bin
 create mode 100644 pc-bios/vof/entry.S
 create mode 100644 pc-bios/vof/vof.lds

Comments

BALATON Zoltan June 15, 2021, 10:29 a.m. UTC | #1
On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
> The PAPR platform describes an OS environment that's presented by
> a combination of a hypervisor and firmware. The features it specifies
> require collaboration between the firmware and the hypervisor.
>
> Since the beginning, the runtime component of the firmware (RTAS) has
> been implemented as a 20 byte shim which simply forwards it to
> a hypercall implemented in qemu. The boot time firmware component is
> SLOF - but a build that's specific to qemu, and has always needed to be
> updated in sync with it. Even though we've managed to limit the amount
> of runtime communication we need between qemu and SLOF, there's some,
> and it has become increasingly awkward to handle as we've implemented
> new features.
>
> This implements a boot time OF client interface (CI) which is
> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
> which implements Open Firmware Client Interface (OF CI). This allows
> using a smaller stateless firmware which does not have to manage
> the device tree.
>
> The new "vof.bin" firmware image is included with source code under
> pc-bios/. It also includes RTAS blob.
>
> This implements a handful of CI methods just to get -kernel/-initrd
> working. In particular, this implements the device tree fetching and
> simple memory allocator - "claim" (an OF CI memory allocator) and updates
> "/memory@0/available" to report the client about available memory.
>
> This implements changing some device tree properties which we know how
> to deal with, the rest is ignored. To allow changes, this skips
> fdt_pack() when x-vof=on as not packing the blob leaves some room for
> appending.
>
> In absence of SLOF, this assigns phandles to device tree nodes to make
> device tree traversing work.
>
> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>
> This adds basic instances support which are managed by a hash map
> ihandle -> [phandle].
>
> Before the guest started, the used memory is:
> 0..e60 - the initial firmware
> 8000..10000 - stack
> 400000.. - kernel
> 3ea0000.. - initramdisk
>
> This OF CI does not implement "interpret".
>
> Unlike SLOF, this does not format uninitialized nvram. Instead, this
> includes a disk image with pre-formatted nvram.
>
> With this basic support, this can only boot into kernel directly.
> However this is just enough for the petitboot kernel and initradmdisk to
> boot from any possible source. Note this requires reasonably recent guest
> kernel with:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
>
> The immediate benefit is much faster booting time which especially
> crucial with fully emulated early CPU bring up environments. Also this
> may come handy when/if GRUB-in-the-userspace sees light of the day.
>
> This separates VOF and sPAPR in a hope that VOF bits may be reused by
> other POWERPC boards which do not support pSeries.
>
> This make VOF optional, it is disabled by default, add --enable-vof
> to ./configure to enable it.
>
> This assumes potential support for booting from QEMU backends
> such as blockdev or netdev without devices/drivers used.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>
> The example command line is:
>
> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline \
> -nographic \
> -vga none \
> -enable-kvm \
> -m 8G \
> -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
> -kernel pbuild/kernel-le-guest/vmlinux \
> -initrd pb/rootfs.cpio.xz \
> -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw \
> -global spapr-nvram.drive=DRIVE0 \
> -snapshot \
> -smp 8,threads=8 \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events \
> -d guest_errors \
> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
> -mon chardev=SOCKET0,mode=control

I haven't looked at it in detail yet, just some quick comments I have on 
first skim through.

> ---
> Changes:
> v21:
> * s/ld/ldz/ in entry.S

Typo? Has this become lwz?

> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to Kconfig
> * made CONFIG_VOF optional

This won't work for pegasos2, see below.

> * s/l.lds/vof.lds/
> * force 32 BE in spapr_machine_reset() instead of the firmware
> * added checks for non-null methods of VofMachineIfClass
> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better comment
> * added  path_offset wrapper for handling mixed case for addresses
> after "@" in node names
> * changed getprop() to check for actual "name" property in the fdt
> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike similar
> rtas_ld/ldl_be_*) they return error codes
> * VOF_MEM_READ uses now address_space_read (it was address_space_read_full
> before, not sure why)
[...]
> ---
> configure               |    9 +
> pc-bios/vof/Makefile    |   23 +
> include/hw/ppc/spapr.h  |   25 +-
> include/hw/ppc/vof.h    |   55 ++
> pc-bios/vof/vof.h       |   43 ++
> hw/ppc/spapr.c          |   87 +++-
> hw/ppc/spapr_hcall.c    |   29 +-
> hw/ppc/spapr_vof.c      |  153 ++++++
> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
> pc-bios/vof/bootmem.c   |   14 +
> pc-bios/vof/ci.c        |   91 ++++
> pc-bios/vof/libc.c      |   92 ++++
> pc-bios/vof/main.c      |   21 +
> tests/qtest/rtas-test.c |   17 +-
> MAINTAINERS             |   12 +
> hw/ppc/Kconfig          |    3 +
> hw/ppc/meson.build      |    3 +
> hw/ppc/trace-events     |   24 +
> meson.build             |    1 +
> pc-bios/README          |    2 +
> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
> pc-bios/vof/entry.S     |   49 ++
> pc-bios/vof/vof.lds     |   48 ++
> 24 files changed, 1840 insertions(+), 13 deletions(-)
> create mode 100644 pc-bios/vof/Makefile
> create mode 100644 include/hw/ppc/vof.h
> create mode 100644 pc-bios/vof/vof.h
> create mode 100644 hw/ppc/spapr_vof.c
> create mode 100644 hw/ppc/vof.c
> create mode 100644 pc-bios/vof/bootmem.c
> create mode 100644 pc-bios/vof/ci.c
> create mode 100644 pc-bios/vof/libc.c
> create mode 100644 pc-bios/vof/main.c
> create mode 100644 pc-bios/vof-nvram.bin
> create mode 100755 pc-bios/vof.bin
> create mode 100644 pc-bios/vof/entry.S
> create mode 100644 pc-bios/vof/vof.lds
>
> diff --git a/configure b/configure
> index 8dcb9965b24e..00dc29c027fa 100755
> --- a/configure
> +++ b/configure
> @@ -445,6 +445,7 @@ fuse="auto"
> fuse_lseek="auto"
> multiprocess="auto"
> slirp_smbd="$default_feature"
> +vof="no"

Why is this disabled by default? I pretty much need VOF in pegasos2 as 
there would be no other firmware otherwise. So it means I have to select 
VOF in pegasos2 config and then VOF itself cannot be optional any more. If 
you want it to be optional for spapr then you can't use CONFIG_VOF for 
that but need to add a separate CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF 
option that you can set to no by default even when CONFIG_VOF is yes and 
make VOF usage conditional on that variable within spapr files.

Hope this make sense. But I don't really see why you need to do that when 
you already have this turned off by default for spapr unless the x-vof 
option is used. Isn't that enough to make this optional? If not then you 
need another spapr specific CONFIG_* variable because CONFIG_PEGASOS2 has 
to select CONFIG_VOF as it will be its default firmware. For the same 
reason you should not put it behind a config option especially one that 
needs to be explicitely enabled.

> malloc_trim="auto"
> gio="$default_feature"
> @@ -1561,6 +1562,10 @@ for opt do
>   ;;
>   --disable-slirp-smbd) slirp_smbd=no
>   ;;
> +  --enable-vof) vof=yes
> +  ;;
> +  --disable-vof) vof=no
> +  ;;
>   *)
>       echo "ERROR: unknown option $opt"
>       echo "Try '$0 --help' for more information"
> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is enabled if available
>   multiprocess    Out of process device emulation support
>   gio             libgio support
>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
> +  vof             Virtual Open Firmware support (powerpc/pseries, experimental)
>
> NOTE: The object files are built at the place where configure is launched
> EOF
> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
> fi
> +if test "$vof" = "yes" ; then
> +  echo "CONFIG_VOF=y" >> $config_host_mak
> +fi
> if test "$vde" = "yes" ; then
>   echo "CONFIG_VDE=y" >> $config_host_mak
>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
> new file mode 100644
> index 000000000000..aa1678c4d889
> --- /dev/null
> +++ b/pc-bios/vof/Makefile
> @@ -0,0 +1,23 @@
> +all: build-all
> +
> +build-all: vof.bin
> +
> +CROSS ?=
> +CC = $(CROSS)gcc
> +LD = $(CROSS)ld
> +OBJCOPY = $(CROSS)objcopy
> +
> +%.o: %.S
> +	$(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
> +
> +%.o: %.c
> +	$(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $<
> +
> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
> +	$(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
> +
> +%.bin: %.elf
> +	$(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
> +
> +clean:
> +	rm -f *.o vof.bin vof.elf *~
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f05219f75ef6..39b5581ae650 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -12,6 +12,9 @@
> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
> #include "hw/ppc/xics.h"        /* For ICSState */
> #include "hw/ppc/spapr_tpm_proxy.h"
> +#ifdef CONFIG_VOF
> +#include "hw/ppc/vof.h"
> +#endif
>
> struct SpaprVioBus;
> struct SpaprPhbState;
> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>     uint64_t kernel_addr;
>     uint32_t initrd_base;
>     long initrd_size;
> +#ifdef CONFIG_VOF
> +    Vof *vof;
> +#endif
>     uint64_t rtc_offset; /* Now used only during incoming migration */
>     struct PPCTimebase tb;
>     bool has_graphics;
> @@ -558,7 +564,9 @@ struct SpaprMachineState {
> /* Client Architecture support */
> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>
> /*
>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
> hwaddr spapr_get_rtas_addr(void);
> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
> +
> +#ifdef CONFIG_VOF
> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> +                     target_ulong *stack_ptr, Error **errp);
> +void spapr_vof_quiesce(MachineState *ms);
> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname,
> +                       void *val, int vallen);
> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                target_ulong opcode, target_ulong *args);
> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
> +                                                   CPUState *cs,
> +                                                   target_ulong ovec_addr);
> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
> +#endif
> +
> #endif /* HW_SPAPR_H */
> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
> new file mode 100644
> index 000000000000..65ca2fed0d41
> --- /dev/null
> +++ b/include/hw/ppc/vof.h
> @@ -0,0 +1,55 @@
> +/*
> + * Virtual Open Firmware
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#ifndef HW_VOF_H
> +#define HW_VOF_H
> +
> +typedef struct Vof {
> +    uint64_t top_addr; /* copied from rma_size */
> +    GArray *claimed; /* array of SpaprOfClaimed */
> +    uint64_t claimed_base;
> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
> +    uint32_t of_instance_last;
> +    char *bootargs;
> +    long fw_size;
> +} Vof;
> +
> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
> +                    target_ulong args_real);
> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t align);
> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
> +void vof_cleanup(Vof *vof);
> +void vof_build_dt(void *fdt, Vof *vof);
> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> +                               const char *prop, const char *path);
> +
> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
> +
> +typedef struct VofMachineIfClass VofMachineIfClass;
> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, TYPE_VOF_MACHINE_IF)
> +
> +struct VofMachineIfClass {
> +    InterfaceClass parent;
> +    target_ulong (*client_architecture_support)(MachineState *ms, CPUState *cs,
> +                                                target_ulong vec);
> +    void (*quiesce)(MachineState *ms);
> +    bool (*setprop)(MachineState *ms, const char *path, const char *propname,
> +                    void *val, int vallen);
> +};
> +
> +/*
> + * Initial stack size is from
> + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html
> + */
> +#define VOF_STACK_SIZE       0x8000

Maybe also add a define for RTAS_SIZE here? We'll need to put that in the 
device tree but it depends on the rtas shim size that's part of VOF so it 
should be defined here instead of hardcoding it in boards that use VOF so 
it can be updated later at one place if needed.

> +
> +#define VOF_MEM_READ(pa, buf, size) \
> +    address_space_read(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> +#define VOF_MEM_WRITE(pa, buf, size) \
> +    address_space_write(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))

These aren't much useful without the struct definition that you typically 
want to read data into using these.

> +
> +#endif /* HW_VOF_H */
[...]
> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
> new file mode 100644
> index 000000000000..1068a1e58388
> --- /dev/null
> +++ b/hw/ppc/vof.c
> +
> +/* Defined as Big Endian */
> +struct prom_args {
> +    uint32_t service;
> +    uint32_t nargs;
> +    uint32_t nret;
> +    uint32_t args[10];
> +} QEMU_PACKED;

I mean this one, this could be in vof.h too. But this may better be in a 
generic rtas.h with the rtas_* macros so maybe done at a later point. So 
maybe just forget it for now.

> +
> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
> +                    target_ulong args_real)
> +{
> +    struct prom_args args_be;
> +    uint32_t args[ARRAY_SIZE(args_be.args)];
> +    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
> +    char service[64];
> +    unsigned nargs, nret, i;
> +
> +    if (address_space_rw(&address_space_memory, args_real,
> +                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
> +                         false) != MEMTX_OK) {
> +        return -EINVAL;
> +    }
> +    nargs = be32_to_cpu(args_be.nargs);
> +    if (nargs >= ARRAY_SIZE(args_be.args)) {
> +        return -EINVAL;
> +    }
> +
> +    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
> +                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
> +                         false) != MEMTX_OK) {
> +        return -EINVAL;
> +    }
> +    if (strnlen(service, sizeof(service)) == sizeof(service)) {
> +        /* Too long service name */
> +        return -EINVAL;
> +    }
> +
> +    for (i = 0; i < nargs; ++i) {
> +        args[i] = be32_to_cpu(args_be.args[i]);
> +    }
> +
> +    nret = be32_to_cpu(args_be.nret);
> +    ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, nret);
> +    if (!nret) {
> +        return 0;
> +    }
> +
> +    args_be.args[nargs] = cpu_to_be32(ret);
> +    for (i = 1; i < nret; ++i) {
> +        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
> +    }
> +
> +    if (address_space_rw(&address_space_memory,
> +                         args_real + offsetof(struct prom_args, args[nargs]),
> +                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
> +                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
> +        return -EINVAL;

Also you're still not using your macros here. Why?

Regards,
BALATON Zoltan
BALATON Zoltan June 15, 2021, 9:09 p.m. UTC | #2
On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
> The PAPR platform describes an OS environment that's presented by
> a combination of a hypervisor and firmware. The features it specifies
> require collaboration between the firmware and the hypervisor.
>
> Since the beginning, the runtime component of the firmware (RTAS) has
> been implemented as a 20 byte shim which simply forwards it to
> a hypercall implemented in qemu. The boot time firmware component is
> SLOF - but a build that's specific to qemu, and has always needed to be
> updated in sync with it. Even though we've managed to limit the amount
> of runtime communication we need between qemu and SLOF, there's some,
> and it has become increasingly awkward to handle as we've implemented
> new features.
>
> This implements a boot time OF client interface (CI) which is
> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
> which implements Open Firmware Client Interface (OF CI). This allows
> using a smaller stateless firmware which does not have to manage
> the device tree.
>
> The new "vof.bin" firmware image is included with source code under
> pc-bios/. It also includes RTAS blob.
>
> This implements a handful of CI methods just to get -kernel/-initrd
> working. In particular, this implements the device tree fetching and
> simple memory allocator - "claim" (an OF CI memory allocator) and updates
> "/memory@0/available" to report the client about available memory.
>
> This implements changing some device tree properties which we know how
> to deal with, the rest is ignored. To allow changes, this skips
> fdt_pack() when x-vof=on as not packing the blob leaves some room for
> appending.
>
> In absence of SLOF, this assigns phandles to device tree nodes to make
> device tree traversing work.
>
> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>
> This adds basic instances support which are managed by a hash map
> ihandle -> [phandle].
>
> Before the guest started, the used memory is:
> 0..e60 - the initial firmware
> 8000..10000 - stack
> 400000.. - kernel
> 3ea0000.. - initramdisk
>
> This OF CI does not implement "interpret".
>
> Unlike SLOF, this does not format uninitialized nvram. Instead, this
> includes a disk image with pre-formatted nvram.
>
> With this basic support, this can only boot into kernel directly.
> However this is just enough for the petitboot kernel and initradmdisk to
> boot from any possible source. Note this requires reasonably recent guest
> kernel with:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
>
> The immediate benefit is much faster booting time which especially
> crucial with fully emulated early CPU bring up environments. Also this
> may come handy when/if GRUB-in-the-userspace sees light of the day.
>
> This separates VOF and sPAPR in a hope that VOF bits may be reused by
> other POWERPC boards which do not support pSeries.
>
> This make VOF optional, it is disabled by default, add --enable-vof
> to ./configure to enable it.
>
> This assumes potential support for booting from QEMU backends
> such as blockdev or netdev without devices/drivers used.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>
> The example command line is:
>
> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline \
> -nographic \
> -vga none \
> -enable-kvm \
> -m 8G \
> -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
> -kernel pbuild/kernel-le-guest/vmlinux \
> -initrd pb/rootfs.cpio.xz \
> -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw \
> -global spapr-nvram.drive=DRIVE0 \
> -snapshot \
> -smp 8,threads=8 \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events \
> -d guest_errors \
> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
> -mon chardev=SOCKET0,mode=control
>
> ---
> Changes:
> v21:
> * s/ld/ldz/ in entry.S
> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to Kconfig
> * made CONFIG_VOF optional
> * s/l.lds/vof.lds/
> * force 32 BE in spapr_machine_reset() instead of the firmware
> * added checks for non-null methods of VofMachineIfClass
> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better comment
> * added  path_offset wrapper for handling mixed case for addresses
> after "@" in node names
> * changed getprop() to check for actual "name" property in the fdt
> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike similar
> rtas_ld/ldl_be_*) they return error codes
> * VOF_MEM_READ uses now address_space_read (it was address_space_read_full
> before, not sure why)
>
> v20:
> * compile vof.bin with -mcpu=power4 for better compatibility
> * s/std/stw/ in entry.S to make it work on ppc32
> * fixed dt_available property to support both 32 and 64bit
> * shuffled prom_args handling code
> * do not enforce 32bit in MSR (again, to support 32bit platforms)
>
> v19:
> * put bootargs in the FDT
> * moved setting properties to a VOF machine hook
> * moved fw_size and claim for it to vof_init()
> * added CROSS to the VOF's makefile
> * simplified phandles assigning
> * pass MachineState to all machine hooks instead of calling
> qdev_get_machine (following QOM)
> * bunch of smaller changes and added comments
> * added simple test to attempt to start with x-vof=on
>
> v18:
> * fixed top addr (max address for "claim") on radix - it equals to ram_size
> and vof->top_addr was uint32_t
> * fixed "available" property which got broken in v14 but it is only visible
> to clients which care (== grub)
> * reshuffled vof_dt_memory_available() calls, added vof_init() to allow
> vof_claim() before rendering the FDT
>
> v17:
> * mv hw/ppc/vof.h include/hw/ppc/vof.h
> * VofMachineIfClass -> VofMachineClass; it is not VofMachineInterface as
> nobody used this scheme, usually "Interface" is dropped, a couple of times
> it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
> used by include/hw/vmstate-if.h
> * added SPDX
> * other fixes from v16 review
>
> v16:
> * rebased on dwg/ppc-for-6.1
> * s/SpaprVofInterface/VofMachineInterface/
>
> v15:
> * bugfix: claimed memory for the VOF itself
> * ditched OF_STACK_ADDR and allocate one instead, now it starts from 0x8000
> because it is aligned to its size (no particular reason though)
> * coding style
> * moved nvram.bin up one level
> * ditched bool in the firmware
> * made debugging code conditional using trace_event_get_state() + qemu_loglevel_mask()
> * renamed the CAS interface to SpaprVofInterface
> * added "write" which for now dumps the message and ihandle via
> trace point for early debug assistance
> * commented on when we allocate of_instances in vof_build_dt()
> * store fw_size is SpaprMachine to let spapr_vof_reset() claim it
> * many small fixes from v14's review
>
> v14:
> * check for truncates in readstr()
> * ditched a separate vof_reset()
> * spapr->vof is a pointer now, dropped the "on" field
> * removed rtas_base from vof and updated comment why we allow setting it
> * added myself to maintainers
> * updated commit log about blockdev and other possible platforms
> * added a note why new hcall is 0x5
> * no in place endianness convertion in spapr_h_vof_client
> * converted all cpu_physical_memory_read/write to address_space_rw
> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>
> v13:
> * rebase on latest ppc-for-6.0
> * shuffled code around to touch spapr.c less
>
> v12:
> * split VOF and SPAPR
>
> v11:
> * added g_autofree
> * fixed gcc warnings
> * fixed few leaks
> * added nvram image to make "nvram --print-config" not crash;
> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
> is 16K, or it just does not work (empty output from "nvram")
>
> v10:
> * now rebased to compile with meson
>
> v9:
> * remove special handling of /rtas/rtas-size as now we always add it in QEMU
> * removed leftovers from scsi/grub/stdout/stdin/...
>
> v8:
> * no read/write/seek
> * no @dev in instances
> * the machine flag is "x-vof" for now
>
> v7:
> * now we have a small firmware which loads at 0 as SLOF and starts from
> 0x100 as SLOF
> * no MBR/ELF/GRUB business in QEMU anymore
> * blockdev is a separate patch
> * networking is a separate patch
>
> v6:
> * borrowed a big chunk of commit log introduction from David
> * fixed initial stack pointer (points to the highest address of stack)
> * traces for "interpret" and others
> * disabled  translate_kernel_address() hack so grub can load (work in
> progress)
> * added "milliseconds" for grub
> * fixed "claim" allocator again
> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
> * moved the most code possible from spapr.c to spapr_of_client.c, such as
> RTAS, prom entry and FDT build/finalize
> * separated blobs
> * GRUB now proceeds to its console prompt (there are still other issues)
> * parse MBR/GPT to find PReP and load GRUB
>
> v5:
> * made instances keep device and chardev pointers
> * removed VIO dependencies
> * print error if RTAS memory is not claimed as it should have been
> * pack FDT as "quiesce"
>
> v4:
> * fixed open
> * validate ihandles in "call-method"
>
> v3:
> * fixed phandles allocation
> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
> * fixed size of /chosen/stdout
> * bunch of renames
> * do not create rtas properties at all, let the client deal with it;
> instead setprop allows changing these in the FDT
> * no more packing FDT when bios=off - nobody needs it and getprop does not
> work otherwise
> * allow updating initramdisk device tree properties (for zImage)
> * added instances
> * fixed stdout on OF's "write"
> * removed special handling for stdout in OF client, spapr-vty handles it
> instead
>
> v2:
> * fixed claim()
> * added "setprop"
> * cleaner client interface and RTAS blobs management
> * boots to petitboot and further to the target system
> * more trace points
>
> v20
>
> v20!
> ---
> configure               |    9 +
> pc-bios/vof/Makefile    |   23 +
> include/hw/ppc/spapr.h  |   25 +-
> include/hw/ppc/vof.h    |   55 ++
> pc-bios/vof/vof.h       |   43 ++
> hw/ppc/spapr.c          |   87 +++-
> hw/ppc/spapr_hcall.c    |   29 +-
> hw/ppc/spapr_vof.c      |  153 ++++++
> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
> pc-bios/vof/bootmem.c   |   14 +
> pc-bios/vof/ci.c        |   91 ++++
> pc-bios/vof/libc.c      |   92 ++++
> pc-bios/vof/main.c      |   21 +
> tests/qtest/rtas-test.c |   17 +-
> MAINTAINERS             |   12 +
> hw/ppc/Kconfig          |    3 +
> hw/ppc/meson.build      |    3 +
> hw/ppc/trace-events     |   24 +
> meson.build             |    1 +
> pc-bios/README          |    2 +
> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
> pc-bios/vof/entry.S     |   49 ++
> pc-bios/vof/vof.lds     |   48 ++
> 24 files changed, 1840 insertions(+), 13 deletions(-)
> create mode 100644 pc-bios/vof/Makefile
> create mode 100644 include/hw/ppc/vof.h
> create mode 100644 pc-bios/vof/vof.h
> create mode 100644 hw/ppc/spapr_vof.c
> create mode 100644 hw/ppc/vof.c
> create mode 100644 pc-bios/vof/bootmem.c
> create mode 100644 pc-bios/vof/ci.c
> create mode 100644 pc-bios/vof/libc.c
> create mode 100644 pc-bios/vof/main.c
> create mode 100644 pc-bios/vof-nvram.bin
> create mode 100755 pc-bios/vof.bin
> create mode 100644 pc-bios/vof/entry.S
> create mode 100644 pc-bios/vof/vof.lds
>
> diff --git a/configure b/configure
> index 8dcb9965b24e..00dc29c027fa 100755
> --- a/configure
> +++ b/configure
> @@ -445,6 +445,7 @@ fuse="auto"
> fuse_lseek="auto"
> multiprocess="auto"
> slirp_smbd="$default_feature"
> +vof="no"
>
> malloc_trim="auto"
> gio="$default_feature"
> @@ -1561,6 +1562,10 @@ for opt do
>   ;;
>   --disable-slirp-smbd) slirp_smbd=no
>   ;;
> +  --enable-vof) vof=yes
> +  ;;
> +  --disable-vof) vof=no
> +  ;;
>   *)
>       echo "ERROR: unknown option $opt"
>       echo "Try '$0 --help' for more information"
> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is enabled if available
>   multiprocess    Out of process device emulation support
>   gio             libgio support
>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
> +  vof             Virtual Open Firmware support (powerpc/pseries, experimental)
>
> NOTE: The object files are built at the place where configure is launched
> EOF
> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
> fi
> +if test "$vof" = "yes" ; then
> +  echo "CONFIG_VOF=y" >> $config_host_mak
> +fi
> if test "$vde" = "yes" ; then
>   echo "CONFIG_VDE=y" >> $config_host_mak
>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak

In case I could not explain it clearly in my previous message I think the 
solution we want here is to drop these configure changes and let Kconfig 
configure this. The CONFIG_VOF option decides if vof itself is built (adds 
vof.c) and pegasos2 will select this so it will usually be yes by default. 
Your problem is that you're trying to use this variable in spapr to make 
it off by default but that does not work. You need to add another option 
for that (e.g. CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF whichever makes more 
sense) then you can set that to no despite CONFIG_VOF is yes and use that 
variable in spapr files and to add spapr_vof.c. Then no configure option 
is needed which does not even work for me: I get compile errors saying 
'poisoning existing macro "CONFIG_VOF"' if I try with --enable-vof or 
spapr fails to build if I try without --enable-vof but select CONFIG_VOF 
from pegasos2. I hope this makes sense now.

> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
> new file mode 100644
> index 000000000000..aa1678c4d889
> --- /dev/null
> +++ b/pc-bios/vof/Makefile
> @@ -0,0 +1,23 @@
> +all: build-all
> +
> +build-all: vof.bin
> +
> +CROSS ?=
> +CC = $(CROSS)gcc
> +LD = $(CROSS)ld
> +OBJCOPY = $(CROSS)objcopy
> +
> +%.o: %.S
> +	$(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
> +
> +%.o: %.c
> +	$(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $<
> +
> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
> +	$(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
> +
> +%.bin: %.elf
> +	$(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
> +
> +clean:
> +	rm -f *.o vof.bin vof.elf *~
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f05219f75ef6..39b5581ae650 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -12,6 +12,9 @@
> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
> #include "hw/ppc/xics.h"        /* For ICSState */
> #include "hw/ppc/spapr_tpm_proxy.h"
> +#ifdef CONFIG_VOF
> +#include "hw/ppc/vof.h"
> +#endif
>
> struct SpaprVioBus;
> struct SpaprPhbState;
> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>     uint64_t kernel_addr;
>     uint32_t initrd_base;
>     long initrd_size;
> +#ifdef CONFIG_VOF

So this can't be CONFIG_VOF here if you want to be able to set it to no 
despite pegasos2 pulling in VOF so you need another SPAPR specific option 
for that in spapr specific parts with CONFIG_VOF selecting if vof itself 
is built it any board uses it. So CONFIG_PEGASOS2 has to select CONFIG_VOF 
and your SPAPR_VOF option should too if it's enabled that way vof.c will 
be added if either board is built but for SPAPR only if its VOF option is 
on.

> +    Vof *vof;
> +#endif
>     uint64_t rtc_offset; /* Now used only during incoming migration */
>     struct PPCTimebase tb;
>     bool has_graphics;
> @@ -558,7 +564,9 @@ struct SpaprMachineState {
> /* Client Architecture support */
> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>
> /*
>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
> hwaddr spapr_get_rtas_addr(void);
> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
> +
> +#ifdef CONFIG_VOF
> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> +                     target_ulong *stack_ptr, Error **errp);
> +void spapr_vof_quiesce(MachineState *ms);
> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname,
> +                       void *val, int vallen);
> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                target_ulong opcode, target_ulong *args);
> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
> +                                                   CPUState *cs,
> +                                                   target_ulong ovec_addr);
> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
> +#endif
> +
> #endif /* HW_SPAPR_H */
> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
> new file mode 100644
> index 000000000000..65ca2fed0d41
> --- /dev/null
> +++ b/include/hw/ppc/vof.h
> @@ -0,0 +1,55 @@
> +/*
> + * Virtual Open Firmware
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#ifndef HW_VOF_H
> +#define HW_VOF_H
> +
> +typedef struct Vof {
> +    uint64_t top_addr; /* copied from rma_size */
> +    GArray *claimed; /* array of SpaprOfClaimed */
> +    uint64_t claimed_base;
> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
> +    uint32_t of_instance_last;
> +    char *bootargs;
> +    long fw_size;
> +} Vof;
> +
> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
> +                    target_ulong args_real);
> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t align);
> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
> +void vof_cleanup(Vof *vof);
> +void vof_build_dt(void *fdt, Vof *vof);
> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> +                               const char *prop, const char *path);
> +
> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
> +
> +typedef struct VofMachineIfClass VofMachineIfClass;
> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, TYPE_VOF_MACHINE_IF)
> +
> +struct VofMachineIfClass {
> +    InterfaceClass parent;
> +    target_ulong (*client_architecture_support)(MachineState *ms, CPUState *cs,
> +                                                target_ulong vec);
> +    void (*quiesce)(MachineState *ms);
> +    bool (*setprop)(MachineState *ms, const char *path, const char *propname,
> +                    void *val, int vallen);
> +};
> +
> +/*
> + * Initial stack size is from
> + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html

I wonder if it's better to quote the section number and the title of the 
doc in case the URL here goes away in the future.

> + */
> +#define VOF_STACK_SIZE       0x8000
> +
> +#define VOF_MEM_READ(pa, buf, size) \
> +    address_space_read(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> +#define VOF_MEM_WRITE(pa, buf, size) \
> +    address_space_write(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> +
> +#endif /* HW_VOF_H */
> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
> new file mode 100644
> index 000000000000..2d8958076907
> --- /dev/null
> +++ b/pc-bios/vof/vof.h
> @@ -0,0 +1,43 @@
> +/*
> + * Virtual Open Firmware
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#include <stdarg.h>
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned long uint32_t;
> +typedef unsigned long long uint64_t;
> +#define NULL (0)
> +#define PROM_ERROR (-1u)
> +typedef unsigned long ihandle;
> +typedef unsigned long phandle;
> +typedef int size_t;
> +typedef void client(void);
> +
> +/* globals */
> +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
> +
> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
> +
> +/* libc */
> +int strlen(const char *s);
> +int strcmp(const char *s1, const char *s2);
> +void *memcpy(void *dest, const void *src, size_t n);
> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
> +void *memmove(void *dest, const void *src, size_t n);
> +void *memset(void *dest, int c, size_t size);
> +
> +/* CI wrappers */
> +void ci_panic(const char *str);
> +phandle ci_finddevice(const char *path);
> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len);
> +
> +/* booting from -kernel */
> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
> +
> +/* Entry points for CI and RTAS */
> +extern uint32_t ci_entry(uint32_t params);
> +extern unsigned long hv_rtas(unsigned long params);
> +extern unsigned int hv_rtas_size;
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 4dd90b75cc52..6d747d72c614 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -101,6 +101,7 @@
> #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
> #define FW_MAX_SIZE             0x400000
> #define FW_FILE_NAME            "slof.bin"
> +#define FW_FILE_NAME_VOF        "vof.bin"
> #define FW_OVERHEAD             0x2800000
> #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>
> @@ -1639,22 +1640,40 @@ static void spapr_machine_reset(MachineState *machine)
>     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
>
>     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
> +#ifdef CONFIG_VOF
> +    if (spapr->vof) {
> +        target_ulong stack_ptr = 0;
>
> -    rc = fdt_pack(fdt);
> +        spapr_vof_reset(spapr, fdt, &stack_ptr, &error_fatal);
>
> -    /* Should only fail if we've built a corrupted tree */
> -    assert(rc == 0);
> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
> +                                  stack_ptr, spapr->initrd_base,
> +                                  spapr->initrd_size);
> +        /* VOF is 32bit BE so enforce MSR here */
> +        first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << MSR_LE));
> +        /*
> +         * Do not pack the FDT as the client may change properties.
> +         * VOF client does not expect the FDT so we do not load it to the VM.
> +         */
> +    } else
> +#endif
> +    {
> +        rc = fdt_pack(fdt);
> +        /* Should only fail if we've built a corrupted tree */
> +        assert(rc == 0);
>
> -    /* Load the fdt */
> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
> +                                  0, fdt_addr, 0);
> +        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> +    }
>     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> +
>     g_free(spapr->fdt_blob);
>     spapr->fdt_size = fdt_totalsize(fdt);
>     spapr->fdt_initial_size = spapr->fdt_size;
>     spapr->fdt_blob = fdt;
>
>     /* Set up the entry state */
> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0);
>     first_ppc_cpu->env.gpr[5] = 0;
>
>     spapr->fwnmi_system_reset_addr = -1;
> @@ -2657,7 +2676,12 @@ static void spapr_machine_init(MachineState *machine)
>     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>     MachineClass *mc = MACHINE_GET_CLASS(machine);
> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
> +    const char *bios_default =
> +#ifdef CONFIG_VOF
> +        !!spapr->vof ? FW_FILE_NAME_VOF :

Does !! make sense here? I think testing for non-0 does not need it so you 
could just write spapr->vof without !!.

> +#endif
> +        FW_FILE_NAME;
> +    const char *bios_name = machine->firmware ?: bios_default;
>     const char *kernel_filename = machine->kernel_filename;
>     const char *initrd_filename = machine->initrd_filename;
>     PCIHostState *phb;
> @@ -3014,6 +3038,12 @@ static void spapr_machine_init(MachineState *machine)
>     }
>
>     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
> +#ifdef CONFIG_VOF
> +    if (spapr->vof) {
> +        spapr->vof->fw_size = fw_size; /* for claim() on itself */
> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
> +    }
> +#endif
> }
>
> #define DEFAULT_KVM_TYPE "auto"
> @@ -3204,6 +3234,30 @@ static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
>     }
> }
>
> +#ifdef CONFIG_VOF
> +static bool spapr_get_vof(Object *obj, Error **errp)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->vof != NULL;
> +}
> +
> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    if (spapr->vof) {
> +        vof_cleanup(spapr->vof);
> +        g_free(spapr->vof);
> +        spapr->vof = NULL;
> +    }
> +    if (!value) {
> +        return;
> +    }
> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
> +}
> +#endif
> +
> static char *spapr_get_ic_mode(Object *obj, Error **errp)
> {
>     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> @@ -3329,6 +3383,12 @@ static void spapr_instance_init(Object *obj)
>                                     stringify(KERNEL_LOAD_ADDR)
>                                     " for -kernel is the default");
>     spapr->kernel_addr = KERNEL_LOAD_ADDR;
> +#ifdef CONFIG_VOF
> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
> +    object_property_set_description(obj, "x-vof",
> +                                    "Enable Virtual Open Firmware (experimental)");
> +#endif
> +
>     /* The machine class defines the default interrupt controller mode */
>     spapr->irq = smc->irq;
>     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
> @@ -4580,6 +4640,16 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>     smc->smp_threads_vsmt = true;
>     smc->nr_xirqs = SPAPR_NR_XIRQS;
>     xfc->match_nvt = spapr_match_nvt;
> +
> +#ifdef CONFIG_VOF
> +    {
> +        VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
> +        vmc->client_architecture_support =
> +            spapr_vof_client_architecture_support;
> +        vmc->quiesce = spapr_vof_quiesce;
> +        vmc->setprop = spapr_vof_setprop;
> +    }
> +#endif
> }
>
> static const TypeInfo spapr_machine_info = {
> @@ -4599,6 +4669,9 @@ static const TypeInfo spapr_machine_info = {
>         { TYPE_XICS_FABRIC },
>         { TYPE_INTERRUPT_STATS_PROVIDER },
>         { TYPE_XIVE_FABRIC },
> +#ifdef CONFIG_VOF
> +        { TYPE_VOF_MACHINE_IF },
> +#endif
>         { }
>     },
> };
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index f25014afda40..986a4de34128 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1080,7 +1080,7 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>     SpaprOptionVector *ov1_guest, *ov5_guest;
>     bool guest_radix;
>     bool raw_mode_supported = false;
> -    bool guest_xive;
> +    bool guest_xive, reset_fdt = false;
>     CPUState *cs;
>     void *fdt;
>     uint32_t max_compat = spapr->max_compat_pvr;
> @@ -1233,8 +1233,10 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>         spapr_setup_hpt(spapr);
>     }
>
> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
> -
> +#ifdef CONFIG_VOF
> +    reset_fdt = spapr->vof != NULL;

(Here when storing to a bool !! could make sense but what you have is 
better as it's clearer so I'm not suggesting to use !! here either. It's 
rarely useful, maybe only if you need a bool but does not have space to 
write the condition or it would be more confusing that way.)

> +#endif
> +    fdt = spapr_build_fdt(spapr, reset_fdt, fdt_bufsize);
>     g_free(spapr->fdt_blob);
>     spapr->fdt_size = fdt_totalsize(fdt);
>     spapr->fdt_initial_size = spapr->fdt_size;
> @@ -1277,6 +1279,27 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
>     return ret;
> }
>
> +#ifdef CONFIG_VOF
> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
> +                                                   CPUState *cs,
> +                                                   target_ulong ovec_addr)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
> +
> +    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr,
> +                                                      ovec_addr, FDT_MAX_SIZE);
> +
> +    /*
> +     * This adds stdout and generates phandles for boottime and CAS FDTs.
> +     * It is alright to update the FDT here as do_client_architecture_support()
> +     * does not pack it.
> +     */
> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
> +
> +    return ret;
> +}
> +#endif
> +
> static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>                                               SpaprMachineState *spapr,
>                                               target_ulong opcode,
> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
> new file mode 100644
> index 000000000000..653d376f38aa
> --- /dev/null
> +++ b/hw/ppc/spapr_vof.c
> @@ -0,0 +1,153 @@
> +/*
> + * SPAPR machine hooks to Virtual Open Firmware,
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include <sys/ioctl.h>
> +#include "qapi/error.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_vio.h"
> +#include "hw/ppc/fdt.h"
> +#include "sysemu/sysemu.h"
> +#include "qom/qom-qobject.h"
> +#include "trace.h"
> +
> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                target_ulong opcode, target_ulong *_args)
> +{
> +    int ret = vof_client_call(MACHINE(spapr), spapr->vof, spapr->fdt_blob,
> +                              ppc64_phys_to_real(_args[0]));
> +
> +    if (ret) {
> +        return H_PARAMETER;
> +    }
> +    return H_SUCCESS;
> +}
> +
> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
> +{
> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
> +    int chosen;
> +
> +    vof_build_dt(fdt, spapr->vof);
> +
> +    _FDT(chosen = fdt_path_offset(fdt, "/chosen"));
> +    _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
> +                            spapr->vof->bootargs ? : ""));
> +
> +    /*
> +     * SLOF-less setup requires an open instance of stdout for early
> +     * kernel printk. By now all phandles are settled so we can open
> +     * the default serial console.
> +     */
> +    if (stdout_path) {
> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
> +                                   stdout_path));
> +    }
> +}
> +
> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> +                     target_ulong *stack_ptr, Error **errp)
> +{
> +    Vof *vof = spapr->vof;
> +
> +    vof_init(vof, spapr->rma_size, errp);
> +
> +    *stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE);
> +    if (*stack_ptr == -1) {
> +        error_setg(errp, "Memory allocation for stack failed");
> +        return;
> +    }
> +    /* Stack grows downwards plus reserve space for the minimum stack frame */
> +    *stack_ptr += VOF_STACK_SIZE - 0x20;
> +
> +    if (spapr->kernel_size &&
> +        vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == -1) {
> +        error_setg(errp, "Memory for kernel is in use");
> +        return;
> +    }
> +
> +    if (spapr->initrd_size &&
> +        vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == -1) {
> +        error_setg(errp, "Memory for initramdisk is in use");
> +        return;
> +    }
> +
> +    spapr_vof_client_dt_finalize(spapr, fdt);
> +
> +    /*
> +     * At this point the expected allocation map is:
> +     *
> +     * 0..c38 - the initial firmware
> +     * 8000..10000 - stack
> +     * 400000.. - kernel
> +     * 3ea0000.. - initramdisk
> +     *
> +     * We skip writing FDT as nothing expects it; OF client interface is
> +     * going to be used for reading the device tree.
> +     */
> +}
> +
> +void spapr_vof_quiesce(MachineState *ms)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
> +
> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
> +    spapr->fdt_initial_size = spapr->fdt_size;
> +}
> +
> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname,
> +                       void *val, int vallen)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
> +
> +    /*
> +     * We only allow changing properties which we know how to update in QEMU
> +     * OR
> +     * the ones which we know that they need to survive during "quiesce".
> +     */
> +
> +    if (strcmp(path, "/rtas") == 0) {
> +        if (strcmp(propname, "linux,rtas-base") == 0 ||
> +            strcmp(propname, "linux,rtas-entry") == 0) {
> +            /* These need to survive quiesce so let them store in the FDT */
> +            return true;
> +        }
> +    }
> +
> +    if (strcmp(path, "/chosen") == 0) {
> +        if (strcmp(propname, "bootargs") == 0) {
> +            Vof *vof = spapr->vof;
> +
> +            g_free(vof->bootargs);
> +            vof->bootargs = g_strndup(val, vallen);
> +            return true;
> +        }
> +        if (strcmp(propname, "linux,initrd-start") == 0) {
> +            if (vallen == sizeof(uint32_t)) {
> +                spapr->initrd_base = ldl_be_p(val);
> +                return true;
> +            }
> +            if (vallen == sizeof(uint64_t)) {
> +                spapr->initrd_base = ldq_be_p(val);
> +                return true;
> +            }
> +            return false;
> +        }
> +        if (strcmp(propname, "linux,initrd-end") == 0) {
> +            if (vallen == sizeof(uint32_t)) {
> +                spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base;
> +                return true;
> +            }
> +            if (vallen == sizeof(uint64_t)) {
> +                spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base;
> +                return true;
> +            }
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
> new file mode 100644
> index 000000000000..1068a1e58388
> --- /dev/null
> +++ b/hw/ppc/vof.c
> @@ -0,0 +1,1052 @@
> +/*
> + * QEMU PowerPC Virtual Open Firmware.
> + *
> + * This implements client interface from OpenFirmware IEEE1275 on the QEMU
> + * side to leave only a very basic firmware in the VM.
> + *
> + * Copyright (c) 2021 IBM Corporation.
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "qemu/timer.h"
> +#include "qemu/range.h"
> +#include "qemu/units.h"
> +#include "qapi/error.h"
> +#include <sys/ioctl.h>
> +#include "exec/ram_addr.h"
> +#include "exec/address-spaces.h"
> +#include "hw/ppc/vof.h"
> +#include "hw/ppc/fdt.h"
> +#include "sysemu/runstate.h"
> +#include "qom/qom-qobject.h"
> +#include "trace.h"
> +
> +#include <libfdt.h>
> +
> +/*
> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long.
> + */
> +#define OF_PROPNAME_LEN_MAX 64
> +
> +#define VOF_MAX_PATH        256
> +#define VOF_MAX_SETPROPLEN  2048
> +#define VOF_MAX_METHODLEN   256
> +#define VOF_MAX_FORTHCODE   256
> +#define VOF_VTY_BUF_SIZE    256
> +
> +typedef struct {
> +    uint64_t start;
> +    uint64_t size;
> +} OfClaimed;
> +
> +typedef struct {
> +    char *path; /* the path used to open the instance */
> +    uint32_t phandle;
> +} OfInstance;
> +
> +static int readstr(hwaddr pa, char *buf, int size)
> +{
> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
> +        return -1;
> +    }
> +    if (strnlen(buf, size) == size) {
> +        buf[size - 1] = '\0';
> +        trace_vof_error_str_truncated(buf, size);
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
> +                       const char *s1, unsigned nargscheck, unsigned nretcheck)
> +{
> +    if (strcmp(s, s1)) {
> +        return false;
> +    }
> +    if ((nargscheck && (nargs != nargscheck)) ||
> +        (nretcheck && (nret != nretcheck))) {
> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static void prop_format(char *tval, int tlen, const void *prop, int len)
> +{
> +    int i;
> +    const unsigned char *c;
> +    char *t;
> +    const char bin[] = "...";
> +
> +    for (i = 0, c = prop; i < len; ++i, ++c) {
> +        if (*c == '\0' && i == len - 1) {
> +            strncpy(tval, prop, tlen - 1);
> +            return;
> +        }
> +        if (*c < 0x20 || *c >= 0x80) {
> +            break;
> +        }
> +    }
> +
> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
> +            strcpy(t, bin);
> +            return;
> +        }
> +        if (i && i % 4 == 0 && i != len - 1) {
> +            strcat(t, " ");
> +            ++t;
> +        }
> +        t += sprintf(t, "%02X", *c & 0xFF);
> +    }
> +}
> +
> +static int get_path(const void *fdt, int offset, char *buf, int len)
> +{
> +    int ret;
> +
> +    ret = fdt_get_path(fdt, offset, buf, len - 1);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    buf[len - 1] = '\0';
> +
> +    return strlen(buf) + 1;
> +}
> +
> +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, int len)
> +{
> +    int ret;
> +
> +    ret = fdt_node_offset_by_phandle(fdt, ph);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return get_path(fdt, ret, buf, len);
> +}
> +
> +static int path_offset(const void *fdt, const char *path)
> +{
> +    g_autofree char *p = NULL;
> +    char *at;
> +
> +    /*
> +     * The addresses in node names are expected to in the lower case as per

There's some grammar problem with this sentence. I think it should be "are 
expected to be in lower case" but ask a native speaker.

> +     * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html
> +     */
> +    at = strchr(path, '@');
> +    if (!at) {
> +        return fdt_path_offset(fdt, path);
> +    }
> +
> +    p = g_strdup(path);
> +    for (at = at - path + p + 1; *at; ++at) {
> +        *at = tolower(*at);
> +    }
> +    return fdt_path_offset(fdt, p);
> +}
> +
> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
> +{
> +    char fullnode[VOF_MAX_PATH];
> +    uint32_t ret = -1;
> +    int offset;
> +
> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
> +        return (uint32_t) ret;
> +    }
> +
> +    offset = path_offset(fdt, fullnode);
> +    if (offset >= 0) {
> +        ret = fdt_get_phandle(fdt, offset);
> +    }
> +    trace_vof_finddevice(fullnode, ret);
> +    return (uint32_t) ret;
> +}
> +
> +static const void *getprop(const void *fdt, int nodeoff, const char *propname,
> +                           int *proplen, bool *write0)
> +{
> +    const char *unit, *prop;
> +    const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen);
> +
> +    if (ret) {
> +        if (write0) {
> +            *write0 = false;
> +        }
> +        return ret;
> +    }
> +
> +    /*
> +     * The "name" property is not actually expected as a property in the FDT
> +     * (although some platform may create those in "/" so we try getprop first),

Not only in "/" but anywhere. MorphOS walks the tree with nextprop and 
expects to get a name property for most nodes without ever explicitely 
querying "name". I've tested this with both the board firmware and VOF and 
with the board firmware a name property appears in most nodes but not all 
so I think at least SmartFirmware does the same and explicitely sets name 
on some nodes and otherwise returns the name from path if such property 
does not existbut queried. With this in VOF I can do the same and get same 
results so the change should be OK but the comment may be misleading now. 
Better to just say we return a value for "name" from path if queried but 
property does not exist which seems to be what OF does too.

> +     * we emulate it by returning a pointer to the node's name and adjust
> +     * proplen to include only the name but not the unit.
> +     */
> +    if (strcmp(propname, "name")) {
> +        return NULL;
> +    }
> +    prop = fdt_get_name(fdt, nodeoff, proplen);
> +    if (!prop) {
> +        *proplen = 0;
> +        return NULL;
> +    }
> +
> +    unit = memchr(prop, '@', *proplen);
> +    if (unit) {
> +        *proplen = unit - prop;
> +    }
> +    *proplen += 1;
> +
> +    /*
> +     * Since it might be cut at "@" and there will be no trailing zero
> +     * in the prop buffer, tell the caller to write zero at the end.
> +     */
> +    if (write0) {
> +        *write0 = true;
> +    }
> +    return prop;
> +}
> +
> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
> +                            uint32_t valaddr, uint32_t vallen)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = 0;
> +    int proplen = 0;
> +    const void *prop;
> +    char trval[64] = "";
> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
> +    bool write0;
> +
> +    if (nodeoff < 0) {
> +        return -1;
> +    }
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        return -1;
> +    }
> +    prop = getprop(fdt, nodeoff, propname, &proplen, &write0);
> +    if (prop) {
> +        const char zero = 0;
> +        int cb = MIN(proplen, vallen);
> +
> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK ||
> +            /* if that was "name" with a unit address, overwrite '@' with '0' */
> +            (write0 &&
> +             cb == proplen &&
> +             VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) {
> +            ret = -1;
> +        } else {
> +            /*
> +             * OF1275 says:
> +             * "Size is either the actual size of the property, or -1 if name
> +             * does not exist", hence returning proplen instead of cb.
> +             */
> +            ret = proplen;
> +            /* Do not format a value if tracepoint is silent, for performance */
> +            if (trace_event_get_state(TRACE_VOF_GETPROP) &&
> +                qemu_loglevel_mask(LOG_TRACE)) {
> +                prop_format(trval, sizeof(trval), prop, ret);
> +            }
> +        }
> +    } else {
> +        ret = -1;
> +    }
> +    trace_vof_getprop(nodeph, propname, ret, trval);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = 0;
> +    int proplen = 0;
> +    const void *prop;
> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
> +
> +    if (nodeoff < 0) {
> +        return -1;
> +    }
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        return -1;
> +    }
> +    prop = getprop(fdt, nodeoff, propname, &proplen, NULL);
> +    if (prop) {
> +        ret = proplen;
> +    } else {
> +        ret = -1;
> +    }
> +    trace_vof_getproplen(nodeph, propname, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof,
> +                            uint32_t nodeph, uint32_t pname,
> +                            uint32_t valaddr, uint32_t vallen)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = -1;
> +    int offset;
> +    char trval[64] = "";
> +    char nodepath[VOF_MAX_PATH] = "";
> +    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
> +    g_autofree char *val = NULL;
> +
> +    if (vallen > VOF_MAX_SETPROPLEN) {
> +        goto trace_exit;
> +    }
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        goto trace_exit;
> +    }
> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
> +    if (offset < 0) {
> +        goto trace_exit;
> +    }
> +    ret = get_path(fdt, offset, nodepath, sizeof(nodepath));
> +    if (ret <= 0) {
> +        goto trace_exit;
> +    }
> +
> +    val = g_malloc0(vallen);
> +    if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) {
> +        goto trace_exit;
> +    }
> +
> +    if (vmo) {
> +        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
> +
> +        if (vmc->setprop &&
> +            !vmc->setprop(ms, nodepath, propname, val, vallen)) {
> +            goto trace_exit;
> +        }
> +    }
> +
> +    ret = fdt_setprop(fdt, offset, propname, val, vallen);
> +    if (ret) {
> +        goto trace_exit;
> +    }
> +
> +    if (trace_event_get_state(TRACE_VOF_SETPROP) &&
> +        qemu_loglevel_mask(LOG_TRACE)) {
> +        prop_format(trval, sizeof(trval), val, vallen);
> +    }
> +    ret = vallen;
> +
> +trace_exit:
> +    trace_vof_setprop(nodeph, propname, trval, vallen, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
> +                             uint32_t prevaddr, uint32_t nameaddr)
> +{
> +    int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle);
> +    char prev[OF_PROPNAME_LEN_MAX + 1];
> +    const char *tmp;
> +
> +    if (readstr(prevaddr, prev, sizeof(prev))) {
> +        return -1;
> +    }
> +
> +    fdt_for_each_property_offset(offset, fdt, nodeoff) {
> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
> +            return 0;
> +        }
> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
> +            if (prev[0] != '\0') {
> +                offset = fdt_next_property_offset(fdt, offset);
> +                if (offset < 0) {
> +                    return 0;
> +                }
> +            }
> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
> +                return 0;
> +            }
> +
> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) {
> +                return -1;
> +            }
> +            return 1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
> +{
> +    int ret;
> +
> +    if (phandle == 0) {
> +        ret = fdt_path_offset(fdt, "/");
> +    } else {
> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +    }
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
> +{
> +    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
> +{
> +    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const char *path)
> +{
> +    uint32_t ret = -1;
> +    OfInstance *inst = NULL;
> +
> +    if (vof->of_instance_last == 0xFFFFFFFF) {
> +        /* We do not recycle ihandles yet */
> +        goto trace_exit;
> +    }
> +
> +    inst = g_new0(OfInstance, 1);
> +    inst->phandle = fdt_get_phandle(fdt, offset);
> +    g_assert(inst->phandle);
> +    ++vof->of_instance_last;
> +
> +    inst->path = g_strdup(path);
> +    g_hash_table_insert(vof->of_instances,
> +                        GINT_TO_POINTER(vof->of_instance_last),
> +                        inst);
> +    ret = vof->of_instance_last;
> +
> +trace_exit:
> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
> +
> +    return ret;
> +}
> +
> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> +                               const char *prop, const char *path)
> +{
> +    int node = fdt_path_offset(fdt, nodename);
> +    int inst, offset;
> +
> +    offset = fdt_path_offset(fdt, path);
> +    if (offset < 0) {
> +        trace_vof_error_unknown_path(path);
> +        return offset;
> +    }
> +
> +    inst = vof_do_open(fdt, vof, offset, path);
> +
> +    return fdt_setprop_cell(fdt, node, prop, inst);
> +}
> +
> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
> +{
> +    char path[VOF_MAX_PATH];
> +    int offset;
> +
> +    if (readstr(pathaddr, path, sizeof(path))) {
> +        return -1;
> +    }
> +
> +    offset = path_offset(fdt, path);
> +    if (offset < 0) {
> +        trace_vof_error_unknown_path(path);
> +        return offset;
> +    }
> +
> +    return vof_do_open(fdt, vof, offset, path);
> +}
> +
> +static void vof_close(Vof *vof, uint32_t ihandle)
> +{
> +    if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) {
> +        trace_vof_error_unknown_ihandle_close(ihandle);
> +    }
> +}
> +
> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
> +{
> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
> +                                         GINT_TO_POINTER(ihandle));
> +    uint32_t ret = -1;
> +
> +    if (instp) {
> +        ret = ((OfInstance *)instp)->phandle;
> +    }
> +    trace_vof_instance_to_package(ihandle, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
> +                                    uint32_t buf, uint32_t len)
> +{
> +    uint32_t ret = -1;
> +    char tmp[VOF_MAX_PATH] = "";
> +
> +    ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
> +    if (ret > 0) {
> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
> +            ret = -1;
> +        }
> +    }
> +
> +    trace_vof_package_to_path(phandle, tmp, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle,
> +                                     uint32_t buf, uint32_t len)
> +{
> +    uint32_t ret = -1;
> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
> +    char tmp[VOF_MAX_PATH] = "";
> +
> +    if (phandle != -1) {
> +        ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
> +        if (ret > 0) {
> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
> +                ret = -1;
> +            }
> +        }
> +    }
> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf,
> +                          uint32_t len)
> +{
> +    char tmp[VOF_VTY_BUF_SIZE];
> +    unsigned cb;
> +    OfInstance *inst = (OfInstance *)
> +        g_hash_table_lookup(vof->of_instances, GINT_TO_POINTER(ihandle));
> +
> +    if (!inst) {
> +        trace_vof_error_write(ihandle);
> +        return -1;
> +    }
> +
> +    for ( ; len > 0; len -= cb) {
> +        cb = MIN(len, sizeof(tmp) - 1);
> +        if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) {
> +            return -1;
> +        }
> +
> +        /* FIXME: there is no backend(s) yet so just call a trace */
> +        if (trace_event_get_state(TRACE_VOF_WRITE) &&
> +            qemu_loglevel_mask(LOG_TRACE)) {
> +            tmp[cb] = '\0';
> +            trace_vof_write(ihandle, cb, tmp);
> +        }
> +    }
> +
> +    return len;
> +}
> +
> +static void vof_claimed_dump(GArray *claimed)
> +{
> +    int i;
> +    OfClaimed c;
> +
> +    if (trace_event_get_state(TRACE_VOF_CLAIMED) &&
> +        qemu_loglevel_mask(LOG_TRACE)) {
> +
> +        for (i = 0; i < claimed->len; ++i) {
> +            c = g_array_index(claimed, OfClaimed, i);
> +            trace_vof_claimed(c.start, c.start + c.size, c.size);
> +        }
> +    }
> +}
> +
> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size)
> +{
> +    int i;
> +    OfClaimed c;
> +
> +    for (i = 0; i < claimed->len; ++i) {
> +        c = g_array_index(claimed, OfClaimed, i);
> +        if (ranges_overlap(c.start, c.size, virt, size)) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
> +{
> +    OfClaimed newclaim;
> +
> +    newclaim.start = virt;
> +    newclaim.size = size;
> +    g_array_append_val(claimed, newclaim);
> +}
> +
> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
> +{
> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
> +}
> +
> +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
> +{
> +    int i, n, offset, proplen = 0, sc, ac;
> +    target_ulong mem0_end;
> +    const uint8_t *mem0_reg;
> +    g_autofree uint8_t *avail = NULL;
> +    uint8_t *availcur;
> +
> +    if (!fdt || !claimed) {
> +        return;
> +    }
> +
> +    offset = fdt_path_offset(fdt, "/");
> +    _FDT(offset);
> +    ac = fdt_address_cells(fdt, offset);
> +    g_assert(ac == 1 || ac == 2);
> +    sc = fdt_size_cells(fdt, offset);
> +    g_assert(sc == 1 || sc == 2);
> +
> +    offset = fdt_path_offset(fdt, "/memory@0");
> +    _FDT(offset);
> +
> +    mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
> +    g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
> +    if (sc == 2) {
> +        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac));
> +    } else {
> +        mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac));
> +    }
> +
> +    g_array_sort(claimed, of_claimed_compare_func);
> +    vof_claimed_dump(claimed);
> +
> +    /*
> +     * VOF resides in the first page so we do not need to check if there is
> +     * available memory before the first claimed block
> +     */
> +    g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 0).start == 0));
> +
> +    avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len);
> +    for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) {
> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
> +        uint64_t start, size;
> +
> +        start = c.start + c.size;
> +        if (i < claimed->len - 1) {
> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
> +
> +            size = cn.start - start;
> +        } else {
> +            size = mem0_end - start;
> +        }
> +
> +        if (ac == 2) {
> +            *(uint64_t *) availcur = cpu_to_be64(start);
> +        } else {
> +            *(uint32_t *) availcur = cpu_to_be32(start);
> +        }
> +        availcur += sizeof(uint32_t) * ac;
> +        if (sc == 2) {
> +            *(uint64_t *) availcur = cpu_to_be64(size);
> +        } else {
> +            *(uint32_t *) availcur = cpu_to_be32(size);
> +        }
> +        availcur += sizeof(uint32_t) * sc;
> +
> +        if (size) {
> +            trace_vof_avail(c.start + c.size, c.start + c.size + size, size);
> +            ++n;
> +        }
> +    }
> +    _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - avail)));
> +}
> +
> +/*
> + * OF1275:
> + * "Allocates size bytes of memory. If align is zero, the allocated range
> + * begins at the virtual address virt. Otherwise, an aligned address is
> + * automatically chosen and the input argument virt is ignored".
> + *
> + * In other words, exactly one of @virt and @align is non-zero.
> + */
> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size,
> +                   uint64_t align)
> +{
> +    uint64_t ret;
> +
> +    if (size == 0) {
> +        ret = -1;
> +    } else if (align == 0) {
> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
> +            ret = -1;
> +        } else {
> +            ret = virt;
> +        }
> +    } else {
> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
> +        while (1) {
> +            if (vof->claimed_base >= vof->top_addr) {
> +                error_report("Out of RMA memory for the OF client");
> +                return -1;
> +            }
> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
> +                break;
> +            }
> +            vof->claimed_base += size;
> +        }
> +        ret = vof->claimed_base;
> +    }
> +
> +    if (ret != -1) {
> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
> +        vof_claim_add(vof->claimed, ret, size);
> +    }
> +    trace_vof_claim(virt, size, align, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size)
> +{
> +    uint32_t ret = -1;
> +    int i;
> +    GArray *claimed = vof->claimed;
> +    OfClaimed c;
> +
> +    for (i = 0; i < claimed->len; ++i) {
> +        c = g_array_index(claimed, OfClaimed, i);
> +        if (c.start == virt && c.size == size) {
> +            g_array_remove_index(claimed, i);
> +            ret = 0;
> +            break;
> +        }
> +    }
> +
> +    trace_vof_release(virt, size, ret);
> +
> +    return ret;
> +}
> +
> +static void vof_instantiate_rtas(Error **errp)
> +{
> +    error_setg(errp, "The firmware should have instantiated RTAS");
> +}
> +
> +static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t methodaddr,
> +                                uint32_t ihandle, uint32_t param1,
> +                                uint32_t param2, uint32_t param3,
> +                                uint32_t param4, uint32_t *ret2)
> +{
> +    uint32_t ret = -1;
> +    char method[VOF_MAX_METHODLEN] = "";
> +    OfInstance *inst;
> +
> +    if (!ihandle) {
> +        goto trace_exit;
> +    }
> +
> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
> +                                              GINT_TO_POINTER(ihandle));

I think you should not have space in type casts but checkpatch does not 
seem to mind. You have this at several places.

> +    if (!inst) {
> +        goto trace_exit;
> +    }
> +
> +    if (readstr(methodaddr, method, sizeof(method))) {
> +        goto trace_exit;
> +    }
> +
> +    if (strcmp(inst->path, "/") == 0) {
> +        if (strcmp(method, "ibm,client-architecture-support") == 0) {
> +            Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
> +
> +            if (vmo) {
> +                VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
> +
> +                g_assert(vmc->client_architecture_support);
> +                ret = vmc->client_architecture_support(ms, first_cpu, param1);
> +            }
> +
> +            *ret2 = 0;
> +        }
> +    } else if (strcmp(inst->path, "/rtas") == 0) {
> +        if (strcmp(method, "instantiate-rtas") == 0) {
> +            vof_instantiate_rtas(&error_fatal);
> +            ret = 0;
> +            *ret2 = param1; /* rtas-base */
> +        }
> +    } else {
> +        trace_vof_error_unknown_method(method);
> +    }
> +
> +trace_exit:
> +    trace_vof_method(ihandle, method, param1, ret, *ret2);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1,
> +                                   uint32_t param2, uint32_t *ret2)
> +{
> +    uint32_t ret = -1;
> +    char cmd[VOF_MAX_FORTHCODE] = "";
> +
> +    /* No interpret implemented so just call a trace */
> +    readstr(cmdaddr, cmd, sizeof(cmd));
> +    trace_vof_interpret(cmd, param1, param2, ret, *ret2);
> +
> +    return ret;
> +}
> +
> +static void vof_quiesce(MachineState *ms, void *fdt, Vof *vof)
> +{
> +    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
> +    /* After "quiesce", no change is expected to the FDT, pack FDT to ensure */
> +    int rc = fdt_pack(fdt);
> +
> +    assert(rc == 0);
> +
> +    if (vmo) {
> +        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
> +
> +        if (vmc->quiesce) {
> +            vmc->quiesce(ms);
> +        }
> +    }
> +
> +    vof_claimed_dump(vof->claimed);
> +}
> +
> +static uint32_t vof_client_handle(MachineState *ms, void *fdt, Vof *vof,
> +                                  const char *service,
> +                                  uint32_t *args, unsigned nargs,
> +                                  uint32_t *rets, unsigned nrets)
> +{
> +    uint32_t ret = 0;
> +
> +    /* @nrets includes the value which this function returns */
> +#define cmpserv(s, a, r) \
> +    cmpservice(service, nargs, nrets, (s), (a), (r))
> +
> +    if (cmpserv("finddevice", 1, 1)) {
> +        ret = vof_finddevice(fdt, args[0]);
> +    } else if (cmpserv("getprop", 4, 1)) {
> +        ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]);
> +    } else if (cmpserv("getproplen", 2, 1)) {
> +        ret = vof_getproplen(fdt, args[0], args[1]);
> +    } else if (cmpserv("setprop", 4, 1)) {
> +        ret = vof_setprop(ms, fdt, vof, args[0], args[1], args[2], args[3]);
> +    } else if (cmpserv("nextprop", 3, 1)) {
> +        ret = vof_nextprop(fdt, args[0], args[1], args[2]);
> +    } else if (cmpserv("peer", 1, 1)) {
> +        ret = vof_peer(fdt, args[0]);
> +    } else if (cmpserv("child", 1, 1)) {
> +        ret = vof_child(fdt, args[0]);
> +    } else if (cmpserv("parent", 1, 1)) {
> +        ret = vof_parent(fdt, args[0]);
> +    } else if (cmpserv("open", 1, 1)) {
> +        ret = vof_open(fdt, vof, args[0]);
> +    } else if (cmpserv("close", 1, 0)) {
> +        vof_close(vof, args[0]);
> +    } else if (cmpserv("instance-to-package", 1, 1)) {
> +        ret = vof_instance_to_package(vof, args[0]);
> +    } else if (cmpserv("package-to-path", 3, 1)) {
> +        ret = vof_package_to_path(fdt, args[0], args[1], args[2]);
> +    } else if (cmpserv("instance-to-path", 3, 1)) {
> +        ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]);
> +    } else if (cmpserv("write", 3, 1)) {
> +        ret = vof_write(vof, args[0], args[1], args[2]);
> +    } else if (cmpserv("claim", 3, 1)) {
> +        ret = vof_claim(vof, args[0], args[1], args[2]);
> +        if (ret != -1) {
> +            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> +        }
> +    } else if (cmpserv("release", 2, 0)) {
> +        ret = vof_release(vof, args[0], args[1]);
> +        if (ret != -1) {
> +            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> +        }
> +    } else if (cmpserv("call-method", 0, 0)) {
> +        ret = vof_call_method(ms, vof, args[0], args[1], args[2], args[3],
> +                              args[4], args[5], rets);
> +    } else if (cmpserv("interpret", 0, 0)) {
> +        ret = vof_call_interpret(args[0], args[1], args[2], rets);
> +    } else if (cmpserv("milliseconds", 0, 1)) {
> +        ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +    } else if (cmpserv("quiesce", 0, 0)) {
> +        vof_quiesce(ms, fdt, vof);
> +    } else if (cmpserv("exit", 0, 0)) {
> +        error_report("Stopped as the VM requested \"exit\"");
> +        vm_stop(RUN_STATE_PAUSED);
> +    } else {
> +        trace_vof_error_unknown_service(service, nargs, nrets);
> +        ret = -1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Defined as Big Endian */
> +struct prom_args {
> +    uint32_t service;
> +    uint32_t nargs;
> +    uint32_t nret;
> +    uint32_t args[10];
> +} QEMU_PACKED;
> +
> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
> +                    target_ulong args_real)
> +{
> +    struct prom_args args_be;
> +    uint32_t args[ARRAY_SIZE(args_be.args)];
> +    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
> +    char service[64];
> +    unsigned nargs, nret, i;
> +
> +    if (address_space_rw(&address_space_memory, args_real,
> +                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
> +                         false) != MEMTX_OK) {
> +        return -EINVAL;
> +    }
> +    nargs = be32_to_cpu(args_be.nargs);
> +    if (nargs >= ARRAY_SIZE(args_be.args)) {
> +        return -EINVAL;
> +    }
> +
> +    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
> +                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
> +                         false) != MEMTX_OK) {
> +        return -EINVAL;
> +    }
> +    if (strnlen(service, sizeof(service)) == sizeof(service)) {
> +        /* Too long service name */
> +        return -EINVAL;
> +    }
> +
> +    for (i = 0; i < nargs; ++i) {
> +        args[i] = be32_to_cpu(args_be.args[i]);
> +    }
> +
> +    nret = be32_to_cpu(args_be.nret);
> +    ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, nret);
> +    if (!nret) {
> +        return 0;
> +    }
> +
> +    args_be.args[nargs] = cpu_to_be32(ret);
> +    for (i = 1; i < nret; ++i) {
> +        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
> +    }
> +
> +    if (address_space_rw(&address_space_memory,
> +                         args_real + offsetof(struct prom_args, args[nargs]),
> +                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
> +                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vof_instance_free(gpointer data)
> +{
> +    OfInstance *inst = (OfInstance *) data;

Remove space in cast?

> +
> +    g_free(inst->path);
> +    g_free(inst);
> +}
> +
> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp)
> +{
> +    vof_cleanup(vof);
> +
> +    vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal,
> +                                              NULL, vof_instance_free);
> +    vof->claimed = g_array_new(false, false, sizeof(OfClaimed));
> +
> +    /* Keep allocations in 32bit as CLI ABI can only return cells==32bit */
> +    vof->top_addr = MIN(top_addr, 4 * GiB);
> +    if (vof_claim(vof, 0, vof->fw_size, 0) == -1) {
> +        error_setg(errp, "Memory for firmware is in use");
> +    }
> +}
> +
> +void vof_cleanup(Vof *vof)
> +{
> +    if (vof->claimed) {
> +        g_array_unref(vof->claimed);
> +    }
> +    if (vof->of_instances) {
> +        g_hash_table_unref(vof->of_instances);
> +    }
> +    vof->claimed = NULL;
> +    vof->of_instances = NULL;
> +}
> +
> +void vof_build_dt(void *fdt, Vof *vof)
> +{
> +    uint32_t phandle = fdt_get_max_phandle(fdt);
> +    int offset, proplen = 0;
> +    const void *prop;
> +
> +    /* Assign phandles to nodes without predefined phandles (like XICS/XIVE) */
> +    for (offset = fdt_next_node(fdt, -1, NULL);
> +         offset >= 0;
> +         offset = fdt_next_node(fdt, offset, NULL)) {
> +        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
> +        if (prop) {
> +            continue;
> +        }
> +        ++phandle;
> +        _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle));
> +    }
> +
> +    vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> +}
> +
> +static const TypeInfo vof_machine_if_info = {
> +    .name = TYPE_VOF_MACHINE_IF,
> +    .parent = TYPE_INTERFACE,
> +    .class_size = sizeof(VofMachineIfClass),
> +};
> +
> +static void vof_machine_if_register_types(void)
> +{
> +    type_register_static(&vof_machine_if_info);
> +}
> +type_init(vof_machine_if_register_types)
> diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c
> new file mode 100644
> index 000000000000..771b9e95f95d
> --- /dev/null
> +++ b/pc-bios/vof/bootmem.c
> @@ -0,0 +1,14 @@
> +#include "vof.h"
> +
> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize)
> +{
> +    uint64_t kern[2];
> +    phandle chosen = ci_finddevice("/chosen");
> +
> +    if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=
> +        sizeof(kern)) {
> +        return;
> +    }
> +
> +    do_boot(kern[0], initrd, initrdsize);
> +}
> diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
> new file mode 100644
> index 000000000000..a80806580dd0
> --- /dev/null
> +++ b/pc-bios/vof/ci.c
> @@ -0,0 +1,91 @@
> +#include "vof.h"
> +
> +struct prom_args {
> +    uint32_t service;
> +    uint32_t nargs;
> +    uint32_t nret;
> +    uint32_t args[10];
> +};
> +
> +typedef unsigned long prom_arg_t;
> +
> +#define ADDR(x) (uint32_t)(x)
> +
> +static int prom_handle(struct prom_args *pargs)
> +{
> +    void *rtasbase;
> +    uint32_t rtassize = 0;
> +    phandle rtas;
> +
> +    if (strcmp("call-method", (void *)(unsigned long) pargs->service)) {

Remove space from cast?

> +        return -1;
> +    }
> +
> +    if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0])) {

Ditto.

> +        return -1;
> +    }
> +
> +    rtas = ci_finddevice("/rtas");
> +    /* rtas-size is set by QEMU depending of FWNMI support */
> +    ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));
> +    if (rtassize < hv_rtas_size) {
> +        return -1;
> +    }
> +
> +    rtasbase = (void *)(unsigned long) pargs->args[2];
> +
> +    memcpy(rtasbase, hv_rtas, hv_rtas_size);
> +    pargs->args[pargs->nargs] = 0;
> +    pargs->args[pargs->nargs + 1] = pargs->args[2];
> +
> +    return 0;
> +}
> +
> +void prom_entry(uint32_t args)
> +{
> +    if (prom_handle((void *)(unsigned long) args)) {
> +        ci_entry(args);
> +    }
> +}
> +
> +static int call_ci(const char *service, int nargs, int nret, ...)
> +{
> +    int i;
> +    struct prom_args args;
> +    va_list list;
> +
> +    args.service = ADDR(service);
> +    args.nargs = nargs;
> +    args.nret = nret;
> +
> +    va_start(list, nret);
> +    for (i = 0; i < nargs; i++) {
> +        args.args[i] = va_arg(list, prom_arg_t);
> +    }
> +    va_end(list);
> +
> +    for (i = 0; i < nret; i++) {
> +        args.args[nargs + i] = 0;
> +    }
> +
> +    if (ci_entry((uint32_t)(&args)) < 0) {
> +        return PROM_ERROR;
> +    }
> +
> +    return (nret > 0) ? args.args[nargs] : 0;
> +}
> +
> +void ci_panic(const char *str)
> +{
> +    call_ci("exit", 0, 0);
> +}
> +
> +phandle ci_finddevice(const char *path)
> +{
> +    return call_ci("finddevice", 1, 1, path);
> +}
> +
> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len)
> +{
> +    return call_ci("getprop", 4, 1, ph, propname, prop, len);
> +}
> diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c
> new file mode 100644
> index 000000000000..00c10e6e7da1
> --- /dev/null
> +++ b/pc-bios/vof/libc.c
> @@ -0,0 +1,92 @@
> +#include "vof.h"
> +
> +int strlen(const char *s)
> +{
> +    int len = 0;
> +
> +    while (*s != 0) {
> +        len += 1;
> +        s += 1;
> +    }
> +
> +    return len;
> +}
> +
> +int strcmp(const char *s1, const char *s2)
> +{
> +    while (*s1 != 0 && *s2 != 0) {
> +        if (*s1 != *s2) {
> +            break;
> +        }
> +        s1 += 1;
> +        s2 += 1;
> +    }
> +
> +    return *s1 - *s2;
> +}
> +
> +void *memcpy(void *dest, const void *src, size_t n)
> +{
> +    char *cdest;
> +    const char *csrc = src;
> +
> +    cdest = dest;
> +    while (n-- > 0) {
> +        *cdest++ = *csrc++;
> +    }
> +
> +    return dest;
> +}
> +
> +int memcmp(const void *ptr1, const void *ptr2, size_t n)
> +{
> +    const unsigned char *p1 = ptr1;
> +    const unsigned char *p2 = ptr2;
> +
> +    while (n-- > 0) {
> +        if (*p1 != *p2) {
> +            return *p1 - *p2;
> +        }
> +        p1 += 1;
> +        p2 += 1;
> +    }
> +
> +    return 0;
> +}
> +
> +void *memmove(void *dest, const void *src, size_t n)
> +{
> +    char *cdest;
> +    const char *csrc;
> +    int i;
> +
> +    /* Do the buffers overlap in a bad way? */
> +    if (src < dest && src + n >= dest) {
> +        /* Copy from end to start */
> +        cdest = dest + n - 1;
> +        csrc = src + n - 1;
> +        for (i = 0; i < n; i++) {
> +            *cdest-- = *csrc--;
> +        }
> +    } else {
> +        /* Normal copy is possible */
> +        cdest = dest;
> +        csrc = src;
> +        for (i = 0; i < n; i++) {
> +            *cdest++ = *csrc++;
> +        }
> +    }
> +
> +    return dest;
> +}
> +
> +void *memset(void *dest, int c, size_t size)
> +{
> +    unsigned char *d = (unsigned char *)dest;
> +
> +    while (size-- > 0) {
> +        *d++ = (unsigned char)c;
> +    }
> +
> +    return dest;
> +}
> diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c
> new file mode 100644
> index 000000000000..9fc30d2d0957
> --- /dev/null
> +++ b/pc-bios/vof/main.c
> @@ -0,0 +1,21 @@
> +#include "vof.h"
> +
> +void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4)
> +{
> +    register unsigned long r3 __asm__("r3") = _r3;
> +    register unsigned long r4 __asm__("r4") = _r4;
> +    register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;
> +
> +    ((client *)(uint32_t)addr)();
> +}
> +
> +void entry_c(void)
> +{
> +    register unsigned long r3 __asm__("r3");
> +    register unsigned long r4 __asm__("r4");
> +    register unsigned long r5 __asm__("r5");
> +    uint64_t initrd = r3, initrdsize = r4;
> +
> +    boot_from_memory(initrd, initrdsize);
> +    ci_panic("*** No boot target ***\n");
> +}
> diff --git a/tests/qtest/rtas-test.c b/tests/qtest/rtas-test.c
> index 16751dbd2f55..5b8e3d240921 100644
> --- a/tests/qtest/rtas-test.c
> +++ b/tests/qtest/rtas-test.c
> @@ -5,7 +5,7 @@
> #include "libqos/libqos-spapr.h"
> #include "libqos/rtas.h"
>
> -static void test_rtas_get_time_of_day(void)
> +static void run_test_rtas_get_time_of_day(const char *machine)
> {
>     QOSState *qs;
>     struct tm tm;
> @@ -13,7 +13,7 @@ static void test_rtas_get_time_of_day(void)
>     uint64_t ret;
>     time_t t1, t2;
>
> -    qs = qtest_spapr_boot("-machine pseries");
> +    qs = qtest_spapr_boot(machine);
>
>     t1 = time(NULL);
>     ret = qrtas_get_time_of_day(qs->qts, &qs->alloc, &tm, &ns);
> @@ -24,6 +24,18 @@ static void test_rtas_get_time_of_day(void)
>     qtest_shutdown(qs);
> }
>
> +static void test_rtas_get_time_of_day(void)
> +{
> +    run_test_rtas_get_time_of_day("-machine pseries");
> +}
> +
> +static void test_rtas_get_time_of_day_vof(void)
> +{
> +#ifdef CONFIG_VOF
> +    run_test_rtas_get_time_of_day("-machine pseries,x-vof=on");
> +#endif
> +}
> +
> int main(int argc, char *argv[])
> {
>     const char *arch = qtest_get_arch();
> @@ -35,6 +47,7 @@ int main(int argc, char *argv[])
>         exit(EXIT_FAILURE);
>     }
>     qtest_add_func("rtas/get-time-of-day", test_rtas_get_time_of_day);
> +    qtest_add_func("rtas/get-time-of-day-vof", test_rtas_get_time_of_day_vof);
>
>     return g_test_run();
> }
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7d9cd2904264..6fb202f99e90 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1356,6 +1356,18 @@ F: hw/pci-host/mv64361.c
> F: hw/pci-host/mv643xx.h
> F: include/hw/pci-host/mv64361.h
>
> +Virtual Open Firmware (VOF)
> +M: Alexey Kardashevskiy <aik@ozlabs.ru>
> +M: David Gibson <david@gibson.dropbear.id.au>
> +M: Greg Kurz <groug@kaod.org>
> +L: qemu-ppc@nongnu.org
> +S: Maintained
> +F: hw/ppc/spapr_vof*
> +F: hw/ppc/vof*
> +F: include/hw/ppc/vof*
> +F: pc-bios/vof/*
> +F: pc-bios/vof*
> +
> RISC-V Machines
> ---------------
> OpenTitan
> diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
> index 66e0b15d9efd..b895720b28b2 100644
> --- a/hw/ppc/Kconfig
> +++ b/hw/ppc/Kconfig
> @@ -144,3 +144,6 @@ config FW_CFG_PPC
>
> config FDT_PPC
>     bool
> +
> +config VOF
> +    bool
> diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
> index 597d974dd4ff..aa4c8e6a2eac 100644
> --- a/hw/ppc/meson.build
> +++ b/hw/ppc/meson.build
> @@ -84,4 +84,7 @@ ppc_ss.add(when: 'CONFIG_VIRTEX', if_true: files('virtex_ml507.c'))
> # Pegasos2
> ppc_ss.add(when: 'CONFIG_PEGASOS2', if_true: files('pegasos2.c'))
>
> +ppc_ss.add(when: 'CONFIG_VOF', if_true: files('vof.c'))
> +ppc_ss.add(when: ['CONFIG_VOF', 'CONFIG_PSERIES'], if_true: files('spapr_vof.c'))
> +
> hw_arch += {'ppc': ppc_ss}
> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> index 0ba3e403533f..6e90a0107247 100644
> --- a/hw/ppc/trace-events
> +++ b/hw/ppc/trace-events
> @@ -71,6 +71,30 @@ spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3
> spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64
> spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed"
>
> +# vof.c
> +vof_error_str_truncated(const char *s, int len) "%s truncated to %d"
> +vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d"
> +vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d"
> +vof_error_unknown_method(const char *method) "\"%s\""
> +vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x"
> +vof_error_unknown_path(const char *path) "\"%s\""
> +vof_error_write(uint32_t ih) "ih=0x%x"
> +vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x"
> +vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x"
> +vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x"
> +vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x"
> +vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]"
> +vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d"
> +vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t vallen, uint32_t ret) "ph=0x%x \"%s\" [%s] len=%d => ret=%d"
> +vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x"
> +vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x"
> +vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d"
> +vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d"
> +vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x"
> +vof_write(uint32_t ih, unsigned cb, const char *msg) "ih=0x%x [%u] \"%s\""
> +vof_avail(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64
> +vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64
> +
> # ppc.c
> ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
>
> diff --git a/meson.build b/meson.build
> index 626cf932c1e5..ed011adca89a 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2700,6 +2700,7 @@ summary_info += {'pixman':            pixman.found()}
> summary_info += {'VTE support':       config_host.has_key('CONFIG_VTE')}
> # TODO: add back version
> summary_info += {'slirp support':     slirp_opt == 'disabled' ? false : slirp_opt}
> +summary_info += {'VOF support':        config_host.has_key('CONFIG_VOF')}
> summary_info += {'libtasn1':          config_host.has_key('CONFIG_TASN1')}
> summary_info += {'PAM':               config_host.has_key('CONFIG_AUTH_PAM')}
> summary_info += {'iconv support':     iconv.found()}
> diff --git a/pc-bios/README b/pc-bios/README
> index c101c9a04f8f..6e6556e91c92 100644
> --- a/pc-bios/README
> +++ b/pc-bios/README
> @@ -16,6 +16,8 @@
>   https://github.com/aik/SLOF, and the image currently in qemu is
>   built from git tag qemu-slof-20210217.
>
> +- vof is a minimalistic firmware to work with -machine pseries,x-vof=on.

Maybe this should say what vof stands for and that it implements minimal 
OpenFirmware client interface, otherwise this one sentence does not help 
much those who have no idea what vof is. As this is a README that can be 
assumed that readers will not have an idea.

That's all I could find, I could not test it as it does not compile for me 
with --enable-vof so I'm waiting for next version dropping this configure 
option and splitting CONFIG_VOF into SPAPR specific option as I suggested 
above that I think should work and resolve this problem.

Regards,
BALATON Zoltan

> +
> - sgabios (the Serial Graphics Adapter option ROM) provides a means for
>   legacy x86 software to communicate with an attached serial console as
>   if a video card were attached.  The master sources reside in a subversion
> diff --git a/pc-bios/vof-nvram.bin b/pc-bios/vof-nvram.bin
> new file mode 100644
> index 0000000000000000000000000000000000000000..d183901cf980a91d81c4348bb20487c7bb62a2ec
> GIT binary patch
> literal 16384
> zcmeI%Jx;?g6bEpZJ8*)oSZeqZi&Z2pKnD)sI4{AHlNb4;RW}a70XPHaW57uo=-#R7
> zKSLBhJJ0sdixY3IuY@hzo0r$OmE%T;XE9uh@s1k=AOHafKmY;|fB*y_009U<00Izz
> z00bZa0SG_<0uX=z1Rwwb2tWV=XCbip6d#B4{{rX#XR%}$Bm^J;0SG|gWP$!?Aq=-I
> zcT+0Ix{{?1q>9J8r+eW^JK1tYYZZMWQCUwW%0S*~w^p@wfkX-<yRFx)H*+YEt0RRd
> zmn}6xtwbP`yp4O=>kxMAEA<~5@*g)@mb%KD5!;O~8c)>8rRQBx55=trhk#+1+T3J_
> zaf*G4vZAduqy$qda{``6Gnc2DQg<Es<GLxL#9<Oj*zP!8ZSnwf@-j7l47!nFXQO$a
> z^Hes6YU^_M<KsM*k~zwOSa+2g3Sx{*Eyu^XrB0FM5IJ-*?8`VvpBc4}vS(+_UKJ;=
> xITAns0uX=z1Rwwb2tWV=5P-nt34DD||Nni|VfbXeJORuY0uX=z1R!vE0>7B^s4f5i
>
> literal 0
> HcmV?d00001
>
> diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin
> new file mode 100755
> index 0000000000000000000000000000000000000000..1ec670be82134adcb5ae128732aff6e371281360
> GIT binary patch
> literal 3784
> zcmeHIL1-Lh6n>lC=%gLW9QLr#l}v1e-6f$B_K?xg-PA2?k`e+EC^WLWX18WB$*##L
> zqwKgdDcGd6vY{31LDNGBdJsJHpr=+43D`ppJ*-fhdMkMGXwf=;Z|0vS*)$~>4}u-|
> znVJ88^S<}K_ue-||L<!cO_V?RF=dHv`_EUqoW_-3XsZ68Z7oY}m-C>g{vfz^9((;=
> zR2{WQtrd?N131V}{??u$da-4X{5^achQIqYsco$hpzbT*Q3SrDwbWtb<O;Rd*W1=o
> z_bBk{z02W7outMJ$&H(&Sap)E6;kXI7MU-N7<P&zGag(UwL)DSk5(tuI!vLOgt&fm
> zen0=K^QT!jYj7O<!@IOvD<3tMQzUCD5$;lL=@`F<#k*p09GetIED~aovlM!0eQ1TU
> z6#Q}@F>HvOlgK%SoO^cXd?|*UBR(E-jM#A^vwOq<4r<c=3CGyeK{U9)aXJp_bXVBP
> z!VdAY&pg5I27E|}^$q4#OOeuli$ct$^|>W;PMn9Gb2(NRw{|F-PFs(0bb%9Qj!*~J
> zMtsklfRFZNVH1L%$aq|N#Y_x`wK#mF!RZi@e~!HX+t|YtwmXGw%(G=X=D<8QVxX>l
> z2dnGFJDe|YCH6Ps{Pq5JZV<OL;ygOFkNWkA`rSeO)=|IPyX$vx3Rq{*vlHmqd}EWQ
> z8xB>MJ4pv;#&V8o>5r(pnIfy1qs3;O8qF2@w0V=RH#g~vrbA!<)JfkU)-x{0DVo4t
> z!FwKe?!(}<6s4IP+y_y^CiXM9^LbYE4eCh6CD_F}==njLy1{dn-?a`3j7!g*xYkK&
> zzc#RC{~2wFLWnP8do_~yT<Zj9UQHBHJBjNCED1iD(+Ttl4O-6CZP-biP2f!8+Pi*W
> zEBZj$)(5S=^1|2m$Vc{S^~YD7JNJ9qCYAA8{c)cn2Z`}Tzib?=Ulv!nU;3{T^;<-k
> znaJrMUS2@o1oHH6=*(h<{X<6>BLIv5FrpsDqtCcEMIHy#vkP|_$JbBvVi^4KtSOcQ
> zE#1No0bdpPFYFWlmY9VMW}zRmFtB?TE?B^i0lx>c&<Fe^a0op9%yFT{tUrp-kB~Qq
> zPMq=4w@P|}deBFTU8k@)b(Zy>-;?I4*A3P=Gx!>2`Qf)*dr_D4Q|i9q>n*T-edRfc
> z!tYierf}x<cLe*9CG?itdkYS0tg9caN3H)B_;&waKHu=0Wcwv=mSpzhYUu*ncqg7L
> z@MPCs;6d9tF84cuf5Xqyw=)y&KhU)?<6~of&$Pt0BX86qidyg-MKQyS8R1!mZ`KG|
> zOSrS`#==?w-IA7up*3C}sy5^}%Q;fr|Bvl#@b?0@m6N@2T|6HjxMO(U#S`-97$xSd
> zHBX=0$2x>OwsF`R<LWkD*k6n}m-~qL_caiQ3T~pTU%0#b&{Xb6Rf{_g@9~+diCcTF
> zJ##E{Q9;~&9>?I&<6hD_@Vt{_MdYHumUUSbnz)YdSyOlLx7+tq>$j(G_WMujh?u<j
> z83Jz3{U=1nhw*=Ta=$e9!hx(lyPh%dpJX7BNT{O~_038}SM|c&TP0n^)*+fKOi%aD
> zl=P`ek;=34dSO;C7xYr^oL-pcM2?FGzV*(RKPX|ZWR!J2J6E197EAAyC$R~K)wxQQ
> zj!#Zi=1a5mPHE=d9!BeZyEHdjnr3^VfJ}5m!HQueu`qf^#;~8qI*sQEtRhww%fPb!
> E2HT5NwEzGB
>
> literal 0
> HcmV?d00001
>
> diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S
> new file mode 100644
> index 000000000000..10a101fb6d71
> --- /dev/null
> +++ b/pc-bios/vof/entry.S
> @@ -0,0 +1,49 @@
> +#define LOAD32(rn, name)    \
> +	lis     rn,name##@h;    \
> +	ori     rn,rn,name##@l
> +
> +#define ENTRY(func_name)    \
> +	.text;                  \
> +	.align  2;              \
> +	.globl  .func_name;     \
> +	.func_name:             \
> +	.globl  func_name;      \
> +	func_name:
> +
> +#define KVMPPC_HCALL_BASE       0xf000
> +#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> +
> +	. = 0x100 /* Do exactly as SLOF does */
> +
> +ENTRY(_start)
> +	LOAD32(2, __toc_start)
> +	b entry_c
> +
> +ENTRY(_prom_entry)
> +	LOAD32(2, __toc_start)
> +	stwu    %r1,-112(%r1)
> +	stw     %r31,104(%r1)
> +	mflr    %r31
> +	bl prom_entry
> +	nop
> +	mtlr    %r31
> +	lwz     %r31,104(%r1)
> +	addi    %r1,%r1,112
> +	blr
> +
> +ENTRY(ci_entry)
> +	mr	4,3
> +	LOAD32(3,KVMPPC_H_VOF_CLIENT)
> +	sc	1
> +	blr
> +
> +/* This is the actual RTAS blob copied to the OS at instantiate-rtas */
> +ENTRY(hv_rtas)
> +	mr      %r4,%r3
> +	LOAD32(3,KVMPPC_H_RTAS)
> +	sc	1
> +	blr
> +	.globl hv_rtas_size
> +hv_rtas_size:
> +	.long . - hv_rtas;
> diff --git a/pc-bios/vof/vof.lds b/pc-bios/vof/vof.lds
> new file mode 100644
> index 000000000000..1506ab4b0185
> --- /dev/null
> +++ b/pc-bios/vof/vof.lds
> @@ -0,0 +1,48 @@
> +OUTPUT_FORMAT("elf32-powerpc")
> +OUTPUT_ARCH(powerpc:common)
> +
> +/* set the entry point */
> +ENTRY ( __start )
> +
> +SECTIONS {
> +	__executable_start = .;
> +
> +	.text : {
> +		*(.text)
> +	}
> +
> +	__etext = .;
> +
> +	. = ALIGN(8);
> +
> +	.data : {
> +		*(.data)
> +		*(.rodata .rodata.*)
> +		*(.got1)
> +		*(.sdata)
> +		*(.opd)
> +	}
> +
> +	/* FIXME bss at end ??? */
> +
> +	. = ALIGN(8);
> +	__bss_start = .;
> +	.bss : {
> +		*(.sbss) *(.scommon)
> +		*(.dynbss)
> +		*(.bss)
> +	}
> +
> +	. = ALIGN(8);
> +	__bss_end = .;
> +	__bss_size = (__bss_end - __bss_start);
> +
> +	. = ALIGN(256);
> +	__toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000;
> +	.got :
> +	{
> +		 *(.toc .got)
> +	}
> +	. = ALIGN(8);
> +	__toc_end = .;
> +}
>
Alexey Kardashevskiy June 16, 2021, 6:49 a.m. UTC | #3
On 6/16/21 07:09, BALATON Zoltan wrote:
> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>> The PAPR platform describes an OS environment that's presented by
>> a combination of a hypervisor and firmware. The features it specifies
>> require collaboration between the firmware and the hypervisor.
>>
>> Since the beginning, the runtime component of the firmware (RTAS) has
>> been implemented as a 20 byte shim which simply forwards it to
>> a hypercall implemented in qemu. The boot time firmware component is
>> SLOF - but a build that's specific to qemu, and has always needed to be
>> updated in sync with it. Even though we've managed to limit the amount
>> of runtime communication we need between qemu and SLOF, there's some,
>> and it has become increasingly awkward to handle as we've implemented
>> new features.
>>
>> This implements a boot time OF client interface (CI) which is
>> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>> which implements Open Firmware Client Interface (OF CI). This allows
>> using a smaller stateless firmware which does not have to manage
>> the device tree.
>>
>> The new "vof.bin" firmware image is included with source code under
>> pc-bios/. It also includes RTAS blob.
>>
>> This implements a handful of CI methods just to get -kernel/-initrd
>> working. In particular, this implements the device tree fetching and
>> simple memory allocator - "claim" (an OF CI memory allocator) and updates
>> "/memory@0/available" to report the client about available memory.
>>
>> This implements changing some device tree properties which we know how
>> to deal with, the rest is ignored. To allow changes, this skips
>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>> appending.
>>
>> In absence of SLOF, this assigns phandles to device tree nodes to make
>> device tree traversing work.
>>
>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>
>> This adds basic instances support which are managed by a hash map
>> ihandle -> [phandle].
>>
>> Before the guest started, the used memory is:
>> 0..e60 - the initial firmware
>> 8000..10000 - stack
>> 400000.. - kernel
>> 3ea0000.. - initramdisk
>>
>> This OF CI does not implement "interpret".
>>
>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>> includes a disk image with pre-formatted nvram.
>>
>> With this basic support, this can only boot into kernel directly.
>> However this is just enough for the petitboot kernel and initradmdisk to
>> boot from any possible source. Note this requires reasonably recent guest
>> kernel with:
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>
>>
>> The immediate benefit is much faster booting time which especially
>> crucial with fully emulated early CPU bring up environments. Also this
>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>
>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>> other POWERPC boards which do not support pSeries.
>>
>> This make VOF optional, it is disabled by default, add --enable-vof
>> to ./configure to enable it.
>>
>> This assumes potential support for booting from QEMU backends
>> such as blockdev or netdev without devices/drivers used.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> The example command line is:
>>
>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>> -nodefaults \
>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>> -mon id=MON0,chardev=STDIO0,mode=readline \
>> -nographic \
>> -vga none \
>> -enable-kvm \
>> -m 8G \
>> -machine 
>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>> \
>> -kernel pbuild/kernel-le-guest/vmlinux \
>> -initrd pb/rootfs.cpio.xz \
>> -drive 
>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>> \
>> -global spapr-nvram.drive=DRIVE0 \
>> -snapshot \
>> -smp 8,threads=8 \
>> -L /home/aik/t/qemu-ppc64-bios/ \
>> -trace events=qemu_trace_events \
>> -d guest_errors \
>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>> -mon chardev=SOCKET0,mode=control
>>
>> ---
>> Changes:
>> v21:
>> * s/ld/ldz/ in entry.S
>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>> Kconfig
>> * made CONFIG_VOF optional
>> * s/l.lds/vof.lds/
>> * force 32 BE in spapr_machine_reset() instead of the firmware
>> * added checks for non-null methods of VofMachineIfClass
>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>> comment
>> * added  path_offset wrapper for handling mixed case for addresses
>> after "@" in node names
>> * changed getprop() to check for actual "name" property in the fdt
>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>> similar
>> rtas_ld/ldl_be_*) they return error codes
>> * VOF_MEM_READ uses now address_space_read (it was 
>> address_space_read_full
>> before, not sure why)
>>
>> v20:
>> * compile vof.bin with -mcpu=power4 for better compatibility
>> * s/std/stw/ in entry.S to make it work on ppc32
>> * fixed dt_available property to support both 32 and 64bit
>> * shuffled prom_args handling code
>> * do not enforce 32bit in MSR (again, to support 32bit platforms)
>>
>> v19:
>> * put bootargs in the FDT
>> * moved setting properties to a VOF machine hook
>> * moved fw_size and claim for it to vof_init()
>> * added CROSS to the VOF's makefile
>> * simplified phandles assigning
>> * pass MachineState to all machine hooks instead of calling
>> qdev_get_machine (following QOM)
>> * bunch of smaller changes and added comments
>> * added simple test to attempt to start with x-vof=on
>>
>> v18:
>> * fixed top addr (max address for "claim") on radix - it equals to 
>> ram_size
>> and vof->top_addr was uint32_t
>> * fixed "available" property which got broken in v14 but it is only 
>> visible
>> to clients which care (== grub)
>> * reshuffled vof_dt_memory_available() calls, added vof_init() to allow
>> vof_claim() before rendering the FDT
>>
>> v17:
>> * mv hw/ppc/vof.h include/hw/ppc/vof.h
>> * VofMachineIfClass -> VofMachineClass; it is not VofMachineInterface as
>> nobody used this scheme, usually "Interface" is dropped, a couple of 
>> times
>> it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
>> used by include/hw/vmstate-if.h
>> * added SPDX
>> * other fixes from v16 review
>>
>> v16:
>> * rebased on dwg/ppc-for-6.1
>> * s/SpaprVofInterface/VofMachineInterface/
>>
>> v15:
>> * bugfix: claimed memory for the VOF itself
>> * ditched OF_STACK_ADDR and allocate one instead, now it starts from 
>> 0x8000
>> because it is aligned to its size (no particular reason though)
>> * coding style
>> * moved nvram.bin up one level
>> * ditched bool in the firmware
>> * made debugging code conditional using trace_event_get_state() + 
>> qemu_loglevel_mask()
>> * renamed the CAS interface to SpaprVofInterface
>> * added "write" which for now dumps the message and ihandle via
>> trace point for early debug assistance
>> * commented on when we allocate of_instances in vof_build_dt()
>> * store fw_size is SpaprMachine to let spapr_vof_reset() claim it
>> * many small fixes from v14's review
>>
>> v14:
>> * check for truncates in readstr()
>> * ditched a separate vof_reset()
>> * spapr->vof is a pointer now, dropped the "on" field
>> * removed rtas_base from vof and updated comment why we allow setting it
>> * added myself to maintainers
>> * updated commit log about blockdev and other possible platforms
>> * added a note why new hcall is 0x5
>> * no in place endianness convertion in spapr_h_vof_client
>> * converted all cpu_physical_memory_read/write to address_space_rw
>> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>>
>> v13:
>> * rebase on latest ppc-for-6.0
>> * shuffled code around to touch spapr.c less
>>
>> v12:
>> * split VOF and SPAPR
>>
>> v11:
>> * added g_autofree
>> * fixed gcc warnings
>> * fixed few leaks
>> * added nvram image to make "nvram --print-config" not crash;
>> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
>> is 16K, or it just does not work (empty output from "nvram")
>>
>> v10:
>> * now rebased to compile with meson
>>
>> v9:
>> * remove special handling of /rtas/rtas-size as now we always add it 
>> in QEMU
>> * removed leftovers from scsi/grub/stdout/stdin/...
>>
>> v8:
>> * no read/write/seek
>> * no @dev in instances
>> * the machine flag is "x-vof" for now
>>
>> v7:
>> * now we have a small firmware which loads at 0 as SLOF and starts from
>> 0x100 as SLOF
>> * no MBR/ELF/GRUB business in QEMU anymore
>> * blockdev is a separate patch
>> * networking is a separate patch
>>
>> v6:
>> * borrowed a big chunk of commit log introduction from David
>> * fixed initial stack pointer (points to the highest address of stack)
>> * traces for "interpret" and others
>> * disabled  translate_kernel_address() hack so grub can load (work in
>> progress)
>> * added "milliseconds" for grub
>> * fixed "claim" allocator again
>> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
>> * moved the most code possible from spapr.c to spapr_of_client.c, such as
>> RTAS, prom entry and FDT build/finalize
>> * separated blobs
>> * GRUB now proceeds to its console prompt (there are still other issues)
>> * parse MBR/GPT to find PReP and load GRUB
>>
>> v5:
>> * made instances keep device and chardev pointers
>> * removed VIO dependencies
>> * print error if RTAS memory is not claimed as it should have been
>> * pack FDT as "quiesce"
>>
>> v4:
>> * fixed open
>> * validate ihandles in "call-method"
>>
>> v3:
>> * fixed phandles allocation
>> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
>> * fixed size of /chosen/stdout
>> * bunch of renames
>> * do not create rtas properties at all, let the client deal with it;
>> instead setprop allows changing these in the FDT
>> * no more packing FDT when bios=off - nobody needs it and getprop does 
>> not
>> work otherwise
>> * allow updating initramdisk device tree properties (for zImage)
>> * added instances
>> * fixed stdout on OF's "write"
>> * removed special handling for stdout in OF client, spapr-vty handles it
>> instead
>>
>> v2:
>> * fixed claim()
>> * added "setprop"
>> * cleaner client interface and RTAS blobs management
>> * boots to petitboot and further to the target system
>> * more trace points
>>
>> v20
>>
>> v20!
>> ---
>> configure               |    9 +
>> pc-bios/vof/Makefile    |   23 +
>> include/hw/ppc/spapr.h  |   25 +-
>> include/hw/ppc/vof.h    |   55 ++
>> pc-bios/vof/vof.h       |   43 ++
>> hw/ppc/spapr.c          |   87 +++-
>> hw/ppc/spapr_hcall.c    |   29 +-
>> hw/ppc/spapr_vof.c      |  153 ++++++
>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>> pc-bios/vof/bootmem.c   |   14 +
>> pc-bios/vof/ci.c        |   91 ++++
>> pc-bios/vof/libc.c      |   92 ++++
>> pc-bios/vof/main.c      |   21 +
>> tests/qtest/rtas-test.c |   17 +-
>> MAINTAINERS             |   12 +
>> hw/ppc/Kconfig          |    3 +
>> hw/ppc/meson.build      |    3 +
>> hw/ppc/trace-events     |   24 +
>> meson.build             |    1 +
>> pc-bios/README          |    2 +
>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>> pc-bios/vof/entry.S     |   49 ++
>> pc-bios/vof/vof.lds     |   48 ++
>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>> create mode 100644 pc-bios/vof/Makefile
>> create mode 100644 include/hw/ppc/vof.h
>> create mode 100644 pc-bios/vof/vof.h
>> create mode 100644 hw/ppc/spapr_vof.c
>> create mode 100644 hw/ppc/vof.c
>> create mode 100644 pc-bios/vof/bootmem.c
>> create mode 100644 pc-bios/vof/ci.c
>> create mode 100644 pc-bios/vof/libc.c
>> create mode 100644 pc-bios/vof/main.c
>> create mode 100644 pc-bios/vof-nvram.bin
>> create mode 100755 pc-bios/vof.bin
>> create mode 100644 pc-bios/vof/entry.S
>> create mode 100644 pc-bios/vof/vof.lds
>>
>> diff --git a/configure b/configure
>> index 8dcb9965b24e..00dc29c027fa 100755
>> --- a/configure
>> +++ b/configure
>> @@ -445,6 +445,7 @@ fuse="auto"
>> fuse_lseek="auto"
>> multiprocess="auto"
>> slirp_smbd="$default_feature"
>> +vof="no"
>>
>> malloc_trim="auto"
>> gio="$default_feature"
>> @@ -1561,6 +1562,10 @@ for opt do
>>   ;;
>>   --disable-slirp-smbd) slirp_smbd=no
>>   ;;
>> +  --enable-vof) vof=yes
>> +  ;;
>> +  --disable-vof) vof=no
>> +  ;;
>>   *)
>>       echo "ERROR: unknown option $opt"
>>       echo "Try '$0 --help' for more information"
>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is 
>> enabled if available
>>   multiprocess    Out of process device emulation support
>>   gio             libgio support
>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>> experimental)
>>
>> NOTE: The object files are built at the place where configure is launched
>> EOF
>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>> fi
>> +if test "$vof" = "yes" ; then
>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>> +fi
>> if test "$vde" = "yes" ; then
>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
> 
> In case I could not explain it clearly in my previous message I think 
> the solution we want here is to drop these configure changes and let 
> Kconfig configure this. The CONFIG_VOF option decides if vof itself is 
> built (adds vof.c) and pegasos2 will select this so it will usually be 
> yes by default. Your problem is that you're trying to use this variable 
> in spapr to make it off by default but that does not work. You need to 
> add another option for that (e.g. CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF 
> whichever makes more sense) then you can set that to no despite 
> CONFIG_VOF is yes and use that variable in spapr files and to add 
> spapr_vof.c. Then no configure option is needed which does not even work 
> for me: I get compile errors saying 'poisoning existing macro 
> "CONFIG_VOF"' if I try with --enable-vof or spapr fails to build if I 
> try without --enable-vof but select CONFIG_VOF from pegasos2. I hope 
> this makes sense now.


My problem is that I do not understand when we want VOF to be compiled 
in by default and when we do not. For a moment I thought we do not want 
it in by default but now it sounds like we do. If that is so, then 
CONFIG_VOF + selecting it from PSERIES and PEGASOS should do. Or I am 
missing the point again?



>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>> new file mode 100644
>> index 000000000000..aa1678c4d889
>> --- /dev/null
>> +++ b/pc-bios/vof/Makefile
>> @@ -0,0 +1,23 @@
>> +all: build-all
>> +
>> +build-all: vof.bin
>> +
>> +CROSS ?=
>> +CC = $(CROSS)gcc
>> +LD = $(CROSS)ld
>> +OBJCOPY = $(CROSS)objcopy
>> +
>> +%.o: %.S
>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>> +
>> +%.o: %.c
>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o 
>> $@ $<
>> +
>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>> +
>> +%.bin: %.elf
>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>> +
>> +clean:
>> +    rm -f *.o vof.bin vof.elf *~
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index f05219f75ef6..39b5581ae650 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -12,6 +12,9 @@
>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>> #include "hw/ppc/xics.h"        /* For ICSState */
>> #include "hw/ppc/spapr_tpm_proxy.h"
>> +#ifdef CONFIG_VOF
>> +#include "hw/ppc/vof.h"
>> +#endif
>>
>> struct SpaprVioBus;
>> struct SpaprPhbState;
>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>     uint64_t kernel_addr;
>>     uint32_t initrd_base;
>>     long initrd_size;
>> +#ifdef CONFIG_VOF
> 
> So this can't be CONFIG_VOF here if you want to be able to set it to no 
> despite pegasos2 pulling in VOF so you need another SPAPR specific 

If VOF is compiled it, why would I want it to be still disabled for 
PSERIES? The code is in, let it work.


> option for that in spapr specific parts with CONFIG_VOF selecting if vof 
> itself is built it any board uses it. So CONFIG_PEGASOS2 has to select 
> CONFIG_VOF and your SPAPR_VOF option should too if it's enabled that way 
> vof.c will be added if either board is built but for SPAPR only if its 
> VOF option is on.
> 
>> +    Vof *vof;
>> +#endif
>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>     struct PPCTimebase tb;
>>     bool has_graphics;
>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>> /* Client Architecture support */
>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>
>> /*
>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState 
>> *spapr, hwaddr pagesize,
>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>> hwaddr spapr_get_rtas_addr(void);
>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>> +
>> +#ifdef CONFIG_VOF
>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>> +                     target_ulong *stack_ptr, Error **errp);
>> +void spapr_vof_quiesce(MachineState *ms);
>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>> *propname,
>> +                       void *val, int vallen);
>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>> *spapr,
>> +                                target_ulong opcode, target_ulong 
>> *args);
>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>> +                                                   CPUState *cs,
>> +                                                   target_ulong 
>> ovec_addr);
>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>> +#endif
>> +
>> #endif /* HW_SPAPR_H */
>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>> new file mode 100644
>> index 000000000000..65ca2fed0d41
>> --- /dev/null
>> +++ b/include/hw/ppc/vof.h
>> @@ -0,0 +1,55 @@
>> +/*
>> + * Virtual Open Firmware
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#ifndef HW_VOF_H
>> +#define HW_VOF_H
>> +
>> +typedef struct Vof {
>> +    uint64_t top_addr; /* copied from rma_size */
>> +    GArray *claimed; /* array of SpaprOfClaimed */
>> +    uint64_t claimed_base;
>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>> +    uint32_t of_instance_last;
>> +    char *bootargs;
>> +    long fw_size;
>> +} Vof;
>> +
>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>> +                    target_ulong args_real);
>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>> align);
>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>> +void vof_cleanup(Vof *vof);
>> +void vof_build_dt(void *fdt, Vof *vof);
>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>> *nodename,
>> +                               const char *prop, const char *path);
>> +
>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>> +
>> +typedef struct VofMachineIfClass VofMachineIfClass;
>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>> TYPE_VOF_MACHINE_IF)
>> +
>> +struct VofMachineIfClass {
>> +    InterfaceClass parent;
>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>> CPUState *cs,
>> +                                                target_ulong vec);
>> +    void (*quiesce)(MachineState *ms);
>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>> *propname,
>> +                    void *val, int vallen);
>> +};
>> +
>> +/*
>> + * Initial stack size is from
>> + * 
>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>
> 
> I wonder if it's better to quote the section number and the title of the 
> doc in case the URL here goes away in the future.


The binding (the URL clearly suggests it is a "binding") says 32K is the 
minimum, what else is here to quote? The doc does not explain why anyway.


>> + */
>> +#define VOF_STACK_SIZE       0x8000
>> +
>> +#define VOF_MEM_READ(pa, buf, size) \
>> +    address_space_read(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> +#define VOF_MEM_WRITE(pa, buf, size) \
>> +    address_space_write(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> +
>> +#endif /* HW_VOF_H */
>> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
>> new file mode 100644
>> index 000000000000..2d8958076907
>> --- /dev/null
>> +++ b/pc-bios/vof/vof.h
>> @@ -0,0 +1,43 @@
>> +/*
>> + * Virtual Open Firmware
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#include <stdarg.h>
>> +
>> +typedef unsigned char uint8_t;
>> +typedef unsigned short uint16_t;
>> +typedef unsigned long uint32_t;
>> +typedef unsigned long long uint64_t;
>> +#define NULL (0)
>> +#define PROM_ERROR (-1u)
>> +typedef unsigned long ihandle;
>> +typedef unsigned long phandle;
>> +typedef int size_t;
>> +typedef void client(void);
>> +
>> +/* globals */
>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this 
>> firmware) */
>> +
>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>> +
>> +/* libc */
>> +int strlen(const char *s);
>> +int strcmp(const char *s1, const char *s2);
>> +void *memcpy(void *dest, const void *src, size_t n);
>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>> +void *memmove(void *dest, const void *src, size_t n);
>> +void *memset(void *dest, int c, size_t size);
>> +
>> +/* CI wrappers */
>> +void ci_panic(const char *str);
>> +phandle ci_finddevice(const char *path);
>> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int 
>> len);
>> +
>> +/* booting from -kernel */
>> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
>> +
>> +/* Entry points for CI and RTAS */
>> +extern uint32_t ci_entry(uint32_t params);
>> +extern unsigned long hv_rtas(unsigned long params);
>> +extern unsigned int hv_rtas_size;
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 4dd90b75cc52..6d747d72c614 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -101,6 +101,7 @@
>> #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
>> #define FW_MAX_SIZE             0x400000
>> #define FW_FILE_NAME            "slof.bin"
>> +#define FW_FILE_NAME_VOF        "vof.bin"
>> #define FW_OVERHEAD             0x2800000
>> #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>>
>> @@ -1639,22 +1640,40 @@ static void spapr_machine_reset(MachineState 
>> *machine)
>>     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
>>
>>     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
>> +#ifdef CONFIG_VOF
>> +    if (spapr->vof) {
>> +        target_ulong stack_ptr = 0;
>>
>> -    rc = fdt_pack(fdt);
>> +        spapr_vof_reset(spapr, fdt, &stack_ptr, &error_fatal);
>>
>> -    /* Should only fail if we've built a corrupted tree */
>> -    assert(rc == 0);
>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>> +                                  stack_ptr, spapr->initrd_base,
>> +                                  spapr->initrd_size);
>> +        /* VOF is 32bit BE so enforce MSR here */
>> +        first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << 
>> MSR_LE));
>> +        /*
>> +         * Do not pack the FDT as the client may change properties.
>> +         * VOF client does not expect the FDT so we do not load it to 
>> the VM.
>> +         */
>> +    } else
>> +#endif
>> +    {
>> +        rc = fdt_pack(fdt);
>> +        /* Should only fail if we've built a corrupted tree */
>> +        assert(rc == 0);
>>
>> -    /* Load the fdt */
>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>> +                                  0, fdt_addr, 0);
>> +        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>> +    }
>>     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>> +
>>     g_free(spapr->fdt_blob);
>>     spapr->fdt_size = fdt_totalsize(fdt);
>>     spapr->fdt_initial_size = spapr->fdt_size;
>>     spapr->fdt_blob = fdt;
>>
>>     /* Set up the entry state */
>> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, 
>> fdt_addr, 0);
>>     first_ppc_cpu->env.gpr[5] = 0;
>>
>>     spapr->fwnmi_system_reset_addr = -1;
>> @@ -2657,7 +2676,12 @@ static void spapr_machine_init(MachineState 
>> *machine)
>>     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>>     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>>     MachineClass *mc = MACHINE_GET_CLASS(machine);
>> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
>> +    const char *bios_default =
>> +#ifdef CONFIG_VOF
>> +        !!spapr->vof ? FW_FILE_NAME_VOF :
> 
> Does !! make sense here? I think testing for non-0 does not need it so 
> you could just write spapr->vof without !!.


I find c operator precedence confusing at times. Unary operators like 
"!" are easy to read though.


> 
>> +#endif
>> +        FW_FILE_NAME;
>> +    const char *bios_name = machine->firmware ?: bios_default;
>>     const char *kernel_filename = machine->kernel_filename;
>>     const char *initrd_filename = machine->initrd_filename;
>>     PCIHostState *phb;
>> @@ -3014,6 +3038,12 @@ static void spapr_machine_init(MachineState 
>> *machine)
>>     }
>>
>>     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
>> +#ifdef CONFIG_VOF
>> +    if (spapr->vof) {
>> +        spapr->vof->fw_size = fw_size; /* for claim() on itself */
>> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, 
>> spapr_h_vof_client);
>> +    }
>> +#endif
>> }
>>
>> #define DEFAULT_KVM_TYPE "auto"
>> @@ -3204,6 +3234,30 @@ static void spapr_set_resize_hpt(Object *obj, 
>> const char *value, Error **errp)
>>     }
>> }
>>
>> +#ifdef CONFIG_VOF
>> +static bool spapr_get_vof(Object *obj, Error **errp)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    return spapr->vof != NULL;
>> +}
>> +
>> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    if (spapr->vof) {
>> +        vof_cleanup(spapr->vof);
>> +        g_free(spapr->vof);
>> +        spapr->vof = NULL;
>> +    }
>> +    if (!value) {
>> +        return;
>> +    }
>> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
>> +}
>> +#endif
>> +
>> static char *spapr_get_ic_mode(Object *obj, Error **errp)
>> {
>>     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> @@ -3329,6 +3383,12 @@ static void spapr_instance_init(Object *obj)
>>                                     stringify(KERNEL_LOAD_ADDR)
>>                                     " for -kernel is the default");
>>     spapr->kernel_addr = KERNEL_LOAD_ADDR;
>> +#ifdef CONFIG_VOF
>> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, 
>> spapr_set_vof);
>> +    object_property_set_description(obj, "x-vof",
>> +                                    "Enable Virtual Open Firmware 
>> (experimental)");
>> +#endif
>> +
>>     /* The machine class defines the default interrupt controller mode */
>>     spapr->irq = smc->irq;
>>     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
>> @@ -4580,6 +4640,16 @@ static void 
>> spapr_machine_class_init(ObjectClass *oc, void *data)
>>     smc->smp_threads_vsmt = true;
>>     smc->nr_xirqs = SPAPR_NR_XIRQS;
>>     xfc->match_nvt = spapr_match_nvt;
>> +
>> +#ifdef CONFIG_VOF
>> +    {
>> +        VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
>> +        vmc->client_architecture_support =
>> +            spapr_vof_client_architecture_support;
>> +        vmc->quiesce = spapr_vof_quiesce;
>> +        vmc->setprop = spapr_vof_setprop;
>> +    }
>> +#endif
>> }
>>
>> static const TypeInfo spapr_machine_info = {
>> @@ -4599,6 +4669,9 @@ static const TypeInfo spapr_machine_info = {
>>         { TYPE_XICS_FABRIC },
>>         { TYPE_INTERRUPT_STATS_PROVIDER },
>>         { TYPE_XIVE_FABRIC },
>> +#ifdef CONFIG_VOF
>> +        { TYPE_VOF_MACHINE_IF },
>> +#endif
>>         { }
>>     },
>> };
>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>> index f25014afda40..986a4de34128 100644
>> --- a/hw/ppc/spapr_hcall.c
>> +++ b/hw/ppc/spapr_hcall.c
>> @@ -1080,7 +1080,7 @@ target_ulong 
>> do_client_architecture_support(PowerPCCPU *cpu,
>>     SpaprOptionVector *ov1_guest, *ov5_guest;
>>     bool guest_radix;
>>     bool raw_mode_supported = false;
>> -    bool guest_xive;
>> +    bool guest_xive, reset_fdt = false;
>>     CPUState *cs;
>>     void *fdt;
>>     uint32_t max_compat = spapr->max_compat_pvr;
>> @@ -1233,8 +1233,10 @@ target_ulong 
>> do_client_architecture_support(PowerPCCPU *cpu,
>>         spapr_setup_hpt(spapr);
>>     }
>>
>> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
>> -
>> +#ifdef CONFIG_VOF
>> +    reset_fdt = spapr->vof != NULL;
> 
> (Here when storing to a bool !! could make sense but what you have is 
> better as it's clearer so I'm not suggesting to use !! here either.

I prefer this way and I would rather do this that "!!" but again 
precedence confuses me some times so up there I'd need braces for the 
condition and then folks start asking "why you need braces" :)
I do not need braces here as "=" has the priority (although the fact 
that it returns a value is just bizzarre).

> It's 
> rarely useful, maybe only if you need a bool but does not have space to 
> write the condition or it would be more confusing that way.)



> 
>> +#endif
>> +    fdt = spapr_build_fdt(spapr, reset_fdt, fdt_bufsize);
>>     g_free(spapr->fdt_blob);
>>     spapr->fdt_size = fdt_totalsize(fdt);
>>     spapr->fdt_initial_size = spapr->fdt_size;
>> @@ -1277,6 +1279,27 @@ static target_ulong 
>> h_client_architecture_support(PowerPCCPU *cpu,
>>     return ret;
>> }
>>
>> +#ifdef CONFIG_VOF
>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>> +                                                   CPUState *cs,
>> +                                                   target_ulong 
>> ovec_addr)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>> +
>> +    target_ulong ret = 
>> do_client_architecture_support(POWERPC_CPU(cs), spapr,
>> +                                                      ovec_addr, 
>> FDT_MAX_SIZE);
>> +
>> +    /*
>> +     * This adds stdout and generates phandles for boottime and CAS 
>> FDTs.
>> +     * It is alright to update the FDT here as 
>> do_client_architecture_support()
>> +     * does not pack it.
>> +     */
>> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
>> +
>> +    return ret;
>> +}
>> +#endif
>> +
>> static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>                                               SpaprMachineState *spapr,
>>                                               target_ulong opcode,
>> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
>> new file mode 100644
>> index 000000000000..653d376f38aa
>> --- /dev/null
>> +++ b/hw/ppc/spapr_vof.c
>> @@ -0,0 +1,153 @@
>> +/*
>> + * SPAPR machine hooks to Virtual Open Firmware,
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include <sys/ioctl.h>
>> +#include "qapi/error.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_vio.h"
>> +#include "hw/ppc/fdt.h"
>> +#include "sysemu/sysemu.h"
>> +#include "qom/qom-qobject.h"
>> +#include "trace.h"
>> +
>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>> *spapr,
>> +                                target_ulong opcode, target_ulong 
>> *_args)
>> +{
>> +    int ret = vof_client_call(MACHINE(spapr), spapr->vof, 
>> spapr->fdt_blob,
>> +                              ppc64_phys_to_real(_args[0]));
>> +
>> +    if (ret) {
>> +        return H_PARAMETER;
>> +    }
>> +    return H_SUCCESS;
>> +}
>> +
>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
>> +{
>> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
>> +    int chosen;
>> +
>> +    vof_build_dt(fdt, spapr->vof);
>> +
>> +    _FDT(chosen = fdt_path_offset(fdt, "/chosen"));
>> +    _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
>> +                            spapr->vof->bootargs ? : ""));
>> +
>> +    /*
>> +     * SLOF-less setup requires an open instance of stdout for early
>> +     * kernel printk. By now all phandles are settled so we can open
>> +     * the default serial console.
>> +     */
>> +    if (stdout_path) {
>> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
>> +                                   stdout_path));
>> +    }
>> +}
>> +
>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>> +                     target_ulong *stack_ptr, Error **errp)
>> +{
>> +    Vof *vof = spapr->vof;
>> +
>> +    vof_init(vof, spapr->rma_size, errp);
>> +
>> +    *stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE);
>> +    if (*stack_ptr == -1) {
>> +        error_setg(errp, "Memory allocation for stack failed");
>> +        return;
>> +    }
>> +    /* Stack grows downwards plus reserve space for the minimum stack 
>> frame */
>> +    *stack_ptr += VOF_STACK_SIZE - 0x20;
>> +
>> +    if (spapr->kernel_size &&
>> +        vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == 
>> -1) {
>> +        error_setg(errp, "Memory for kernel is in use");
>> +        return;
>> +    }
>> +
>> +    if (spapr->initrd_size &&
>> +        vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == 
>> -1) {
>> +        error_setg(errp, "Memory for initramdisk is in use");
>> +        return;
>> +    }
>> +
>> +    spapr_vof_client_dt_finalize(spapr, fdt);
>> +
>> +    /*
>> +     * At this point the expected allocation map is:
>> +     *
>> +     * 0..c38 - the initial firmware
>> +     * 8000..10000 - stack
>> +     * 400000.. - kernel
>> +     * 3ea0000.. - initramdisk
>> +     *
>> +     * We skip writing FDT as nothing expects it; OF client interface is
>> +     * going to be used for reading the device tree.
>> +     */
>> +}
>> +
>> +void spapr_vof_quiesce(MachineState *ms)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>> +
>> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
>> +    spapr->fdt_initial_size = spapr->fdt_size;
>> +}
>> +
>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>> *propname,
>> +                       void *val, int vallen)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>> +
>> +    /*
>> +     * We only allow changing properties which we know how to update 
>> in QEMU
>> +     * OR
>> +     * the ones which we know that they need to survive during 
>> "quiesce".
>> +     */
>> +
>> +    if (strcmp(path, "/rtas") == 0) {
>> +        if (strcmp(propname, "linux,rtas-base") == 0 ||
>> +            strcmp(propname, "linux,rtas-entry") == 0) {
>> +            /* These need to survive quiesce so let them store in the 
>> FDT */
>> +            return true;
>> +        }
>> +    }
>> +
>> +    if (strcmp(path, "/chosen") == 0) {
>> +        if (strcmp(propname, "bootargs") == 0) {
>> +            Vof *vof = spapr->vof;
>> +
>> +            g_free(vof->bootargs);
>> +            vof->bootargs = g_strndup(val, vallen);
>> +            return true;
>> +        }
>> +        if (strcmp(propname, "linux,initrd-start") == 0) {
>> +            if (vallen == sizeof(uint32_t)) {
>> +                spapr->initrd_base = ldl_be_p(val);
>> +                return true;
>> +            }
>> +            if (vallen == sizeof(uint64_t)) {
>> +                spapr->initrd_base = ldq_be_p(val);
>> +                return true;
>> +            }
>> +            return false;
>> +        }
>> +        if (strcmp(propname, "linux,initrd-end") == 0) {
>> +            if (vallen == sizeof(uint32_t)) {
>> +                spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base;
>> +                return true;
>> +            }
>> +            if (vallen == sizeof(uint64_t)) {
>> +                spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base;
>> +                return true;
>> +            }
>> +            return false;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>> new file mode 100644
>> index 000000000000..1068a1e58388
>> --- /dev/null
>> +++ b/hw/ppc/vof.c
>> @@ -0,0 +1,1052 @@
>> +/*
>> + * QEMU PowerPC Virtual Open Firmware.
>> + *
>> + * This implements client interface from OpenFirmware IEEE1275 on the 
>> QEMU
>> + * side to leave only a very basic firmware in the VM.
>> + *
>> + * Copyright (c) 2021 IBM Corporation.
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include "qemu/timer.h"
>> +#include "qemu/range.h"
>> +#include "qemu/units.h"
>> +#include "qapi/error.h"
>> +#include <sys/ioctl.h>
>> +#include "exec/ram_addr.h"
>> +#include "exec/address-spaces.h"
>> +#include "hw/ppc/vof.h"
>> +#include "hw/ppc/fdt.h"
>> +#include "sysemu/runstate.h"
>> +#include "qom/qom-qobject.h"
>> +#include "trace.h"
>> +
>> +#include <libfdt.h>
>> +
>> +/*
>> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
>> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 
>> chars long.
>> + */
>> +#define OF_PROPNAME_LEN_MAX 64
>> +
>> +#define VOF_MAX_PATH        256
>> +#define VOF_MAX_SETPROPLEN  2048
>> +#define VOF_MAX_METHODLEN   256
>> +#define VOF_MAX_FORTHCODE   256
>> +#define VOF_VTY_BUF_SIZE    256
>> +
>> +typedef struct {
>> +    uint64_t start;
>> +    uint64_t size;
>> +} OfClaimed;
>> +
>> +typedef struct {
>> +    char *path; /* the path used to open the instance */
>> +    uint32_t phandle;
>> +} OfInstance;
>> +
>> +static int readstr(hwaddr pa, char *buf, int size)
>> +{
>> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
>> +        return -1;
>> +    }
>> +    if (strnlen(buf, size) == size) {
>> +        buf[size - 1] = '\0';
>> +        trace_vof_error_str_truncated(buf, size);
>> +        return -1;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
>> +                       const char *s1, unsigned nargscheck, unsigned 
>> nretcheck)
>> +{
>> +    if (strcmp(s, s1)) {
>> +        return false;
>> +    }
>> +    if ((nargscheck && (nargs != nargscheck)) ||
>> +        (nretcheck && (nret != nretcheck))) {
>> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static void prop_format(char *tval, int tlen, const void *prop, int len)
>> +{
>> +    int i;
>> +    const unsigned char *c;
>> +    char *t;
>> +    const char bin[] = "...";
>> +
>> +    for (i = 0, c = prop; i < len; ++i, ++c) {
>> +        if (*c == '\0' && i == len - 1) {
>> +            strncpy(tval, prop, tlen - 1);
>> +            return;
>> +        }
>> +        if (*c < 0x20 || *c >= 0x80) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
>> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
>> +            strcpy(t, bin);
>> +            return;
>> +        }
>> +        if (i && i % 4 == 0 && i != len - 1) {
>> +            strcat(t, " ");
>> +            ++t;
>> +        }
>> +        t += sprintf(t, "%02X", *c & 0xFF);
>> +    }
>> +}
>> +
>> +static int get_path(const void *fdt, int offset, char *buf, int len)
>> +{
>> +    int ret;
>> +
>> +    ret = fdt_get_path(fdt, offset, buf, len - 1);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    buf[len - 1] = '\0';
>> +
>> +    return strlen(buf) + 1;
>> +}
>> +
>> +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, 
>> int len)
>> +{
>> +    int ret;
>> +
>> +    ret = fdt_node_offset_by_phandle(fdt, ph);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return get_path(fdt, ret, buf, len);
>> +}
>> +
>> +static int path_offset(const void *fdt, const char *path)
>> +{
>> +    g_autofree char *p = NULL;
>> +    char *at;
>> +
>> +    /*
>> +     * The addresses in node names are expected to in the lower case 
>> as per
> 
> There's some grammar problem with this sentence. I think it should be 
> "are expected to be in lower case" but ask a native speaker.

Definitely missed "be". Thanks for spotting.


> 
>> +     * 
>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>
>> +     */
>> +    at = strchr(path, '@');
>> +    if (!at) {
>> +        return fdt_path_offset(fdt, path);
>> +    }
>> +
>> +    p = g_strdup(path);
>> +    for (at = at - path + p + 1; *at; ++at) {
>> +        *at = tolower(*at);
>> +    }
>> +    return fdt_path_offset(fdt, p);
>> +}
>> +
>> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
>> +{
>> +    char fullnode[VOF_MAX_PATH];
>> +    uint32_t ret = -1;
>> +    int offset;
>> +
>> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
>> +        return (uint32_t) ret;
>> +    }
>> +
>> +    offset = path_offset(fdt, fullnode);
>> +    if (offset >= 0) {
>> +        ret = fdt_get_phandle(fdt, offset);
>> +    }
>> +    trace_vof_finddevice(fullnode, ret);
>> +    return (uint32_t) ret;
>> +}
>> +
>> +static const void *getprop(const void *fdt, int nodeoff, const char 
>> *propname,
>> +                           int *proplen, bool *write0)
>> +{
>> +    const char *unit, *prop;
>> +    const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen);
>> +
>> +    if (ret) {
>> +        if (write0) {
>> +            *write0 = false;
>> +        }
>> +        return ret;
>> +    }
>> +
>> +    /*
>> +     * The "name" property is not actually expected as a property in 
>> the FDT
>> +     * (although some platform may create those in "/" so we try 
>> getprop first),
> 
> Not only in "/" but anywhere. MorphOS walks the tree with nextprop and 
> expects to get a name property for most nodes without ever explicitely 
> querying "name". I've tested this with both the board firmware and VOF 
> and with the board firmware a name property appears in most nodes but 
> not all so I think at least SmartFirmware does the same and explicitely 
> sets name on some nodes and otherwise returns the name from path if such 
> property does not existbut queried. With this in VOF I can do the same 
> and get same results so the change should be OK but the comment may be 
> misleading now. Better to just say we return a value for "name" from 
> path if queried but property does not exist which seems to be what OF 
> does too.


Fair point, after checking with o1275 and 
devicetree-specification-v0.2.pdf, I'll do what you said.



>> +     * we emulate it by returning a pointer to the node's name and 
>> adjust
>> +     * proplen to include only the name but not the unit.
>> +     */
>> +    if (strcmp(propname, "name")) {
>> +        return NULL;
>> +    }
>> +    prop = fdt_get_name(fdt, nodeoff, proplen);
>> +    if (!prop) {
>> +        *proplen = 0;
>> +        return NULL;
>> +    }
>> +
>> +    unit = memchr(prop, '@', *proplen);
>> +    if (unit) {
>> +        *proplen = unit - prop;
>> +    }
>> +    *proplen += 1;
>> +
>> +    /*
>> +     * Since it might be cut at "@" and there will be no trailing zero
>> +     * in the prop buffer, tell the caller to write zero at the end.
>> +     */
>> +    if (write0) {
>> +        *write0 = true;
>> +    }
>> +    return prop;
>> +}
>> +
>> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, 
>> uint32_t pname,
>> +                            uint32_t valaddr, uint32_t vallen)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = 0;
>> +    int proplen = 0;
>> +    const void *prop;
>> +    char trval[64] = "";
>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>> +    bool write0;
>> +
>> +    if (nodeoff < 0) {
>> +        return -1;
>> +    }
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        return -1;
>> +    }
>> +    prop = getprop(fdt, nodeoff, propname, &proplen, &write0);
>> +    if (prop) {
>> +        const char zero = 0;
>> +        int cb = MIN(proplen, vallen);
>> +
>> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK ||
>> +            /* if that was "name" with a unit address, overwrite '@' 
>> with '0' */
>> +            (write0 &&
>> +             cb == proplen &&
>> +             VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) {
>> +            ret = -1;
>> +        } else {
>> +            /*
>> +             * OF1275 says:
>> +             * "Size is either the actual size of the property, or -1 
>> if name
>> +             * does not exist", hence returning proplen instead of cb.
>> +             */
>> +            ret = proplen;
>> +            /* Do not format a value if tracepoint is silent, for 
>> performance */
>> +            if (trace_event_get_state(TRACE_VOF_GETPROP) &&
>> +                qemu_loglevel_mask(LOG_TRACE)) {
>> +                prop_format(trval, sizeof(trval), prop, ret);
>> +            }
>> +        }
>> +    } else {
>> +        ret = -1;
>> +    }
>> +    trace_vof_getprop(nodeph, propname, ret, trval);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, 
>> uint32_t pname)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = 0;
>> +    int proplen = 0;
>> +    const void *prop;
>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>> +
>> +    if (nodeoff < 0) {
>> +        return -1;
>> +    }
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        return -1;
>> +    }
>> +    prop = getprop(fdt, nodeoff, propname, &proplen, NULL);
>> +    if (prop) {
>> +        ret = proplen;
>> +    } else {
>> +        ret = -1;
>> +    }
>> +    trace_vof_getproplen(nodeph, propname, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof,
>> +                            uint32_t nodeph, uint32_t pname,
>> +                            uint32_t valaddr, uint32_t vallen)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = -1;
>> +    int offset;
>> +    char trval[64] = "";
>> +    char nodepath[VOF_MAX_PATH] = "";
>> +    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
>> +    g_autofree char *val = NULL;
>> +
>> +    if (vallen > VOF_MAX_SETPROPLEN) {
>> +        goto trace_exit;
>> +    }
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        goto trace_exit;
>> +    }
>> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
>> +    if (offset < 0) {
>> +        goto trace_exit;
>> +    }
>> +    ret = get_path(fdt, offset, nodepath, sizeof(nodepath));
>> +    if (ret <= 0) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    val = g_malloc0(vallen);
>> +    if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    if (vmo) {
>> +        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
>> +
>> +        if (vmc->setprop &&
>> +            !vmc->setprop(ms, nodepath, propname, val, vallen)) {
>> +            goto trace_exit;
>> +        }
>> +    }
>> +
>> +    ret = fdt_setprop(fdt, offset, propname, val, vallen);
>> +    if (ret) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    if (trace_event_get_state(TRACE_VOF_SETPROP) &&
>> +        qemu_loglevel_mask(LOG_TRACE)) {
>> +        prop_format(trval, sizeof(trval), val, vallen);
>> +    }
>> +    ret = vallen;
>> +
>> +trace_exit:
>> +    trace_vof_setprop(nodeph, propname, trval, vallen, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
>> +                             uint32_t prevaddr, uint32_t nameaddr)
>> +{
>> +    int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle);
>> +    char prev[OF_PROPNAME_LEN_MAX + 1];
>> +    const char *tmp;
>> +
>> +    if (readstr(prevaddr, prev, sizeof(prev))) {
>> +        return -1;
>> +    }
>> +
>> +    fdt_for_each_property_offset(offset, fdt, nodeoff) {
>> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>> +            return 0;
>> +        }
>> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
>> +            if (prev[0] != '\0') {
>> +                offset = fdt_next_property_offset(fdt, offset);
>> +                if (offset < 0) {
>> +                    return 0;
>> +                }
>> +            }
>> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>> +                return 0;
>> +            }
>> +
>> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != 
>> MEMTX_OK) {
>> +                return -1;
>> +            }
>> +            return 1;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret;
>> +
>> +    if (phandle == 0) {
>> +        ret = fdt_path_offset(fdt, "/");
>> +    } else {
>> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, 
>> phandle));
>> +    }
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, 
>> phandle));
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, 
>> phandle));
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const 
>> char *path)
>> +{
>> +    uint32_t ret = -1;
>> +    OfInstance *inst = NULL;
>> +
>> +    if (vof->of_instance_last == 0xFFFFFFFF) {
>> +        /* We do not recycle ihandles yet */
>> +        goto trace_exit;
>> +    }
>> +
>> +    inst = g_new0(OfInstance, 1);
>> +    inst->phandle = fdt_get_phandle(fdt, offset);
>> +    g_assert(inst->phandle);
>> +    ++vof->of_instance_last;
>> +
>> +    inst->path = g_strdup(path);
>> +    g_hash_table_insert(vof->of_instances,
>> +                        GINT_TO_POINTER(vof->of_instance_last),
>> +                        inst);
>> +    ret = vof->of_instance_last;
>> +
>> +trace_exit:
>> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>> *nodename,
>> +                               const char *prop, const char *path)
>> +{
>> +    int node = fdt_path_offset(fdt, nodename);
>> +    int inst, offset;
>> +
>> +    offset = fdt_path_offset(fdt, path);
>> +    if (offset < 0) {
>> +        trace_vof_error_unknown_path(path);
>> +        return offset;
>> +    }
>> +
>> +    inst = vof_do_open(fdt, vof, offset, path);
>> +
>> +    return fdt_setprop_cell(fdt, node, prop, inst);
>> +}
>> +
>> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
>> +{
>> +    char path[VOF_MAX_PATH];
>> +    int offset;
>> +
>> +    if (readstr(pathaddr, path, sizeof(path))) {
>> +        return -1;
>> +    }
>> +
>> +    offset = path_offset(fdt, path);
>> +    if (offset < 0) {
>> +        trace_vof_error_unknown_path(path);
>> +        return offset;
>> +    }
>> +
>> +    return vof_do_open(fdt, vof, offset, path);
>> +}
>> +
>> +static void vof_close(Vof *vof, uint32_t ihandle)
>> +{
>> +    if (!g_hash_table_remove(vof->of_instances, 
>> GINT_TO_POINTER(ihandle))) {
>> +        trace_vof_error_unknown_ihandle_close(ihandle);
>> +    }
>> +}
>> +
>> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
>> +{
>> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
>> +                                         GINT_TO_POINTER(ihandle));
>> +    uint32_t ret = -1;
>> +
>> +    if (instp) {
>> +        ret = ((OfInstance *)instp)->phandle;
>> +    }
>> +    trace_vof_instance_to_package(ihandle, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
>> +                                    uint32_t buf, uint32_t len)
>> +{
>> +    uint32_t ret = -1;
>> +    char tmp[VOF_MAX_PATH] = "";
>> +
>> +    ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>> +    if (ret > 0) {
>> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>> +            ret = -1;
>> +        }
>> +    }
>> +
>> +    trace_vof_package_to_path(phandle, tmp, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t 
>> ihandle,
>> +                                     uint32_t buf, uint32_t len)
>> +{
>> +    uint32_t ret = -1;
>> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
>> +    char tmp[VOF_MAX_PATH] = "";
>> +
>> +    if (phandle != -1) {
>> +        ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>> +        if (ret > 0) {
>> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>> +                ret = -1;
>> +            }
>> +        }
>> +    }
>> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf,
>> +                          uint32_t len)
>> +{
>> +    char tmp[VOF_VTY_BUF_SIZE];
>> +    unsigned cb;
>> +    OfInstance *inst = (OfInstance *)
>> +        g_hash_table_lookup(vof->of_instances, 
>> GINT_TO_POINTER(ihandle));
>> +
>> +    if (!inst) {
>> +        trace_vof_error_write(ihandle);
>> +        return -1;
>> +    }
>> +
>> +    for ( ; len > 0; len -= cb) {
>> +        cb = MIN(len, sizeof(tmp) - 1);
>> +        if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) {
>> +            return -1;
>> +        }
>> +
>> +        /* FIXME: there is no backend(s) yet so just call a trace */
>> +        if (trace_event_get_state(TRACE_VOF_WRITE) &&
>> +            qemu_loglevel_mask(LOG_TRACE)) {
>> +            tmp[cb] = '\0';
>> +            trace_vof_write(ihandle, cb, tmp);
>> +        }
>> +    }
>> +
>> +    return len;
>> +}
>> +
>> +static void vof_claimed_dump(GArray *claimed)
>> +{
>> +    int i;
>> +    OfClaimed c;
>> +
>> +    if (trace_event_get_state(TRACE_VOF_CLAIMED) &&
>> +        qemu_loglevel_mask(LOG_TRACE)) {
>> +
>> +        for (i = 0; i < claimed->len; ++i) {
>> +            c = g_array_index(claimed, OfClaimed, i);
>> +            trace_vof_claimed(c.start, c.start + c.size, c.size);
>> +        }
>> +    }
>> +}
>> +
>> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t 
>> size)
>> +{
>> +    int i;
>> +    OfClaimed c;
>> +
>> +    for (i = 0; i < claimed->len; ++i) {
>> +        c = g_array_index(claimed, OfClaimed, i);
>> +        if (ranges_overlap(c.start, c.size, virt, size)) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
>> +{
>> +    OfClaimed newclaim;
>> +
>> +    newclaim.start = virt;
>> +    newclaim.size = size;
>> +    g_array_append_val(claimed, newclaim);
>> +}
>> +
>> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
>> +{
>> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
>> +}
>> +
>> +static void vof_dt_memory_available(void *fdt, GArray *claimed, 
>> uint64_t base)
>> +{
>> +    int i, n, offset, proplen = 0, sc, ac;
>> +    target_ulong mem0_end;
>> +    const uint8_t *mem0_reg;
>> +    g_autofree uint8_t *avail = NULL;
>> +    uint8_t *availcur;
>> +
>> +    if (!fdt || !claimed) {
>> +        return;
>> +    }
>> +
>> +    offset = fdt_path_offset(fdt, "/");
>> +    _FDT(offset);
>> +    ac = fdt_address_cells(fdt, offset);
>> +    g_assert(ac == 1 || ac == 2);
>> +    sc = fdt_size_cells(fdt, offset);
>> +    g_assert(sc == 1 || sc == 2);
>> +
>> +    offset = fdt_path_offset(fdt, "/memory@0");
>> +    _FDT(offset);
>> +
>> +    mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
>> +    g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
>> +    if (sc == 2) {
>> +        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + 
>> sizeof(uint32_t) * ac));
>> +    } else {
>> +        mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + 
>> sizeof(uint32_t) * ac));
>> +    }
>> +
>> +    g_array_sort(claimed, of_claimed_compare_func);
>> +    vof_claimed_dump(claimed);
>> +
>> +    /*
>> +     * VOF resides in the first page so we do not need to check if 
>> there is
>> +     * available memory before the first claimed block
>> +     */
>> +    g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 
>> 0).start == 0));
>> +
>> +    avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len);
>> +    for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) {
>> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
>> +        uint64_t start, size;
>> +
>> +        start = c.start + c.size;
>> +        if (i < claimed->len - 1) {
>> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
>> +
>> +            size = cn.start - start;
>> +        } else {
>> +            size = mem0_end - start;
>> +        }
>> +
>> +        if (ac == 2) {
>> +            *(uint64_t *) availcur = cpu_to_be64(start);
>> +        } else {
>> +            *(uint32_t *) availcur = cpu_to_be32(start);
>> +        }
>> +        availcur += sizeof(uint32_t) * ac;
>> +        if (sc == 2) {
>> +            *(uint64_t *) availcur = cpu_to_be64(size);
>> +        } else {
>> +            *(uint32_t *) availcur = cpu_to_be32(size);
>> +        }
>> +        availcur += sizeof(uint32_t) * sc;
>> +
>> +        if (size) {
>> +            trace_vof_avail(c.start + c.size, c.start + c.size + 
>> size, size);
>> +            ++n;
>> +        }
>> +    }
>> +    _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - 
>> avail)));
>> +}
>> +
>> +/*
>> + * OF1275:
>> + * "Allocates size bytes of memory. If align is zero, the allocated 
>> range
>> + * begins at the virtual address virt. Otherwise, an aligned address is
>> + * automatically chosen and the input argument virt is ignored".
>> + *
>> + * In other words, exactly one of @virt and @align is non-zero.
>> + */
>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size,
>> +                   uint64_t align)
>> +{
>> +    uint64_t ret;
>> +
>> +    if (size == 0) {
>> +        ret = -1;
>> +    } else if (align == 0) {
>> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
>> +            ret = -1;
>> +        } else {
>> +            ret = virt;
>> +        }
>> +    } else {
>> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
>> +        while (1) {
>> +            if (vof->claimed_base >= vof->top_addr) {
>> +                error_report("Out of RMA memory for the OF client");
>> +                return -1;
>> +            }
>> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, 
>> size)) {
>> +                break;
>> +            }
>> +            vof->claimed_base += size;
>> +        }
>> +        ret = vof->claimed_base;
>> +    }
>> +
>> +    if (ret != -1) {
>> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
>> +        vof_claim_add(vof->claimed, ret, size);
>> +    }
>> +    trace_vof_claim(virt, size, align, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size)
>> +{
>> +    uint32_t ret = -1;
>> +    int i;
>> +    GArray *claimed = vof->claimed;
>> +    OfClaimed c;
>> +
>> +    for (i = 0; i < claimed->len; ++i) {
>> +        c = g_array_index(claimed, OfClaimed, i);
>> +        if (c.start == virt && c.size == size) {
>> +            g_array_remove_index(claimed, i);
>> +            ret = 0;
>> +            break;
>> +        }
>> +    }
>> +
>> +    trace_vof_release(virt, size, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static void vof_instantiate_rtas(Error **errp)
>> +{
>> +    error_setg(errp, "The firmware should have instantiated RTAS");
>> +}
>> +
>> +static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t 
>> methodaddr,
>> +                                uint32_t ihandle, uint32_t param1,
>> +                                uint32_t param2, uint32_t param3,
>> +                                uint32_t param4, uint32_t *ret2)
>> +{
>> +    uint32_t ret = -1;
>> +    char method[VOF_MAX_METHODLEN] = "";
>> +    OfInstance *inst;
>> +
>> +    if (!ihandle) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
>> +                                              GINT_TO_POINTER(ihandle));
> 
> I think you should not have space in type casts but checkpatch does not 
> seem to mind. You have this at several places.


checkpatch does mind because it is truly ugly. I tried:


ERROR: "(foo*)" should be "(foo *)"
#2029: FILE: pc-bios/vof/ci.c:46:
+    if (prom_handle((void*)(unsigned long) args)) {

total: 1 errors, 0 warnings, 2030 lines checked
Alexey Kardashevskiy June 16, 2021, 6:49 a.m. UTC | #4
On 6/15/21 20:29, BALATON Zoltan wrote:
> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>> The PAPR platform describes an OS environment that's presented by
>> a combination of a hypervisor and firmware. The features it specifies
>> require collaboration between the firmware and the hypervisor.
>>
>> Since the beginning, the runtime component of the firmware (RTAS) has
>> been implemented as a 20 byte shim which simply forwards it to
>> a hypercall implemented in qemu. The boot time firmware component is
>> SLOF - but a build that's specific to qemu, and has always needed to be
>> updated in sync with it. Even though we've managed to limit the amount
>> of runtime communication we need between qemu and SLOF, there's some,
>> and it has become increasingly awkward to handle as we've implemented
>> new features.
>>
>> This implements a boot time OF client interface (CI) which is
>> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>> which implements Open Firmware Client Interface (OF CI). This allows
>> using a smaller stateless firmware which does not have to manage
>> the device tree.
>>
>> The new "vof.bin" firmware image is included with source code under
>> pc-bios/. It also includes RTAS blob.
>>
>> This implements a handful of CI methods just to get -kernel/-initrd
>> working. In particular, this implements the device tree fetching and
>> simple memory allocator - "claim" (an OF CI memory allocator) and updates
>> "/memory@0/available" to report the client about available memory.
>>
>> This implements changing some device tree properties which we know how
>> to deal with, the rest is ignored. To allow changes, this skips
>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>> appending.
>>
>> In absence of SLOF, this assigns phandles to device tree nodes to make
>> device tree traversing work.
>>
>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>
>> This adds basic instances support which are managed by a hash map
>> ihandle -> [phandle].
>>
>> Before the guest started, the used memory is:
>> 0..e60 - the initial firmware
>> 8000..10000 - stack
>> 400000.. - kernel
>> 3ea0000.. - initramdisk
>>
>> This OF CI does not implement "interpret".
>>
>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>> includes a disk image with pre-formatted nvram.
>>
>> With this basic support, this can only boot into kernel directly.
>> However this is just enough for the petitboot kernel and initradmdisk to
>> boot from any possible source. Note this requires reasonably recent guest
>> kernel with:
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>
>>
>> The immediate benefit is much faster booting time which especially
>> crucial with fully emulated early CPU bring up environments. Also this
>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>
>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>> other POWERPC boards which do not support pSeries.
>>
>> This make VOF optional, it is disabled by default, add --enable-vof
>> to ./configure to enable it.
>>
>> This assumes potential support for booting from QEMU backends
>> such as blockdev or netdev without devices/drivers used.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> The example command line is:
>>
>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>> -nodefaults \
>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>> -mon id=MON0,chardev=STDIO0,mode=readline \
>> -nographic \
>> -vga none \
>> -enable-kvm \
>> -m 8G \
>> -machine 
>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>> \
>> -kernel pbuild/kernel-le-guest/vmlinux \
>> -initrd pb/rootfs.cpio.xz \
>> -drive 
>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>> \
>> -global spapr-nvram.drive=DRIVE0 \
>> -snapshot \
>> -smp 8,threads=8 \
>> -L /home/aik/t/qemu-ppc64-bios/ \
>> -trace events=qemu_trace_events \
>> -d guest_errors \
>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>> -mon chardev=SOCKET0,mode=control
> 
> I haven't looked at it in detail yet, just some quick comments I have on 
> first skim through.
> 
>> ---
>> Changes:
>> v21:
>> * s/ld/ldz/ in entry.S
> 
> Typo? Has this become lwz?

Yup, lwz.

> 
>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>> Kconfig
>> * made CONFIG_VOF optional
> 
> This won't work for pegasos2, see below.
> 
>> * s/l.lds/vof.lds/
>> * force 32 BE in spapr_machine_reset() instead of the firmware
>> * added checks for non-null methods of VofMachineIfClass
>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>> comment
>> * added  path_offset wrapper for handling mixed case for addresses
>> after "@" in node names
>> * changed getprop() to check for actual "name" property in the fdt
>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>> similar
>> rtas_ld/ldl_be_*) they return error codes
>> * VOF_MEM_READ uses now address_space_read (it was 
>> address_space_read_full
>> before, not sure why)
> [...]
>> ---
>> configure               |    9 +
>> pc-bios/vof/Makefile    |   23 +
>> include/hw/ppc/spapr.h  |   25 +-
>> include/hw/ppc/vof.h    |   55 ++
>> pc-bios/vof/vof.h       |   43 ++
>> hw/ppc/spapr.c          |   87 +++-
>> hw/ppc/spapr_hcall.c    |   29 +-
>> hw/ppc/spapr_vof.c      |  153 ++++++
>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>> pc-bios/vof/bootmem.c   |   14 +
>> pc-bios/vof/ci.c        |   91 ++++
>> pc-bios/vof/libc.c      |   92 ++++
>> pc-bios/vof/main.c      |   21 +
>> tests/qtest/rtas-test.c |   17 +-
>> MAINTAINERS             |   12 +
>> hw/ppc/Kconfig          |    3 +
>> hw/ppc/meson.build      |    3 +
>> hw/ppc/trace-events     |   24 +
>> meson.build             |    1 +
>> pc-bios/README          |    2 +
>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>> pc-bios/vof/entry.S     |   49 ++
>> pc-bios/vof/vof.lds     |   48 ++
>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>> create mode 100644 pc-bios/vof/Makefile
>> create mode 100644 include/hw/ppc/vof.h
>> create mode 100644 pc-bios/vof/vof.h
>> create mode 100644 hw/ppc/spapr_vof.c
>> create mode 100644 hw/ppc/vof.c
>> create mode 100644 pc-bios/vof/bootmem.c
>> create mode 100644 pc-bios/vof/ci.c
>> create mode 100644 pc-bios/vof/libc.c
>> create mode 100644 pc-bios/vof/main.c
>> create mode 100644 pc-bios/vof-nvram.bin
>> create mode 100755 pc-bios/vof.bin
>> create mode 100644 pc-bios/vof/entry.S
>> create mode 100644 pc-bios/vof/vof.lds
>>
>> diff --git a/configure b/configure
>> index 8dcb9965b24e..00dc29c027fa 100755
>> --- a/configure
>> +++ b/configure
>> @@ -445,6 +445,7 @@ fuse="auto"
>> fuse_lseek="auto"
>> multiprocess="auto"
>> slirp_smbd="$default_feature"
>> +vof="no"
> 
> Why is this disabled by default? I pretty much need VOF in pegasos2 as 
> there would be no other firmware otherwise. So it means I have to select 
> VOF in pegasos2 config and then VOF itself cannot be optional any more. 
> If you want it to be optional for spapr then you can't use CONFIG_VOF 
> for that but need to add a separate CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF 
> option that you can set to no by default even when CONFIG_VOF is yes and 
> make VOF usage conditional on that variable within spapr files.
> 
> Hope this make sense. But I don't really see why you need to do that 
> when you already have this turned off by default for spapr unless the 
> x-vof option is used. Isn't that enough to make this optional? If not 
> then you need another spapr specific CONFIG_* variable because 
> CONFIG_PEGASOS2 has to select CONFIG_VOF as it will be its default 
> firmware. For the same reason you should not put it behind a config 
> option especially one that needs to be explicitely enabled.


Answered in the other mail.



>> malloc_trim="auto"
>> gio="$default_feature"
>> @@ -1561,6 +1562,10 @@ for opt do
>>   ;;
>>   --disable-slirp-smbd) slirp_smbd=no
>>   ;;
>> +  --enable-vof) vof=yes
>> +  ;;
>> +  --disable-vof) vof=no
>> +  ;;
>>   *)
>>       echo "ERROR: unknown option $opt"
>>       echo "Try '$0 --help' for more information"
>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is 
>> enabled if available
>>   multiprocess    Out of process device emulation support
>>   gio             libgio support
>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>> experimental)
>>
>> NOTE: The object files are built at the place where configure is launched
>> EOF
>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>> fi
>> +if test "$vof" = "yes" ; then
>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>> +fi
>> if test "$vde" = "yes" ; then
>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>> new file mode 100644
>> index 000000000000..aa1678c4d889
>> --- /dev/null
>> +++ b/pc-bios/vof/Makefile
>> @@ -0,0 +1,23 @@
>> +all: build-all
>> +
>> +build-all: vof.bin
>> +
>> +CROSS ?=
>> +CC = $(CROSS)gcc
>> +LD = $(CROSS)ld
>> +OBJCOPY = $(CROSS)objcopy
>> +
>> +%.o: %.S
>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>> +
>> +%.o: %.c
>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o 
>> $@ $<
>> +
>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>> +
>> +%.bin: %.elf
>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>> +
>> +clean:
>> +    rm -f *.o vof.bin vof.elf *~
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index f05219f75ef6..39b5581ae650 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -12,6 +12,9 @@
>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>> #include "hw/ppc/xics.h"        /* For ICSState */
>> #include "hw/ppc/spapr_tpm_proxy.h"
>> +#ifdef CONFIG_VOF
>> +#include "hw/ppc/vof.h"
>> +#endif
>>
>> struct SpaprVioBus;
>> struct SpaprPhbState;
>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>     uint64_t kernel_addr;
>>     uint32_t initrd_base;
>>     long initrd_size;
>> +#ifdef CONFIG_VOF
>> +    Vof *vof;
>> +#endif
>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>     struct PPCTimebase tb;
>>     bool has_graphics;
>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>> /* Client Architecture support */
>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>
>> /*
>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState 
>> *spapr, hwaddr pagesize,
>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>> hwaddr spapr_get_rtas_addr(void);
>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>> +
>> +#ifdef CONFIG_VOF
>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>> +                     target_ulong *stack_ptr, Error **errp);
>> +void spapr_vof_quiesce(MachineState *ms);
>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>> *propname,
>> +                       void *val, int vallen);
>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>> *spapr,
>> +                                target_ulong opcode, target_ulong 
>> *args);
>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>> +                                                   CPUState *cs,
>> +                                                   target_ulong 
>> ovec_addr);
>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>> +#endif
>> +
>> #endif /* HW_SPAPR_H */
>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>> new file mode 100644
>> index 000000000000..65ca2fed0d41
>> --- /dev/null
>> +++ b/include/hw/ppc/vof.h
>> @@ -0,0 +1,55 @@
>> +/*
>> + * Virtual Open Firmware
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#ifndef HW_VOF_H
>> +#define HW_VOF_H
>> +
>> +typedef struct Vof {
>> +    uint64_t top_addr; /* copied from rma_size */
>> +    GArray *claimed; /* array of SpaprOfClaimed */
>> +    uint64_t claimed_base;
>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>> +    uint32_t of_instance_last;
>> +    char *bootargs;
>> +    long fw_size;
>> +} Vof;
>> +
>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>> +                    target_ulong args_real);
>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>> align);
>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>> +void vof_cleanup(Vof *vof);
>> +void vof_build_dt(void *fdt, Vof *vof);
>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>> *nodename,
>> +                               const char *prop, const char *path);
>> +
>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>> +
>> +typedef struct VofMachineIfClass VofMachineIfClass;
>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>> TYPE_VOF_MACHINE_IF)
>> +
>> +struct VofMachineIfClass {
>> +    InterfaceClass parent;
>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>> CPUState *cs,
>> +                                                target_ulong vec);
>> +    void (*quiesce)(MachineState *ms);
>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>> *propname,
>> +                    void *val, int vallen);
>> +};
>> +
>> +/*
>> + * Initial stack size is from
>> + * 
>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>
>> + */
>> +#define VOF_STACK_SIZE       0x8000
> 
> Maybe also add a define for RTAS_SIZE here? We'll need to put that in 
> the device tree but it depends on the rtas shim size that's part of VOF 
> so it should be defined here instead of hardcoding it in boards that use 
> VOF so it can be updated later at one place if needed.

This is rtas-size for pseries:

_FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
           ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));

=> depends on cpus => depends on the command line.


RTAS_SIZE is not used by anything in pseries anymore, I'll send a patch 
to ditch it.


> 
>> +
>> +#define VOF_MEM_READ(pa, buf, size) \
>> +    address_space_read(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> +#define VOF_MEM_WRITE(pa, buf, size) \
>> +    address_space_write(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> 
> These aren't much useful without the struct definition that you 
> typically want to read data into using these.

These are not reading to structs, it is either string buffer, cell(s) or 
binary blob. And they can return errors if the address is bad, this is 
the reason for them to exist.


>> +
>> +#endif /* HW_VOF_H */
> [...]
>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>> new file mode 100644
>> index 000000000000..1068a1e58388
>> --- /dev/null
>> +++ b/hw/ppc/vof.c
>> +
>> +/* Defined as Big Endian */
>> +struct prom_args {
>> +    uint32_t service;
>> +    uint32_t nargs;
>> +    uint32_t nret;
>> +    uint32_t args[10];
>> +} QEMU_PACKED;
> 
> I mean this one, this could be in vof.h too.


What is going to need it in the header?



> But this may better be in a 
> generic rtas.h with the rtas_* macros so maybe done at a later point. So 
> maybe just forget it for now.
>> +
>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>> +                    target_ulong args_real)
>> +{
>> +    struct prom_args args_be;
>> +    uint32_t args[ARRAY_SIZE(args_be.args)];
>> +    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
>> +    char service[64];
>> +    unsigned nargs, nret, i;
>> +
>> +    if (address_space_rw(&address_space_memory, args_real,
>> +                         MEMTXATTRS_UNSPECIFIED, &args_be, 
>> sizeof(args_be),
>> +                         false) != MEMTX_OK) {
>> +        return -EINVAL;
>> +    }
>> +    nargs = be32_to_cpu(args_be.nargs);
>> +    if (nargs >= ARRAY_SIZE(args_be.args)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (address_space_rw(&address_space_memory, 
>> be32_to_cpu(args_be.service),
>> +                         MEMTXATTRS_UNSPECIFIED, service, 
>> sizeof(service),
>> +                         false) != MEMTX_OK) {
>> +        return -EINVAL;
>> +    }
>> +    if (strnlen(service, sizeof(service)) == sizeof(service)) {
>> +        /* Too long service name */
>> +        return -EINVAL;
>> +    }
>> +
>> +    for (i = 0; i < nargs; ++i) {
>> +        args[i] = be32_to_cpu(args_be.args[i]);
>> +    }
>> +
>> +    nret = be32_to_cpu(args_be.nret);
>> +    ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, 
>> nret);
>> +    if (!nret) {
>> +        return 0;
>> +    }
>> +
>> +    args_be.args[nargs] = cpu_to_be32(ret);
>> +    for (i = 1; i < nret; ++i) {
>> +        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
>> +    }
>> +
>> +    if (address_space_rw(&address_space_memory,
>> +                         args_real + offsetof(struct prom_args, 
>> args[nargs]),
>> +                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
>> +                         sizeof(args_be.args[0]) * nret, true) != 
>> MEMTX_OK) {
>> +        return -EINVAL;
> 
> Also you're still not using your macros here. Why?

Because it was quick cut-n-paste with some prototypes fixing :) My bad, 
I'll fix it.


> 
> Regards,
> BALATON Zoltan
BALATON Zoltan June 16, 2021, 10:26 a.m. UTC | #5
On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
> On 6/16/21 07:09, BALATON Zoltan wrote:
>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>> The PAPR platform describes an OS environment that's presented by
>>> a combination of a hypervisor and firmware. The features it specifies
>>> require collaboration between the firmware and the hypervisor.
>>> 
>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>> been implemented as a 20 byte shim which simply forwards it to
>>> a hypercall implemented in qemu. The boot time firmware component is
>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>> updated in sync with it. Even though we've managed to limit the amount
>>> of runtime communication we need between qemu and SLOF, there's some,
>>> and it has become increasingly awkward to handle as we've implemented
>>> new features.
>>> 
>>> This implements a boot time OF client interface (CI) which is
>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>> which implements Open Firmware Client Interface (OF CI). This allows
>>> using a smaller stateless firmware which does not have to manage
>>> the device tree.
>>> 
>>> The new "vof.bin" firmware image is included with source code under
>>> pc-bios/. It also includes RTAS blob.
>>> 
>>> This implements a handful of CI methods just to get -kernel/-initrd
>>> working. In particular, this implements the device tree fetching and
>>> simple memory allocator - "claim" (an OF CI memory allocator) and updates
>>> "/memory@0/available" to report the client about available memory.
>>> 
>>> This implements changing some device tree properties which we know how
>>> to deal with, the rest is ignored. To allow changes, this skips
>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>> appending.
>>> 
>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>> device tree traversing work.
>>> 
>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>> 
>>> This adds basic instances support which are managed by a hash map
>>> ihandle -> [phandle].
>>> 
>>> Before the guest started, the used memory is:
>>> 0..e60 - the initial firmware
>>> 8000..10000 - stack
>>> 400000.. - kernel
>>> 3ea0000.. - initramdisk
>>> 
>>> This OF CI does not implement "interpret".
>>> 
>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>> includes a disk image with pre-formatted nvram.
>>> 
>>> With this basic support, this can only boot into kernel directly.
>>> However this is just enough for the petitboot kernel and initradmdisk to
>>> boot from any possible source. Note this requires reasonably recent guest
>>> kernel with:
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>> 
>>> The immediate benefit is much faster booting time which especially
>>> crucial with fully emulated early CPU bring up environments. Also this
>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>> 
>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>> other POWERPC boards which do not support pSeries.
>>> 
>>> This make VOF optional, it is disabled by default, add --enable-vof
>>> to ./configure to enable it.
>>> 
>>> This assumes potential support for booting from QEMU backends
>>> such as blockdev or netdev without devices/drivers used.
>>> 
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> 
>>> The example command line is:
>>> 
>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>> -nodefaults \
>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>> -nographic \
>>> -vga none \
>>> -enable-kvm \
>>> -m 8G \
>>> -machine 
>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>> \
>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>> -initrd pb/rootfs.cpio.xz \
>>> -drive 
>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>> \
>>> -global spapr-nvram.drive=DRIVE0 \
>>> -snapshot \
>>> -smp 8,threads=8 \
>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>> -trace events=qemu_trace_events \
>>> -d guest_errors \
>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>> -mon chardev=SOCKET0,mode=control
>>> 
>>> ---
>>> Changes:
>>> v21:
>>> * s/ld/ldz/ in entry.S
>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>> Kconfig
>>> * made CONFIG_VOF optional
>>> * s/l.lds/vof.lds/
>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>> * added checks for non-null methods of VofMachineIfClass
>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better comment
>>> * added  path_offset wrapper for handling mixed case for addresses
>>> after "@" in node names
>>> * changed getprop() to check for actual "name" property in the fdt
>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike similar
>>> rtas_ld/ldl_be_*) they return error codes
>>> * VOF_MEM_READ uses now address_space_read (it was address_space_read_full
>>> before, not sure why)
>>> 
>>> v20:
>>> * compile vof.bin with -mcpu=power4 for better compatibility
>>> * s/std/stw/ in entry.S to make it work on ppc32
>>> * fixed dt_available property to support both 32 and 64bit
>>> * shuffled prom_args handling code
>>> * do not enforce 32bit in MSR (again, to support 32bit platforms)
>>> 
>>> v19:
>>> * put bootargs in the FDT
>>> * moved setting properties to a VOF machine hook
>>> * moved fw_size and claim for it to vof_init()
>>> * added CROSS to the VOF's makefile
>>> * simplified phandles assigning
>>> * pass MachineState to all machine hooks instead of calling
>>> qdev_get_machine (following QOM)
>>> * bunch of smaller changes and added comments
>>> * added simple test to attempt to start with x-vof=on
>>> 
>>> v18:
>>> * fixed top addr (max address for "claim") on radix - it equals to 
>>> ram_size
>>> and vof->top_addr was uint32_t
>>> * fixed "available" property which got broken in v14 but it is only 
>>> visible
>>> to clients which care (== grub)
>>> * reshuffled vof_dt_memory_available() calls, added vof_init() to allow
>>> vof_claim() before rendering the FDT
>>> 
>>> v17:
>>> * mv hw/ppc/vof.h include/hw/ppc/vof.h
>>> * VofMachineIfClass -> VofMachineClass; it is not VofMachineInterface as
>>> nobody used this scheme, usually "Interface" is dropped, a couple of times
>>> it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
>>> used by include/hw/vmstate-if.h
>>> * added SPDX
>>> * other fixes from v16 review
>>> 
>>> v16:
>>> * rebased on dwg/ppc-for-6.1
>>> * s/SpaprVofInterface/VofMachineInterface/
>>> 
>>> v15:
>>> * bugfix: claimed memory for the VOF itself
>>> * ditched OF_STACK_ADDR and allocate one instead, now it starts from 
>>> 0x8000
>>> because it is aligned to its size (no particular reason though)
>>> * coding style
>>> * moved nvram.bin up one level
>>> * ditched bool in the firmware
>>> * made debugging code conditional using trace_event_get_state() + 
>>> qemu_loglevel_mask()
>>> * renamed the CAS interface to SpaprVofInterface
>>> * added "write" which for now dumps the message and ihandle via
>>> trace point for early debug assistance
>>> * commented on when we allocate of_instances in vof_build_dt()
>>> * store fw_size is SpaprMachine to let spapr_vof_reset() claim it
>>> * many small fixes from v14's review
>>> 
>>> v14:
>>> * check for truncates in readstr()
>>> * ditched a separate vof_reset()
>>> * spapr->vof is a pointer now, dropped the "on" field
>>> * removed rtas_base from vof and updated comment why we allow setting it
>>> * added myself to maintainers
>>> * updated commit log about blockdev and other possible platforms
>>> * added a note why new hcall is 0x5
>>> * no in place endianness convertion in spapr_h_vof_client
>>> * converted all cpu_physical_memory_read/write to address_space_rw
>>> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>>> 
>>> v13:
>>> * rebase on latest ppc-for-6.0
>>> * shuffled code around to touch spapr.c less
>>> 
>>> v12:
>>> * split VOF and SPAPR
>>> 
>>> v11:
>>> * added g_autofree
>>> * fixed gcc warnings
>>> * fixed few leaks
>>> * added nvram image to make "nvram --print-config" not crash;
>>> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
>>> is 16K, or it just does not work (empty output from "nvram")
>>> 
>>> v10:
>>> * now rebased to compile with meson
>>> 
>>> v9:
>>> * remove special handling of /rtas/rtas-size as now we always add it in 
>>> QEMU
>>> * removed leftovers from scsi/grub/stdout/stdin/...
>>> 
>>> v8:
>>> * no read/write/seek
>>> * no @dev in instances
>>> * the machine flag is "x-vof" for now
>>> 
>>> v7:
>>> * now we have a small firmware which loads at 0 as SLOF and starts from
>>> 0x100 as SLOF
>>> * no MBR/ELF/GRUB business in QEMU anymore
>>> * blockdev is a separate patch
>>> * networking is a separate patch
>>> 
>>> v6:
>>> * borrowed a big chunk of commit log introduction from David
>>> * fixed initial stack pointer (points to the highest address of stack)
>>> * traces for "interpret" and others
>>> * disabled  translate_kernel_address() hack so grub can load (work in
>>> progress)
>>> * added "milliseconds" for grub
>>> * fixed "claim" allocator again
>>> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
>>> * moved the most code possible from spapr.c to spapr_of_client.c, such as
>>> RTAS, prom entry and FDT build/finalize
>>> * separated blobs
>>> * GRUB now proceeds to its console prompt (there are still other issues)
>>> * parse MBR/GPT to find PReP and load GRUB
>>> 
>>> v5:
>>> * made instances keep device and chardev pointers
>>> * removed VIO dependencies
>>> * print error if RTAS memory is not claimed as it should have been
>>> * pack FDT as "quiesce"
>>> 
>>> v4:
>>> * fixed open
>>> * validate ihandles in "call-method"
>>> 
>>> v3:
>>> * fixed phandles allocation
>>> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
>>> * fixed size of /chosen/stdout
>>> * bunch of renames
>>> * do not create rtas properties at all, let the client deal with it;
>>> instead setprop allows changing these in the FDT
>>> * no more packing FDT when bios=off - nobody needs it and getprop does not
>>> work otherwise
>>> * allow updating initramdisk device tree properties (for zImage)
>>> * added instances
>>> * fixed stdout on OF's "write"
>>> * removed special handling for stdout in OF client, spapr-vty handles it
>>> instead
>>> 
>>> v2:
>>> * fixed claim()
>>> * added "setprop"
>>> * cleaner client interface and RTAS blobs management
>>> * boots to petitboot and further to the target system
>>> * more trace points
>>> 
>>> v20
>>> 
>>> v20!
>>> ---
>>> configure               |    9 +
>>> pc-bios/vof/Makefile    |   23 +
>>> include/hw/ppc/spapr.h  |   25 +-
>>> include/hw/ppc/vof.h    |   55 ++
>>> pc-bios/vof/vof.h       |   43 ++
>>> hw/ppc/spapr.c          |   87 +++-
>>> hw/ppc/spapr_hcall.c    |   29 +-
>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>> pc-bios/vof/bootmem.c   |   14 +
>>> pc-bios/vof/ci.c        |   91 ++++
>>> pc-bios/vof/libc.c      |   92 ++++
>>> pc-bios/vof/main.c      |   21 +
>>> tests/qtest/rtas-test.c |   17 +-
>>> MAINTAINERS             |   12 +
>>> hw/ppc/Kconfig          |    3 +
>>> hw/ppc/meson.build      |    3 +
>>> hw/ppc/trace-events     |   24 +
>>> meson.build             |    1 +
>>> pc-bios/README          |    2 +
>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>> pc-bios/vof/entry.S     |   49 ++
>>> pc-bios/vof/vof.lds     |   48 ++
>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>> create mode 100644 pc-bios/vof/Makefile
>>> create mode 100644 include/hw/ppc/vof.h
>>> create mode 100644 pc-bios/vof/vof.h
>>> create mode 100644 hw/ppc/spapr_vof.c
>>> create mode 100644 hw/ppc/vof.c
>>> create mode 100644 pc-bios/vof/bootmem.c
>>> create mode 100644 pc-bios/vof/ci.c
>>> create mode 100644 pc-bios/vof/libc.c
>>> create mode 100644 pc-bios/vof/main.c
>>> create mode 100644 pc-bios/vof-nvram.bin
>>> create mode 100755 pc-bios/vof.bin
>>> create mode 100644 pc-bios/vof/entry.S
>>> create mode 100644 pc-bios/vof/vof.lds
>>> 
>>> diff --git a/configure b/configure
>>> index 8dcb9965b24e..00dc29c027fa 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -445,6 +445,7 @@ fuse="auto"
>>> fuse_lseek="auto"
>>> multiprocess="auto"
>>> slirp_smbd="$default_feature"
>>> +vof="no"
>>> 
>>> malloc_trim="auto"
>>> gio="$default_feature"
>>> @@ -1561,6 +1562,10 @@ for opt do
>>>   ;;
>>>   --disable-slirp-smbd) slirp_smbd=no
>>>   ;;
>>> +  --enable-vof) vof=yes
>>> +  ;;
>>> +  --disable-vof) vof=no
>>> +  ;;
>>>   *)
>>>       echo "ERROR: unknown option $opt"
>>>       echo "Try '$0 --help' for more information"
>>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is enabled 
>>> if available
>>>   multiprocess    Out of process device emulation support
>>>   gio             libgio support
>>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>>> experimental)
>>> 
>>> NOTE: The object files are built at the place where configure is launched
>>> EOF
>>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>>> fi
>>> +if test "$vof" = "yes" ; then
>>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>>> +fi
>>> if test "$vde" = "yes" ; then
>>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>> 
>> In case I could not explain it clearly in my previous message I think the 
>> solution we want here is to drop these configure changes and let Kconfig 
>> configure this. The CONFIG_VOF option decides if vof itself is built (adds 
>> vof.c) and pegasos2 will select this so it will usually be yes by default. 
>> Your problem is that you're trying to use this variable in spapr to make it 
>> off by default but that does not work. You need to add another option for 
>> that (e.g. CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF whichever makes more sense) 
>> then you can set that to no despite CONFIG_VOF is yes and use that variable 
>> in spapr files and to add spapr_vof.c. Then no configure option is needed 
>> which does not even work for me: I get compile errors saying 'poisoning 
>> existing macro "CONFIG_VOF"' if I try with --enable-vof or spapr fails to 
>> build if I try without --enable-vof but select CONFIG_VOF from pegasos2. I 
>> hope this makes sense now.
>
>
> My problem is that I do not understand when we want VOF to be compiled in by 
> default and when we do not. For a moment I thought we do not want it in by 
> default but now it sounds like we do. If that is so, then CONFIG_VOF + 
> selecting it from PSERIES and PEGASOS should do. Or I am missing the point 
> again?

I don't know what spapr wants, for pegasos2 VOF will be the default 
firmware and I want pegasos2 to be enabled by default (like other boards) 
so this means CONFIG_VOF will also be enabled by default via select VOF in 
CONFIG_PEGASOS2. So if the x-vof option in spapr is not enough and you 
want to be able to configure it off for spapr then you need another 
CONFIG_something option for that and cannot rely on CONFIG_VOF for it 
because CONFIG_VOF is on if any board that uses VOF is compiled. If you're 
OK with compiling it in but disabled by x-vof by default then no need to 
have another option. In both cases you'll have to select VOF somewhere for 
your board, either in CONFIG_PSERIES or in the new option that decides if 
VOF is built for spapr. At least that's how I understand Kconfig.

>>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>>> new file mode 100644
>>> index 000000000000..aa1678c4d889
>>> --- /dev/null
>>> +++ b/pc-bios/vof/Makefile
>>> @@ -0,0 +1,23 @@
>>> +all: build-all
>>> +
>>> +build-all: vof.bin
>>> +
>>> +CROSS ?=
>>> +CC = $(CROSS)gcc
>>> +LD = $(CROSS)ld
>>> +OBJCOPY = $(CROSS)objcopy
>>> +
>>> +%.o: %.S
>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>>> +
>>> +%.o: %.c
>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $<
>>> +
>>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>>> +
>>> +%.bin: %.elf
>>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>>> +
>>> +clean:
>>> +    rm -f *.o vof.bin vof.elf *~
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index f05219f75ef6..39b5581ae650 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -12,6 +12,9 @@
>>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>> #include "hw/ppc/xics.h"        /* For ICSState */
>>> #include "hw/ppc/spapr_tpm_proxy.h"
>>> +#ifdef CONFIG_VOF
>>> +#include "hw/ppc/vof.h"
>>> +#endif
>>> 
>>> struct SpaprVioBus;
>>> struct SpaprPhbState;
>>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>>     uint64_t kernel_addr;
>>>     uint32_t initrd_base;
>>>     long initrd_size;
>>> +#ifdef CONFIG_VOF
>> 
>> So this can't be CONFIG_VOF here if you want to be able to set it to no 
>> despite pegasos2 pulling in VOF so you need another SPAPR specific 
>
> If VOF is compiled it, why would I want it to be still disabled for PSERIES? 
> The code is in, let it work.

That's something to decide for spapr maintainers, I just want to be able 
to use CONFIG_VOF from CONFIG_PEGASOS2 and be it on by default.

>> option for that in spapr specific parts with CONFIG_VOF selecting if vof 
>> itself is built it any board uses it. So CONFIG_PEGASOS2 has to select 
>> CONFIG_VOF and your SPAPR_VOF option should too if it's enabled that way 
>> vof.c will be added if either board is built but for SPAPR only if its VOF 
>> option is on.
>> 
>>> +    Vof *vof;
>>> +#endif
>>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>>     struct PPCTimebase tb;
>>>     bool has_graphics;
>>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>>> /* Client Architecture support */
>>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>> 
>>> /*
>>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, 
>>> hwaddr pagesize,
>>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>> hwaddr spapr_get_rtas_addr(void);
>>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>>> +
>>> +#ifdef CONFIG_VOF
>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>> +                     target_ulong *stack_ptr, Error **errp);
>>> +void spapr_vof_quiesce(MachineState *ms);
>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>>> *propname,
>>> +                       void *val, int vallen);
>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>> *spapr,
>>> +                                target_ulong opcode, target_ulong *args);
>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>> +                                                   CPUState *cs,
>>> +                                                   target_ulong 
>>> ovec_addr);
>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>>> +#endif
>>> +
>>> #endif /* HW_SPAPR_H */
>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>> new file mode 100644
>>> index 000000000000..65ca2fed0d41
>>> --- /dev/null
>>> +++ b/include/hw/ppc/vof.h
>>> @@ -0,0 +1,55 @@
>>> +/*
>>> + * Virtual Open Firmware
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +#ifndef HW_VOF_H
>>> +#define HW_VOF_H
>>> +
>>> +typedef struct Vof {
>>> +    uint64_t top_addr; /* copied from rma_size */
>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>> +    uint64_t claimed_base;
>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>> +    uint32_t of_instance_last;
>>> +    char *bootargs;
>>> +    long fw_size;
>>> +} Vof;
>>> +
>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>> +                    target_ulong args_real);
>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>> align);
>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>> +void vof_cleanup(Vof *vof);
>>> +void vof_build_dt(void *fdt, Vof *vof);
>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
>>> +                               const char *prop, const char *path);
>>> +
>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>> +
>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>> TYPE_VOF_MACHINE_IF)
>>> +
>>> +struct VofMachineIfClass {
>>> +    InterfaceClass parent;
>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>> CPUState *cs,
>>> +                                                target_ulong vec);
>>> +    void (*quiesce)(MachineState *ms);
>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>> *propname,
>>> +                    void *val, int vallen);
>>> +};
>>> +
>>> +/*
>>> + * Initial stack size is from
>>> + * 
>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>> 
>> I wonder if it's better to quote the section number and the title of the 
>> doc in case the URL here goes away in the future.
>
>
> The binding (the URL clearly suggests it is a "binding") says 32K is the 
> minimum, what else is here to quote? The doc does not explain why anyway.

I thought maybe saying "section x.x of OpenFirmware PPC binding says 
minimum stack size is 32K" which can also be understood if the link points 
to nowhere in the future. You can still add a link if you want but just 
have the relevant info in the comment so one does not need to read the 
whole doc to find it.

>
>>> + */
>>> +#define VOF_STACK_SIZE       0x8000
>>> +
>>> +#define VOF_MEM_READ(pa, buf, size) \
>>> +    address_space_read(&address_space_memory, \
>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>>> +#define VOF_MEM_WRITE(pa, buf, size) \
>>> +    address_space_write(&address_space_memory, \
>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>>> +
>>> +#endif /* HW_VOF_H */
>>> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
>>> new file mode 100644
>>> index 000000000000..2d8958076907
>>> --- /dev/null
>>> +++ b/pc-bios/vof/vof.h
>>> @@ -0,0 +1,43 @@
>>> +/*
>>> + * Virtual Open Firmware
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +#include <stdarg.h>
>>> +
>>> +typedef unsigned char uint8_t;
>>> +typedef unsigned short uint16_t;
>>> +typedef unsigned long uint32_t;
>>> +typedef unsigned long long uint64_t;
>>> +#define NULL (0)
>>> +#define PROM_ERROR (-1u)
>>> +typedef unsigned long ihandle;
>>> +typedef unsigned long phandle;
>>> +typedef int size_t;
>>> +typedef void client(void);
>>> +
>>> +/* globals */
>>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) 
>>> */
>>> +
>>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>>> +
>>> +/* libc */
>>> +int strlen(const char *s);
>>> +int strcmp(const char *s1, const char *s2);
>>> +void *memcpy(void *dest, const void *src, size_t n);
>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>> +void *memmove(void *dest, const void *src, size_t n);
>>> +void *memset(void *dest, int c, size_t size);
>>> +
>>> +/* CI wrappers */
>>> +void ci_panic(const char *str);
>>> +phandle ci_finddevice(const char *path);
>>> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int 
>>> len);
>>> +
>>> +/* booting from -kernel */
>>> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
>>> +
>>> +/* Entry points for CI and RTAS */
>>> +extern uint32_t ci_entry(uint32_t params);
>>> +extern unsigned long hv_rtas(unsigned long params);
>>> +extern unsigned int hv_rtas_size;
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index 4dd90b75cc52..6d747d72c614 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -101,6 +101,7 @@
>>> #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
>>> #define FW_MAX_SIZE             0x400000
>>> #define FW_FILE_NAME            "slof.bin"
>>> +#define FW_FILE_NAME_VOF        "vof.bin"
>>> #define FW_OVERHEAD             0x2800000
>>> #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>>> 
>>> @@ -1639,22 +1640,40 @@ static void spapr_machine_reset(MachineState 
>>> *machine)
>>>     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
>>> 
>>>     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
>>> +#ifdef CONFIG_VOF
>>> +    if (spapr->vof) {
>>> +        target_ulong stack_ptr = 0;
>>> 
>>> -    rc = fdt_pack(fdt);
>>> +        spapr_vof_reset(spapr, fdt, &stack_ptr, &error_fatal);
>>> 
>>> -    /* Should only fail if we've built a corrupted tree */
>>> -    assert(rc == 0);
>>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>>> +                                  stack_ptr, spapr->initrd_base,
>>> +                                  spapr->initrd_size);
>>> +        /* VOF is 32bit BE so enforce MSR here */
>>> +        first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << MSR_LE));
>>> +        /*
>>> +         * Do not pack the FDT as the client may change properties.
>>> +         * VOF client does not expect the FDT so we do not load it to the 
>>> VM.
>>> +         */
>>> +    } else
>>> +#endif
>>> +    {
>>> +        rc = fdt_pack(fdt);
>>> +        /* Should only fail if we've built a corrupted tree */
>>> +        assert(rc == 0);
>>> 
>>> -    /* Load the fdt */
>>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>>> +                                  0, fdt_addr, 0);
>>> +        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>> +    }
>>>     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>> +
>>>     g_free(spapr->fdt_blob);
>>>     spapr->fdt_size = fdt_totalsize(fdt);
>>>     spapr->fdt_initial_size = spapr->fdt_size;
>>>     spapr->fdt_blob = fdt;
>>> 
>>>     /* Set up the entry state */
>>> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, 
>>> fdt_addr, 0);
>>>     first_ppc_cpu->env.gpr[5] = 0;
>>> 
>>>     spapr->fwnmi_system_reset_addr = -1;
>>> @@ -2657,7 +2676,12 @@ static void spapr_machine_init(MachineState 
>>> *machine)
>>>     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>>>     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>>>     MachineClass *mc = MACHINE_GET_CLASS(machine);
>>> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
>>> +    const char *bios_default =
>>> +#ifdef CONFIG_VOF
>>> +        !!spapr->vof ? FW_FILE_NAME_VOF :
>> 
>> Does !! make sense here? I think testing for non-0 does not need it so you 
>> could just write spapr->vof without !!.
>
>
> I find c operator precedence confusing at times. Unary operators like "!" are 
> easy to read though.

OK but it's not needed here at all. With or without !! you should get the 
same result, !! is only needed if you need to make sure value is bool and 
not some number which is not needed here, you just test if spapr->vof is 0 
or not. So writing just that is simpler and people not familiar with !! 
won't be confused. (I had somebedy ask about !! before in one of my 
patches so I think this should only be used where necessary.) Also less 
operators in an expression means less precedence to care for ;-)

>
>> 
>>> +#endif
>>> +        FW_FILE_NAME;
>>> +    const char *bios_name = machine->firmware ?: bios_default;
>>>     const char *kernel_filename = machine->kernel_filename;
>>>     const char *initrd_filename = machine->initrd_filename;
>>>     PCIHostState *phb;
>>> @@ -3014,6 +3038,12 @@ static void spapr_machine_init(MachineState 
>>> *machine)
>>>     }
>>> 
>>>     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
>>> +#ifdef CONFIG_VOF
>>> +    if (spapr->vof) {
>>> +        spapr->vof->fw_size = fw_size; /* for claim() on itself */
>>> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, 
>>> spapr_h_vof_client);
>>> +    }
>>> +#endif
>>> }
>>> 
>>> #define DEFAULT_KVM_TYPE "auto"
>>> @@ -3204,6 +3234,30 @@ static void spapr_set_resize_hpt(Object *obj, const 
>>> char *value, Error **errp)
>>>     }
>>> }
>>> 
>>> +#ifdef CONFIG_VOF
>>> +static bool spapr_get_vof(Object *obj, Error **errp)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>> +
>>> +    return spapr->vof != NULL;
>>> +}
>>> +
>>> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>> +
>>> +    if (spapr->vof) {
>>> +        vof_cleanup(spapr->vof);
>>> +        g_free(spapr->vof);
>>> +        spapr->vof = NULL;
>>> +    }
>>> +    if (!value) {
>>> +        return;
>>> +    }
>>> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
>>> +}
>>> +#endif
>>> +
>>> static char *spapr_get_ic_mode(Object *obj, Error **errp)
>>> {
>>>     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>> @@ -3329,6 +3383,12 @@ static void spapr_instance_init(Object *obj)
>>>                                     stringify(KERNEL_LOAD_ADDR)
>>>                                     " for -kernel is the default");
>>>     spapr->kernel_addr = KERNEL_LOAD_ADDR;
>>> +#ifdef CONFIG_VOF
>>> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
>>> +    object_property_set_description(obj, "x-vof",
>>> +                                    "Enable Virtual Open Firmware 
>>> (experimental)");
>>> +#endif
>>> +
>>>     /* The machine class defines the default interrupt controller mode */
>>>     spapr->irq = smc->irq;
>>>     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
>>> @@ -4580,6 +4640,16 @@ static void spapr_machine_class_init(ObjectClass 
>>> *oc, void *data)
>>>     smc->smp_threads_vsmt = true;
>>>     smc->nr_xirqs = SPAPR_NR_XIRQS;
>>>     xfc->match_nvt = spapr_match_nvt;
>>> +
>>> +#ifdef CONFIG_VOF
>>> +    {
>>> +        VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
>>> +        vmc->client_architecture_support =
>>> +            spapr_vof_client_architecture_support;
>>> +        vmc->quiesce = spapr_vof_quiesce;
>>> +        vmc->setprop = spapr_vof_setprop;
>>> +    }
>>> +#endif
>>> }
>>> 
>>> static const TypeInfo spapr_machine_info = {
>>> @@ -4599,6 +4669,9 @@ static const TypeInfo spapr_machine_info = {
>>>         { TYPE_XICS_FABRIC },
>>>         { TYPE_INTERRUPT_STATS_PROVIDER },
>>>         { TYPE_XIVE_FABRIC },
>>> +#ifdef CONFIG_VOF
>>> +        { TYPE_VOF_MACHINE_IF },
>>> +#endif
>>>         { }
>>>     },
>>> };
>>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>>> index f25014afda40..986a4de34128 100644
>>> --- a/hw/ppc/spapr_hcall.c
>>> +++ b/hw/ppc/spapr_hcall.c
>>> @@ -1080,7 +1080,7 @@ target_ulong 
>>> do_client_architecture_support(PowerPCCPU *cpu,
>>>     SpaprOptionVector *ov1_guest, *ov5_guest;
>>>     bool guest_radix;
>>>     bool raw_mode_supported = false;
>>> -    bool guest_xive;
>>> +    bool guest_xive, reset_fdt = false;
>>>     CPUState *cs;
>>>     void *fdt;
>>>     uint32_t max_compat = spapr->max_compat_pvr;
>>> @@ -1233,8 +1233,10 @@ target_ulong 
>>> do_client_architecture_support(PowerPCCPU *cpu,
>>>         spapr_setup_hpt(spapr);
>>>     }
>>> 
>>> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
>>> -
>>> +#ifdef CONFIG_VOF
>>> +    reset_fdt = spapr->vof != NULL;
>> 
>> (Here when storing to a bool !! could make sense but what you have is 
>> better as it's clearer so I'm not suggesting to use !! here either.
>
> I prefer this way

I agree that this way writing with explicit != NULL is clear so I prefer 
that here too. I just mentioned that here !! might make more sense here 
than above.

> and I would rather do this that "!!" but again precedence 
> confuses me some times so up there I'd need braces for the condition and then 
> folks start asking "why you need braces" :)

For precedence "man operator" has a quick table to help, it's hard to 
remember. In my opinion !! is only useful if you need to convert something 
like a flag to a bool like bool = !!(reg & BIT(x)) otherwise it's probably 
clearer to do without it as it may confuse those not familiar with it.

> I do not need braces here as "=" has the priority (although the fact that it 
> returns a value is just bizzarre).

Everything in C has practical reasons. I think assignment returning a 
value is so you could write a = b = 0; although this is discouraged 
usually. Also you can do while ((c = getc())) so this is sometimes useful 
to have = return a value.

>> It's rarely useful, maybe only if you need a bool but does not have space 
>> to write the condition or it would be more confusing that way.)
>
>
>
>> 
>>> +#endif
>>> +    fdt = spapr_build_fdt(spapr, reset_fdt, fdt_bufsize);
>>>     g_free(spapr->fdt_blob);
>>>     spapr->fdt_size = fdt_totalsize(fdt);
>>>     spapr->fdt_initial_size = spapr->fdt_size;
>>> @@ -1277,6 +1279,27 @@ static target_ulong 
>>> h_client_architecture_support(PowerPCCPU *cpu,
>>>     return ret;
>>> }
>>> 
>>> +#ifdef CONFIG_VOF
>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>> +                                                   CPUState *cs,
>>> +                                                   target_ulong 
>>> ovec_addr)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>> +
>>> +    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), 
>>> spapr,
>>> +                                                      ovec_addr, 
>>> FDT_MAX_SIZE);
>>> +
>>> +    /*
>>> +     * This adds stdout and generates phandles for boottime and CAS FDTs.
>>> +     * It is alright to update the FDT here as 
>>> do_client_architecture_support()
>>> +     * does not pack it.
>>> +     */
>>> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
>>> +
>>> +    return ret;
>>> +}
>>> +#endif
>>> +
>>> static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>>                                               SpaprMachineState *spapr,
>>>                                               target_ulong opcode,
>>> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
>>> new file mode 100644
>>> index 000000000000..653d376f38aa
>>> --- /dev/null
>>> +++ b/hw/ppc/spapr_vof.c
>>> @@ -0,0 +1,153 @@
>>> +/*
>>> + * SPAPR machine hooks to Virtual Open Firmware,
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +#include "qemu/osdep.h"
>>> +#include "qemu-common.h"
>>> +#include <sys/ioctl.h>
>>> +#include "qapi/error.h"
>>> +#include "hw/ppc/spapr.h"
>>> +#include "hw/ppc/spapr_vio.h"
>>> +#include "hw/ppc/fdt.h"
>>> +#include "sysemu/sysemu.h"
>>> +#include "qom/qom-qobject.h"
>>> +#include "trace.h"
>>> +
>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>> *spapr,
>>> +                                target_ulong opcode, target_ulong *_args)
>>> +{
>>> +    int ret = vof_client_call(MACHINE(spapr), spapr->vof, 
>>> spapr->fdt_blob,
>>> +                              ppc64_phys_to_real(_args[0]));
>>> +
>>> +    if (ret) {
>>> +        return H_PARAMETER;
>>> +    }
>>> +    return H_SUCCESS;
>>> +}
>>> +
>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
>>> +{
>>> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
>>> +    int chosen;
>>> +
>>> +    vof_build_dt(fdt, spapr->vof);
>>> +
>>> +    _FDT(chosen = fdt_path_offset(fdt, "/chosen"));
>>> +    _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
>>> +                            spapr->vof->bootargs ? : ""));
>>> +
>>> +    /*
>>> +     * SLOF-less setup requires an open instance of stdout for early
>>> +     * kernel printk. By now all phandles are settled so we can open
>>> +     * the default serial console.
>>> +     */
>>> +    if (stdout_path) {
>>> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
>>> +                                   stdout_path));
>>> +    }
>>> +}
>>> +
>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>> +                     target_ulong *stack_ptr, Error **errp)
>>> +{
>>> +    Vof *vof = spapr->vof;
>>> +
>>> +    vof_init(vof, spapr->rma_size, errp);
>>> +
>>> +    *stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE);
>>> +    if (*stack_ptr == -1) {
>>> +        error_setg(errp, "Memory allocation for stack failed");
>>> +        return;
>>> +    }
>>> +    /* Stack grows downwards plus reserve space for the minimum stack 
>>> frame */
>>> +    *stack_ptr += VOF_STACK_SIZE - 0x20;
>>> +
>>> +    if (spapr->kernel_size &&
>>> +        vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == -1) 
>>> {
>>> +        error_setg(errp, "Memory for kernel is in use");
>>> +        return;
>>> +    }
>>> +
>>> +    if (spapr->initrd_size &&
>>> +        vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == -1) 
>>> {
>>> +        error_setg(errp, "Memory for initramdisk is in use");
>>> +        return;
>>> +    }
>>> +
>>> +    spapr_vof_client_dt_finalize(spapr, fdt);
>>> +
>>> +    /*
>>> +     * At this point the expected allocation map is:
>>> +     *
>>> +     * 0..c38 - the initial firmware
>>> +     * 8000..10000 - stack
>>> +     * 400000.. - kernel
>>> +     * 3ea0000.. - initramdisk
>>> +     *
>>> +     * We skip writing FDT as nothing expects it; OF client interface is
>>> +     * going to be used for reading the device tree.
>>> +     */
>>> +}
>>> +
>>> +void spapr_vof_quiesce(MachineState *ms)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>> +
>>> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
>>> +    spapr->fdt_initial_size = spapr->fdt_size;
>>> +}
>>> +
>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>>> *propname,
>>> +                       void *val, int vallen)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>> +
>>> +    /*
>>> +     * We only allow changing properties which we know how to update in 
>>> QEMU
>>> +     * OR
>>> +     * the ones which we know that they need to survive during "quiesce".
>>> +     */
>>> +
>>> +    if (strcmp(path, "/rtas") == 0) {
>>> +        if (strcmp(propname, "linux,rtas-base") == 0 ||
>>> +            strcmp(propname, "linux,rtas-entry") == 0) {
>>> +            /* These need to survive quiesce so let them store in the FDT 
>>> */
>>> +            return true;
>>> +        }
>>> +    }
>>> +
>>> +    if (strcmp(path, "/chosen") == 0) {
>>> +        if (strcmp(propname, "bootargs") == 0) {
>>> +            Vof *vof = spapr->vof;
>>> +
>>> +            g_free(vof->bootargs);
>>> +            vof->bootargs = g_strndup(val, vallen);
>>> +            return true;
>>> +        }
>>> +        if (strcmp(propname, "linux,initrd-start") == 0) {
>>> +            if (vallen == sizeof(uint32_t)) {
>>> +                spapr->initrd_base = ldl_be_p(val);
>>> +                return true;
>>> +            }
>>> +            if (vallen == sizeof(uint64_t)) {
>>> +                spapr->initrd_base = ldq_be_p(val);
>>> +                return true;
>>> +            }
>>> +            return false;
>>> +        }
>>> +        if (strcmp(propname, "linux,initrd-end") == 0) {
>>> +            if (vallen == sizeof(uint32_t)) {
>>> +                spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base;
>>> +                return true;
>>> +            }
>>> +            if (vallen == sizeof(uint64_t)) {
>>> +                spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base;
>>> +                return true;
>>> +            }
>>> +            return false;
>>> +        }
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>>> new file mode 100644
>>> index 000000000000..1068a1e58388
>>> --- /dev/null
>>> +++ b/hw/ppc/vof.c
>>> @@ -0,0 +1,1052 @@
>>> +/*
>>> + * QEMU PowerPC Virtual Open Firmware.
>>> + *
>>> + * This implements client interface from OpenFirmware IEEE1275 on the 
>>> QEMU
>>> + * side to leave only a very basic firmware in the VM.
>>> + *
>>> + * Copyright (c) 2021 IBM Corporation.
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu-common.h"
>>> +#include "qemu/timer.h"
>>> +#include "qemu/range.h"
>>> +#include "qemu/units.h"
>>> +#include "qapi/error.h"
>>> +#include <sys/ioctl.h>
>>> +#include "exec/ram_addr.h"
>>> +#include "exec/address-spaces.h"
>>> +#include "hw/ppc/vof.h"
>>> +#include "hw/ppc/fdt.h"
>>> +#include "sysemu/runstate.h"
>>> +#include "qom/qom-qobject.h"
>>> +#include "trace.h"
>>> +
>>> +#include <libfdt.h>
>>> +
>>> +/*
>>> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
>>> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars 
>>> long.
>>> + */
>>> +#define OF_PROPNAME_LEN_MAX 64
>>> +
>>> +#define VOF_MAX_PATH        256
>>> +#define VOF_MAX_SETPROPLEN  2048
>>> +#define VOF_MAX_METHODLEN   256
>>> +#define VOF_MAX_FORTHCODE   256
>>> +#define VOF_VTY_BUF_SIZE    256
>>> +
>>> +typedef struct {
>>> +    uint64_t start;
>>> +    uint64_t size;
>>> +} OfClaimed;
>>> +
>>> +typedef struct {
>>> +    char *path; /* the path used to open the instance */
>>> +    uint32_t phandle;
>>> +} OfInstance;
>>> +
>>> +static int readstr(hwaddr pa, char *buf, int size)
>>> +{
>>> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
>>> +        return -1;
>>> +    }
>>> +    if (strnlen(buf, size) == size) {
>>> +        buf[size - 1] = '\0';
>>> +        trace_vof_error_str_truncated(buf, size);
>>> +        return -1;
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
>>> +                       const char *s1, unsigned nargscheck, unsigned 
>>> nretcheck)
>>> +{
>>> +    if (strcmp(s, s1)) {
>>> +        return false;
>>> +    }
>>> +    if ((nargscheck && (nargs != nargscheck)) ||
>>> +        (nretcheck && (nret != nretcheck))) {
>>> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
>>> +        return false;
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static void prop_format(char *tval, int tlen, const void *prop, int len)
>>> +{
>>> +    int i;
>>> +    const unsigned char *c;
>>> +    char *t;
>>> +    const char bin[] = "...";
>>> +
>>> +    for (i = 0, c = prop; i < len; ++i, ++c) {
>>> +        if (*c == '\0' && i == len - 1) {
>>> +            strncpy(tval, prop, tlen - 1);
>>> +            return;
>>> +        }
>>> +        if (*c < 0x20 || *c >= 0x80) {
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
>>> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
>>> +            strcpy(t, bin);
>>> +            return;
>>> +        }
>>> +        if (i && i % 4 == 0 && i != len - 1) {
>>> +            strcat(t, " ");
>>> +            ++t;
>>> +        }
>>> +        t += sprintf(t, "%02X", *c & 0xFF);
>>> +    }
>>> +}
>>> +
>>> +static int get_path(const void *fdt, int offset, char *buf, int len)
>>> +{
>>> +    int ret;
>>> +
>>> +    ret = fdt_get_path(fdt, offset, buf, len - 1);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    buf[len - 1] = '\0';
>>> +
>>> +    return strlen(buf) + 1;
>>> +}
>>> +
>>> +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, int 
>>> len)
>>> +{
>>> +    int ret;
>>> +
>>> +    ret = fdt_node_offset_by_phandle(fdt, ph);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    return get_path(fdt, ret, buf, len);
>>> +}
>>> +
>>> +static int path_offset(const void *fdt, const char *path)
>>> +{
>>> +    g_autofree char *p = NULL;
>>> +    char *at;
>>> +
>>> +    /*
>>> +     * The addresses in node names are expected to in the lower case as 
>>> per
>> 
>> There's some grammar problem with this sentence. I think it should be "are 
>> expected to be in lower case" but ask a native speaker.
>
> Definitely missed "be". Thanks for spotting.
>
>
>> 
>>> +     * 
>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>> +     */
>>> +    at = strchr(path, '@');
>>> +    if (!at) {
>>> +        return fdt_path_offset(fdt, path);
>>> +    }
>>> +
>>> +    p = g_strdup(path);
>>> +    for (at = at - path + p + 1; *at; ++at) {
>>> +        *at = tolower(*at);
>>> +    }
>>> +    return fdt_path_offset(fdt, p);
>>> +}
>>> +
>>> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
>>> +{
>>> +    char fullnode[VOF_MAX_PATH];
>>> +    uint32_t ret = -1;
>>> +    int offset;
>>> +
>>> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
>>> +        return (uint32_t) ret;
>>> +    }
>>> +
>>> +    offset = path_offset(fdt, fullnode);
>>> +    if (offset >= 0) {
>>> +        ret = fdt_get_phandle(fdt, offset);
>>> +    }
>>> +    trace_vof_finddevice(fullnode, ret);
>>> +    return (uint32_t) ret;
>>> +}
>>> +
>>> +static const void *getprop(const void *fdt, int nodeoff, const char 
>>> *propname,
>>> +                           int *proplen, bool *write0)
>>> +{
>>> +    const char *unit, *prop;
>>> +    const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen);
>>> +
>>> +    if (ret) {
>>> +        if (write0) {
>>> +            *write0 = false;
>>> +        }
>>> +        return ret;
>>> +    }
>>> +
>>> +    /*
>>> +     * The "name" property is not actually expected as a property in the 
>>> FDT
>>> +     * (although some platform may create those in "/" so we try getprop 
>>> first),
>> 
>> Not only in "/" but anywhere. MorphOS walks the tree with nextprop and 
>> expects to get a name property for most nodes without ever explicitely 
>> querying "name". I've tested this with both the board firmware and VOF and 
>> with the board firmware a name property appears in most nodes but not all 
>> so I think at least SmartFirmware does the same and explicitely sets name 
>> on some nodes and otherwise returns the name from path if such property 
>> does not existbut queried. With this in VOF I can do the same and get same 
>> results so the change should be OK but the comment may be misleading now. 
>> Better to just say we return a value for "name" from path if queried but 
>> property does not exist which seems to be what OF does too.
>
>
> Fair point, after checking with o1275 and devicetree-specification-v0.2.pdf, 
> I'll do what you said.

Just to avoid misunderstandings: only change the comment not the code, it 
works this way just the comment could be adjusted to describe it better.

>
>>> +     * we emulate it by returning a pointer to the node's name and adjust
>>> +     * proplen to include only the name but not the unit.
>>> +     */
>>> +    if (strcmp(propname, "name")) {
>>> +        return NULL;
>>> +    }
>>> +    prop = fdt_get_name(fdt, nodeoff, proplen);
>>> +    if (!prop) {
>>> +        *proplen = 0;
>>> +        return NULL;
>>> +    }
>>> +
>>> +    unit = memchr(prop, '@', *proplen);
>>> +    if (unit) {
>>> +        *proplen = unit - prop;
>>> +    }
>>> +    *proplen += 1;
>>> +
>>> +    /*
>>> +     * Since it might be cut at "@" and there will be no trailing zero
>>> +     * in the prop buffer, tell the caller to write zero at the end.
>>> +     */
>>> +    if (write0) {
>>> +        *write0 = true;
>>> +    }
>>> +    return prop;
>>> +}
>>> +
>>> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t 
>>> pname,
>>> +                            uint32_t valaddr, uint32_t vallen)
>>> +{
>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>> +    uint32_t ret = 0;
>>> +    int proplen = 0;
>>> +    const void *prop;
>>> +    char trval[64] = "";
>>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>>> +    bool write0;
>>> +
>>> +    if (nodeoff < 0) {
>>> +        return -1;
>>> +    }
>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>> +        return -1;
>>> +    }
>>> +    prop = getprop(fdt, nodeoff, propname, &proplen, &write0);
>>> +    if (prop) {
>>> +        const char zero = 0;
>>> +        int cb = MIN(proplen, vallen);
>>> +
>>> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK ||
>>> +            /* if that was "name" with a unit address, overwrite '@' with 
>>> '0' */
>>> +            (write0 &&
>>> +             cb == proplen &&
>>> +             VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) {
>>> +            ret = -1;
>>> +        } else {
>>> +            /*
>>> +             * OF1275 says:
>>> +             * "Size is either the actual size of the property, or -1 if 
>>> name
>>> +             * does not exist", hence returning proplen instead of cb.
>>> +             */
>>> +            ret = proplen;
>>> +            /* Do not format a value if tracepoint is silent, for 
>>> performance */
>>> +            if (trace_event_get_state(TRACE_VOF_GETPROP) &&
>>> +                qemu_loglevel_mask(LOG_TRACE)) {
>>> +                prop_format(trval, sizeof(trval), prop, ret);
>>> +            }
>>> +        }
>>> +    } else {
>>> +        ret = -1;
>>> +    }
>>> +    trace_vof_getprop(nodeph, propname, ret, trval);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t 
>>> pname)
>>> +{
>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>> +    uint32_t ret = 0;
>>> +    int proplen = 0;
>>> +    const void *prop;
>>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>>> +
>>> +    if (nodeoff < 0) {
>>> +        return -1;
>>> +    }
>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>> +        return -1;
>>> +    }
>>> +    prop = getprop(fdt, nodeoff, propname, &proplen, NULL);
>>> +    if (prop) {
>>> +        ret = proplen;
>>> +    } else {
>>> +        ret = -1;
>>> +    }
>>> +    trace_vof_getproplen(nodeph, propname, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof,
>>> +                            uint32_t nodeph, uint32_t pname,
>>> +                            uint32_t valaddr, uint32_t vallen)
>>> +{
>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>> +    uint32_t ret = -1;
>>> +    int offset;
>>> +    char trval[64] = "";
>>> +    char nodepath[VOF_MAX_PATH] = "";
>>> +    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
>>> +    g_autofree char *val = NULL;
>>> +
>>> +    if (vallen > VOF_MAX_SETPROPLEN) {
>>> +        goto trace_exit;
>>> +    }
>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>> +        goto trace_exit;
>>> +    }
>>> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
>>> +    if (offset < 0) {
>>> +        goto trace_exit;
>>> +    }
>>> +    ret = get_path(fdt, offset, nodepath, sizeof(nodepath));
>>> +    if (ret <= 0) {
>>> +        goto trace_exit;
>>> +    }
>>> +
>>> +    val = g_malloc0(vallen);
>>> +    if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) {
>>> +        goto trace_exit;
>>> +    }
>>> +
>>> +    if (vmo) {
>>> +        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
>>> +
>>> +        if (vmc->setprop &&
>>> +            !vmc->setprop(ms, nodepath, propname, val, vallen)) {
>>> +            goto trace_exit;
>>> +        }
>>> +    }
>>> +
>>> +    ret = fdt_setprop(fdt, offset, propname, val, vallen);
>>> +    if (ret) {
>>> +        goto trace_exit;
>>> +    }
>>> +
>>> +    if (trace_event_get_state(TRACE_VOF_SETPROP) &&
>>> +        qemu_loglevel_mask(LOG_TRACE)) {
>>> +        prop_format(trval, sizeof(trval), val, vallen);
>>> +    }
>>> +    ret = vallen;
>>> +
>>> +trace_exit:
>>> +    trace_vof_setprop(nodeph, propname, trval, vallen, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
>>> +                             uint32_t prevaddr, uint32_t nameaddr)
>>> +{
>>> +    int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle);
>>> +    char prev[OF_PROPNAME_LEN_MAX + 1];
>>> +    const char *tmp;
>>> +
>>> +    if (readstr(prevaddr, prev, sizeof(prev))) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    fdt_for_each_property_offset(offset, fdt, nodeoff) {
>>> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>>> +            return 0;
>>> +        }
>>> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
>>> +            if (prev[0] != '\0') {
>>> +                offset = fdt_next_property_offset(fdt, offset);
>>> +                if (offset < 0) {
>>> +                    return 0;
>>> +                }
>>> +            }
>>> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>>> +                return 0;
>>> +            }
>>> +
>>> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != 
>>> MEMTX_OK) {
>>> +                return -1;
>>> +            }
>>> +            return 1;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
>>> +{
>>> +    int ret;
>>> +
>>> +    if (phandle == 0) {
>>> +        ret = fdt_path_offset(fdt, "/");
>>> +    } else {
>>> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, 
>>> phandle));
>>> +    }
>>> +
>>> +    if (ret < 0) {
>>> +        ret = 0;
>>> +    } else {
>>> +        ret = fdt_get_phandle(fdt, ret);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
>>> +{
>>> +    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, 
>>> phandle));
>>> +
>>> +    if (ret < 0) {
>>> +        ret = 0;
>>> +    } else {
>>> +        ret = fdt_get_phandle(fdt, ret);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
>>> +{
>>> +    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, 
>>> phandle));
>>> +
>>> +    if (ret < 0) {
>>> +        ret = 0;
>>> +    } else {
>>> +        ret = fdt_get_phandle(fdt, ret);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const char 
>>> *path)
>>> +{
>>> +    uint32_t ret = -1;
>>> +    OfInstance *inst = NULL;
>>> +
>>> +    if (vof->of_instance_last == 0xFFFFFFFF) {
>>> +        /* We do not recycle ihandles yet */
>>> +        goto trace_exit;
>>> +    }
>>> +
>>> +    inst = g_new0(OfInstance, 1);
>>> +    inst->phandle = fdt_get_phandle(fdt, offset);
>>> +    g_assert(inst->phandle);
>>> +    ++vof->of_instance_last;
>>> +
>>> +    inst->path = g_strdup(path);
>>> +    g_hash_table_insert(vof->of_instances,
>>> +                        GINT_TO_POINTER(vof->of_instance_last),
>>> +                        inst);
>>> +    ret = vof->of_instance_last;
>>> +
>>> +trace_exit:
>>> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
>>> +                               const char *prop, const char *path)
>>> +{
>>> +    int node = fdt_path_offset(fdt, nodename);
>>> +    int inst, offset;
>>> +
>>> +    offset = fdt_path_offset(fdt, path);
>>> +    if (offset < 0) {
>>> +        trace_vof_error_unknown_path(path);
>>> +        return offset;
>>> +    }
>>> +
>>> +    inst = vof_do_open(fdt, vof, offset, path);
>>> +
>>> +    return fdt_setprop_cell(fdt, node, prop, inst);
>>> +}
>>> +
>>> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
>>> +{
>>> +    char path[VOF_MAX_PATH];
>>> +    int offset;
>>> +
>>> +    if (readstr(pathaddr, path, sizeof(path))) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    offset = path_offset(fdt, path);
>>> +    if (offset < 0) {
>>> +        trace_vof_error_unknown_path(path);
>>> +        return offset;
>>> +    }
>>> +
>>> +    return vof_do_open(fdt, vof, offset, path);
>>> +}
>>> +
>>> +static void vof_close(Vof *vof, uint32_t ihandle)
>>> +{
>>> +    if (!g_hash_table_remove(vof->of_instances, 
>>> GINT_TO_POINTER(ihandle))) {
>>> +        trace_vof_error_unknown_ihandle_close(ihandle);
>>> +    }
>>> +}
>>> +
>>> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
>>> +{
>>> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
>>> +                                         GINT_TO_POINTER(ihandle));
>>> +    uint32_t ret = -1;
>>> +
>>> +    if (instp) {
>>> +        ret = ((OfInstance *)instp)->phandle;
>>> +    }
>>> +    trace_vof_instance_to_package(ihandle, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
>>> +                                    uint32_t buf, uint32_t len)
>>> +{
>>> +    uint32_t ret = -1;
>>> +    char tmp[VOF_MAX_PATH] = "";
>>> +
>>> +    ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>>> +    if (ret > 0) {
>>> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>>> +            ret = -1;
>>> +        }
>>> +    }
>>> +
>>> +    trace_vof_package_to_path(phandle, tmp, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t 
>>> ihandle,
>>> +                                     uint32_t buf, uint32_t len)
>>> +{
>>> +    uint32_t ret = -1;
>>> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
>>> +    char tmp[VOF_MAX_PATH] = "";
>>> +
>>> +    if (phandle != -1) {
>>> +        ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>>> +        if (ret > 0) {
>>> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>>> +                ret = -1;
>>> +            }
>>> +        }
>>> +    }
>>> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf,
>>> +                          uint32_t len)
>>> +{
>>> +    char tmp[VOF_VTY_BUF_SIZE];
>>> +    unsigned cb;
>>> +    OfInstance *inst = (OfInstance *)
>>> +        g_hash_table_lookup(vof->of_instances, GINT_TO_POINTER(ihandle));
>>> +
>>> +    if (!inst) {
>>> +        trace_vof_error_write(ihandle);
>>> +        return -1;
>>> +    }
>>> +
>>> +    for ( ; len > 0; len -= cb) {
>>> +        cb = MIN(len, sizeof(tmp) - 1);
>>> +        if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) {
>>> +            return -1;
>>> +        }
>>> +
>>> +        /* FIXME: there is no backend(s) yet so just call a trace */
>>> +        if (trace_event_get_state(TRACE_VOF_WRITE) &&
>>> +            qemu_loglevel_mask(LOG_TRACE)) {
>>> +            tmp[cb] = '\0';
>>> +            trace_vof_write(ihandle, cb, tmp);
>>> +        }
>>> +    }
>>> +
>>> +    return len;
>>> +}
>>> +
>>> +static void vof_claimed_dump(GArray *claimed)
>>> +{
>>> +    int i;
>>> +    OfClaimed c;
>>> +
>>> +    if (trace_event_get_state(TRACE_VOF_CLAIMED) &&
>>> +        qemu_loglevel_mask(LOG_TRACE)) {
>>> +
>>> +        for (i = 0; i < claimed->len; ++i) {
>>> +            c = g_array_index(claimed, OfClaimed, i);
>>> +            trace_vof_claimed(c.start, c.start + c.size, c.size);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t 
>>> size)
>>> +{
>>> +    int i;
>>> +    OfClaimed c;
>>> +
>>> +    for (i = 0; i < claimed->len; ++i) {
>>> +        c = g_array_index(claimed, OfClaimed, i);
>>> +        if (ranges_overlap(c.start, c.size, virt, size)) {
>>> +            return false;
>>> +        }
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
>>> +{
>>> +    OfClaimed newclaim;
>>> +
>>> +    newclaim.start = virt;
>>> +    newclaim.size = size;
>>> +    g_array_append_val(claimed, newclaim);
>>> +}
>>> +
>>> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
>>> +{
>>> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
>>> +}
>>> +
>>> +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t 
>>> base)
>>> +{
>>> +    int i, n, offset, proplen = 0, sc, ac;
>>> +    target_ulong mem0_end;
>>> +    const uint8_t *mem0_reg;
>>> +    g_autofree uint8_t *avail = NULL;
>>> +    uint8_t *availcur;
>>> +
>>> +    if (!fdt || !claimed) {
>>> +        return;
>>> +    }
>>> +
>>> +    offset = fdt_path_offset(fdt, "/");
>>> +    _FDT(offset);
>>> +    ac = fdt_address_cells(fdt, offset);
>>> +    g_assert(ac == 1 || ac == 2);
>>> +    sc = fdt_size_cells(fdt, offset);
>>> +    g_assert(sc == 1 || sc == 2);
>>> +
>>> +    offset = fdt_path_offset(fdt, "/memory@0");
>>> +    _FDT(offset);
>>> +
>>> +    mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
>>> +    g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
>>> +    if (sc == 2) {
>>> +        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) 
>>> * ac));
>>> +    } else {
>>> +        mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) 
>>> * ac));
>>> +    }
>>> +
>>> +    g_array_sort(claimed, of_claimed_compare_func);
>>> +    vof_claimed_dump(claimed);
>>> +
>>> +    /*
>>> +     * VOF resides in the first page so we do not need to check if there 
>>> is
>>> +     * available memory before the first claimed block
>>> +     */
>>> +    g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 0).start 
>>> == 0));
>>> +
>>> +    avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len);
>>> +    for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) {
>>> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
>>> +        uint64_t start, size;
>>> +
>>> +        start = c.start + c.size;
>>> +        if (i < claimed->len - 1) {
>>> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
>>> +
>>> +            size = cn.start - start;
>>> +        } else {
>>> +            size = mem0_end - start;
>>> +        }
>>> +
>>> +        if (ac == 2) {
>>> +            *(uint64_t *) availcur = cpu_to_be64(start);
>>> +        } else {
>>> +            *(uint32_t *) availcur = cpu_to_be32(start);
>>> +        }
>>> +        availcur += sizeof(uint32_t) * ac;
>>> +        if (sc == 2) {
>>> +            *(uint64_t *) availcur = cpu_to_be64(size);
>>> +        } else {
>>> +            *(uint32_t *) availcur = cpu_to_be32(size);
>>> +        }
>>> +        availcur += sizeof(uint32_t) * sc;
>>> +
>>> +        if (size) {
>>> +            trace_vof_avail(c.start + c.size, c.start + c.size + size, 
>>> size);
>>> +            ++n;
>>> +        }
>>> +    }
>>> +    _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - 
>>> avail)));
>>> +}
>>> +
>>> +/*
>>> + * OF1275:
>>> + * "Allocates size bytes of memory. If align is zero, the allocated range
>>> + * begins at the virtual address virt. Otherwise, an aligned address is
>>> + * automatically chosen and the input argument virt is ignored".
>>> + *
>>> + * In other words, exactly one of @virt and @align is non-zero.
>>> + */
>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size,
>>> +                   uint64_t align)
>>> +{
>>> +    uint64_t ret;
>>> +
>>> +    if (size == 0) {
>>> +        ret = -1;
>>> +    } else if (align == 0) {
>>> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
>>> +            ret = -1;
>>> +        } else {
>>> +            ret = virt;
>>> +        }
>>> +    } else {
>>> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
>>> +        while (1) {
>>> +            if (vof->claimed_base >= vof->top_addr) {
>>> +                error_report("Out of RMA memory for the OF client");
>>> +                return -1;
>>> +            }
>>> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
>>> +                break;
>>> +            }
>>> +            vof->claimed_base += size;
>>> +        }
>>> +        ret = vof->claimed_base;
>>> +    }
>>> +
>>> +    if (ret != -1) {
>>> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
>>> +        vof_claim_add(vof->claimed, ret, size);
>>> +    }
>>> +    trace_vof_claim(virt, size, align, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size)
>>> +{
>>> +    uint32_t ret = -1;
>>> +    int i;
>>> +    GArray *claimed = vof->claimed;
>>> +    OfClaimed c;
>>> +
>>> +    for (i = 0; i < claimed->len; ++i) {
>>> +        c = g_array_index(claimed, OfClaimed, i);
>>> +        if (c.start == virt && c.size == size) {
>>> +            g_array_remove_index(claimed, i);
>>> +            ret = 0;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    trace_vof_release(virt, size, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static void vof_instantiate_rtas(Error **errp)
>>> +{
>>> +    error_setg(errp, "The firmware should have instantiated RTAS");
>>> +}
>>> +
>>> +static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t 
>>> methodaddr,
>>> +                                uint32_t ihandle, uint32_t param1,
>>> +                                uint32_t param2, uint32_t param3,
>>> +                                uint32_t param4, uint32_t *ret2)
>>> +{
>>> +    uint32_t ret = -1;
>>> +    char method[VOF_MAX_METHODLEN] = "";
>>> +    OfInstance *inst;
>>> +
>>> +    if (!ihandle) {
>>> +        goto trace_exit;
>>> +    }
>>> +
>>> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
>>> +                                              GINT_TO_POINTER(ihandle));
>> 
>> I think you should not have space in type casts but checkpatch does not 
>> seem to mind. You have this at several places.
>
>
> checkpatch does mind because it is truly ugly. I tried:
>
>
> ERROR: "(foo*)" should be "(foo *)"
> #2029: FILE: pc-bios/vof/ci.c:46:
> +    if (prom_handle((void*)(unsigned long) args)) {
>
> total: 1 errors, 0 warnings, 2030 lines checked

That one yes, but I've meant writing

(OfInstance *)g_hash_table_lookup

instead of

(OfInstance *) g_hash_table_lookup

which checkpatch doesn't seem to mind and I did not find anything about 
this in coding style doc so maybe both are OK? I prefer casts without a 
space so it's clear what is being cast but if there's no style for that 
and checkpatch accepts it then use what you like.

Regards,
BALATON Zoltan
BALATON Zoltan June 16, 2021, 10:34 a.m. UTC | #6
On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
> On 6/15/21 20:29, BALATON Zoltan wrote:
>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>> The PAPR platform describes an OS environment that's presented by
>>> a combination of a hypervisor and firmware. The features it specifies
>>> require collaboration between the firmware and the hypervisor.
>>> 
>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>> been implemented as a 20 byte shim which simply forwards it to
>>> a hypercall implemented in qemu. The boot time firmware component is
>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>> updated in sync with it. Even though we've managed to limit the amount
>>> of runtime communication we need between qemu and SLOF, there's some,
>>> and it has become increasingly awkward to handle as we've implemented
>>> new features.
>>> 
>>> This implements a boot time OF client interface (CI) which is
>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>> which implements Open Firmware Client Interface (OF CI). This allows
>>> using a smaller stateless firmware which does not have to manage
>>> the device tree.
>>> 
>>> The new "vof.bin" firmware image is included with source code under
>>> pc-bios/. It also includes RTAS blob.
>>> 
>>> This implements a handful of CI methods just to get -kernel/-initrd
>>> working. In particular, this implements the device tree fetching and
>>> simple memory allocator - "claim" (an OF CI memory allocator) and updates
>>> "/memory@0/available" to report the client about available memory.
>>> 
>>> This implements changing some device tree properties which we know how
>>> to deal with, the rest is ignored. To allow changes, this skips
>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>> appending.
>>> 
>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>> device tree traversing work.
>>> 
>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>> 
>>> This adds basic instances support which are managed by a hash map
>>> ihandle -> [phandle].
>>> 
>>> Before the guest started, the used memory is:
>>> 0..e60 - the initial firmware
>>> 8000..10000 - stack
>>> 400000.. - kernel
>>> 3ea0000.. - initramdisk
>>> 
>>> This OF CI does not implement "interpret".
>>> 
>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>> includes a disk image with pre-formatted nvram.
>>> 
>>> With this basic support, this can only boot into kernel directly.
>>> However this is just enough for the petitboot kernel and initradmdisk to
>>> boot from any possible source. Note this requires reasonably recent guest
>>> kernel with:
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>> 
>>> The immediate benefit is much faster booting time which especially
>>> crucial with fully emulated early CPU bring up environments. Also this
>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>> 
>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>> other POWERPC boards which do not support pSeries.
>>> 
>>> This make VOF optional, it is disabled by default, add --enable-vof
>>> to ./configure to enable it.
>>> 
>>> This assumes potential support for booting from QEMU backends
>>> such as blockdev or netdev without devices/drivers used.
>>> 
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> 
>>> The example command line is:
>>> 
>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>> -nodefaults \
>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>> -nographic \
>>> -vga none \
>>> -enable-kvm \
>>> -m 8G \
>>> -machine 
>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>> \
>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>> -initrd pb/rootfs.cpio.xz \
>>> -drive 
>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>> \
>>> -global spapr-nvram.drive=DRIVE0 \
>>> -snapshot \
>>> -smp 8,threads=8 \
>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>> -trace events=qemu_trace_events \
>>> -d guest_errors \
>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>> -mon chardev=SOCKET0,mode=control
>> 
>> I haven't looked at it in detail yet, just some quick comments I have on 
>> first skim through.
>> 
>>> ---
>>> Changes:
>>> v21:
>>> * s/ld/ldz/ in entry.S
>> 
>> Typo? Has this become lwz?
>
> Yup, lwz.
>
>> 
>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>> Kconfig
>>> * made CONFIG_VOF optional
>> 
>> This won't work for pegasos2, see below.
>> 
>>> * s/l.lds/vof.lds/
>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>> * added checks for non-null methods of VofMachineIfClass
>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better comment
>>> * added  path_offset wrapper for handling mixed case for addresses
>>> after "@" in node names
>>> * changed getprop() to check for actual "name" property in the fdt
>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike similar
>>> rtas_ld/ldl_be_*) they return error codes
>>> * VOF_MEM_READ uses now address_space_read (it was address_space_read_full
>>> before, not sure why)
>> [...]
>>> ---
>>> configure               |    9 +
>>> pc-bios/vof/Makefile    |   23 +
>>> include/hw/ppc/spapr.h  |   25 +-
>>> include/hw/ppc/vof.h    |   55 ++
>>> pc-bios/vof/vof.h       |   43 ++
>>> hw/ppc/spapr.c          |   87 +++-
>>> hw/ppc/spapr_hcall.c    |   29 +-
>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>> pc-bios/vof/bootmem.c   |   14 +
>>> pc-bios/vof/ci.c        |   91 ++++
>>> pc-bios/vof/libc.c      |   92 ++++
>>> pc-bios/vof/main.c      |   21 +
>>> tests/qtest/rtas-test.c |   17 +-
>>> MAINTAINERS             |   12 +
>>> hw/ppc/Kconfig          |    3 +
>>> hw/ppc/meson.build      |    3 +
>>> hw/ppc/trace-events     |   24 +
>>> meson.build             |    1 +
>>> pc-bios/README          |    2 +
>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>> pc-bios/vof/entry.S     |   49 ++
>>> pc-bios/vof/vof.lds     |   48 ++
>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>> create mode 100644 pc-bios/vof/Makefile
>>> create mode 100644 include/hw/ppc/vof.h
>>> create mode 100644 pc-bios/vof/vof.h
>>> create mode 100644 hw/ppc/spapr_vof.c
>>> create mode 100644 hw/ppc/vof.c
>>> create mode 100644 pc-bios/vof/bootmem.c
>>> create mode 100644 pc-bios/vof/ci.c
>>> create mode 100644 pc-bios/vof/libc.c
>>> create mode 100644 pc-bios/vof/main.c
>>> create mode 100644 pc-bios/vof-nvram.bin
>>> create mode 100755 pc-bios/vof.bin
>>> create mode 100644 pc-bios/vof/entry.S
>>> create mode 100644 pc-bios/vof/vof.lds
>>> 
>>> diff --git a/configure b/configure
>>> index 8dcb9965b24e..00dc29c027fa 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -445,6 +445,7 @@ fuse="auto"
>>> fuse_lseek="auto"
>>> multiprocess="auto"
>>> slirp_smbd="$default_feature"
>>> +vof="no"
>> 
>> Why is this disabled by default? I pretty much need VOF in pegasos2 as 
>> there would be no other firmware otherwise. So it means I have to select 
>> VOF in pegasos2 config and then VOF itself cannot be optional any more. If 
>> you want it to be optional for spapr then you can't use CONFIG_VOF for that 
>> but need to add a separate CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF option that 
>> you can set to no by default even when CONFIG_VOF is yes and make VOF usage 
>> conditional on that variable within spapr files.
>> 
>> Hope this make sense. But I don't really see why you need to do that when 
>> you already have this turned off by default for spapr unless the x-vof 
>> option is used. Isn't that enough to make this optional? If not then you 
>> need another spapr specific CONFIG_* variable because CONFIG_PEGASOS2 has 
>> to select CONFIG_VOF as it will be its default firmware. For the same 
>> reason you should not put it behind a config option especially one that 
>> needs to be explicitely enabled.
>
>
> Answered in the other mail.
>
>
>
>>> malloc_trim="auto"
>>> gio="$default_feature"
>>> @@ -1561,6 +1562,10 @@ for opt do
>>>   ;;
>>>   --disable-slirp-smbd) slirp_smbd=no
>>>   ;;
>>> +  --enable-vof) vof=yes
>>> +  ;;
>>> +  --disable-vof) vof=no
>>> +  ;;
>>>   *)
>>>       echo "ERROR: unknown option $opt"
>>>       echo "Try '$0 --help' for more information"
>>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is enabled 
>>> if available
>>>   multiprocess    Out of process device emulation support
>>>   gio             libgio support
>>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>>> experimental)
>>> 
>>> NOTE: The object files are built at the place where configure is launched
>>> EOF
>>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>>> fi
>>> +if test "$vof" = "yes" ; then
>>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>>> +fi
>>> if test "$vde" = "yes" ; then
>>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>>> new file mode 100644
>>> index 000000000000..aa1678c4d889
>>> --- /dev/null
>>> +++ b/pc-bios/vof/Makefile
>>> @@ -0,0 +1,23 @@
>>> +all: build-all
>>> +
>>> +build-all: vof.bin
>>> +
>>> +CROSS ?=
>>> +CC = $(CROSS)gcc
>>> +LD = $(CROSS)ld
>>> +OBJCOPY = $(CROSS)objcopy
>>> +
>>> +%.o: %.S
>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>>> +
>>> +%.o: %.c
>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $<
>>> +
>>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>>> +
>>> +%.bin: %.elf
>>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>>> +
>>> +clean:
>>> +    rm -f *.o vof.bin vof.elf *~
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index f05219f75ef6..39b5581ae650 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -12,6 +12,9 @@
>>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>> #include "hw/ppc/xics.h"        /* For ICSState */
>>> #include "hw/ppc/spapr_tpm_proxy.h"
>>> +#ifdef CONFIG_VOF
>>> +#include "hw/ppc/vof.h"
>>> +#endif
>>> 
>>> struct SpaprVioBus;
>>> struct SpaprPhbState;
>>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>>     uint64_t kernel_addr;
>>>     uint32_t initrd_base;
>>>     long initrd_size;
>>> +#ifdef CONFIG_VOF
>>> +    Vof *vof;
>>> +#endif
>>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>>     struct PPCTimebase tb;
>>>     bool has_graphics;
>>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>>> /* Client Architecture support */
>>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>> 
>>> /*
>>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, 
>>> hwaddr pagesize,
>>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>> hwaddr spapr_get_rtas_addr(void);
>>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>>> +
>>> +#ifdef CONFIG_VOF
>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>> +                     target_ulong *stack_ptr, Error **errp);
>>> +void spapr_vof_quiesce(MachineState *ms);
>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>>> *propname,
>>> +                       void *val, int vallen);
>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>> *spapr,
>>> +                                target_ulong opcode, target_ulong *args);
>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>> +                                                   CPUState *cs,
>>> +                                                   target_ulong 
>>> ovec_addr);
>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>>> +#endif
>>> +
>>> #endif /* HW_SPAPR_H */
>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>> new file mode 100644
>>> index 000000000000..65ca2fed0d41
>>> --- /dev/null
>>> +++ b/include/hw/ppc/vof.h
>>> @@ -0,0 +1,55 @@
>>> +/*
>>> + * Virtual Open Firmware
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +#ifndef HW_VOF_H
>>> +#define HW_VOF_H
>>> +
>>> +typedef struct Vof {
>>> +    uint64_t top_addr; /* copied from rma_size */
>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>> +    uint64_t claimed_base;
>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>> +    uint32_t of_instance_last;
>>> +    char *bootargs;
>>> +    long fw_size;
>>> +} Vof;
>>> +
>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>> +                    target_ulong args_real);
>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>> align);
>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>> +void vof_cleanup(Vof *vof);
>>> +void vof_build_dt(void *fdt, Vof *vof);
>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
>>> +                               const char *prop, const char *path);
>>> +
>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>> +
>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>> TYPE_VOF_MACHINE_IF)
>>> +
>>> +struct VofMachineIfClass {
>>> +    InterfaceClass parent;
>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>> CPUState *cs,
>>> +                                                target_ulong vec);
>>> +    void (*quiesce)(MachineState *ms);
>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>> *propname,
>>> +                    void *val, int vallen);
>>> +};
>>> +
>>> +/*
>>> + * Initial stack size is from
>>> + * 
>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>> + */
>>> +#define VOF_STACK_SIZE       0x8000
>> 
>> Maybe also add a define for RTAS_SIZE here? We'll need to put that in the 
>> device tree but it depends on the rtas shim size that's part of VOF so it 
>> should be defined here instead of hardcoding it in boards that use VOF so 
>> it can be updated later at one place if needed.
>
> This is rtas-size for pseries:
>
> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>
> => depends on cpus => depends on the command line.
>
>
> RTAS_SIZE is not used by anything in pseries anymore, I'll send a patch to 
> ditch it.

I mean you need to have at least the size of code in pc-bios/vof/entry.S 
hv_rtas where also hv_rtas_size is defined but that value is not available 
in QEMU where one needs to add it to the device tree. So a define for that 
should be here in vof.h. Currently I've counted instructions and have

     qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);

in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that you 
define corresponding to hv_rtas_size. You'll probably need the same even 
after changing above rtas size calculation in spapr because client has to 
allocate memory for instantiate-rtas.

>
>> 
>>> +
>>> +#define VOF_MEM_READ(pa, buf, size) \
>>> +    address_space_read(&address_space_memory, \
>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>>> +#define VOF_MEM_WRITE(pa, buf, size) \
>>> +    address_space_write(&address_space_memory, \
>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> 
>> These aren't much useful without the struct definition that you typically 
>> want to read data into using these.
>
> These are not reading to structs, it is either string buffer, cell(s) or 
> binary blob. And they can return errors if the address is bad, this is the 
> reason for them to exist.
>
>>> +
>>> +#endif /* HW_VOF_H */
>> [...]
>>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>>> new file mode 100644
>>> index 000000000000..1068a1e58388
>>> --- /dev/null
>>> +++ b/hw/ppc/vof.c
>>> +
>>> +/* Defined as Big Endian */
>>> +struct prom_args {
>>> +    uint32_t service;
>>> +    uint32_t nargs;
>>> +    uint32_t nret;
>>> +    uint32_t args[10];
>>> +} QEMU_PACKED;
>> 
>> I mean this one, this could be in vof.h too.
>
>
> What is going to need it in the header?

Nothing in the header but in the code including the header like 
implementing vof client call or rtas but maybe this should be a separate 
rtas.h that could also be reused by vof so this can be a clean up later, 
just forget it for now.

Regards,
BALATON Zoltan
Alexey Kardashevskiy June 17, 2021, 2:23 a.m. UTC | #7
On 16/06/2021 20:34, BALATON Zoltan wrote:
> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>> The PAPR platform describes an OS environment that's presented by
>>>> a combination of a hypervisor and firmware. The features it specifies
>>>> require collaboration between the firmware and the hypervisor.
>>>>
>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>> been implemented as a 20 byte shim which simply forwards it to
>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>>> updated in sync with it. Even though we've managed to limit the amount
>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>> and it has become increasingly awkward to handle as we've implemented
>>>> new features.
>>>>
>>>> This implements a boot time OF client interface (CI) which is
>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>> Open
>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>> using a smaller stateless firmware which does not have to manage
>>>> the device tree.
>>>>
>>>> The new "vof.bin" firmware image is included with source code under
>>>> pc-bios/. It also includes RTAS blob.
>>>>
>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>> working. In particular, this implements the device tree fetching and
>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>> updates
>>>> "/memory@0/available" to report the client about available memory.
>>>>
>>>> This implements changing some device tree properties which we know how
>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>> appending.
>>>>
>>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>>> device tree traversing work.
>>>>
>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>
>>>> This adds basic instances support which are managed by a hash map
>>>> ihandle -> [phandle].
>>>>
>>>> Before the guest started, the used memory is:
>>>> 0..e60 - the initial firmware
>>>> 8000..10000 - stack
>>>> 400000.. - kernel
>>>> 3ea0000.. - initramdisk
>>>>
>>>> This OF CI does not implement "interpret".
>>>>
>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>> includes a disk image with pre-formatted nvram.
>>>>
>>>> With this basic support, this can only boot into kernel directly.
>>>> However this is just enough for the petitboot kernel and 
>>>> initradmdisk to
>>>> boot from any possible source. Note this requires reasonably recent 
>>>> guest
>>>> kernel with:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>
>>>> The immediate benefit is much faster booting time which especially
>>>> crucial with fully emulated early CPU bring up environments. Also this
>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>
>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>> other POWERPC boards which do not support pSeries.
>>>>
>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>> to ./configure to enable it.
>>>>
>>>> This assumes potential support for booting from QEMU backends
>>>> such as blockdev or netdev without devices/drivers used.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>
>>>> The example command line is:
>>>>
>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>> -nodefaults \
>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>> -nographic \
>>>> -vga none \
>>>> -enable-kvm \
>>>> -m 8G \
>>>> -machine 
>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>> \
>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>> -initrd pb/rootfs.cpio.xz \
>>>> -drive 
>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>> \
>>>> -global spapr-nvram.drive=DRIVE0 \
>>>> -snapshot \
>>>> -smp 8,threads=8 \
>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>> -trace events=qemu_trace_events \
>>>> -d guest_errors \
>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>> -mon chardev=SOCKET0,mode=control
>>>
>>> I haven't looked at it in detail yet, just some quick comments I have 
>>> on first skim through.
>>>
>>>> ---
>>>> Changes:
>>>> v21:
>>>> * s/ld/ldz/ in entry.S
>>>
>>> Typo? Has this become lwz?
>>
>> Yup, lwz.
>>
>>>
>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>> Kconfig
>>>> * made CONFIG_VOF optional
>>>
>>> This won't work for pegasos2, see below.
>>>
>>>> * s/l.lds/vof.lds/
>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>> * added checks for non-null methods of VofMachineIfClass
>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>> comment
>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>> after "@" in node names
>>>> * changed getprop() to check for actual "name" property in the fdt
>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>> similar
>>>> rtas_ld/ldl_be_*) they return error codes
>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>> address_space_read_full
>>>> before, not sure why)
>>> [...]
>>>> ---
>>>> configure               |    9 +
>>>> pc-bios/vof/Makefile    |   23 +
>>>> include/hw/ppc/spapr.h  |   25 +-
>>>> include/hw/ppc/vof.h    |   55 ++
>>>> pc-bios/vof/vof.h       |   43 ++
>>>> hw/ppc/spapr.c          |   87 +++-
>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>>> pc-bios/vof/bootmem.c   |   14 +
>>>> pc-bios/vof/ci.c        |   91 ++++
>>>> pc-bios/vof/libc.c      |   92 ++++
>>>> pc-bios/vof/main.c      |   21 +
>>>> tests/qtest/rtas-test.c |   17 +-
>>>> MAINTAINERS             |   12 +
>>>> hw/ppc/Kconfig          |    3 +
>>>> hw/ppc/meson.build      |    3 +
>>>> hw/ppc/trace-events     |   24 +
>>>> meson.build             |    1 +
>>>> pc-bios/README          |    2 +
>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>> pc-bios/vof/entry.S     |   49 ++
>>>> pc-bios/vof/vof.lds     |   48 ++
>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>> create mode 100644 pc-bios/vof/Makefile
>>>> create mode 100644 include/hw/ppc/vof.h
>>>> create mode 100644 pc-bios/vof/vof.h
>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>> create mode 100644 hw/ppc/vof.c
>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>> create mode 100644 pc-bios/vof/ci.c
>>>> create mode 100644 pc-bios/vof/libc.c
>>>> create mode 100644 pc-bios/vof/main.c
>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>> create mode 100755 pc-bios/vof.bin
>>>> create mode 100644 pc-bios/vof/entry.S
>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>
>>>> diff --git a/configure b/configure
>>>> index 8dcb9965b24e..00dc29c027fa 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -445,6 +445,7 @@ fuse="auto"
>>>> fuse_lseek="auto"
>>>> multiprocess="auto"
>>>> slirp_smbd="$default_feature"
>>>> +vof="no"
>>>
>>> Why is this disabled by default? I pretty much need VOF in pegasos2 
>>> as there would be no other firmware otherwise. So it means I have to 
>>> select VOF in pegasos2 config and then VOF itself cannot be optional 
>>> any more. If you want it to be optional for spapr then you can't use 
>>> CONFIG_VOF for that but need to add a separate CONFIG_VOF_SPAPR or 
>>> CONFIG_SPAPR_VOF option that you can set to no by default even when 
>>> CONFIG_VOF is yes and make VOF usage conditional on that variable 
>>> within spapr files.
>>>
>>> Hope this make sense. But I don't really see why you need to do that 
>>> when you already have this turned off by default for spapr unless the 
>>> x-vof option is used. Isn't that enough to make this optional? If not 
>>> then you need another spapr specific CONFIG_* variable because 
>>> CONFIG_PEGASOS2 has to select CONFIG_VOF as it will be its default 
>>> firmware. For the same reason you should not put it behind a config 
>>> option especially one that needs to be explicitely enabled.
>>
>>
>> Answered in the other mail.
>>
>>
>>
>>>> malloc_trim="auto"
>>>> gio="$default_feature"
>>>> @@ -1561,6 +1562,10 @@ for opt do
>>>>   ;;
>>>>   --disable-slirp-smbd) slirp_smbd=no
>>>>   ;;
>>>> +  --enable-vof) vof=yes
>>>> +  ;;
>>>> +  --disable-vof) vof=no
>>>> +  ;;
>>>>   *)
>>>>       echo "ERROR: unknown option $opt"
>>>>       echo "Try '$0 --help' for more information"
>>>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is 
>>>> enabled if available
>>>>   multiprocess    Out of process device emulation support
>>>>   gio             libgio support
>>>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>>>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>>>> experimental)
>>>>
>>>> NOTE: The object files are built at the place where configure is 
>>>> launched
>>>> EOF
>>>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>>>> fi
>>>> +if test "$vof" = "yes" ; then
>>>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>>>> +fi
>>>> if test "$vde" = "yes" ; then
>>>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>>>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>>>> new file mode 100644
>>>> index 000000000000..aa1678c4d889
>>>> --- /dev/null
>>>> +++ b/pc-bios/vof/Makefile
>>>> @@ -0,0 +1,23 @@
>>>> +all: build-all
>>>> +
>>>> +build-all: vof.bin
>>>> +
>>>> +CROSS ?=
>>>> +CC = $(CROSS)gcc
>>>> +LD = $(CROSS)ld
>>>> +OBJCOPY = $(CROSS)objcopy
>>>> +
>>>> +%.o: %.S
>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>>>> +
>>>> +%.o: %.c
>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o 
>>>> $@ $<
>>>> +
>>>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>>>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>>>> +
>>>> +%.bin: %.elf
>>>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>>>> +
>>>> +clean:
>>>> +    rm -f *.o vof.bin vof.elf *~
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index f05219f75ef6..39b5581ae650 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -12,6 +12,9 @@
>>>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>>> #include "hw/ppc/xics.h"        /* For ICSState */
>>>> #include "hw/ppc/spapr_tpm_proxy.h"
>>>> +#ifdef CONFIG_VOF
>>>> +#include "hw/ppc/vof.h"
>>>> +#endif
>>>>
>>>> struct SpaprVioBus;
>>>> struct SpaprPhbState;
>>>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>>>     uint64_t kernel_addr;
>>>>     uint32_t initrd_base;
>>>>     long initrd_size;
>>>> +#ifdef CONFIG_VOF
>>>> +    Vof *vof;
>>>> +#endif
>>>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>>>     struct PPCTimebase tb;
>>>>     bool has_graphics;
>>>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>>>> /* Client Architecture support */
>>>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>>>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>>>
>>>> /*
>>>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>>>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState 
>>>> *spapr, hwaddr pagesize,
>>>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>>> hwaddr spapr_get_rtas_addr(void);
>>>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>>>> +
>>>> +#ifdef CONFIG_VOF
>>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>>> +                     target_ulong *stack_ptr, Error **errp);
>>>> +void spapr_vof_quiesce(MachineState *ms);
>>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const 
>>>> char *propname,
>>>> +                       void *val, int vallen);
>>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>>> *spapr,
>>>> +                                target_ulong opcode, target_ulong 
>>>> *args);
>>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>>> +                                                   CPUState *cs,
>>>> +                                                   target_ulong 
>>>> ovec_addr);
>>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void 
>>>> *fdt);
>>>> +#endif
>>>> +
>>>> #endif /* HW_SPAPR_H */
>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>> new file mode 100644
>>>> index 000000000000..65ca2fed0d41
>>>> --- /dev/null
>>>> +++ b/include/hw/ppc/vof.h
>>>> @@ -0,0 +1,55 @@
>>>> +/*
>>>> + * Virtual Open Firmware
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +#ifndef HW_VOF_H
>>>> +#define HW_VOF_H
>>>> +
>>>> +typedef struct Vof {
>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>> +    uint64_t claimed_base;
>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>> +    uint32_t of_instance_last;
>>>> +    char *bootargs;
>>>> +    long fw_size;
>>>> +} Vof;
>>>> +
>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>> +                    target_ulong args_real);
>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>> align);
>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>> +void vof_cleanup(Vof *vof);
>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>> *nodename,
>>>> +                               const char *prop, const char *path);
>>>> +
>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>> +
>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>> TYPE_VOF_MACHINE_IF)
>>>> +
>>>> +struct VofMachineIfClass {
>>>> +    InterfaceClass parent;
>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>> CPUState *cs,
>>>> +                                                target_ulong vec);
>>>> +    void (*quiesce)(MachineState *ms);
>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>> *propname,
>>>> +                    void *val, int vallen);
>>>> +};
>>>> +
>>>> +/*
>>>> + * Initial stack size is from
>>>> + * 
>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>> + */
>>>> +#define VOF_STACK_SIZE       0x8000
>>>
>>> Maybe also add a define for RTAS_SIZE here? We'll need to put that in 
>>> the device tree but it depends on the rtas shim size that's part of 
>>> VOF so it should be defined here instead of hardcoding it in boards 
>>> that use VOF so it can be updated later at one place if needed.
>>
>> This is rtas-size for pseries:
>>
>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>
>> => depends on cpus => depends on the command line.
>>
>>
>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a 
>> patch to ditch it.
> 
> I mean you need to have at least the size of code in pc-bios/vof/entry.S 
> hv_rtas where also hv_rtas_size is defined but that value is not 
> available in QEMU where one needs to add it to the device tree. So a 
> define for that should be here in vof.h. Currently I've counted 
> instructions and have
> 
>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
> 
> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that you 
> define corresponding to hv_rtas_size. You'll probably need the same even 
> after changing above rtas size calculation in spapr because client has 
> to allocate memory for instantiate-rtas.


Ah fair point. I do not like "20" here and I think the right thing will 
be adding whatever number of bytes to rtas-size in the firmware itself 
and update it in QEMU via "setprop" as we do for "linux,rtas-base". And 
then do the same in SLOF.
Alexey Kardashevskiy June 17, 2021, 2:40 a.m. UTC | #8
On 16/06/2021 20:26, BALATON Zoltan wrote:
> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>> On 6/16/21 07:09, BALATON Zoltan wrote:
>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>> The PAPR platform describes an OS environment that's presented by
>>>> a combination of a hypervisor and firmware. The features it specifies
>>>> require collaboration between the firmware and the hypervisor.
>>>>
>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>> been implemented as a 20 byte shim which simply forwards it to
>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>>> updated in sync with it. Even though we've managed to limit the amount
>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>> and it has become increasingly awkward to handle as we've implemented
>>>> new features.
>>>>
>>>> This implements a boot time OF client interface (CI) which is
>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>> Open
>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>> using a smaller stateless firmware which does not have to manage
>>>> the device tree.
>>>>
>>>> The new "vof.bin" firmware image is included with source code under
>>>> pc-bios/. It also includes RTAS blob.
>>>>
>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>> working. In particular, this implements the device tree fetching and
>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>> updates
>>>> "/memory@0/available" to report the client about available memory.
>>>>
>>>> This implements changing some device tree properties which we know how
>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>> appending.
>>>>
>>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>>> device tree traversing work.
>>>>
>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>
>>>> This adds basic instances support which are managed by a hash map
>>>> ihandle -> [phandle].
>>>>
>>>> Before the guest started, the used memory is:
>>>> 0..e60 - the initial firmware
>>>> 8000..10000 - stack
>>>> 400000.. - kernel
>>>> 3ea0000.. - initramdisk
>>>>
>>>> This OF CI does not implement "interpret".
>>>>
>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>> includes a disk image with pre-formatted nvram.
>>>>
>>>> With this basic support, this can only boot into kernel directly.
>>>> However this is just enough for the petitboot kernel and 
>>>> initradmdisk to
>>>> boot from any possible source. Note this requires reasonably recent 
>>>> guest
>>>> kernel with:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>
>>>> The immediate benefit is much faster booting time which especially
>>>> crucial with fully emulated early CPU bring up environments. Also this
>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>
>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>> other POWERPC boards which do not support pSeries.
>>>>
>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>> to ./configure to enable it.
>>>>
>>>> This assumes potential support for booting from QEMU backends
>>>> such as blockdev or netdev without devices/drivers used.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>
>>>> The example command line is:
>>>>
>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>> -nodefaults \
>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>> -nographic \
>>>> -vga none \
>>>> -enable-kvm \
>>>> -m 8G \
>>>> -machine 
>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>> \
>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>> -initrd pb/rootfs.cpio.xz \
>>>> -drive 
>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>> \
>>>> -global spapr-nvram.drive=DRIVE0 \
>>>> -snapshot \
>>>> -smp 8,threads=8 \
>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>> -trace events=qemu_trace_events \
>>>> -d guest_errors \
>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>> -mon chardev=SOCKET0,mode=control
>>>>
>>>> ---
>>>> Changes:
>>>> v21:
>>>> * s/ld/ldz/ in entry.S
>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>> Kconfig
>>>> * made CONFIG_VOF optional
>>>> * s/l.lds/vof.lds/
>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>> * added checks for non-null methods of VofMachineIfClass
>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>> comment
>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>> after "@" in node names
>>>> * changed getprop() to check for actual "name" property in the fdt
>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>> similar
>>>> rtas_ld/ldl_be_*) they return error codes
>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>> address_space_read_full
>>>> before, not sure why)
>>>>
>>>> v20:
>>>> * compile vof.bin with -mcpu=power4 for better compatibility
>>>> * s/std/stw/ in entry.S to make it work on ppc32
>>>> * fixed dt_available property to support both 32 and 64bit
>>>> * shuffled prom_args handling code
>>>> * do not enforce 32bit in MSR (again, to support 32bit platforms)
>>>>
>>>> v19:
>>>> * put bootargs in the FDT
>>>> * moved setting properties to a VOF machine hook
>>>> * moved fw_size and claim for it to vof_init()
>>>> * added CROSS to the VOF's makefile
>>>> * simplified phandles assigning
>>>> * pass MachineState to all machine hooks instead of calling
>>>> qdev_get_machine (following QOM)
>>>> * bunch of smaller changes and added comments
>>>> * added simple test to attempt to start with x-vof=on
>>>>
>>>> v18:
>>>> * fixed top addr (max address for "claim") on radix - it equals to 
>>>> ram_size
>>>> and vof->top_addr was uint32_t
>>>> * fixed "available" property which got broken in v14 but it is only 
>>>> visible
>>>> to clients which care (== grub)
>>>> * reshuffled vof_dt_memory_available() calls, added vof_init() to allow
>>>> vof_claim() before rendering the FDT
>>>>
>>>> v17:
>>>> * mv hw/ppc/vof.h include/hw/ppc/vof.h
>>>> * VofMachineIfClass -> VofMachineClass; it is not 
>>>> VofMachineInterface as
>>>> nobody used this scheme, usually "Interface" is dropped, a couple of 
>>>> times
>>>> it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
>>>> used by include/hw/vmstate-if.h
>>>> * added SPDX
>>>> * other fixes from v16 review
>>>>
>>>> v16:
>>>> * rebased on dwg/ppc-for-6.1
>>>> * s/SpaprVofInterface/VofMachineInterface/
>>>>
>>>> v15:
>>>> * bugfix: claimed memory for the VOF itself
>>>> * ditched OF_STACK_ADDR and allocate one instead, now it starts from 
>>>> 0x8000
>>>> because it is aligned to its size (no particular reason though)
>>>> * coding style
>>>> * moved nvram.bin up one level
>>>> * ditched bool in the firmware
>>>> * made debugging code conditional using trace_event_get_state() + 
>>>> qemu_loglevel_mask()
>>>> * renamed the CAS interface to SpaprVofInterface
>>>> * added "write" which for now dumps the message and ihandle via
>>>> trace point for early debug assistance
>>>> * commented on when we allocate of_instances in vof_build_dt()
>>>> * store fw_size is SpaprMachine to let spapr_vof_reset() claim it
>>>> * many small fixes from v14's review
>>>>
>>>> v14:
>>>> * check for truncates in readstr()
>>>> * ditched a separate vof_reset()
>>>> * spapr->vof is a pointer now, dropped the "on" field
>>>> * removed rtas_base from vof and updated comment why we allow 
>>>> setting it
>>>> * added myself to maintainers
>>>> * updated commit log about blockdev and other possible platforms
>>>> * added a note why new hcall is 0x5
>>>> * no in place endianness convertion in spapr_h_vof_client
>>>> * converted all cpu_physical_memory_read/write to address_space_rw
>>>> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>>>>
>>>> v13:
>>>> * rebase on latest ppc-for-6.0
>>>> * shuffled code around to touch spapr.c less
>>>>
>>>> v12:
>>>> * split VOF and SPAPR
>>>>
>>>> v11:
>>>> * added g_autofree
>>>> * fixed gcc warnings
>>>> * fixed few leaks
>>>> * added nvram image to make "nvram --print-config" not crash;
>>>> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum 
>>>> size
>>>> is 16K, or it just does not work (empty output from "nvram")
>>>>
>>>> v10:
>>>> * now rebased to compile with meson
>>>>
>>>> v9:
>>>> * remove special handling of /rtas/rtas-size as now we always add it 
>>>> in QEMU
>>>> * removed leftovers from scsi/grub/stdout/stdin/...
>>>>
>>>> v8:
>>>> * no read/write/seek
>>>> * no @dev in instances
>>>> * the machine flag is "x-vof" for now
>>>>
>>>> v7:
>>>> * now we have a small firmware which loads at 0 as SLOF and starts from
>>>> 0x100 as SLOF
>>>> * no MBR/ELF/GRUB business in QEMU anymore
>>>> * blockdev is a separate patch
>>>> * networking is a separate patch
>>>>
>>>> v6:
>>>> * borrowed a big chunk of commit log introduction from David
>>>> * fixed initial stack pointer (points to the highest address of stack)
>>>> * traces for "interpret" and others
>>>> * disabled  translate_kernel_address() hack so grub can load (work in
>>>> progress)
>>>> * added "milliseconds" for grub
>>>> * fixed "claim" allocator again
>>>> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too 
>>>> for CAS
>>>> * moved the most code possible from spapr.c to spapr_of_client.c, 
>>>> such as
>>>> RTAS, prom entry and FDT build/finalize
>>>> * separated blobs
>>>> * GRUB now proceeds to its console prompt (there are still other 
>>>> issues)
>>>> * parse MBR/GPT to find PReP and load GRUB
>>>>
>>>> v5:
>>>> * made instances keep device and chardev pointers
>>>> * removed VIO dependencies
>>>> * print error if RTAS memory is not claimed as it should have been
>>>> * pack FDT as "quiesce"
>>>>
>>>> v4:
>>>> * fixed open
>>>> * validate ihandles in "call-method"
>>>>
>>>> v3:
>>>> * fixed phandles allocation
>>>> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
>>>> * fixed size of /chosen/stdout
>>>> * bunch of renames
>>>> * do not create rtas properties at all, let the client deal with it;
>>>> instead setprop allows changing these in the FDT
>>>> * no more packing FDT when bios=off - nobody needs it and getprop 
>>>> does not
>>>> work otherwise
>>>> * allow updating initramdisk device tree properties (for zImage)
>>>> * added instances
>>>> * fixed stdout on OF's "write"
>>>> * removed special handling for stdout in OF client, spapr-vty 
>>>> handles it
>>>> instead
>>>>
>>>> v2:
>>>> * fixed claim()
>>>> * added "setprop"
>>>> * cleaner client interface and RTAS blobs management
>>>> * boots to petitboot and further to the target system
>>>> * more trace points
>>>>
>>>> v20
>>>>
>>>> v20!
>>>> ---
>>>> configure               |    9 +
>>>> pc-bios/vof/Makefile    |   23 +
>>>> include/hw/ppc/spapr.h  |   25 +-
>>>> include/hw/ppc/vof.h    |   55 ++
>>>> pc-bios/vof/vof.h       |   43 ++
>>>> hw/ppc/spapr.c          |   87 +++-
>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>>> pc-bios/vof/bootmem.c   |   14 +
>>>> pc-bios/vof/ci.c        |   91 ++++
>>>> pc-bios/vof/libc.c      |   92 ++++
>>>> pc-bios/vof/main.c      |   21 +
>>>> tests/qtest/rtas-test.c |   17 +-
>>>> MAINTAINERS             |   12 +
>>>> hw/ppc/Kconfig          |    3 +
>>>> hw/ppc/meson.build      |    3 +
>>>> hw/ppc/trace-events     |   24 +
>>>> meson.build             |    1 +
>>>> pc-bios/README          |    2 +
>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>> pc-bios/vof/entry.S     |   49 ++
>>>> pc-bios/vof/vof.lds     |   48 ++
>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>> create mode 100644 pc-bios/vof/Makefile
>>>> create mode 100644 include/hw/ppc/vof.h
>>>> create mode 100644 pc-bios/vof/vof.h
>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>> create mode 100644 hw/ppc/vof.c
>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>> create mode 100644 pc-bios/vof/ci.c
>>>> create mode 100644 pc-bios/vof/libc.c
>>>> create mode 100644 pc-bios/vof/main.c
>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>> create mode 100755 pc-bios/vof.bin
>>>> create mode 100644 pc-bios/vof/entry.S
>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>
>>>> diff --git a/configure b/configure
>>>> index 8dcb9965b24e..00dc29c027fa 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -445,6 +445,7 @@ fuse="auto"
>>>> fuse_lseek="auto"
>>>> multiprocess="auto"
>>>> slirp_smbd="$default_feature"
>>>> +vof="no"
>>>>
>>>> malloc_trim="auto"
>>>> gio="$default_feature"
>>>> @@ -1561,6 +1562,10 @@ for opt do
>>>>   ;;
>>>>   --disable-slirp-smbd) slirp_smbd=no
>>>>   ;;
>>>> +  --enable-vof) vof=yes
>>>> +  ;;
>>>> +  --disable-vof) vof=no
>>>> +  ;;
>>>>   *)
>>>>       echo "ERROR: unknown option $opt"
>>>>       echo "Try '$0 --help' for more information"
>>>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is 
>>>> enabled if available
>>>>   multiprocess    Out of process device emulation support
>>>>   gio             libgio support
>>>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>>>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>>>> experimental)
>>>>
>>>> NOTE: The object files are built at the place where configure is 
>>>> launched
>>>> EOF
>>>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>>>> fi
>>>> +if test "$vof" = "yes" ; then
>>>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>>>> +fi
>>>> if test "$vde" = "yes" ; then
>>>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>>>
>>> In case I could not explain it clearly in my previous message I think 
>>> the solution we want here is to drop these configure changes and let 
>>> Kconfig configure this. The CONFIG_VOF option decides if vof itself 
>>> is built (adds vof.c) and pegasos2 will select this so it will 
>>> usually be yes by default. Your problem is that you're trying to use 
>>> this variable in spapr to make it off by default but that does not 
>>> work. You need to add another option for that (e.g. CONFIG_VOF_SPAPR 
>>> or CONFIG_SPAPR_VOF whichever makes more sense) then you can set that 
>>> to no despite CONFIG_VOF is yes and use that variable in spapr files 
>>> and to add spapr_vof.c. Then no configure option is needed which does 
>>> not even work for me: I get compile errors saying 'poisoning existing 
>>> macro "CONFIG_VOF"' if I try with --enable-vof or spapr fails to 
>>> build if I try without --enable-vof but select CONFIG_VOF from 
>>> pegasos2. I hope this makes sense now.
>>
>>
>> My problem is that I do not understand when we want VOF to be compiled 
>> in by default and when we do not. For a moment I thought we do not 
>> want it in by default but now it sounds like we do. If that is so, 
>> then CONFIG_VOF + selecting it from PSERIES and PEGASOS should do. Or 
>> I am missing the point again?
> 
> I don't know what spapr wants, for pegasos2 VOF will be the default 
> firmware and I want pegasos2 to be enabled by default (like other 
> boards) so this means CONFIG_VOF will also be enabled by default via 
> select VOF in CONFIG_PEGASOS2. So if the x-vof option in spapr is not 
> enough and you want to be able to configure it off for spapr then you 
> need another CONFIG_something option for that and cannot rely on 
> CONFIG_VOF for it because CONFIG_VOF is on if any board that uses VOF is 
> compiled. If you're OK with compiling it in but disabled by x-vof by 
> default then no need to have another option. In both cases you'll have 
> to select VOF somewhere for your board, either in CONFIG_PSERIES or in 
> the new option that decides if VOF is built for spapr. At least that's 
> how I understand Kconfig.
> 
>>>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>>>> new file mode 100644
>>>> index 000000000000..aa1678c4d889
>>>> --- /dev/null
>>>> +++ b/pc-bios/vof/Makefile
>>>> @@ -0,0 +1,23 @@
>>>> +all: build-all
>>>> +
>>>> +build-all: vof.bin
>>>> +
>>>> +CROSS ?=
>>>> +CC = $(CROSS)gcc
>>>> +LD = $(CROSS)ld
>>>> +OBJCOPY = $(CROSS)objcopy
>>>> +
>>>> +%.o: %.S
>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>>>> +
>>>> +%.o: %.c
>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o 
>>>> $@ $<
>>>> +
>>>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>>>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>>>> +
>>>> +%.bin: %.elf
>>>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>>>> +
>>>> +clean:
>>>> +    rm -f *.o vof.bin vof.elf *~
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index f05219f75ef6..39b5581ae650 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -12,6 +12,9 @@
>>>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>>> #include "hw/ppc/xics.h"        /* For ICSState */
>>>> #include "hw/ppc/spapr_tpm_proxy.h"
>>>> +#ifdef CONFIG_VOF
>>>> +#include "hw/ppc/vof.h"
>>>> +#endif
>>>>
>>>> struct SpaprVioBus;
>>>> struct SpaprPhbState;
>>>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>>>     uint64_t kernel_addr;
>>>>     uint32_t initrd_base;
>>>>     long initrd_size;
>>>> +#ifdef CONFIG_VOF
>>>
>>> So this can't be CONFIG_VOF here if you want to be able to set it to 
>>> no despite pegasos2 pulling in VOF so you need another SPAPR specific 
>>
>> If VOF is compiled it, why would I want it to be still disabled for 
>> PSERIES? The code is in, let it work.
> 
> That's something to decide for spapr maintainers, I just want to be able 
> to use CONFIG_VOF from CONFIG_PEGASOS2 and be it on by default.
> 
>>> option for that in spapr specific parts with CONFIG_VOF selecting if 
>>> vof itself is built it any board uses it. So CONFIG_PEGASOS2 has to 
>>> select CONFIG_VOF and your SPAPR_VOF option should too if it's 
>>> enabled that way vof.c will be added if either board is built but for 
>>> SPAPR only if its VOF option is on.
>>>
>>>> +    Vof *vof;
>>>> +#endif
>>>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>>>     struct PPCTimebase tb;
>>>>     bool has_graphics;
>>>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>>>> /* Client Architecture support */
>>>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>>>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>>>
>>>> /*
>>>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>>>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState 
>>>> *spapr, hwaddr pagesize,
>>>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>>> hwaddr spapr_get_rtas_addr(void);
>>>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>>>> +
>>>> +#ifdef CONFIG_VOF
>>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>>> +                     target_ulong *stack_ptr, Error **errp);
>>>> +void spapr_vof_quiesce(MachineState *ms);
>>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const 
>>>> char *propname,
>>>> +                       void *val, int vallen);
>>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>>> *spapr,
>>>> +                                target_ulong opcode, target_ulong 
>>>> *args);
>>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>>> +                                                   CPUState *cs,
>>>> +                                                   target_ulong 
>>>> ovec_addr);
>>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void 
>>>> *fdt);
>>>> +#endif
>>>> +
>>>> #endif /* HW_SPAPR_H */
>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>> new file mode 100644
>>>> index 000000000000..65ca2fed0d41
>>>> --- /dev/null
>>>> +++ b/include/hw/ppc/vof.h
>>>> @@ -0,0 +1,55 @@
>>>> +/*
>>>> + * Virtual Open Firmware
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +#ifndef HW_VOF_H
>>>> +#define HW_VOF_H
>>>> +
>>>> +typedef struct Vof {
>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>> +    uint64_t claimed_base;
>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>> +    uint32_t of_instance_last;
>>>> +    char *bootargs;
>>>> +    long fw_size;
>>>> +} Vof;
>>>> +
>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>> +                    target_ulong args_real);
>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>> align);
>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>> +void vof_cleanup(Vof *vof);
>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>> *nodename,
>>>> +                               const char *prop, const char *path);
>>>> +
>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>> +
>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>> TYPE_VOF_MACHINE_IF)
>>>> +
>>>> +struct VofMachineIfClass {
>>>> +    InterfaceClass parent;
>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>> CPUState *cs,
>>>> +                                                target_ulong vec);
>>>> +    void (*quiesce)(MachineState *ms);
>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>> *propname,
>>>> +                    void *val, int vallen);
>>>> +};
>>>> +
>>>> +/*
>>>> + * Initial stack size is from
>>>> + * 
>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>
>>>
>>> I wonder if it's better to quote the section number and the title of 
>>> the doc in case the URL here goes away in the future.
>>
>>
>> The binding (the URL clearly suggests it is a "binding") says 32K is 
>> the minimum, what else is here to quote? The doc does not explain why 
>> anyway.
> 
> I thought maybe saying "section x.x of OpenFirmware PPC binding says 
> minimum stack size is 32K" which can also be understood if the link 
> points to nowhere in the future. You can still add a link if you want 
> but just have the relevant info in the comment so one does not need to 
> read the whole doc to find it.


I did not read the whole doc - I opened the link, "ctrl-f", "stack" - 
and there was a local link to "8.2.2. Initial Stack" right there :)

I'll do what you suggested but I really see no value.


> 
>>
>>>> + */
>>>> +#define VOF_STACK_SIZE       0x8000
>>>> +
>>>> +#define VOF_MEM_READ(pa, buf, size) \
>>>> +    address_space_read(&address_space_memory, \
>>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>>>> +#define VOF_MEM_WRITE(pa, buf, size) \
>>>> +    address_space_write(&address_space_memory, \
>>>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>>>> +
>>>> +#endif /* HW_VOF_H */
>>>> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
>>>> new file mode 100644
>>>> index 000000000000..2d8958076907
>>>> --- /dev/null
>>>> +++ b/pc-bios/vof/vof.h
>>>> @@ -0,0 +1,43 @@
>>>> +/*
>>>> + * Virtual Open Firmware
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +#include <stdarg.h>
>>>> +
>>>> +typedef unsigned char uint8_t;
>>>> +typedef unsigned short uint16_t;
>>>> +typedef unsigned long uint32_t;
>>>> +typedef unsigned long long uint64_t;
>>>> +#define NULL (0)
>>>> +#define PROM_ERROR (-1u)
>>>> +typedef unsigned long ihandle;
>>>> +typedef unsigned long phandle;
>>>> +typedef int size_t;
>>>> +typedef void client(void);
>>>> +
>>>> +/* globals */
>>>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this 
>>>> firmware) */
>>>> +
>>>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>>>> +
>>>> +/* libc */
>>>> +int strlen(const char *s);
>>>> +int strcmp(const char *s1, const char *s2);
>>>> +void *memcpy(void *dest, const void *src, size_t n);
>>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>>> +void *memmove(void *dest, const void *src, size_t n);
>>>> +void *memset(void *dest, int c, size_t size);
>>>> +
>>>> +/* CI wrappers */
>>>> +void ci_panic(const char *str);
>>>> +phandle ci_finddevice(const char *path);
>>>> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, 
>>>> int len);
>>>> +
>>>> +/* booting from -kernel */
>>>> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
>>>> +
>>>> +/* Entry points for CI and RTAS */
>>>> +extern uint32_t ci_entry(uint32_t params);
>>>> +extern unsigned long hv_rtas(unsigned long params);
>>>> +extern unsigned int hv_rtas_size;
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index 4dd90b75cc52..6d747d72c614 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -101,6 +101,7 @@
>>>> #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below 
>>>> that */
>>>> #define FW_MAX_SIZE             0x400000
>>>> #define FW_FILE_NAME            "slof.bin"
>>>> +#define FW_FILE_NAME_VOF        "vof.bin"
>>>> #define FW_OVERHEAD             0x2800000
>>>> #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>>>>
>>>> @@ -1639,22 +1640,40 @@ static void spapr_machine_reset(MachineState 
>>>> *machine)
>>>>     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
>>>>
>>>>     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
>>>> +#ifdef CONFIG_VOF
>>>> +    if (spapr->vof) {
>>>> +        target_ulong stack_ptr = 0;
>>>>
>>>> -    rc = fdt_pack(fdt);
>>>> +        spapr_vof_reset(spapr, fdt, &stack_ptr, &error_fatal);
>>>>
>>>> -    /* Should only fail if we've built a corrupted tree */
>>>> -    assert(rc == 0);
>>>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>>>> +                                  stack_ptr, spapr->initrd_base,
>>>> +                                  spapr->initrd_size);
>>>> +        /* VOF is 32bit BE so enforce MSR here */
>>>> +        first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << 
>>>> MSR_LE));
>>>> +        /*
>>>> +         * Do not pack the FDT as the client may change properties.
>>>> +         * VOF client does not expect the FDT so we do not load it 
>>>> to the VM.
>>>> +         */
>>>> +    } else
>>>> +#endif
>>>> +    {
>>>> +        rc = fdt_pack(fdt);
>>>> +        /* Should only fail if we've built a corrupted tree */
>>>> +        assert(rc == 0);
>>>>
>>>> -    /* Load the fdt */
>>>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>>>> +                                  0, fdt_addr, 0);
>>>> +        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>>> +    }
>>>>     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>>>> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>>>> +
>>>>     g_free(spapr->fdt_blob);
>>>>     spapr->fdt_size = fdt_totalsize(fdt);
>>>>     spapr->fdt_initial_size = spapr->fdt_size;
>>>>     spapr->fdt_blob = fdt;
>>>>
>>>>     /* Set up the entry state */
>>>> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, 
>>>> fdt_addr, 0);
>>>>     first_ppc_cpu->env.gpr[5] = 0;
>>>>
>>>>     spapr->fwnmi_system_reset_addr = -1;
>>>> @@ -2657,7 +2676,12 @@ static void spapr_machine_init(MachineState 
>>>> *machine)
>>>>     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>>>>     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>>>>     MachineClass *mc = MACHINE_GET_CLASS(machine);
>>>> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
>>>> +    const char *bios_default =
>>>> +#ifdef CONFIG_VOF
>>>> +        !!spapr->vof ? FW_FILE_NAME_VOF :
>>>
>>> Does !! make sense here? I think testing for non-0 does not need it 
>>> so you could just write spapr->vof without !!.
>>
>>
>> I find c operator precedence confusing at times. Unary operators like 
>> "!" are easy to read though.
> 
> OK but it's not needed here at all. With or without !! you should get 
> the same result, !! is only needed if you need to make sure value is 
> bool and not some number which is not needed here, you just test if 
> spapr->vof is 0 or not. So writing just that is simpler and people not 
> familiar with !! won't be confused. (I had somebedy ask about !! before 
> in one of my patches so I think this should only be used where 
> necessary.) Also less operators in an expression means less precedence 
> to care for ;-)

Okay okay :)


> 
>>
>>>
>>>> +#endif
>>>> +        FW_FILE_NAME;
>>>> +    const char *bios_name = machine->firmware ?: bios_default;
>>>>     const char *kernel_filename = machine->kernel_filename;
>>>>     const char *initrd_filename = machine->initrd_filename;
>>>>     PCIHostState *phb;
>>>> @@ -3014,6 +3038,12 @@ static void spapr_machine_init(MachineState 
>>>> *machine)
>>>>     }
>>>>
>>>>     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
>>>> +#ifdef CONFIG_VOF
>>>> +    if (spapr->vof) {
>>>> +        spapr->vof->fw_size = fw_size; /* for claim() on itself */
>>>> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, 
>>>> spapr_h_vof_client);
>>>> +    }
>>>> +#endif
>>>> }
>>>>
>>>> #define DEFAULT_KVM_TYPE "auto"
>>>> @@ -3204,6 +3234,30 @@ static void spapr_set_resize_hpt(Object *obj, 
>>>> const char *value, Error **errp)
>>>>     }
>>>> }
>>>>
>>>> +#ifdef CONFIG_VOF
>>>> +static bool spapr_get_vof(Object *obj, Error **errp)
>>>> +{
>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    return spapr->vof != NULL;
>>>> +}
>>>> +
>>>> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
>>>> +{
>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    if (spapr->vof) {
>>>> +        vof_cleanup(spapr->vof);
>>>> +        g_free(spapr->vof);
>>>> +        spapr->vof = NULL;
>>>> +    }
>>>> +    if (!value) {
>>>> +        return;
>>>> +    }
>>>> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
>>>> +}
>>>> +#endif
>>>> +
>>>> static char *spapr_get_ic_mode(Object *obj, Error **errp)
>>>> {
>>>>     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>>>> @@ -3329,6 +3383,12 @@ static void spapr_instance_init(Object *obj)
>>>>                                     stringify(KERNEL_LOAD_ADDR)
>>>>                                     " for -kernel is the default");
>>>>     spapr->kernel_addr = KERNEL_LOAD_ADDR;
>>>> +#ifdef CONFIG_VOF
>>>> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, 
>>>> spapr_set_vof);
>>>> +    object_property_set_description(obj, "x-vof",
>>>> +                                    "Enable Virtual Open Firmware 
>>>> (experimental)");
>>>> +#endif
>>>> +
>>>>     /* The machine class defines the default interrupt controller 
>>>> mode */
>>>>     spapr->irq = smc->irq;
>>>>     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
>>>> @@ -4580,6 +4640,16 @@ static void 
>>>> spapr_machine_class_init(ObjectClass *oc, void *data)
>>>>     smc->smp_threads_vsmt = true;
>>>>     smc->nr_xirqs = SPAPR_NR_XIRQS;
>>>>     xfc->match_nvt = spapr_match_nvt;
>>>> +
>>>> +#ifdef CONFIG_VOF
>>>> +    {
>>>> +        VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
>>>> +        vmc->client_architecture_support =
>>>> +            spapr_vof_client_architecture_support;
>>>> +        vmc->quiesce = spapr_vof_quiesce;
>>>> +        vmc->setprop = spapr_vof_setprop;
>>>> +    }
>>>> +#endif
>>>> }
>>>>
>>>> static const TypeInfo spapr_machine_info = {
>>>> @@ -4599,6 +4669,9 @@ static const TypeInfo spapr_machine_info = {
>>>>         { TYPE_XICS_FABRIC },
>>>>         { TYPE_INTERRUPT_STATS_PROVIDER },
>>>>         { TYPE_XIVE_FABRIC },
>>>> +#ifdef CONFIG_VOF
>>>> +        { TYPE_VOF_MACHINE_IF },
>>>> +#endif
>>>>         { }
>>>>     },
>>>> };
>>>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>>>> index f25014afda40..986a4de34128 100644
>>>> --- a/hw/ppc/spapr_hcall.c
>>>> +++ b/hw/ppc/spapr_hcall.c
>>>> @@ -1080,7 +1080,7 @@ target_ulong 
>>>> do_client_architecture_support(PowerPCCPU *cpu,
>>>>     SpaprOptionVector *ov1_guest, *ov5_guest;
>>>>     bool guest_radix;
>>>>     bool raw_mode_supported = false;
>>>> -    bool guest_xive;
>>>> +    bool guest_xive, reset_fdt = false;
>>>>     CPUState *cs;
>>>>     void *fdt;
>>>>     uint32_t max_compat = spapr->max_compat_pvr;
>>>> @@ -1233,8 +1233,10 @@ target_ulong 
>>>> do_client_architecture_support(PowerPCCPU *cpu,
>>>>         spapr_setup_hpt(spapr);
>>>>     }
>>>>
>>>> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
>>>> -
>>>> +#ifdef CONFIG_VOF
>>>> +    reset_fdt = spapr->vof != NULL;
>>>
>>> (Here when storing to a bool !! could make sense but what you have is 
>>> better as it's clearer so I'm not suggesting to use !! here either.
>>
>> I prefer this way
> 
> I agree that this way writing with explicit != NULL is clear so I prefer 
> that here too. I just mentioned that here !! might make more sense here 
> than above.
> 
>> and I would rather do this that "!!" but again precedence confuses me 
>> some times so up there I'd need braces for the condition and then 
>> folks start asking "why you need braces" :)
> 
> For precedence "man operator" has a quick table to help, it's hard to 
> remember. In my opinion !! is only useful if you need to convert 
> something like a flag to a bool like bool = !!(reg & BIT(x)) otherwise 
> it's probably clearer to do without it as it may confuse those not 
> familiar with it.
> 
>> I do not need braces here as "=" has the priority (although the fact 
>> that it returns a value is just bizzarre).
> 
> Everything in C has practical reasons. I think assignment returning a 
> value is so you could write a = b = 0; although this is discouraged 
> usually.

Exactly.

> Also you can do while ((c = getc())) so this is sometimes 
> useful to have = return a value.


When I such "while", I feel urge to kill ;) There is nothing practical 
about it imho.


> 
>>> It's rarely useful, maybe only if you need a bool but does not have 
>>> space to write the condition or it would be more confusing that way.)
>>
>>
>>
>>>
>>>> +#endif
>>>> +    fdt = spapr_build_fdt(spapr, reset_fdt, fdt_bufsize);
>>>>     g_free(spapr->fdt_blob);
>>>>     spapr->fdt_size = fdt_totalsize(fdt);
>>>>     spapr->fdt_initial_size = spapr->fdt_size;
>>>> @@ -1277,6 +1279,27 @@ static target_ulong 
>>>> h_client_architecture_support(PowerPCCPU *cpu,
>>>>     return ret;
>>>> }
>>>>
>>>> +#ifdef CONFIG_VOF
>>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>>> +                                                   CPUState *cs,
>>>> +                                                   target_ulong 
>>>> ovec_addr)
>>>> +{
>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>>> +
>>>> +    target_ulong ret = 
>>>> do_client_architecture_support(POWERPC_CPU(cs), spapr,
>>>> +                                                      ovec_addr, 
>>>> FDT_MAX_SIZE);
>>>> +
>>>> +    /*
>>>> +     * This adds stdout and generates phandles for boottime and CAS 
>>>> FDTs.
>>>> +     * It is alright to update the FDT here as 
>>>> do_client_architecture_support()
>>>> +     * does not pack it.
>>>> +     */
>>>> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +#endif
>>>> +
>>>> static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>>>                                               SpaprMachineState *spapr,
>>>>                                               target_ulong opcode,
>>>> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
>>>> new file mode 100644
>>>> index 000000000000..653d376f38aa
>>>> --- /dev/null
>>>> +++ b/hw/ppc/spapr_vof.c
>>>> @@ -0,0 +1,153 @@
>>>> +/*
>>>> + * SPAPR machine hooks to Virtual Open Firmware,
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +#include "qemu/osdep.h"
>>>> +#include "qemu-common.h"
>>>> +#include <sys/ioctl.h>
>>>> +#include "qapi/error.h"
>>>> +#include "hw/ppc/spapr.h"
>>>> +#include "hw/ppc/spapr_vio.h"
>>>> +#include "hw/ppc/fdt.h"
>>>> +#include "sysemu/sysemu.h"
>>>> +#include "qom/qom-qobject.h"
>>>> +#include "trace.h"
>>>> +
>>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>>> *spapr,
>>>> +                                target_ulong opcode, target_ulong 
>>>> *_args)
>>>> +{
>>>> +    int ret = vof_client_call(MACHINE(spapr), spapr->vof, 
>>>> spapr->fdt_blob,
>>>> +                              ppc64_phys_to_real(_args[0]));
>>>> +
>>>> +    if (ret) {
>>>> +        return H_PARAMETER;
>>>> +    }
>>>> +    return H_SUCCESS;
>>>> +}
>>>> +
>>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
>>>> +{
>>>> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
>>>> +    int chosen;
>>>> +
>>>> +    vof_build_dt(fdt, spapr->vof);
>>>> +
>>>> +    _FDT(chosen = fdt_path_offset(fdt, "/chosen"));
>>>> +    _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
>>>> +                            spapr->vof->bootargs ? : ""));
>>>> +
>>>> +    /*
>>>> +     * SLOF-less setup requires an open instance of stdout for early
>>>> +     * kernel printk. By now all phandles are settled so we can open
>>>> +     * the default serial console.
>>>> +     */
>>>> +    if (stdout_path) {
>>>> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", 
>>>> "stdout",
>>>> +                                   stdout_path));
>>>> +    }
>>>> +}
>>>> +
>>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>>> +                     target_ulong *stack_ptr, Error **errp)
>>>> +{
>>>> +    Vof *vof = spapr->vof;
>>>> +
>>>> +    vof_init(vof, spapr->rma_size, errp);
>>>> +
>>>> +    *stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE);
>>>> +    if (*stack_ptr == -1) {
>>>> +        error_setg(errp, "Memory allocation for stack failed");
>>>> +        return;
>>>> +    }
>>>> +    /* Stack grows downwards plus reserve space for the minimum 
>>>> stack frame */
>>>> +    *stack_ptr += VOF_STACK_SIZE - 0x20;
>>>> +
>>>> +    if (spapr->kernel_size &&
>>>> +        vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) 
>>>> == -1) {
>>>> +        error_setg(errp, "Memory for kernel is in use");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (spapr->initrd_size &&
>>>> +        vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) 
>>>> == -1) {
>>>> +        error_setg(errp, "Memory for initramdisk is in use");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    spapr_vof_client_dt_finalize(spapr, fdt);
>>>> +
>>>> +    /*
>>>> +     * At this point the expected allocation map is:
>>>> +     *
>>>> +     * 0..c38 - the initial firmware
>>>> +     * 8000..10000 - stack
>>>> +     * 400000.. - kernel
>>>> +     * 3ea0000.. - initramdisk
>>>> +     *
>>>> +     * We skip writing FDT as nothing expects it; OF client 
>>>> interface is
>>>> +     * going to be used for reading the device tree.
>>>> +     */
>>>> +}
>>>> +
>>>> +void spapr_vof_quiesce(MachineState *ms)
>>>> +{
>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>>> +
>>>> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
>>>> +    spapr->fdt_initial_size = spapr->fdt_size;
>>>> +}
>>>> +
>>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const 
>>>> char *propname,
>>>> +                       void *val, int vallen)
>>>> +{
>>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
>>>> +
>>>> +    /*
>>>> +     * We only allow changing properties which we know how to 
>>>> update in QEMU
>>>> +     * OR
>>>> +     * the ones which we know that they need to survive during 
>>>> "quiesce".
>>>> +     */
>>>> +
>>>> +    if (strcmp(path, "/rtas") == 0) {
>>>> +        if (strcmp(propname, "linux,rtas-base") == 0 ||
>>>> +            strcmp(propname, "linux,rtas-entry") == 0) {
>>>> +            /* These need to survive quiesce so let them store in 
>>>> the FDT */
>>>> +            return true;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (strcmp(path, "/chosen") == 0) {
>>>> +        if (strcmp(propname, "bootargs") == 0) {
>>>> +            Vof *vof = spapr->vof;
>>>> +
>>>> +            g_free(vof->bootargs);
>>>> +            vof->bootargs = g_strndup(val, vallen);
>>>> +            return true;
>>>> +        }
>>>> +        if (strcmp(propname, "linux,initrd-start") == 0) {
>>>> +            if (vallen == sizeof(uint32_t)) {
>>>> +                spapr->initrd_base = ldl_be_p(val);
>>>> +                return true;
>>>> +            }
>>>> +            if (vallen == sizeof(uint64_t)) {
>>>> +                spapr->initrd_base = ldq_be_p(val);
>>>> +                return true;
>>>> +            }
>>>> +            return false;
>>>> +        }
>>>> +        if (strcmp(propname, "linux,initrd-end") == 0) {
>>>> +            if (vallen == sizeof(uint32_t)) {
>>>> +                spapr->initrd_size = ldl_be_p(val) - 
>>>> spapr->initrd_base;
>>>> +                return true;
>>>> +            }
>>>> +            if (vallen == sizeof(uint64_t)) {
>>>> +                spapr->initrd_size = ldq_be_p(val) - 
>>>> spapr->initrd_base;
>>>> +                return true;
>>>> +            }
>>>> +            return false;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>>>> new file mode 100644
>>>> index 000000000000..1068a1e58388
>>>> --- /dev/null
>>>> +++ b/hw/ppc/vof.c
>>>> @@ -0,0 +1,1052 @@
>>>> +/*
>>>> + * QEMU PowerPC Virtual Open Firmware.
>>>> + *
>>>> + * This implements client interface from OpenFirmware IEEE1275 on 
>>>> the QEMU
>>>> + * side to leave only a very basic firmware in the VM.
>>>> + *
>>>> + * Copyright (c) 2021 IBM Corporation.
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qemu-common.h"
>>>> +#include "qemu/timer.h"
>>>> +#include "qemu/range.h"
>>>> +#include "qemu/units.h"
>>>> +#include "qapi/error.h"
>>>> +#include <sys/ioctl.h>
>>>> +#include "exec/ram_addr.h"
>>>> +#include "exec/address-spaces.h"
>>>> +#include "hw/ppc/vof.h"
>>>> +#include "hw/ppc/fdt.h"
>>>> +#include "sysemu/runstate.h"
>>>> +#include "qom/qom-qobject.h"
>>>> +#include "trace.h"
>>>> +
>>>> +#include <libfdt.h>
>>>> +
>>>> +/*
>>>> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
>>>> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 
>>>> chars long.
>>>> + */
>>>> +#define OF_PROPNAME_LEN_MAX 64
>>>> +
>>>> +#define VOF_MAX_PATH        256
>>>> +#define VOF_MAX_SETPROPLEN  2048
>>>> +#define VOF_MAX_METHODLEN   256
>>>> +#define VOF_MAX_FORTHCODE   256
>>>> +#define VOF_VTY_BUF_SIZE    256
>>>> +
>>>> +typedef struct {
>>>> +    uint64_t start;
>>>> +    uint64_t size;
>>>> +} OfClaimed;
>>>> +
>>>> +typedef struct {
>>>> +    char *path; /* the path used to open the instance */
>>>> +    uint32_t phandle;
>>>> +} OfInstance;
>>>> +
>>>> +static int readstr(hwaddr pa, char *buf, int size)
>>>> +{
>>>> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
>>>> +        return -1;
>>>> +    }
>>>> +    if (strnlen(buf, size) == size) {
>>>> +        buf[size - 1] = '\0';
>>>> +        trace_vof_error_str_truncated(buf, size);
>>>> +        return -1;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
>>>> +                       const char *s1, unsigned nargscheck, 
>>>> unsigned nretcheck)
>>>> +{
>>>> +    if (strcmp(s, s1)) {
>>>> +        return false;
>>>> +    }
>>>> +    if ((nargscheck && (nargs != nargscheck)) ||
>>>> +        (nretcheck && (nret != nretcheck))) {
>>>> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static void prop_format(char *tval, int tlen, const void *prop, int 
>>>> len)
>>>> +{
>>>> +    int i;
>>>> +    const unsigned char *c;
>>>> +    char *t;
>>>> +    const char bin[] = "...";
>>>> +
>>>> +    for (i = 0, c = prop; i < len; ++i, ++c) {
>>>> +        if (*c == '\0' && i == len - 1) {
>>>> +            strncpy(tval, prop, tlen - 1);
>>>> +            return;
>>>> +        }
>>>> +        if (*c < 0x20 || *c >= 0x80) {
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
>>>> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
>>>> +            strcpy(t, bin);
>>>> +            return;
>>>> +        }
>>>> +        if (i && i % 4 == 0 && i != len - 1) {
>>>> +            strcat(t, " ");
>>>> +            ++t;
>>>> +        }
>>>> +        t += sprintf(t, "%02X", *c & 0xFF);
>>>> +    }
>>>> +}
>>>> +
>>>> +static int get_path(const void *fdt, int offset, char *buf, int len)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = fdt_get_path(fdt, offset, buf, len - 1);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    buf[len - 1] = '\0';
>>>> +
>>>> +    return strlen(buf) + 1;
>>>> +}
>>>> +
>>>> +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, 
>>>> int len)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = fdt_node_offset_by_phandle(fdt, ph);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    return get_path(fdt, ret, buf, len);
>>>> +}
>>>> +
>>>> +static int path_offset(const void *fdt, const char *path)
>>>> +{
>>>> +    g_autofree char *p = NULL;
>>>> +    char *at;
>>>> +
>>>> +    /*
>>>> +     * The addresses in node names are expected to in the lower 
>>>> case as per
>>>
>>> There's some grammar problem with this sentence. I think it should be 
>>> "are expected to be in lower case" but ask a native speaker.
>>
>> Definitely missed "be". Thanks for spotting.
>>
>>
>>>
>>>> +     * 
>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>> +     */
>>>> +    at = strchr(path, '@');
>>>> +    if (!at) {
>>>> +        return fdt_path_offset(fdt, path);
>>>> +    }
>>>> +
>>>> +    p = g_strdup(path);
>>>> +    for (at = at - path + p + 1; *at; ++at) {
>>>> +        *at = tolower(*at);
>>>> +    }
>>>> +    return fdt_path_offset(fdt, p);
>>>> +}
>>>> +
>>>> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
>>>> +{
>>>> +    char fullnode[VOF_MAX_PATH];
>>>> +    uint32_t ret = -1;
>>>> +    int offset;
>>>> +
>>>> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
>>>> +        return (uint32_t) ret;
>>>> +    }
>>>> +
>>>> +    offset = path_offset(fdt, fullnode);
>>>> +    if (offset >= 0) {
>>>> +        ret = fdt_get_phandle(fdt, offset);
>>>> +    }
>>>> +    trace_vof_finddevice(fullnode, ret);
>>>> +    return (uint32_t) ret;
>>>> +}
>>>> +
>>>> +static const void *getprop(const void *fdt, int nodeoff, const char 
>>>> *propname,
>>>> +                           int *proplen, bool *write0)
>>>> +{
>>>> +    const char *unit, *prop;
>>>> +    const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen);
>>>> +
>>>> +    if (ret) {
>>>> +        if (write0) {
>>>> +            *write0 = false;
>>>> +        }
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * The "name" property is not actually expected as a property 
>>>> in the FDT
>>>> +     * (although some platform may create those in "/" so we try 
>>>> getprop first),
>>>
>>> Not only in "/" but anywhere. MorphOS walks the tree with nextprop 
>>> and expects to get a name property for most nodes without ever 
>>> explicitely querying "name". I've tested this with both the board 
>>> firmware and VOF and with the board firmware a name property appears 
>>> in most nodes but not all so I think at least SmartFirmware does the 
>>> same and explicitely sets name on some nodes and otherwise returns 
>>> the name from path if such property does not existbut queried. With 
>>> this in VOF I can do the same and get same results so the change 
>>> should be OK but the comment may be misleading now. Better to just 
>>> say we return a value for "name" from path if queried but property 
>>> does not exist which seems to be what OF does too.
>>
>>
>> Fair point, after checking with o1275 and 
>> devicetree-specification-v0.2.pdf, I'll do what you said.
> 
> Just to avoid misunderstandings: only change the comment not the code, 
> it works this way just the comment could be adjusted to describe it better.

Yup, agree.


>>
>>>> +     * we emulate it by returning a pointer to the node's name and 
>>>> adjust
>>>> +     * proplen to include only the name but not the unit.
>>>> +     */
>>>> +    if (strcmp(propname, "name")) {
>>>> +        return NULL;
>>>> +    }
>>>> +    prop = fdt_get_name(fdt, nodeoff, proplen);
>>>> +    if (!prop) {
>>>> +        *proplen = 0;
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    unit = memchr(prop, '@', *proplen);
>>>> +    if (unit) {
>>>> +        *proplen = unit - prop;
>>>> +    }
>>>> +    *proplen += 1;
>>>> +
>>>> +    /*
>>>> +     * Since it might be cut at "@" and there will be no trailing zero
>>>> +     * in the prop buffer, tell the caller to write zero at the end.
>>>> +     */
>>>> +    if (write0) {
>>>> +        *write0 = true;
>>>> +    }
>>>> +    return prop;
>>>> +}
>>>> +
>>>> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, 
>>>> uint32_t pname,
>>>> +                            uint32_t valaddr, uint32_t vallen)
>>>> +{
>>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>>> +    uint32_t ret = 0;
>>>> +    int proplen = 0;
>>>> +    const void *prop;
>>>> +    char trval[64] = "";
>>>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>>>> +    bool write0;
>>>> +
>>>> +    if (nodeoff < 0) {
>>>> +        return -1;
>>>> +    }
>>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>>> +        return -1;
>>>> +    }
>>>> +    prop = getprop(fdt, nodeoff, propname, &proplen, &write0);
>>>> +    if (prop) {
>>>> +        const char zero = 0;
>>>> +        int cb = MIN(proplen, vallen);
>>>> +
>>>> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK ||
>>>> +            /* if that was "name" with a unit address, overwrite 
>>>> '@' with '0' */
>>>> +            (write0 &&
>>>> +             cb == proplen &&
>>>> +             VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) {
>>>> +            ret = -1;
>>>> +        } else {
>>>> +            /*
>>>> +             * OF1275 says:
>>>> +             * "Size is either the actual size of the property, or 
>>>> -1 if name
>>>> +             * does not exist", hence returning proplen instead of cb.
>>>> +             */
>>>> +            ret = proplen;
>>>> +            /* Do not format a value if tracepoint is silent, for 
>>>> performance */
>>>> +            if (trace_event_get_state(TRACE_VOF_GETPROP) &&
>>>> +                qemu_loglevel_mask(LOG_TRACE)) {
>>>> +                prop_format(trval, sizeof(trval), prop, ret);
>>>> +            }
>>>> +        }
>>>> +    } else {
>>>> +        ret = -1;
>>>> +    }
>>>> +    trace_vof_getprop(nodeph, propname, ret, trval);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, 
>>>> uint32_t pname)
>>>> +{
>>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>>> +    uint32_t ret = 0;
>>>> +    int proplen = 0;
>>>> +    const void *prop;
>>>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>>>> +
>>>> +    if (nodeoff < 0) {
>>>> +        return -1;
>>>> +    }
>>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>>> +        return -1;
>>>> +    }
>>>> +    prop = getprop(fdt, nodeoff, propname, &proplen, NULL);
>>>> +    if (prop) {
>>>> +        ret = proplen;
>>>> +    } else {
>>>> +        ret = -1;
>>>> +    }
>>>> +    trace_vof_getproplen(nodeph, propname, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof,
>>>> +                            uint32_t nodeph, uint32_t pname,
>>>> +                            uint32_t valaddr, uint32_t vallen)
>>>> +{
>>>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>>>> +    uint32_t ret = -1;
>>>> +    int offset;
>>>> +    char trval[64] = "";
>>>> +    char nodepath[VOF_MAX_PATH] = "";
>>>> +    Object *vmo = object_dynamic_cast(OBJECT(ms), 
>>>> TYPE_VOF_MACHINE_IF);
>>>> +    g_autofree char *val = NULL;
>>>> +
>>>> +    if (vallen > VOF_MAX_SETPROPLEN) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +    if (readstr(pname, propname, sizeof(propname))) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
>>>> +    if (offset < 0) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +    ret = get_path(fdt, offset, nodepath, sizeof(nodepath));
>>>> +    if (ret <= 0) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +
>>>> +    val = g_malloc0(vallen);
>>>> +    if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +
>>>> +    if (vmo) {
>>>> +        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
>>>> +
>>>> +        if (vmc->setprop &&
>>>> +            !vmc->setprop(ms, nodepath, propname, val, vallen)) {
>>>> +            goto trace_exit;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    ret = fdt_setprop(fdt, offset, propname, val, vallen);
>>>> +    if (ret) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +
>>>> +    if (trace_event_get_state(TRACE_VOF_SETPROP) &&
>>>> +        qemu_loglevel_mask(LOG_TRACE)) {
>>>> +        prop_format(trval, sizeof(trval), val, vallen);
>>>> +    }
>>>> +    ret = vallen;
>>>> +
>>>> +trace_exit:
>>>> +    trace_vof_setprop(nodeph, propname, trval, vallen, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
>>>> +                             uint32_t prevaddr, uint32_t nameaddr)
>>>> +{
>>>> +    int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle);
>>>> +    char prev[OF_PROPNAME_LEN_MAX + 1];
>>>> +    const char *tmp;
>>>> +
>>>> +    if (readstr(prevaddr, prev, sizeof(prev))) {
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    fdt_for_each_property_offset(offset, fdt, nodeoff) {
>>>> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>>>> +            return 0;
>>>> +        }
>>>> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
>>>> +            if (prev[0] != '\0') {
>>>> +                offset = fdt_next_property_offset(fdt, offset);
>>>> +                if (offset < 0) {
>>>> +                    return 0;
>>>> +                }
>>>> +            }
>>>> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>>>> +                return 0;
>>>> +            }
>>>> +
>>>> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != 
>>>> MEMTX_OK) {
>>>> +                return -1;
>>>> +            }
>>>> +            return 1;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    if (phandle == 0) {
>>>> +        ret = fdt_path_offset(fdt, "/");
>>>> +    } else {
>>>> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, 
>>>> phandle));
>>>> +    }
>>>> +
>>>> +    if (ret < 0) {
>>>> +        ret = 0;
>>>> +    } else {
>>>> +        ret = fdt_get_phandle(fdt, ret);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
>>>> +{
>>>> +    int ret = fdt_first_subnode(fdt, 
>>>> fdt_node_offset_by_phandle(fdt, phandle));
>>>> +
>>>> +    if (ret < 0) {
>>>> +        ret = 0;
>>>> +    } else {
>>>> +        ret = fdt_get_phandle(fdt, ret);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
>>>> +{
>>>> +    int ret = fdt_parent_offset(fdt, 
>>>> fdt_node_offset_by_phandle(fdt, phandle));
>>>> +
>>>> +    if (ret < 0) {
>>>> +        ret = 0;
>>>> +    } else {
>>>> +        ret = fdt_get_phandle(fdt, ret);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const 
>>>> char *path)
>>>> +{
>>>> +    uint32_t ret = -1;
>>>> +    OfInstance *inst = NULL;
>>>> +
>>>> +    if (vof->of_instance_last == 0xFFFFFFFF) {
>>>> +        /* We do not recycle ihandles yet */
>>>> +        goto trace_exit;
>>>> +    }
>>>> +
>>>> +    inst = g_new0(OfInstance, 1);
>>>> +    inst->phandle = fdt_get_phandle(fdt, offset);
>>>> +    g_assert(inst->phandle);
>>>> +    ++vof->of_instance_last;
>>>> +
>>>> +    inst->path = g_strdup(path);
>>>> +    g_hash_table_insert(vof->of_instances,
>>>> +                        GINT_TO_POINTER(vof->of_instance_last),
>>>> +                        inst);
>>>> +    ret = vof->of_instance_last;
>>>> +
>>>> +trace_exit:
>>>> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>> *nodename,
>>>> +                               const char *prop, const char *path)
>>>> +{
>>>> +    int node = fdt_path_offset(fdt, nodename);
>>>> +    int inst, offset;
>>>> +
>>>> +    offset = fdt_path_offset(fdt, path);
>>>> +    if (offset < 0) {
>>>> +        trace_vof_error_unknown_path(path);
>>>> +        return offset;
>>>> +    }
>>>> +
>>>> +    inst = vof_do_open(fdt, vof, offset, path);
>>>> +
>>>> +    return fdt_setprop_cell(fdt, node, prop, inst);
>>>> +}
>>>> +
>>>> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
>>>> +{
>>>> +    char path[VOF_MAX_PATH];
>>>> +    int offset;
>>>> +
>>>> +    if (readstr(pathaddr, path, sizeof(path))) {
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    offset = path_offset(fdt, path);
>>>> +    if (offset < 0) {
>>>> +        trace_vof_error_unknown_path(path);
>>>> +        return offset;
>>>> +    }
>>>> +
>>>> +    return vof_do_open(fdt, vof, offset, path);
>>>> +}
>>>> +
>>>> +static void vof_close(Vof *vof, uint32_t ihandle)
>>>> +{
>>>> +    if (!g_hash_table_remove(vof->of_instances, 
>>>> GINT_TO_POINTER(ihandle))) {
>>>> +        trace_vof_error_unknown_ihandle_close(ihandle);
>>>> +    }
>>>> +}
>>>> +
>>>> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
>>>> +{
>>>> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
>>>> +                                         GINT_TO_POINTER(ihandle));
>>>> +    uint32_t ret = -1;
>>>> +
>>>> +    if (instp) {
>>>> +        ret = ((OfInstance *)instp)->phandle;
>>>> +    }
>>>> +    trace_vof_instance_to_package(ihandle, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
>>>> +                                    uint32_t buf, uint32_t len)
>>>> +{
>>>> +    uint32_t ret = -1;
>>>> +    char tmp[VOF_MAX_PATH] = "";
>>>> +
>>>> +    ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>>>> +    if (ret > 0) {
>>>> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>>>> +            ret = -1;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    trace_vof_package_to_path(phandle, tmp, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t 
>>>> ihandle,
>>>> +                                     uint32_t buf, uint32_t len)
>>>> +{
>>>> +    uint32_t ret = -1;
>>>> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
>>>> +    char tmp[VOF_MAX_PATH] = "";
>>>> +
>>>> +    if (phandle != -1) {
>>>> +        ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
>>>> +        if (ret > 0) {
>>>> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>>>> +                ret = -1;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf,
>>>> +                          uint32_t len)
>>>> +{
>>>> +    char tmp[VOF_VTY_BUF_SIZE];
>>>> +    unsigned cb;
>>>> +    OfInstance *inst = (OfInstance *)
>>>> +        g_hash_table_lookup(vof->of_instances, 
>>>> GINT_TO_POINTER(ihandle));
>>>> +
>>>> +    if (!inst) {
>>>> +        trace_vof_error_write(ihandle);
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    for ( ; len > 0; len -= cb) {
>>>> +        cb = MIN(len, sizeof(tmp) - 1);
>>>> +        if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) {
>>>> +            return -1;
>>>> +        }
>>>> +
>>>> +        /* FIXME: there is no backend(s) yet so just call a trace */
>>>> +        if (trace_event_get_state(TRACE_VOF_WRITE) &&
>>>> +            qemu_loglevel_mask(LOG_TRACE)) {
>>>> +            tmp[cb] = '\0';
>>>> +            trace_vof_write(ihandle, cb, tmp);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return len;
>>>> +}
>>>> +
>>>> +static void vof_claimed_dump(GArray *claimed)
>>>> +{
>>>> +    int i;
>>>> +    OfClaimed c;
>>>> +
>>>> +    if (trace_event_get_state(TRACE_VOF_CLAIMED) &&
>>>> +        qemu_loglevel_mask(LOG_TRACE)) {
>>>> +
>>>> +        for (i = 0; i < claimed->len; ++i) {
>>>> +            c = g_array_index(claimed, OfClaimed, i);
>>>> +            trace_vof_claimed(c.start, c.start + c.size, c.size);
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, 
>>>> uint64_t size)
>>>> +{
>>>> +    int i;
>>>> +    OfClaimed c;
>>>> +
>>>> +    for (i = 0; i < claimed->len; ++i) {
>>>> +        c = g_array_index(claimed, OfClaimed, i);
>>>> +        if (ranges_overlap(c.start, c.size, virt, size)) {
>>>> +            return false;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t 
>>>> size)
>>>> +{
>>>> +    OfClaimed newclaim;
>>>> +
>>>> +    newclaim.start = virt;
>>>> +    newclaim.size = size;
>>>> +    g_array_append_val(claimed, newclaim);
>>>> +}
>>>> +
>>>> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
>>>> +{
>>>> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
>>>> +}
>>>> +
>>>> +static void vof_dt_memory_available(void *fdt, GArray *claimed, 
>>>> uint64_t base)
>>>> +{
>>>> +    int i, n, offset, proplen = 0, sc, ac;
>>>> +    target_ulong mem0_end;
>>>> +    const uint8_t *mem0_reg;
>>>> +    g_autofree uint8_t *avail = NULL;
>>>> +    uint8_t *availcur;
>>>> +
>>>> +    if (!fdt || !claimed) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    offset = fdt_path_offset(fdt, "/");
>>>> +    _FDT(offset);
>>>> +    ac = fdt_address_cells(fdt, offset);
>>>> +    g_assert(ac == 1 || ac == 2);
>>>> +    sc = fdt_size_cells(fdt, offset);
>>>> +    g_assert(sc == 1 || sc == 2);
>>>> +
>>>> +    offset = fdt_path_offset(fdt, "/memory@0");
>>>> +    _FDT(offset);
>>>> +
>>>> +    mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
>>>> +    g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
>>>> +    if (sc == 2) {
>>>> +        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + 
>>>> sizeof(uint32_t) * ac));
>>>> +    } else {
>>>> +        mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + 
>>>> sizeof(uint32_t) * ac));
>>>> +    }
>>>> +
>>>> +    g_array_sort(claimed, of_claimed_compare_func);
>>>> +    vof_claimed_dump(claimed);
>>>> +
>>>> +    /*
>>>> +     * VOF resides in the first page so we do not need to check if 
>>>> there is
>>>> +     * available memory before the first claimed block
>>>> +     */
>>>> +    g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 
>>>> 0).start == 0));
>>>> +
>>>> +    avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len);
>>>> +    for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) {
>>>> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
>>>> +        uint64_t start, size;
>>>> +
>>>> +        start = c.start + c.size;
>>>> +        if (i < claimed->len - 1) {
>>>> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
>>>> +
>>>> +            size = cn.start - start;
>>>> +        } else {
>>>> +            size = mem0_end - start;
>>>> +        }
>>>> +
>>>> +        if (ac == 2) {
>>>> +            *(uint64_t *) availcur = cpu_to_be64(start);
>>>> +        } else {
>>>> +            *(uint32_t *) availcur = cpu_to_be32(start);
>>>> +        }
>>>> +        availcur += sizeof(uint32_t) * ac;
>>>> +        if (sc == 2) {
>>>> +            *(uint64_t *) availcur = cpu_to_be64(size);
>>>> +        } else {
>>>> +            *(uint32_t *) availcur = cpu_to_be32(size);
>>>> +        }
>>>> +        availcur += sizeof(uint32_t) * sc;
>>>> +
>>>> +        if (size) {
>>>> +            trace_vof_avail(c.start + c.size, c.start + c.size + 
>>>> size, size);
>>>> +            ++n;
>>>> +        }
>>>> +    }
>>>> +    _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - 
>>>> avail)));
>>>> +}
>>>> +
>>>> +/*
>>>> + * OF1275:
>>>> + * "Allocates size bytes of memory. If align is zero, the allocated 
>>>> range
>>>> + * begins at the virtual address virt. Otherwise, an aligned 
>>>> address is
>>>> + * automatically chosen and the input argument virt is ignored".
>>>> + *
>>>> + * In other words, exactly one of @virt and @align is non-zero.
>>>> + */
>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size,
>>>> +                   uint64_t align)
>>>> +{
>>>> +    uint64_t ret;
>>>> +
>>>> +    if (size == 0) {
>>>> +        ret = -1;
>>>> +    } else if (align == 0) {
>>>> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
>>>> +            ret = -1;
>>>> +        } else {
>>>> +            ret = virt;
>>>> +        }
>>>> +    } else {
>>>> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
>>>> +        while (1) {
>>>> +            if (vof->claimed_base >= vof->top_addr) {
>>>> +                error_report("Out of RMA memory for the OF client");
>>>> +                return -1;
>>>> +            }
>>>> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, 
>>>> size)) {
>>>> +                break;
>>>> +            }
>>>> +            vof->claimed_base += size;
>>>> +        }
>>>> +        ret = vof->claimed_base;
>>>> +    }
>>>> +
>>>> +    if (ret != -1) {
>>>> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
>>>> +        vof_claim_add(vof->claimed, ret, size);
>>>> +    }
>>>> +    trace_vof_claim(virt, size, align, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size)
>>>> +{
>>>> +    uint32_t ret = -1;
>>>> +    int i;
>>>> +    GArray *claimed = vof->claimed;
>>>> +    OfClaimed c;
>>>> +
>>>> +    for (i = 0; i < claimed->len; ++i) {
>>>> +        c = g_array_index(claimed, OfClaimed, i);
>>>> +        if (c.start == virt && c.size == size) {
>>>> +            g_array_remove_index(claimed, i);
>>>> +            ret = 0;
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    trace_vof_release(virt, size, ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void vof_instantiate_rtas(Error **errp)
>>>> +{
>>>> +    error_setg(errp, "The firmware should have instantiated RTAS");
>>>> +}
>>>> +
>>>> +static uint32_t vof_call_method(MachineState *ms, Vof *vof, 
>>>> uint32_t methodaddr,
>>>> +                                uint32_t ihandle, uint32_t param1,
>>>> +                                uint32_t param2, uint32_t param3,
>>>> +                                uint32_t param4, uint32_t *ret2)
>>>> +{
>>>> +    uint32_t ret = -1;
>>>> +    char method[VOF_MAX_METHODLEN] = "";
>>>> +    OfInstance *inst;
>>>> +
>>>> +    if (!ihandle) {
>>>> +        goto trace_exit;
>>>> +    }
>>>> +
>>>> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
>>>> +                                              
>>>> GINT_TO_POINTER(ihandle));
>>>
>>> I think you should not have space in type casts but checkpatch does 
>>> not seem to mind. You have this at several places.
>>
>>
>> checkpatch does mind because it is truly ugly. I tried:
>>
>>
>> ERROR: "(foo*)" should be "(foo *)"
>> #2029: FILE: pc-bios/vof/ci.c:46:
>> +    if (prom_handle((void*)(unsigned long) args)) {
>>
>> total: 1 errors, 0 warnings, 2030 lines checked
> 
> That one yes, but I've meant writing
> 
> (OfInstance *)g_hash_table_lookup
> 
> instead of
> 
> (OfInstance *) g_hash_table_lookup
> 
> which checkpatch doesn't seem to mind and I did not find anything about 
> this in coding style doc so maybe both are OK? I prefer casts without a 
> space so it's clear what is being cast but if there's no style for that 
> and checkpatch accepts it then use what you like.

Oh that. No, it is not enforced anywhere. But I like my spaces :) But 
your way (git grep "\w \*)\w") seems to be 3 times more popular choice 
in QEMU than mine ("\w \*)\s\w"). Hm. May be.
BALATON Zoltan June 17, 2021, 9:16 a.m. UTC | #9
On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
> On 16/06/2021 20:34, BALATON Zoltan wrote:
>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>> The PAPR platform describes an OS environment that's presented by
>>>>> a combination of a hypervisor and firmware. The features it specifies
>>>>> require collaboration between the firmware and the hypervisor.
>>>>> 
>>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>>>> updated in sync with it. Even though we've managed to limit the amount
>>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>>> and it has become increasingly awkward to handle as we've implemented
>>>>> new features.
>>>>> 
>>>>> This implements a boot time OF client interface (CI) which is
>>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>>> Open
>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>>> using a smaller stateless firmware which does not have to manage
>>>>> the device tree.
>>>>> 
>>>>> The new "vof.bin" firmware image is included with source code under
>>>>> pc-bios/. It also includes RTAS blob.
>>>>> 
>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>> working. In particular, this implements the device tree fetching and
>>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>>> updates
>>>>> "/memory@0/available" to report the client about available memory.
>>>>> 
>>>>> This implements changing some device tree properties which we know how
>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>>> appending.
>>>>> 
>>>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>>>> device tree traversing work.
>>>>> 
>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>> 
>>>>> This adds basic instances support which are managed by a hash map
>>>>> ihandle -> [phandle].
>>>>> 
>>>>> Before the guest started, the used memory is:
>>>>> 0..e60 - the initial firmware
>>>>> 8000..10000 - stack
>>>>> 400000.. - kernel
>>>>> 3ea0000.. - initramdisk
>>>>> 
>>>>> This OF CI does not implement "interpret".
>>>>> 
>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>>> includes a disk image with pre-formatted nvram.
>>>>> 
>>>>> With this basic support, this can only boot into kernel directly.
>>>>> However this is just enough for the petitboot kernel and initradmdisk to
>>>>> boot from any possible source. Note this requires reasonably recent 
>>>>> guest
>>>>> kernel with:
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>> The immediate benefit is much faster booting time which especially
>>>>> crucial with fully emulated early CPU bring up environments. Also this
>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>> 
>>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>>> other POWERPC boards which do not support pSeries.
>>>>> 
>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>> to ./configure to enable it.
>>>>> 
>>>>> This assumes potential support for booting from QEMU backends
>>>>> such as blockdev or netdev without devices/drivers used.
>>>>> 
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>> 
>>>>> The example command line is:
>>>>> 
>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>> -nodefaults \
>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>> -nographic \
>>>>> -vga none \
>>>>> -enable-kvm \
>>>>> -m 8G \
>>>>> -machine 
>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>> \
>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>> -initrd pb/rootfs.cpio.xz \
>>>>> -drive 
>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>> \
>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>> -snapshot \
>>>>> -smp 8,threads=8 \
>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>> -trace events=qemu_trace_events \
>>>>> -d guest_errors \
>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>> -mon chardev=SOCKET0,mode=control
>>>> 
>>>> I haven't looked at it in detail yet, just some quick comments I have on 
>>>> first skim through.
>>>> 
>>>>> ---
>>>>> Changes:
>>>>> v21:
>>>>> * s/ld/ldz/ in entry.S
>>>> 
>>>> Typo? Has this become lwz?
>>> 
>>> Yup, lwz.
>>> 
>>>> 
>>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>>> Kconfig
>>>>> * made CONFIG_VOF optional
>>>> 
>>>> This won't work for pegasos2, see below.
>>>> 
>>>>> * s/l.lds/vof.lds/
>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>>> comment
>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>> after "@" in node names
>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>>> similar
>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>> address_space_read_full
>>>>> before, not sure why)
>>>> [...]
>>>>> ---
>>>>> configure               |    9 +
>>>>> pc-bios/vof/Makefile    |   23 +
>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>> pc-bios/vof/main.c      |   21 +
>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>> MAINTAINERS             |   12 +
>>>>> hw/ppc/Kconfig          |    3 +
>>>>> hw/ppc/meson.build      |    3 +
>>>>> hw/ppc/trace-events     |   24 +
>>>>> meson.build             |    1 +
>>>>> pc-bios/README          |    2 +
>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>> create mode 100644 hw/ppc/vof.c
>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>> create mode 100644 pc-bios/vof/main.c
>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>> create mode 100755 pc-bios/vof.bin
>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>> 
[...]
>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>> new file mode 100644
>>>>> index 000000000000..65ca2fed0d41
>>>>> --- /dev/null
>>>>> +++ b/include/hw/ppc/vof.h
>>>>> @@ -0,0 +1,55 @@
>>>>> +/*
>>>>> + * Virtual Open Firmware
>>>>> + *
>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>> + */
>>>>> +#ifndef HW_VOF_H
>>>>> +#define HW_VOF_H
>>>>> +
>>>>> +typedef struct Vof {
>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>> +    uint64_t claimed_base;
>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>> +    uint32_t of_instance_last;
>>>>> +    char *bootargs;
>>>>> +    long fw_size;
>>>>> +} Vof;
>>>>> +
>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>> +                    target_ulong args_real);
>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>>> align);
>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>> +void vof_cleanup(Vof *vof);
>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>> *nodename,
>>>>> +                               const char *prop, const char *path);
>>>>> +
>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>> +
>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>> TYPE_VOF_MACHINE_IF)
>>>>> +
>>>>> +struct VofMachineIfClass {
>>>>> +    InterfaceClass parent;
>>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>>> CPUState *cs,
>>>>> +                                                target_ulong vec);
>>>>> +    void (*quiesce)(MachineState *ms);
>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>>> *propname,
>>>>> +                    void *val, int vallen);
>>>>> +};
>>>>> +
>>>>> +/*
>>>>> + * Initial stack size is from
>>>>> + * 
>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>> + */
>>>>> +#define VOF_STACK_SIZE       0x8000
>>>> 
>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put that in the 
>>>> device tree but it depends on the rtas shim size that's part of VOF so it 
>>>> should be defined here instead of hardcoding it in boards that use VOF so 
>>>> it can be updated later at one place if needed.
>>> 
>>> This is rtas-size for pseries:
>>> 
>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>> 
>>> => depends on cpus => depends on the command line.
>>> 
>>> 
>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a patch to 
>>> ditch it.
>> 
>> I mean you need to have at least the size of code in pc-bios/vof/entry.S 
>> hv_rtas where also hv_rtas_size is defined but that value is not available 
>> in QEMU where one needs to add it to the device tree. So a define for that 
>> should be here in vof.h. Currently I've counted instructions and have
>>
>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>> 
>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that you 
>> define corresponding to hv_rtas_size. You'll probably need the same even 
>> after changing above rtas size calculation in spapr because client has to 
>> allocate memory for instantiate-rtas.
>
>
> Ah fair point. I do not like "20" here and I think the right thing will be 
> adding whatever number of bytes to rtas-size in the firmware itself and 
> update it in QEMU via "setprop" as we do for "linux,rtas-base". And then do 
> the same in SLOF.

This is not the base address but the size of the shim with the hypercall 
that instantiate-rtas copies. Why does it need to be updated? And why does 
it need to be more bytes than necessary? I don't know what you do for 
spapr and why do you need larger rtas-size than this but for pegasos2 this 
/rtas/rtas-size property is only used by guests to allocate memory for 
rtas so all I need is how many bytes are needed for hv_rtas in 
pc-bios/vof/entry.S which is what should be #defined in vof.h. I've found 
20 is just enough so you could add that to vof.h.

Regards,
BALATON Zoltan
BALATON Zoltan June 17, 2021, 9:21 a.m. UTC | #10
On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
> On 16/06/2021 20:26, BALATON Zoltan wrote:
>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>> On 6/16/21 07:09, BALATON Zoltan wrote:
>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>> The PAPR platform describes an OS environment that's presented by
>>>>> a combination of a hypervisor and firmware. The features it specifies
>>>>> require collaboration between the firmware and the hypervisor.
>>>>> 
>>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>>> SLOF - but a build that's specific to qemu, and has always needed to be
>>>>> updated in sync with it. Even though we've managed to limit the amount
>>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>>> and it has become increasingly awkward to handle as we've implemented
>>>>> new features.
>>>>> 
>>>>> This implements a boot time OF client interface (CI) which is
>>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>>> Open
>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>>> using a smaller stateless firmware which does not have to manage
>>>>> the device tree.
>>>>> 
>>>>> The new "vof.bin" firmware image is included with source code under
>>>>> pc-bios/. It also includes RTAS blob.
>>>>> 
>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>> working. In particular, this implements the device tree fetching and
>>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>>> updates
>>>>> "/memory@0/available" to report the client about available memory.
>>>>> 
>>>>> This implements changing some device tree properties which we know how
>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>>> appending.
>>>>> 
>>>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>>>> device tree traversing work.
>>>>> 
>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>> 
>>>>> This adds basic instances support which are managed by a hash map
>>>>> ihandle -> [phandle].
>>>>> 
>>>>> Before the guest started, the used memory is:
>>>>> 0..e60 - the initial firmware
>>>>> 8000..10000 - stack
>>>>> 400000.. - kernel
>>>>> 3ea0000.. - initramdisk
>>>>> 
>>>>> This OF CI does not implement "interpret".
>>>>> 
>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>>> includes a disk image with pre-formatted nvram.
>>>>> 
>>>>> With this basic support, this can only boot into kernel directly.
>>>>> However this is just enough for the petitboot kernel and initradmdisk to
>>>>> boot from any possible source. Note this requires reasonably recent 
>>>>> guest
>>>>> kernel with:
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>> The immediate benefit is much faster booting time which especially
>>>>> crucial with fully emulated early CPU bring up environments. Also this
>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>> 
>>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>>> other POWERPC boards which do not support pSeries.
>>>>> 
>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>> to ./configure to enable it.
>>>>> 
>>>>> This assumes potential support for booting from QEMU backends
>>>>> such as blockdev or netdev without devices/drivers used.
>>>>> 
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>> 
>>>>> The example command line is:
>>>>> 
>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>> -nodefaults \
>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>> -nographic \
>>>>> -vga none \
>>>>> -enable-kvm \
>>>>> -m 8G \
>>>>> -machine 
>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>> \
>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>> -initrd pb/rootfs.cpio.xz \
>>>>> -drive 
>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>> \
>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>> -snapshot \
>>>>> -smp 8,threads=8 \
>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>> -trace events=qemu_trace_events \
>>>>> -d guest_errors \
>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>> -mon chardev=SOCKET0,mode=control
>>>>> 
>>>>> ---
>>>>> Changes:
>>>>> v21:
>>>>> * s/ld/ldz/ in entry.S
>>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>>> Kconfig
>>>>> * made CONFIG_VOF optional
>>>>> * s/l.lds/vof.lds/
>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>>> comment
>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>> after "@" in node names
>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>>> similar
>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>> address_space_read_full
>>>>> before, not sure why)
>>>>> 
>>>>> v20:
>>>>> * compile vof.bin with -mcpu=power4 for better compatibility
>>>>> * s/std/stw/ in entry.S to make it work on ppc32
>>>>> * fixed dt_available property to support both 32 and 64bit
>>>>> * shuffled prom_args handling code
>>>>> * do not enforce 32bit in MSR (again, to support 32bit platforms)
>>>>> 
>>>>> v19:
>>>>> * put bootargs in the FDT
>>>>> * moved setting properties to a VOF machine hook
>>>>> * moved fw_size and claim for it to vof_init()
>>>>> * added CROSS to the VOF's makefile
>>>>> * simplified phandles assigning
>>>>> * pass MachineState to all machine hooks instead of calling
>>>>> qdev_get_machine (following QOM)
>>>>> * bunch of smaller changes and added comments
>>>>> * added simple test to attempt to start with x-vof=on
>>>>> 
>>>>> v18:
>>>>> * fixed top addr (max address for "claim") on radix - it equals to 
>>>>> ram_size
>>>>> and vof->top_addr was uint32_t
>>>>> * fixed "available" property which got broken in v14 but it is only 
>>>>> visible
>>>>> to clients which care (== grub)
>>>>> * reshuffled vof_dt_memory_available() calls, added vof_init() to allow
>>>>> vof_claim() before rendering the FDT
>>>>> 
>>>>> v17:
>>>>> * mv hw/ppc/vof.h include/hw/ppc/vof.h
>>>>> * VofMachineIfClass -> VofMachineClass; it is not VofMachineInterface as
>>>>> nobody used this scheme, usually "Interface" is dropped, a couple of 
>>>>> times
>>>>> it is "xxxInterfaceClass" or "xxxIfClass", as used the latter as it is
>>>>> used by include/hw/vmstate-if.h
>>>>> * added SPDX
>>>>> * other fixes from v16 review
>>>>> 
>>>>> v16:
>>>>> * rebased on dwg/ppc-for-6.1
>>>>> * s/SpaprVofInterface/VofMachineInterface/
>>>>> 
>>>>> v15:
>>>>> * bugfix: claimed memory for the VOF itself
>>>>> * ditched OF_STACK_ADDR and allocate one instead, now it starts from 
>>>>> 0x8000
>>>>> because it is aligned to its size (no particular reason though)
>>>>> * coding style
>>>>> * moved nvram.bin up one level
>>>>> * ditched bool in the firmware
>>>>> * made debugging code conditional using trace_event_get_state() + 
>>>>> qemu_loglevel_mask()
>>>>> * renamed the CAS interface to SpaprVofInterface
>>>>> * added "write" which for now dumps the message and ihandle via
>>>>> trace point for early debug assistance
>>>>> * commented on when we allocate of_instances in vof_build_dt()
>>>>> * store fw_size is SpaprMachine to let spapr_vof_reset() claim it
>>>>> * many small fixes from v14's review
>>>>> 
>>>>> v14:
>>>>> * check for truncates in readstr()
>>>>> * ditched a separate vof_reset()
>>>>> * spapr->vof is a pointer now, dropped the "on" field
>>>>> * removed rtas_base from vof and updated comment why we allow setting it
>>>>> * added myself to maintainers
>>>>> * updated commit log about blockdev and other possible platforms
>>>>> * added a note why new hcall is 0x5
>>>>> * no in place endianness convertion in spapr_h_vof_client
>>>>> * converted all cpu_physical_memory_read/write to address_space_rw
>>>>> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>>>>> 
>>>>> v13:
>>>>> * rebase on latest ppc-for-6.0
>>>>> * shuffled code around to touch spapr.c less
>>>>> 
>>>>> v12:
>>>>> * split VOF and SPAPR
>>>>> 
>>>>> v11:
>>>>> * added g_autofree
>>>>> * fixed gcc warnings
>>>>> * fixed few leaks
>>>>> * added nvram image to make "nvram --print-config" not crash;
>>>>> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
>>>>> is 16K, or it just does not work (empty output from "nvram")
>>>>> 
>>>>> v10:
>>>>> * now rebased to compile with meson
>>>>> 
>>>>> v9:
>>>>> * remove special handling of /rtas/rtas-size as now we always add it in 
>>>>> QEMU
>>>>> * removed leftovers from scsi/grub/stdout/stdin/...
>>>>> 
>>>>> v8:
>>>>> * no read/write/seek
>>>>> * no @dev in instances
>>>>> * the machine flag is "x-vof" for now
>>>>> 
>>>>> v7:
>>>>> * now we have a small firmware which loads at 0 as SLOF and starts from
>>>>> 0x100 as SLOF
>>>>> * no MBR/ELF/GRUB business in QEMU anymore
>>>>> * blockdev is a separate patch
>>>>> * networking is a separate patch
>>>>> 
>>>>> v6:
>>>>> * borrowed a big chunk of commit log introduction from David
>>>>> * fixed initial stack pointer (points to the highest address of stack)
>>>>> * traces for "interpret" and others
>>>>> * disabled  translate_kernel_address() hack so grub can load (work in
>>>>> progress)
>>>>> * added "milliseconds" for grub
>>>>> * fixed "claim" allocator again
>>>>> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for 
>>>>> CAS
>>>>> * moved the most code possible from spapr.c to spapr_of_client.c, such 
>>>>> as
>>>>> RTAS, prom entry and FDT build/finalize
>>>>> * separated blobs
>>>>> * GRUB now proceeds to its console prompt (there are still other issues)
>>>>> * parse MBR/GPT to find PReP and load GRUB
>>>>> 
>>>>> v5:
>>>>> * made instances keep device and chardev pointers
>>>>> * removed VIO dependencies
>>>>> * print error if RTAS memory is not claimed as it should have been
>>>>> * pack FDT as "quiesce"
>>>>> 
>>>>> v4:
>>>>> * fixed open
>>>>> * validate ihandles in "call-method"
>>>>> 
>>>>> v3:
>>>>> * fixed phandles allocation
>>>>> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
>>>>> * fixed size of /chosen/stdout
>>>>> * bunch of renames
>>>>> * do not create rtas properties at all, let the client deal with it;
>>>>> instead setprop allows changing these in the FDT
>>>>> * no more packing FDT when bios=off - nobody needs it and getprop does 
>>>>> not
>>>>> work otherwise
>>>>> * allow updating initramdisk device tree properties (for zImage)
>>>>> * added instances
>>>>> * fixed stdout on OF's "write"
>>>>> * removed special handling for stdout in OF client, spapr-vty handles it
>>>>> instead
>>>>> 
>>>>> v2:
>>>>> * fixed claim()
>>>>> * added "setprop"
>>>>> * cleaner client interface and RTAS blobs management
>>>>> * boots to petitboot and further to the target system
>>>>> * more trace points
>>>>> 
>>>>> v20
>>>>> 
>>>>> v20!
>>>>> ---
>>>>> configure               |    9 +
>>>>> pc-bios/vof/Makefile    |   23 +
>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>> pc-bios/vof/main.c      |   21 +
>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>> MAINTAINERS             |   12 +
>>>>> hw/ppc/Kconfig          |    3 +
>>>>> hw/ppc/meson.build      |    3 +
>>>>> hw/ppc/trace-events     |   24 +
>>>>> meson.build             |    1 +
>>>>> pc-bios/README          |    2 +
>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>> create mode 100644 hw/ppc/vof.c
>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>> create mode 100644 pc-bios/vof/main.c
>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>> create mode 100755 pc-bios/vof.bin
>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>> 
>>>>> diff --git a/configure b/configure
>>>>> index 8dcb9965b24e..00dc29c027fa 100755
>>>>> --- a/configure
>>>>> +++ b/configure
>>>>> @@ -445,6 +445,7 @@ fuse="auto"
>>>>> fuse_lseek="auto"
>>>>> multiprocess="auto"
>>>>> slirp_smbd="$default_feature"
>>>>> +vof="no"
>>>>> 
>>>>> malloc_trim="auto"
>>>>> gio="$default_feature"
>>>>> @@ -1561,6 +1562,10 @@ for opt do
>>>>>   ;;
>>>>>   --disable-slirp-smbd) slirp_smbd=no
>>>>>   ;;
>>>>> +  --enable-vof) vof=yes
>>>>> +  ;;
>>>>> +  --disable-vof) vof=no
>>>>> +  ;;
>>>>>   *)
>>>>>       echo "ERROR: unknown option $opt"
>>>>>       echo "Try '$0 --help' for more information"
>>>>> @@ -1940,6 +1945,7 @@ disabled with --disable-FEATURE, default is 
>>>>> enabled if available
>>>>>   multiprocess    Out of process device emulation support
>>>>>   gio             libgio support
>>>>>   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
>>>>> +  vof             Virtual Open Firmware support (powerpc/pseries, 
>>>>> experimental)
>>>>> 
>>>>> NOTE: The object files are built at the place where configure is 
>>>>> launched
>>>>> EOF
>>>>> @@ -5555,6 +5561,9 @@ if test "$slirp_smbd" = "yes" ; then
>>>>>   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
>>>>>   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
>>>>> fi
>>>>> +if test "$vof" = "yes" ; then
>>>>> +  echo "CONFIG_VOF=y" >> $config_host_mak
>>>>> +fi
>>>>> if test "$vde" = "yes" ; then
>>>>>   echo "CONFIG_VDE=y" >> $config_host_mak
>>>>>   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
>>>> 
>>>> In case I could not explain it clearly in my previous message I think the 
>>>> solution we want here is to drop these configure changes and let Kconfig 
>>>> configure this. The CONFIG_VOF option decides if vof itself is built 
>>>> (adds vof.c) and pegasos2 will select this so it will usually be yes by 
>>>> default. Your problem is that you're trying to use this variable in spapr 
>>>> to make it off by default but that does not work. You need to add another 
>>>> option for that (e.g. CONFIG_VOF_SPAPR or CONFIG_SPAPR_VOF whichever 
>>>> makes more sense) then you can set that to no despite CONFIG_VOF is yes 
>>>> and use that variable in spapr files and to add spapr_vof.c. Then no 
>>>> configure option is needed which does not even work for me: I get compile 
>>>> errors saying 'poisoning existing macro "CONFIG_VOF"' if I try with 
>>>> --enable-vof or spapr fails to build if I try without --enable-vof but 
>>>> select CONFIG_VOF from pegasos2. I hope this makes sense now.
>>> 
>>> 
>>> My problem is that I do not understand when we want VOF to be compiled in 
>>> by default and when we do not. For a moment I thought we do not want it in 
>>> by default but now it sounds like we do. If that is so, then CONFIG_VOF + 
>>> selecting it from PSERIES and PEGASOS should do. Or I am missing the point 
>>> again?
>> 
>> I don't know what spapr wants, for pegasos2 VOF will be the default 
>> firmware and I want pegasos2 to be enabled by default (like other boards) 
>> so this means CONFIG_VOF will also be enabled by default via select VOF in 
>> CONFIG_PEGASOS2. So if the x-vof option in spapr is not enough and you want 
>> to be able to configure it off for spapr then you need another 
>> CONFIG_something option for that and cannot rely on CONFIG_VOF for it 
>> because CONFIG_VOF is on if any board that uses VOF is compiled. If you're 
>> OK with compiling it in but disabled by x-vof by default then no need to 
>> have another option. In both cases you'll have to select VOF somewhere for 
>> your board, either in CONFIG_PSERIES or in the new option that decides if 
>> VOF is built for spapr. At least that's how I understand Kconfig.
>> 
>>>>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>>>>> new file mode 100644
>>>>> index 000000000000..aa1678c4d889
>>>>> --- /dev/null
>>>>> +++ b/pc-bios/vof/Makefile
>>>>> @@ -0,0 +1,23 @@
>>>>> +all: build-all
>>>>> +
>>>>> +build-all: vof.bin
>>>>> +
>>>>> +CROSS ?=
>>>>> +CC = $(CROSS)gcc
>>>>> +LD = $(CROSS)ld
>>>>> +OBJCOPY = $(CROSS)objcopy
>>>>> +
>>>>> +%.o: %.S
>>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
>>>>> +
>>>>> +%.o: %.c
>>>>> +    $(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ 
>>>>> $<
>>>>> +
>>>>> +vof.elf: entry.o main.o ci.o bootmem.o libc.o
>>>>> +    $(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
>>>>> +
>>>>> +%.bin: %.elf
>>>>> +    $(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>>>>> +
>>>>> +clean:
>>>>> +    rm -f *.o vof.bin vof.elf *~
>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>>> index f05219f75ef6..39b5581ae650 100644
>>>>> --- a/include/hw/ppc/spapr.h
>>>>> +++ b/include/hw/ppc/spapr.h
>>>>> @@ -12,6 +12,9 @@
>>>>> #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>>>> #include "hw/ppc/xics.h"        /* For ICSState */
>>>>> #include "hw/ppc/spapr_tpm_proxy.h"
>>>>> +#ifdef CONFIG_VOF
>>>>> +#include "hw/ppc/vof.h"
>>>>> +#endif
>>>>> 
>>>>> struct SpaprVioBus;
>>>>> struct SpaprPhbState;
>>>>> @@ -180,6 +183,9 @@ struct SpaprMachineState {
>>>>>     uint64_t kernel_addr;
>>>>>     uint32_t initrd_base;
>>>>>     long initrd_size;
>>>>> +#ifdef CONFIG_VOF
>>>> 
>>>> So this can't be CONFIG_VOF here if you want to be able to set it to no 
>>>> despite pegasos2 pulling in VOF so you need another SPAPR specific 
>>> 
>>> If VOF is compiled it, why would I want it to be still disabled for 
>>> PSERIES? The code is in, let it work.
>> 
>> That's something to decide for spapr maintainers, I just want to be able to 
>> use CONFIG_VOF from CONFIG_PEGASOS2 and be it on by default.
>> 
>>>> option for that in spapr specific parts with CONFIG_VOF selecting if vof 
>>>> itself is built it any board uses it. So CONFIG_PEGASOS2 has to select 
>>>> CONFIG_VOF and your SPAPR_VOF option should too if it's enabled that way 
>>>> vof.c will be added if either board is built but for SPAPR only if its 
>>>> VOF option is on.
>>>> 
>>>>> +    Vof *vof;
>>>>> +#endif
>>>>>     uint64_t rtc_offset; /* Now used only during incoming migration */
>>>>>     struct PPCTimebase tb;
>>>>>     bool has_graphics;
>>>>> @@ -558,7 +564,9 @@ struct SpaprMachineState {
>>>>> /* Client Architecture support */
>>>>> #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>>>> #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>>>>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>>>>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>>>>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>>>>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>>>> 
>>>>> /*
>>>>>  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>>>>> @@ -956,4 +964,19 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, 
>>>>> hwaddr pagesize,
>>>>> void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>>>> hwaddr spapr_get_rtas_addr(void);
>>>>> bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>>>>> +
>>>>> +#ifdef CONFIG_VOF
>>>>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>>>>> +                     target_ulong *stack_ptr, Error **errp);
>>>>> +void spapr_vof_quiesce(MachineState *ms);
>>>>> +bool spapr_vof_setprop(MachineState *ms, const char *path, const char 
>>>>> *propname,
>>>>> +                       void *val, int vallen);
>>>>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState 
>>>>> *spapr,
>>>>> +                                target_ulong opcode, target_ulong 
>>>>> *args);
>>>>> +target_ulong spapr_vof_client_architecture_support(MachineState *ms,
>>>>> +                                                   CPUState *cs,
>>>>> +                                                   target_ulong 
>>>>> ovec_addr);
>>>>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>>>>> +#endif
>>>>> +
>>>>> #endif /* HW_SPAPR_H */
>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>> new file mode 100644
>>>>> index 000000000000..65ca2fed0d41
>>>>> --- /dev/null
>>>>> +++ b/include/hw/ppc/vof.h
>>>>> @@ -0,0 +1,55 @@
>>>>> +/*
>>>>> + * Virtual Open Firmware
>>>>> + *
>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>> + */
>>>>> +#ifndef HW_VOF_H
>>>>> +#define HW_VOF_H
>>>>> +
>>>>> +typedef struct Vof {
>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>> +    uint64_t claimed_base;
>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>> +    uint32_t of_instance_last;
>>>>> +    char *bootargs;
>>>>> +    long fw_size;
>>>>> +} Vof;
>>>>> +
>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>> +                    target_ulong args_real);
>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>>> align);
>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>> +void vof_cleanup(Vof *vof);
>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>> *nodename,
>>>>> +                               const char *prop, const char *path);
>>>>> +
>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>> +
>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>> TYPE_VOF_MACHINE_IF)
>>>>> +
>>>>> +struct VofMachineIfClass {
>>>>> +    InterfaceClass parent;
>>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>>> CPUState *cs,
>>>>> +                                                target_ulong vec);
>>>>> +    void (*quiesce)(MachineState *ms);
>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>>> *propname,
>>>>> +                    void *val, int vallen);
>>>>> +};
>>>>> +
>>>>> +/*
>>>>> + * Initial stack size is from
>>>>> + * 
>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>> 
>>>> 
>>>> I wonder if it's better to quote the section number and the title of the 
>>>> doc in case the URL here goes away in the future.
>>> 
>>> 
>>> The binding (the URL clearly suggests it is a "binding") says 32K is the 
>>> minimum, what else is here to quote? The doc does not explain why anyway.
>> 
>> I thought maybe saying "section x.x of OpenFirmware PPC binding says 
>> minimum stack size is 32K" which can also be understood if the link points 
>> to nowhere in the future. You can still add a link if you want but just 
>> have the relevant info in the comment so one does not need to read the 
>> whole doc to find it.
>
>
> I did not read the whole doc - I opened the link, "ctrl-f", "stack" - and 
> there was a local link to "8.2.2. Initial Stack" right there :)
>
> I'll do what you suggested but I really see no value.

The value simply is that in the future if www.devicetree.org is gone or 
the document is moved without redirect and the URL just gives you 404 or 
server not found then you can still know what the comment means if it says 
that according to OF PPC binding, Section 8.2.2. Initial Stack this should 
be the correct size. This is clear even when the URL is not available but 
with just the URL like above it's hard to verify if that case.

Regards,
BALATON Zoltan
Alexey Kardashevskiy June 17, 2021, 10:28 a.m. UTC | #11
On 17/06/2021 19:16, BALATON Zoltan wrote:
> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>> On 16/06/2021 20:34, BALATON Zoltan wrote:
>>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>> The PAPR platform describes an OS environment that's presented by
>>>>>> a combination of a hypervisor and firmware. The features it specifies
>>>>>> require collaboration between the firmware and the hypervisor.
>>>>>>
>>>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>>>> SLOF - but a build that's specific to qemu, and has always needed 
>>>>>> to be
>>>>>> updated in sync with it. Even though we've managed to limit the 
>>>>>> amount
>>>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>>>> and it has become increasingly awkward to handle as we've implemented
>>>>>> new features.
>>>>>>
>>>>>> This implements a boot time OF client interface (CI) which is
>>>>>> enabled by a new "x-vof" pseries machine option (stands for 
>>>>>> "Virtual Open
>>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>>>> using a smaller stateless firmware which does not have to manage
>>>>>> the device tree.
>>>>>>
>>>>>> The new "vof.bin" firmware image is included with source code under
>>>>>> pc-bios/. It also includes RTAS blob.
>>>>>>
>>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>>> working. In particular, this implements the device tree fetching and
>>>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>>>> updates
>>>>>> "/memory@0/available" to report the client about available memory.
>>>>>>
>>>>>> This implements changing some device tree properties which we know 
>>>>>> how
>>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>>>> appending.
>>>>>>
>>>>>> In absence of SLOF, this assigns phandles to device tree nodes to 
>>>>>> make
>>>>>> device tree traversing work.
>>>>>>
>>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>>>
>>>>>> This adds basic instances support which are managed by a hash map
>>>>>> ihandle -> [phandle].
>>>>>>
>>>>>> Before the guest started, the used memory is:
>>>>>> 0..e60 - the initial firmware
>>>>>> 8000..10000 - stack
>>>>>> 400000.. - kernel
>>>>>> 3ea0000.. - initramdisk
>>>>>>
>>>>>> This OF CI does not implement "interpret".
>>>>>>
>>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>>>> includes a disk image with pre-formatted nvram.
>>>>>>
>>>>>> With this basic support, this can only boot into kernel directly.
>>>>>> However this is just enough for the petitboot kernel and 
>>>>>> initradmdisk to
>>>>>> boot from any possible source. Note this requires reasonably 
>>>>>> recent guest
>>>>>> kernel with:
>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>>> The immediate benefit is much faster booting time which especially
>>>>>> crucial with fully emulated early CPU bring up environments. Also 
>>>>>> this
>>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>>>
>>>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>>>> other POWERPC boards which do not support pSeries.
>>>>>>
>>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>>> to ./configure to enable it.
>>>>>>
>>>>>> This assumes potential support for booting from QEMU backends
>>>>>> such as blockdev or netdev without devices/drivers used.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>>
>>>>>> The example command line is:
>>>>>>
>>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>>> -nodefaults \
>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>>> -nographic \
>>>>>> -vga none \
>>>>>> -enable-kvm \
>>>>>> -m 8G \
>>>>>> -machine 
>>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>>> \
>>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>>> -initrd pb/rootfs.cpio.xz \
>>>>>> -drive 
>>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>>> \
>>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>>> -snapshot \
>>>>>> -smp 8,threads=8 \
>>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>>> -trace events=qemu_trace_events \
>>>>>> -d guest_errors \
>>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>>> -mon chardev=SOCKET0,mode=control
>>>>>
>>>>> I haven't looked at it in detail yet, just some quick comments I 
>>>>> have on first skim through.
>>>>>
>>>>>> ---
>>>>>> Changes:
>>>>>> v21:
>>>>>> * s/ld/ldz/ in entry.S
>>>>>
>>>>> Typo? Has this become lwz?
>>>>
>>>> Yup, lwz.
>>>>
>>>>>
>>>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak 
>>>>>> to Kconfig
>>>>>> * made CONFIG_VOF optional
>>>>>
>>>>> This won't work for pegasos2, see below.
>>>>>
>>>>>> * s/l.lds/vof.lds/
>>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>>>> comment
>>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>>> after "@" in node names
>>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>>>> similar
>>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>>> address_space_read_full
>>>>>> before, not sure why)
>>>>> [...]
>>>>>> ---
>>>>>> configure               |    9 +
>>>>>> pc-bios/vof/Makefile    |   23 +
>>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>>> hw/ppc/vof.c            | 1052 
>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>>> pc-bios/vof/main.c      |   21 +
>>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>>> MAINTAINERS             |   12 +
>>>>>> hw/ppc/Kconfig          |    3 +
>>>>>> hw/ppc/meson.build      |    3 +
>>>>>> hw/ppc/trace-events     |   24 +
>>>>>> meson.build             |    1 +
>>>>>> pc-bios/README          |    2 +
>>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>>> create mode 100644 hw/ppc/vof.c
>>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>>> create mode 100644 pc-bios/vof/main.c
>>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>>> create mode 100755 pc-bios/vof.bin
>>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>>>
> [...]
>>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>>> new file mode 100644
>>>>>> index 000000000000..65ca2fed0d41
>>>>>> --- /dev/null
>>>>>> +++ b/include/hw/ppc/vof.h
>>>>>> @@ -0,0 +1,55 @@
>>>>>> +/*
>>>>>> + * Virtual Open Firmware
>>>>>> + *
>>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>>> + */
>>>>>> +#ifndef HW_VOF_H
>>>>>> +#define HW_VOF_H
>>>>>> +
>>>>>> +typedef struct Vof {
>>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>>> +    uint64_t claimed_base;
>>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>>> +    uint32_t of_instance_last;
>>>>>> +    char *bootargs;
>>>>>> +    long fw_size;
>>>>>> +} Vof;
>>>>>> +
>>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>>> +                    target_ulong args_real);
>>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, 
>>>>>> uint64_t align);
>>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>>> +void vof_cleanup(Vof *vof);
>>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>>> *nodename,
>>>>>> +                               const char *prop, const char *path);
>>>>>> +
>>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>>> +
>>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>>> TYPE_VOF_MACHINE_IF)
>>>>>> +
>>>>>> +struct VofMachineIfClass {
>>>>>> +    InterfaceClass parent;
>>>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>>>> CPUState *cs,
>>>>>> +                                                target_ulong vec);
>>>>>> +    void (*quiesce)(MachineState *ms);
>>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const 
>>>>>> char *propname,
>>>>>> +                    void *val, int vallen);
>>>>>> +};
>>>>>> +
>>>>>> +/*
>>>>>> + * Initial stack size is from
>>>>>> + * 
>>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>>> + */
>>>>>> +#define VOF_STACK_SIZE       0x8000
>>>>>
>>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put that 
>>>>> in the device tree but it depends on the rtas shim size that's part 
>>>>> of VOF so it should be defined here instead of hardcoding it in 
>>>>> boards that use VOF so it can be updated later at one place if needed.
>>>>
>>>> This is rtas-size for pseries:
>>>>
>>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>>>
>>>> => depends on cpus => depends on the command line.
>>>>
>>>>
>>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a 
>>>> patch to ditch it.
>>>
>>> I mean you need to have at least the size of code in 
>>> pc-bios/vof/entry.S hv_rtas where also hv_rtas_size is defined but 
>>> that value is not available in QEMU where one needs to add it to the 
>>> device tree. So a define for that should be here in vof.h. Currently 
>>> I've counted instructions and have
>>>
>>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>>>
>>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that 
>>> you define corresponding to hv_rtas_size. You'll probably need the 
>>> same even after changing above rtas size calculation in spapr because 
>>> client has to allocate memory for instantiate-rtas.
>>
>>
>> Ah fair point. I do not like "20" here and I think the right thing 
>> will be adding whatever number of bytes to rtas-size in the firmware 
>> itself and update it in QEMU via "setprop" as we do for 
>> "linux,rtas-base". And then do the same in SLOF.
> 
> This is not the base address but the size of the shim with the hypercall 
> that instantiate-rtas copies. Why does it need to be updated?

The vm kernel allocates the space for it.

> And why 
> does it need to be more bytes than necessary?

What is necessary? It is definitely way more than 20 bytes.

> I don't know what you do 
> for spapr and why do you need larger rtas-size than this but for 
> pegasos2 this /rtas/rtas-size property is only used by guests to 
> allocate memory for rtas so all I need is how many bytes are needed for 
> hv_rtas in pc-bios/vof/entry.S which is what should be #defined in 
> vof.h. I've found 20 is just enough so you could add that to vof.h.

I am thinking now that may be the property should be created by vof.bin 
and not QEMU, QEMU just has to tell how many bytes on top it needs.
BALATON Zoltan June 17, 2021, 11:29 a.m. UTC | #12
On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
> On 17/06/2021 19:16, BALATON Zoltan wrote:
>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>> On 16/06/2021 20:34, BALATON Zoltan wrote:
>>>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>> The PAPR platform describes an OS environment that's presented by
>>>>>>> a combination of a hypervisor and firmware. The features it specifies
>>>>>>> require collaboration between the firmware and the hypervisor.
>>>>>>> 
>>>>>>> Since the beginning, the runtime component of the firmware (RTAS) has
>>>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>>>>> SLOF - but a build that's specific to qemu, and has always needed to 
>>>>>>> be
>>>>>>> updated in sync with it. Even though we've managed to limit the amount
>>>>>>> of runtime communication we need between qemu and SLOF, there's some,
>>>>>>> and it has become increasingly awkward to handle as we've implemented
>>>>>>> new features.
>>>>>>> 
>>>>>>> This implements a boot time OF client interface (CI) which is
>>>>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>>>>> Open
>>>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>>>>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>>>>> using a smaller stateless firmware which does not have to manage
>>>>>>> the device tree.
>>>>>>> 
>>>>>>> The new "vof.bin" firmware image is included with source code under
>>>>>>> pc-bios/. It also includes RTAS blob.
>>>>>>> 
>>>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>>>> working. In particular, this implements the device tree fetching and
>>>>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>>>>> updates
>>>>>>> "/memory@0/available" to report the client about available memory.
>>>>>>> 
>>>>>>> This implements changing some device tree properties which we know how
>>>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>>>>>>> appending.
>>>>>>> 
>>>>>>> In absence of SLOF, this assigns phandles to device tree nodes to make
>>>>>>> device tree traversing work.
>>>>>>> 
>>>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>>>>>> 
>>>>>>> This adds basic instances support which are managed by a hash map
>>>>>>> ihandle -> [phandle].
>>>>>>> 
>>>>>>> Before the guest started, the used memory is:
>>>>>>> 0..e60 - the initial firmware
>>>>>>> 8000..10000 - stack
>>>>>>> 400000.. - kernel
>>>>>>> 3ea0000.. - initramdisk
>>>>>>> 
>>>>>>> This OF CI does not implement "interpret".
>>>>>>> 
>>>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>>>>> includes a disk image with pre-formatted nvram.
>>>>>>> 
>>>>>>> With this basic support, this can only boot into kernel directly.
>>>>>>> However this is just enough for the petitboot kernel and initradmdisk 
>>>>>>> to
>>>>>>> boot from any possible source. Note this requires reasonably recent 
>>>>>>> guest
>>>>>>> kernel with:
>>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>>>> The immediate benefit is much faster booting time which especially
>>>>>>> crucial with fully emulated early CPU bring up environments. Also this
>>>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>>>> 
>>>>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>>>>>>> other POWERPC boards which do not support pSeries.
>>>>>>> 
>>>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>>>> to ./configure to enable it.
>>>>>>> 
>>>>>>> This assumes potential support for booting from QEMU backends
>>>>>>> such as blockdev or netdev without devices/drivers used.
>>>>>>> 
>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>> ---
>>>>>>> 
>>>>>>> The example command line is:
>>>>>>> 
>>>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>>>> -nodefaults \
>>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>>>> -nographic \
>>>>>>> -vga none \
>>>>>>> -enable-kvm \
>>>>>>> -m 8G \
>>>>>>> -machine 
>>>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>>>> \
>>>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>>>> -initrd pb/rootfs.cpio.xz \
>>>>>>> -drive 
>>>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>>>> \
>>>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>>>> -snapshot \
>>>>>>> -smp 8,threads=8 \
>>>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>>>> -trace events=qemu_trace_events \
>>>>>>> -d guest_errors \
>>>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>>>> -mon chardev=SOCKET0,mode=control
>>>>>> 
>>>>>> I haven't looked at it in detail yet, just some quick comments I have 
>>>>>> on first skim through.
>>>>>> 
>>>>>>> ---
>>>>>>> Changes:
>>>>>>> v21:
>>>>>>> * s/ld/ldz/ in entry.S
>>>>>> 
>>>>>> Typo? Has this become lwz?
>>>>> 
>>>>> Yup, lwz.
>>>>> 
>>>>>> 
>>>>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>>>>> Kconfig
>>>>>>> * made CONFIG_VOF optional
>>>>>> 
>>>>>> This won't work for pegasos2, see below.
>>>>>> 
>>>>>>> * s/l.lds/vof.lds/
>>>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>>>>> comment
>>>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>>>> after "@" in node names
>>>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>>>>> similar
>>>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>>>> address_space_read_full
>>>>>>> before, not sure why)
>>>>>> [...]
>>>>>>> ---
>>>>>>> configure               |    9 +
>>>>>>> pc-bios/vof/Makefile    |   23 +
>>>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>>>> hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
>>>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>>>> pc-bios/vof/main.c      |   21 +
>>>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>>>> MAINTAINERS             |   12 +
>>>>>>> hw/ppc/Kconfig          |    3 +
>>>>>>> hw/ppc/meson.build      |    3 +
>>>>>>> hw/ppc/trace-events     |   24 +
>>>>>>> meson.build             |    1 +
>>>>>>> pc-bios/README          |    2 +
>>>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>>>> create mode 100644 hw/ppc/vof.c
>>>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>>>> create mode 100644 pc-bios/vof/main.c
>>>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>>>> create mode 100755 pc-bios/vof.bin
>>>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>>>> 
>> [...]
>>>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>>>> new file mode 100644
>>>>>>> index 000000000000..65ca2fed0d41
>>>>>>> --- /dev/null
>>>>>>> +++ b/include/hw/ppc/vof.h
>>>>>>> @@ -0,0 +1,55 @@
>>>>>>> +/*
>>>>>>> + * Virtual Open Firmware
>>>>>>> + *
>>>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>>>> + */
>>>>>>> +#ifndef HW_VOF_H
>>>>>>> +#define HW_VOF_H
>>>>>>> +
>>>>>>> +typedef struct Vof {
>>>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>>>> +    uint64_t claimed_base;
>>>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>>>> +    uint32_t of_instance_last;
>>>>>>> +    char *bootargs;
>>>>>>> +    long fw_size;
>>>>>>> +} Vof;
>>>>>>> +
>>>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>>>> +                    target_ulong args_real);
>>>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>>>>> align);
>>>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>>>> +void vof_cleanup(Vof *vof);
>>>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>>>> *nodename,
>>>>>>> +                               const char *prop, const char *path);
>>>>>>> +
>>>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>>>> +
>>>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>>>> TYPE_VOF_MACHINE_IF)
>>>>>>> +
>>>>>>> +struct VofMachineIfClass {
>>>>>>> +    InterfaceClass parent;
>>>>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>>>>> CPUState *cs,
>>>>>>> +                                                target_ulong vec);
>>>>>>> +    void (*quiesce)(MachineState *ms);
>>>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>>>>> *propname,
>>>>>>> +                    void *val, int vallen);
>>>>>>> +};
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Initial stack size is from
>>>>>>> + * 
>>>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>>>> + */
>>>>>>> +#define VOF_STACK_SIZE       0x8000
>>>>>> 
>>>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put that in 
>>>>>> the device tree but it depends on the rtas shim size that's part of VOF 
>>>>>> so it should be defined here instead of hardcoding it in boards that 
>>>>>> use VOF so it can be updated later at one place if needed.
>>>>> 
>>>>> This is rtas-size for pseries:
>>>>> 
>>>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>>>> 
>>>>> => depends on cpus => depends on the command line.
>>>>> 
>>>>> 
>>>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a patch 
>>>>> to ditch it.
>>>> 
>>>> I mean you need to have at least the size of code in pc-bios/vof/entry.S 
>>>> hv_rtas where also hv_rtas_size is defined but that value is not 
>>>> available in QEMU where one needs to add it to the device tree. So a 
>>>> define for that should be here in vof.h. Currently I've counted 
>>>> instructions and have
>>>> 
>>>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>>>> 
>>>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that you 
>>>> define corresponding to hv_rtas_size. You'll probably need the same even 
>>>> after changing above rtas size calculation in spapr because client has to 
>>>> allocate memory for instantiate-rtas.
>>> 
>>> 
>>> Ah fair point. I do not like "20" here and I think the right thing will be 
>>> adding whatever number of bytes to rtas-size in the firmware itself and 
>>> update it in QEMU via "setprop" as we do for "linux,rtas-base". And then 
>>> do the same in SLOF.
>> 
>> This is not the base address but the size of the shim with the hypercall 
>> that instantiate-rtas copies. Why does it need to be updated?
>
> The vm kernel allocates the space for it.
>
>> And why does it need to be more bytes than necessary?
>
> What is necessary? It is definitely way more than 20 bytes.

I thought instantiate-rtas only copies the hv_rtas routine as the comment 
in qemu/pc-bios/vof/entry.S says and that routine is 20 bytes. What else 
is needed? If that's not enough then we even more need a define for it as 
boards using VOF have no idea otherwise.

>> I don't know what you do for spapr and why do you need larger rtas-size 
>> than this but for pegasos2 this /rtas/rtas-size property is only used by 
>> guests to allocate memory for rtas so all I need is how many bytes are 
>> needed for hv_rtas in pc-bios/vof/entry.S which is what should be #defined 
>> in vof.h. I've found 20 is just enough so you could add that to vof.h.
>
> I am thinking now that may be the property should be created by vof.bin and 
> not QEMU, QEMU just has to tell how many bytes on top it needs.

Maybe. If it's always in /rtas/rtas-size on every OF implementation (if 
that path is kind of standard for rtas) then that could also work or you 
could have an vof_init_rtas() function or similar that sets this, maybe 
pass it "/rtas" as path argument or even the whole property path 
("/rtas/rtas-size") to avoid hard coding it and let the board tell it 
where it expects this property, then the value can be set by this function 
so that's within VOF then. But I think just adding a define for it in 
vof.h is enough and simple. Then boards can add whatever they need and put 
that in the property where they like.

Regards,
BALATON Zoltan
Alexey Kardashevskiy June 18, 2021, 3:19 a.m. UTC | #13
On 6/17/21 21:29, BALATON Zoltan wrote:
> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>> On 17/06/2021 19:16, BALATON Zoltan wrote:
>>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>>> On 16/06/2021 20:34, BALATON Zoltan wrote:
>>>>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>>> The PAPR platform describes an OS environment that's presented by
>>>>>>>> a combination of a hypervisor and firmware. The features it 
>>>>>>>> specifies
>>>>>>>> require collaboration between the firmware and the hypervisor.
>>>>>>>>
>>>>>>>> Since the beginning, the runtime component of the firmware 
>>>>>>>> (RTAS) has
>>>>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>>>>> a hypercall implemented in qemu. The boot time firmware 
>>>>>>>> component is
>>>>>>>> SLOF - but a build that's specific to qemu, and has always 
>>>>>>>> needed to be
>>>>>>>> updated in sync with it. Even though we've managed to limit the 
>>>>>>>> amount
>>>>>>>> of runtime communication we need between qemu and SLOF, there's 
>>>>>>>> some,
>>>>>>>> and it has become increasingly awkward to handle as we've 
>>>>>>>> implemented
>>>>>>>> new features.
>>>>>>>>
>>>>>>>> This implements a boot time OF client interface (CI) which is
>>>>>>>> enabled by a new "x-vof" pseries machine option (stands for 
>>>>>>>> "Virtual Open
>>>>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT 
>>>>>>>> hcall
>>>>>>>> which implements Open Firmware Client Interface (OF CI). This 
>>>>>>>> allows
>>>>>>>> using a smaller stateless firmware which does not have to manage
>>>>>>>> the device tree.
>>>>>>>>
>>>>>>>> The new "vof.bin" firmware image is included with source code under
>>>>>>>> pc-bios/. It also includes RTAS blob.
>>>>>>>>
>>>>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>>>>> working. In particular, this implements the device tree fetching 
>>>>>>>> and
>>>>>>>> simple memory allocator - "claim" (an OF CI memory allocator) 
>>>>>>>> and updates
>>>>>>>> "/memory@0/available" to report the client about available memory.
>>>>>>>>
>>>>>>>> This implements changing some device tree properties which we 
>>>>>>>> know how
>>>>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some 
>>>>>>>> room for
>>>>>>>> appending.
>>>>>>>>
>>>>>>>> In absence of SLOF, this assigns phandles to device tree nodes 
>>>>>>>> to make
>>>>>>>> device tree traversing work.
>>>>>>>>
>>>>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a 
>>>>>>>> tree.
>>>>>>>>
>>>>>>>> This adds basic instances support which are managed by a hash map
>>>>>>>> ihandle -> [phandle].
>>>>>>>>
>>>>>>>> Before the guest started, the used memory is:
>>>>>>>> 0..e60 - the initial firmware
>>>>>>>> 8000..10000 - stack
>>>>>>>> 400000.. - kernel
>>>>>>>> 3ea0000.. - initramdisk
>>>>>>>>
>>>>>>>> This OF CI does not implement "interpret".
>>>>>>>>
>>>>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, 
>>>>>>>> this
>>>>>>>> includes a disk image with pre-formatted nvram.
>>>>>>>>
>>>>>>>> With this basic support, this can only boot into kernel directly.
>>>>>>>> However this is just enough for the petitboot kernel and 
>>>>>>>> initradmdisk to
>>>>>>>> boot from any possible source. Note this requires reasonably 
>>>>>>>> recent guest
>>>>>>>> kernel with:
>>>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>>>>> The immediate benefit is much faster booting time which especially
>>>>>>>> crucial with fully emulated early CPU bring up environments. 
>>>>>>>> Also this
>>>>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>>>>>
>>>>>>>> This separates VOF and sPAPR in a hope that VOF bits may be 
>>>>>>>> reused by
>>>>>>>> other POWERPC boards which do not support pSeries.
>>>>>>>>
>>>>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>>>>> to ./configure to enable it.
>>>>>>>>
>>>>>>>> This assumes potential support for booting from QEMU backends
>>>>>>>> such as blockdev or netdev without devices/drivers used.
>>>>>>>>
>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>> ---
>>>>>>>>
>>>>>>>> The example command line is:
>>>>>>>>
>>>>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>>>>> -nodefaults \
>>>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>>>>> -nographic \
>>>>>>>> -vga none \
>>>>>>>> -enable-kvm \
>>>>>>>> -m 8G \
>>>>>>>> -machine 
>>>>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>>>>> \
>>>>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>>>>> -initrd pb/rootfs.cpio.xz \
>>>>>>>> -drive 
>>>>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>>>>> \
>>>>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>>>>> -snapshot \
>>>>>>>> -smp 8,threads=8 \
>>>>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>>>>> -trace events=qemu_trace_events \
>>>>>>>> -d guest_errors \
>>>>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>>>>> -mon chardev=SOCKET0,mode=control
>>>>>>>
>>>>>>> I haven't looked at it in detail yet, just some quick comments I 
>>>>>>> have on first skim through.
>>>>>>>
>>>>>>>> ---
>>>>>>>> Changes:
>>>>>>>> v21:
>>>>>>>> * s/ld/ldz/ in entry.S
>>>>>>>
>>>>>>> Typo? Has this become lwz?
>>>>>>
>>>>>> Yup, lwz.
>>>>>>
>>>>>>>
>>>>>>>> * moved CONFIG_VOF from 
>>>>>>>> default-configs/devices/ppc64-softmmu.mak to Kconfig
>>>>>>>> * made CONFIG_VOF optional
>>>>>>>
>>>>>>> This won't work for pegasos2, see below.
>>>>>>>
>>>>>>>> * s/l.lds/vof.lds/
>>>>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a 
>>>>>>>> better comment
>>>>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>>>>> after "@" in node names
>>>>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as 
>>>>>>>> (unlike similar
>>>>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>>>>> address_space_read_full
>>>>>>>> before, not sure why)
>>>>>>> [...]
>>>>>>>> ---
>>>>>>>> configure               |    9 +
>>>>>>>> pc-bios/vof/Makefile    |   23 +
>>>>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>>>>> hw/ppc/vof.c            | 1052 
>>>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>>>>> pc-bios/vof/main.c      |   21 +
>>>>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>>>>> MAINTAINERS             |   12 +
>>>>>>>> hw/ppc/Kconfig          |    3 +
>>>>>>>> hw/ppc/meson.build      |    3 +
>>>>>>>> hw/ppc/trace-events     |   24 +
>>>>>>>> meson.build             |    1 +
>>>>>>>> pc-bios/README          |    2 +
>>>>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>>>>> create mode 100644 hw/ppc/vof.c
>>>>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>>>>> create mode 100644 pc-bios/vof/main.c
>>>>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>>>>> create mode 100755 pc-bios/vof.bin
>>>>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>>>>>
>>> [...]
>>>>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>>>>> new file mode 100644
>>>>>>>> index 000000000000..65ca2fed0d41
>>>>>>>> --- /dev/null
>>>>>>>> +++ b/include/hw/ppc/vof.h
>>>>>>>> @@ -0,0 +1,55 @@
>>>>>>>> +/*
>>>>>>>> + * Virtual Open Firmware
>>>>>>>> + *
>>>>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>>>>> + */
>>>>>>>> +#ifndef HW_VOF_H
>>>>>>>> +#define HW_VOF_H
>>>>>>>> +
>>>>>>>> +typedef struct Vof {
>>>>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>>>>> +    uint64_t claimed_base;
>>>>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>>>>> +    uint32_t of_instance_last;
>>>>>>>> +    char *bootargs;
>>>>>>>> +    long fw_size;
>>>>>>>> +} Vof;
>>>>>>>> +
>>>>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>>>>> +                    target_ulong args_real);
>>>>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, 
>>>>>>>> uint64_t align);
>>>>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>>>>> +void vof_cleanup(Vof *vof);
>>>>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>>>>> *nodename,
>>>>>>>> +                               const char *prop, const char 
>>>>>>>> *path);
>>>>>>>> +
>>>>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>>>>> +
>>>>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>>>>> TYPE_VOF_MACHINE_IF)
>>>>>>>> +
>>>>>>>> +struct VofMachineIfClass {
>>>>>>>> +    InterfaceClass parent;
>>>>>>>> +    target_ulong (*client_architecture_support)(MachineState 
>>>>>>>> *ms, CPUState *cs,
>>>>>>>> +                                                target_ulong vec);
>>>>>>>> +    void (*quiesce)(MachineState *ms);
>>>>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const 
>>>>>>>> char *propname,
>>>>>>>> +                    void *val, int vallen);
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Initial stack size is from
>>>>>>>> + * 
>>>>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>>>>> + */
>>>>>>>> +#define VOF_STACK_SIZE       0x8000
>>>>>>>
>>>>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put 
>>>>>>> that in the device tree but it depends on the rtas shim size 
>>>>>>> that's part of VOF so it should be defined here instead of 
>>>>>>> hardcoding it in boards that use VOF so it can be updated later 
>>>>>>> at one place if needed.
>>>>>>
>>>>>> This is rtas-size for pseries:
>>>>>>
>>>>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>>>>>
>>>>>> => depends on cpus => depends on the command line.
>>>>>>
>>>>>>
>>>>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a 
>>>>>> patch to ditch it.
>>>>>
>>>>> I mean you need to have at least the size of code in 
>>>>> pc-bios/vof/entry.S hv_rtas where also hv_rtas_size is defined but 
>>>>> that value is not available in QEMU where one needs to add it to 
>>>>> the device tree. So a define for that should be here in vof.h. 
>>>>> Currently I've counted instructions and have
>>>>>
>>>>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>>>>>
>>>>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that 
>>>>> you define corresponding to hv_rtas_size. You'll probably need the 
>>>>> same even after changing above rtas size calculation in spapr 
>>>>> because client has to allocate memory for instantiate-rtas.
>>>>
>>>>
>>>> Ah fair point. I do not like "20" here and I think the right thing 
>>>> will be adding whatever number of bytes to rtas-size in the firmware 
>>>> itself and update it in QEMU via "setprop" as we do for 
>>>> "linux,rtas-base". And then do the same in SLOF.
>>>
>>> This is not the base address but the size of the shim with the 
>>> hypercall that instantiate-rtas copies. Why does it need to be updated?
>>
>> The vm kernel allocates the space for it.
>>
>>> And why does it need to be more bytes than necessary?
>>
>> What is necessary? It is definitely way more than 20 bytes.
> 
> I thought instantiate-rtas only copies the hv_rtas routine as the 
> comment in qemu/pc-bios/vof/entry.S says

It does only copy the code, correct.

> and that routine is 20 bytes. 


There is no "#define XXX 20" anywhere though. QEMU does not know and 
does not need to know that it is 20, it does not manage the RTAS blob.


> What else is needed? If that's not enough then we even more need a 
> define for it as boards using VOF have no idea otherwise.
> 
>>> I don't know what you do for spapr and why do you need larger 
>>> rtas-size than this but for pegasos2 this /rtas/rtas-size property is 
>>> only used by guests to allocate memory for rtas so all I need is how 
>>> many bytes are needed for hv_rtas in pc-bios/vof/entry.S which is 
>>> what should be #defined in vof.h. I've found 20 is just enough so you 
>>> could add that to vof.h.
>>
>> I am thinking now that may be the property should be created by 
>> vof.bin and not QEMU, QEMU just has to tell how many bytes on top it 
>> needs.
> 
> Maybe. If it's always in /rtas/rtas-size on every OF implementation (if 
> that path is kind of standard for rtas) then that could also work or you 
> could have an vof_init_rtas() function or similar that sets this, maybe 
> pass it "/rtas" as path argument or even the whole property path 
> ("/rtas/rtas-size") to avoid hard coding it and let the board tell it 
> where it expects this property, then the value can be set by this 
> function so that's within VOF then. But I think just adding a define for 
> it in vof.h is enough and simple. Then boards can add whatever they need 
> and put that in the property where they like.


My idea is that boards like pegasos put a zero in such property and VOF 
then adjusts it to whatever it is + 20.
BALATON Zoltan June 18, 2021, 10:13 a.m. UTC | #14
On Fri, 18 Jun 2021, Alexey Kardashevskiy wrote:
> On 6/17/21 21:29, BALATON Zoltan wrote:
>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>> On 17/06/2021 19:16, BALATON Zoltan wrote:
>>>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>>>> On 16/06/2021 20:34, BALATON Zoltan wrote:
>>>>>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>>>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>>>> The PAPR platform describes an OS environment that's presented by
>>>>>>>>> a combination of a hypervisor and firmware. The features it 
>>>>>>>>> specifies
>>>>>>>>> require collaboration between the firmware and the hypervisor.
>>>>>>>>> 
>>>>>>>>> Since the beginning, the runtime component of the firmware (RTAS) 
>>>>>>>>> has
>>>>>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>>>>>> a hypercall implemented in qemu. The boot time firmware component is
>>>>>>>>> SLOF - but a build that's specific to qemu, and has always needed to 
>>>>>>>>> be
>>>>>>>>> updated in sync with it. Even though we've managed to limit the 
>>>>>>>>> amount
>>>>>>>>> of runtime communication we need between qemu and SLOF, there's 
>>>>>>>>> some,
>>>>>>>>> and it has become increasingly awkward to handle as we've 
>>>>>>>>> implemented
>>>>>>>>> new features.
>>>>>>>>> 
>>>>>>>>> This implements a boot time OF client interface (CI) which is
>>>>>>>>> enabled by a new "x-vof" pseries machine option (stands for "Virtual 
>>>>>>>>> Open
>>>>>>>>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT 
>>>>>>>>> hcall
>>>>>>>>> which implements Open Firmware Client Interface (OF CI). This allows
>>>>>>>>> using a smaller stateless firmware which does not have to manage
>>>>>>>>> the device tree.
>>>>>>>>> 
>>>>>>>>> The new "vof.bin" firmware image is included with source code under
>>>>>>>>> pc-bios/. It also includes RTAS blob.
>>>>>>>>> 
>>>>>>>>> This implements a handful of CI methods just to get -kernel/-initrd
>>>>>>>>> working. In particular, this implements the device tree fetching and
>>>>>>>>> simple memory allocator - "claim" (an OF CI memory allocator) and 
>>>>>>>>> updates
>>>>>>>>> "/memory@0/available" to report the client about available memory.
>>>>>>>>> 
>>>>>>>>> This implements changing some device tree properties which we know 
>>>>>>>>> how
>>>>>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some room 
>>>>>>>>> for
>>>>>>>>> appending.
>>>>>>>>> 
>>>>>>>>> In absence of SLOF, this assigns phandles to device tree nodes to 
>>>>>>>>> make
>>>>>>>>> device tree traversing work.
>>>>>>>>> 
>>>>>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a 
>>>>>>>>> tree.
>>>>>>>>> 
>>>>>>>>> This adds basic instances support which are managed by a hash map
>>>>>>>>> ihandle -> [phandle].
>>>>>>>>> 
>>>>>>>>> Before the guest started, the used memory is:
>>>>>>>>> 0..e60 - the initial firmware
>>>>>>>>> 8000..10000 - stack
>>>>>>>>> 400000.. - kernel
>>>>>>>>> 3ea0000.. - initramdisk
>>>>>>>>> 
>>>>>>>>> This OF CI does not implement "interpret".
>>>>>>>>> 
>>>>>>>>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>>>>>>>>> includes a disk image with pre-formatted nvram.
>>>>>>>>> 
>>>>>>>>> With this basic support, this can only boot into kernel directly.
>>>>>>>>> However this is just enough for the petitboot kernel and 
>>>>>>>>> initradmdisk to
>>>>>>>>> boot from any possible source. Note this requires reasonably recent 
>>>>>>>>> guest
>>>>>>>>> kernel with:
>>>>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>>>>>> The immediate benefit is much faster booting time which especially
>>>>>>>>> crucial with fully emulated early CPU bring up environments. Also 
>>>>>>>>> this
>>>>>>>>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>>>>>>>> 
>>>>>>>>> This separates VOF and sPAPR in a hope that VOF bits may be reused 
>>>>>>>>> by
>>>>>>>>> other POWERPC boards which do not support pSeries.
>>>>>>>>> 
>>>>>>>>> This make VOF optional, it is disabled by default, add --enable-vof
>>>>>>>>> to ./configure to enable it.
>>>>>>>>> 
>>>>>>>>> This assumes potential support for booting from QEMU backends
>>>>>>>>> such as blockdev or netdev without devices/drivers used.
>>>>>>>>> 
>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>> ---
>>>>>>>>> 
>>>>>>>>> The example command line is:
>>>>>>>>> 
>>>>>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>>>>>>>>> -nodefaults \
>>>>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>>>>>> -nographic \
>>>>>>>>> -vga none \
>>>>>>>>> -enable-kvm \
>>>>>>>>> -m 8G \
>>>>>>>>> -machine 
>>>>>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>>>>>> \
>>>>>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>>>>>> -initrd pb/rootfs.cpio.xz \
>>>>>>>>> -drive 
>>>>>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>>>>>> \
>>>>>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>>>>>> -snapshot \
>>>>>>>>> -smp 8,threads=8 \
>>>>>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>>>>>> -trace events=qemu_trace_events \
>>>>>>>>> -d guest_errors \
>>>>>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>>>>>> -mon chardev=SOCKET0,mode=control
>>>>>>>> 
>>>>>>>> I haven't looked at it in detail yet, just some quick comments I have 
>>>>>>>> on first skim through.
>>>>>>>> 
>>>>>>>>> ---
>>>>>>>>> Changes:
>>>>>>>>> v21:
>>>>>>>>> * s/ld/ldz/ in entry.S
>>>>>>>> 
>>>>>>>> Typo? Has this become lwz?
>>>>>>> 
>>>>>>> Yup, lwz.
>>>>>>> 
>>>>>>>> 
>>>>>>>>> * moved CONFIG_VOF from default-configs/devices/ppc64-softmmu.mak to 
>>>>>>>>> Kconfig
>>>>>>>>> * made CONFIG_VOF optional
>>>>>>>> 
>>>>>>>> This won't work for pegasos2, see below.
>>>>>>>> 
>>>>>>>>> * s/l.lds/vof.lds/
>>>>>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a better 
>>>>>>>>> comment
>>>>>>>>> * added  path_offset wrapper for handling mixed case for addresses
>>>>>>>>> after "@" in node names
>>>>>>>>> * changed getprop() to check for actual "name" property in the fdt
>>>>>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as (unlike 
>>>>>>>>> similar
>>>>>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>>>>>> address_space_read_full
>>>>>>>>> before, not sure why)
>>>>>>>> [...]
>>>>>>>>> ---
>>>>>>>>> configure               |    9 +
>>>>>>>>> pc-bios/vof/Makefile    |   23 +
>>>>>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>>>>>> hw/ppc/vof.c            | 1052 
>>>>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>>>>>> pc-bios/vof/main.c      |   21 +
>>>>>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>>>>>> MAINTAINERS             |   12 +
>>>>>>>>> hw/ppc/Kconfig          |    3 +
>>>>>>>>> hw/ppc/meson.build      |    3 +
>>>>>>>>> hw/ppc/trace-events     |   24 +
>>>>>>>>> meson.build             |    1 +
>>>>>>>>> pc-bios/README          |    2 +
>>>>>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>>>>>> create mode 100644 hw/ppc/vof.c
>>>>>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>>>>>> create mode 100644 pc-bios/vof/main.c
>>>>>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>>>>>> create mode 100755 pc-bios/vof.bin
>>>>>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>>>>>> 
>>>> [...]
>>>>>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>>>>>> new file mode 100644
>>>>>>>>> index 000000000000..65ca2fed0d41
>>>>>>>>> --- /dev/null
>>>>>>>>> +++ b/include/hw/ppc/vof.h
>>>>>>>>> @@ -0,0 +1,55 @@
>>>>>>>>> +/*
>>>>>>>>> + * Virtual Open Firmware
>>>>>>>>> + *
>>>>>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>>>>>> + */
>>>>>>>>> +#ifndef HW_VOF_H
>>>>>>>>> +#define HW_VOF_H
>>>>>>>>> +
>>>>>>>>> +typedef struct Vof {
>>>>>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>>>>>> +    uint64_t claimed_base;
>>>>>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>>>>>> +    uint32_t of_instance_last;
>>>>>>>>> +    char *bootargs;
>>>>>>>>> +    long fw_size;
>>>>>>>>> +} Vof;
>>>>>>>>> +
>>>>>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>>>>>> +                    target_ulong args_real);
>>>>>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t 
>>>>>>>>> align);
>>>>>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>>>>>> +void vof_cleanup(Vof *vof);
>>>>>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char 
>>>>>>>>> *nodename,
>>>>>>>>> +                               const char *prop, const char *path);
>>>>>>>>> +
>>>>>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>>>>>> +
>>>>>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>>>>>> TYPE_VOF_MACHINE_IF)
>>>>>>>>> +
>>>>>>>>> +struct VofMachineIfClass {
>>>>>>>>> +    InterfaceClass parent;
>>>>>>>>> +    target_ulong (*client_architecture_support)(MachineState *ms, 
>>>>>>>>> CPUState *cs,
>>>>>>>>> +                                                target_ulong vec);
>>>>>>>>> +    void (*quiesce)(MachineState *ms);
>>>>>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const char 
>>>>>>>>> *propname,
>>>>>>>>> +                    void *val, int vallen);
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * Initial stack size is from
>>>>>>>>> + * 
>>>>>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>>>>>> + */
>>>>>>>>> +#define VOF_STACK_SIZE       0x8000
>>>>>>>> 
>>>>>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put that in 
>>>>>>>> the device tree but it depends on the rtas shim size that's part of 
>>>>>>>> VOF so it should be defined here instead of hardcoding it in boards 
>>>>>>>> that use VOF so it can be updated later at one place if needed.
>>>>>>> 
>>>>>>> This is rtas-size for pseries:
>>>>>>> 
>>>>>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>>>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
>>>>>>> 
>>>>>>> => depends on cpus => depends on the command line.
>>>>>>> 
>>>>>>> 
>>>>>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send a 
>>>>>>> patch to ditch it.
>>>>>> 
>>>>>> I mean you need to have at least the size of code in 
>>>>>> pc-bios/vof/entry.S hv_rtas where also hv_rtas_size is defined but that 
>>>>>> value is not available in QEMU where one needs to add it to the device 
>>>>>> tree. So a define for that should be here in vof.h. Currently I've 
>>>>>> counted instructions and have
>>>>>> 
>>>>>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>>>>>> 
>>>>>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that you 
>>>>>> define corresponding to hv_rtas_size. You'll probably need the same 
>>>>>> even after changing above rtas size calculation in spapr because client 
>>>>>> has to allocate memory for instantiate-rtas.
>>>>> 
>>>>> 
>>>>> Ah fair point. I do not like "20" here and I think the right thing will 
>>>>> be adding whatever number of bytes to rtas-size in the firmware itself 
>>>>> and update it in QEMU via "setprop" as we do for "linux,rtas-base". And 
>>>>> then do the same in SLOF.
>>>> 
>>>> This is not the base address but the size of the shim with the hypercall 
>>>> that instantiate-rtas copies. Why does it need to be updated?
>>> 
>>> The vm kernel allocates the space for it.
>>> 
>>>> And why does it need to be more bytes than necessary?
>>> 
>>> What is necessary? It is definitely way more than 20 bytes.
>> 
>> I thought instantiate-rtas only copies the hv_rtas routine as the comment 
>> in qemu/pc-bios/vof/entry.S says
>
> It does only copy the code, correct.
>
>> and that routine is 20 bytes. 
>
>
> There is no "#define XXX 20" anywhere though. QEMU does not know and does not 
> need to know that it is 20, it does not manage the RTAS blob.

But it manages the rtas-size property which has to be at least the RTAS 
blob size so that's why I thought VOF should share this define in vof.h.

>
>> What else is needed? If that's not enough then we even more need a define 
>> for it as boards using VOF have no idea otherwise.
>> 
>>>> I don't know what you do for spapr and why do you need larger rtas-size 
>>>> than this but for pegasos2 this /rtas/rtas-size property is only used by 
>>>> guests to allocate memory for rtas so all I need is how many bytes are 
>>>> needed for hv_rtas in pc-bios/vof/entry.S which is what should be 
>>>> #defined in vof.h. I've found 20 is just enough so you could add that to 
>>>> vof.h.
>>> 
>>> I am thinking now that may be the property should be created by vof.bin 
>>> and not QEMU, QEMU just has to tell how many bytes on top it needs.
>> 
>> Maybe. If it's always in /rtas/rtas-size on every OF implementation (if 
>> that path is kind of standard for rtas) then that could also work or you 
>> could have an vof_init_rtas() function or similar that sets this, maybe 
>> pass it "/rtas" as path argument or even the whole property path 
>> ("/rtas/rtas-size") to avoid hard coding it and let the board tell it where 
>> it expects this property, then the value can be set by this function so 
>> that's within VOF then. But I think just adding a define for it in vof.h is 
>> enough and simple. Then boards can add whatever they need and put that in 
>> the property where they like.
>
>
> My idea is that boards like pegasos put a zero in such property and VOF then 
> adjusts it to whatever it is + 20.

That could work too if VOF knows how to find this property. If it's the 
same on every board then it does not have to look through the whole tree 
for it.

Regards,
BALATON Zoltan
David Gibson June 19, 2021, 9:28 a.m. UTC | #15
On Thu, Jun 17, 2021 at 11:16:19AM +0200, BALATON Zoltan wrote:
> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
> > On 16/06/2021 20:34, BALATON Zoltan wrote:
> > > On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
> > > > On 6/15/21 20:29, BALATON Zoltan wrote:
> > > > > On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
> > > > > > The PAPR platform describes an OS environment that's presented by
> > > > > > a combination of a hypervisor and firmware. The features it specifies
> > > > > > require collaboration between the firmware and the hypervisor.
> > > > > > 
> > > > > > Since the beginning, the runtime component of the firmware (RTAS) has
> > > > > > been implemented as a 20 byte shim which simply forwards it to
> > > > > > a hypercall implemented in qemu. The boot time firmware component is
> > > > > > SLOF - but a build that's specific to qemu, and has always needed to be
> > > > > > updated in sync with it. Even though we've managed to limit the amount
> > > > > > of runtime communication we need between qemu and SLOF, there's some,
> > > > > > and it has become increasingly awkward to handle as we've implemented
> > > > > > new features.
> > > > > > 
> > > > > > This implements a boot time OF client interface (CI) which is
> > > > > > enabled by a new "x-vof" pseries machine option (stands
> > > > > > for "Virtual Open
> > > > > > Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
> > > > > > which implements Open Firmware Client Interface (OF CI). This allows
> > > > > > using a smaller stateless firmware which does not have to manage
> > > > > > the device tree.
> > > > > > 
> > > > > > The new "vof.bin" firmware image is included with source code under
> > > > > > pc-bios/. It also includes RTAS blob.
> > > > > > 
> > > > > > This implements a handful of CI methods just to get -kernel/-initrd
> > > > > > working. In particular, this implements the device tree fetching and
> > > > > > simple memory allocator - "claim" (an OF CI memory
> > > > > > allocator) and updates
> > > > > > "/memory@0/available" to report the client about available memory.
> > > > > > 
> > > > > > This implements changing some device tree properties which we know how
> > > > > > to deal with, the rest is ignored. To allow changes, this skips
> > > > > > fdt_pack() when x-vof=on as not packing the blob leaves some room for
> > > > > > appending.
> > > > > > 
> > > > > > In absence of SLOF, this assigns phandles to device tree nodes to make
> > > > > > device tree traversing work.
> > > > > > 
> > > > > > When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
> > > > > > 
> > > > > > This adds basic instances support which are managed by a hash map
> > > > > > ihandle -> [phandle].
> > > > > > 
> > > > > > Before the guest started, the used memory is:
> > > > > > 0..e60 - the initial firmware
> > > > > > 8000..10000 - stack
> > > > > > 400000.. - kernel
> > > > > > 3ea0000.. - initramdisk
> > > > > > 
> > > > > > This OF CI does not implement "interpret".
> > > > > > 
> > > > > > Unlike SLOF, this does not format uninitialized nvram. Instead, this
> > > > > > includes a disk image with pre-formatted nvram.
> > > > > > 
> > > > > > With this basic support, this can only boot into kernel directly.
> > > > > > However this is just enough for the petitboot kernel and initradmdisk to
> > > > > > boot from any possible source. Note this requires
> > > > > > reasonably recent guest
> > > > > > kernel with:
> > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
> > > > > > The immediate benefit is much faster booting time which
> > > > > > especially
> > > > > > crucial with fully emulated early CPU bring up environments. Also this
> > > > > > may come handy when/if GRUB-in-the-userspace sees light of the day.
> > > > > > 
> > > > > > This separates VOF and sPAPR in a hope that VOF bits may be reused by
> > > > > > other POWERPC boards which do not support pSeries.
> > > > > > 
> > > > > > This make VOF optional, it is disabled by default, add --enable-vof
> > > > > > to ./configure to enable it.
> > > > > > 
> > > > > > This assumes potential support for booting from QEMU backends
> > > > > > such as blockdev or netdev without devices/drivers used.
> > > > > > 
> > > > > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > > > > ---
> > > > > > 
> > > > > > The example command line is:
> > > > > > 
> > > > > > /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
> > > > > > -nodefaults \
> > > > > > -chardev stdio,id=STDIO0,signal=off,mux=on \
> > > > > > -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> > > > > > -mon id=MON0,chardev=STDIO0,mode=readline \
> > > > > > -nographic \
> > > > > > -vga none \
> > > > > > -enable-kvm \
> > > > > > -m 8G \
> > > > > > -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off
> > > > > > \
> > > > > > -kernel pbuild/kernel-le-guest/vmlinux \
> > > > > > -initrd pb/rootfs.cpio.xz \
> > > > > > -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw
> > > > > > \
> > > > > > -global spapr-nvram.drive=DRIVE0 \
> > > > > > -snapshot \
> > > > > > -smp 8,threads=8 \
> > > > > > -L /home/aik/t/qemu-ppc64-bios/ \
> > > > > > -trace events=qemu_trace_events \
> > > > > > -d guest_errors \
> > > > > > -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
> > > > > > -mon chardev=SOCKET0,mode=control
> > > > > 
> > > > > I haven't looked at it in detail yet, just some quick
> > > > > comments I have on first skim through.
> > > > > 
> > > > > > ---
> > > > > > Changes:
> > > > > > v21:
> > > > > > * s/ld/ldz/ in entry.S
> > > > > 
> > > > > Typo? Has this become lwz?
> > > > 
> > > > Yup, lwz.
> > > > 
> > > > > 
> > > > > > * moved CONFIG_VOF from
> > > > > > default-configs/devices/ppc64-softmmu.mak to Kconfig
> > > > > > * made CONFIG_VOF optional
> > > > > 
> > > > > This won't work for pegasos2, see below.
> > > > > 
> > > > > > * s/l.lds/vof.lds/
> > > > > > * force 32 BE in spapr_machine_reset() instead of the firmware
> > > > > > * added checks for non-null methods of VofMachineIfClass
> > > > > > * moved OF_STACK_SIZE to vof.h, renamed to VOF_...,
> > > > > > added a better comment
> > > > > > * added  path_offset wrapper for handling mixed case for addresses
> > > > > > after "@" in node names
> > > > > > * changed getprop() to check for actual "name" property in the fdt
> > > > > > * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing
> > > > > > as (unlike similar
> > > > > > rtas_ld/ldl_be_*) they return error codes
> > > > > > * VOF_MEM_READ uses now address_space_read (it was
> > > > > > address_space_read_full
> > > > > > before, not sure why)
> > > > > [...]
> > > > > > ---
> > > > > > configure               |    9 +
> > > > > > pc-bios/vof/Makefile    |   23 +
> > > > > > include/hw/ppc/spapr.h  |   25 +-
> > > > > > include/hw/ppc/vof.h    |   55 ++
> > > > > > pc-bios/vof/vof.h       |   43 ++
> > > > > > hw/ppc/spapr.c          |   87 +++-
> > > > > > hw/ppc/spapr_hcall.c    |   29 +-
> > > > > > hw/ppc/spapr_vof.c      |  153 ++++++
> > > > > > hw/ppc/vof.c            | 1052 +++++++++++++++++++++++++++++++++++++++
> > > > > > pc-bios/vof/bootmem.c   |   14 +
> > > > > > pc-bios/vof/ci.c        |   91 ++++
> > > > > > pc-bios/vof/libc.c      |   92 ++++
> > > > > > pc-bios/vof/main.c      |   21 +
> > > > > > tests/qtest/rtas-test.c |   17 +-
> > > > > > MAINTAINERS             |   12 +
> > > > > > hw/ppc/Kconfig          |    3 +
> > > > > > hw/ppc/meson.build      |    3 +
> > > > > > hw/ppc/trace-events     |   24 +
> > > > > > meson.build             |    1 +
> > > > > > pc-bios/README          |    2 +
> > > > > > pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
> > > > > > pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
> > > > > > pc-bios/vof/entry.S     |   49 ++
> > > > > > pc-bios/vof/vof.lds     |   48 ++
> > > > > > 24 files changed, 1840 insertions(+), 13 deletions(-)
> > > > > > create mode 100644 pc-bios/vof/Makefile
> > > > > > create mode 100644 include/hw/ppc/vof.h
> > > > > > create mode 100644 pc-bios/vof/vof.h
> > > > > > create mode 100644 hw/ppc/spapr_vof.c
> > > > > > create mode 100644 hw/ppc/vof.c
> > > > > > create mode 100644 pc-bios/vof/bootmem.c
> > > > > > create mode 100644 pc-bios/vof/ci.c
> > > > > > create mode 100644 pc-bios/vof/libc.c
> > > > > > create mode 100644 pc-bios/vof/main.c
> > > > > > create mode 100644 pc-bios/vof-nvram.bin
> > > > > > create mode 100755 pc-bios/vof.bin
> > > > > > create mode 100644 pc-bios/vof/entry.S
> > > > > > create mode 100644 pc-bios/vof/vof.lds
> > > > > > 
> [...]
> > > > > > diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..65ca2fed0d41
> > > > > > --- /dev/null
> > > > > > +++ b/include/hw/ppc/vof.h
> > > > > > @@ -0,0 +1,55 @@
> > > > > > +/*
> > > > > > + * Virtual Open Firmware
> > > > > > + *
> > > > > > + * SPDX-License-Identifier: GPL-2.0-or-later
> > > > > > + */
> > > > > > +#ifndef HW_VOF_H
> > > > > > +#define HW_VOF_H
> > > > > > +
> > > > > > +typedef struct Vof {
> > > > > > +    uint64_t top_addr; /* copied from rma_size */
> > > > > > +    GArray *claimed; /* array of SpaprOfClaimed */
> > > > > > +    uint64_t claimed_base;
> > > > > > +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
> > > > > > +    uint32_t of_instance_last;
> > > > > > +    char *bootargs;
> > > > > > +    long fw_size;
> > > > > > +} Vof;
> > > > > > +
> > > > > > +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
> > > > > > +                    target_ulong args_real);
> > > > > > +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t
> > > > > > size, uint64_t align);
> > > > > > +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
> > > > > > +void vof_cleanup(Vof *vof);
> > > > > > +void vof_build_dt(void *fdt, Vof *vof);
> > > > > > +uint32_t vof_client_open_store(void *fdt, Vof *vof,
> > > > > > const char *nodename,
> > > > > > +                               const char *prop, const char *path);
> > > > > > +
> > > > > > +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
> > > > > > +
> > > > > > +typedef struct VofMachineIfClass VofMachineIfClass;
> > > > > > +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE,
> > > > > > TYPE_VOF_MACHINE_IF)
> > > > > > +
> > > > > > +struct VofMachineIfClass {
> > > > > > +    InterfaceClass parent;
> > > > > > +    target_ulong
> > > > > > (*client_architecture_support)(MachineState *ms,
> > > > > > CPUState *cs,
> > > > > > +                                                target_ulong vec);
> > > > > > +    void (*quiesce)(MachineState *ms);
> > > > > > +    bool (*setprop)(MachineState *ms, const char *path,
> > > > > > const char *propname,
> > > > > > +                    void *val, int vallen);
> > > > > > +};
> > > > > > +
> > > > > > +/*
> > > > > > + * Initial stack size is from
> > > > > > + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html
> > > > > > + */
> > > > > > +#define VOF_STACK_SIZE       0x8000
> > > > > 
> > > > > Maybe also add a define for RTAS_SIZE here? We'll need to
> > > > > put that in the device tree but it depends on the rtas shim
> > > > > size that's part of VOF so it should be defined here instead
> > > > > of hardcoding it in boards that use VOF so it can be updated
> > > > > later at one place if needed.
> > > > 
> > > > This is rtas-size for pseries:
> > > > 
> > > > _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
> > > >          ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t)));
> > > > 
> > > > => depends on cpus => depends on the command line.
> > > > 
> > > > 
> > > > RTAS_SIZE is not used by anything in pseries anymore, I'll send
> > > > a patch to ditch it.
> > > 
> > > I mean you need to have at least the size of code in
> > > pc-bios/vof/entry.S hv_rtas where also hv_rtas_size is defined but
> > > that value is not available in QEMU where one needs to add it to the
> > > device tree. So a define for that should be here in vof.h. Currently
> > > I've counted instructions and have
> > > 
> > >      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
> > > 
> > > in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead that
> > > you define corresponding to hv_rtas_size. You'll probably need the
> > > same even after changing above rtas size calculation in spapr
> > > because client has to allocate memory for instantiate-rtas.
> > 
> > 
> > Ah fair point. I do not like "20" here and I think the right thing will
> > be adding whatever number of bytes to rtas-size in the firmware itself
> > and update it in QEMU via "setprop" as we do for "linux,rtas-base". And
> > then do the same in SLOF.
> 
> This is not the base address but the size of the shim with the hypercall
> that instantiate-rtas copies. Why does it need to be updated? And why does
> it need to be more bytes than necessary? I don't know what you do for spapr
> and why do you need larger rtas-size than this but for pegasos2 this
> /rtas/rtas-size property is only used by guests to allocate memory for rtas
> so all I need is how many bytes are needed for hv_rtas in
> pc-bios/vof/entry.S which is what should be #defined in vof.h. I've found 20
> is just enough so you could add that to vof.h.

Because spapr has fwnmi, a firmware assisted dump mechanism which logs
things into part of the RTAS space.  It's a stupid interface, but
that's how PAPR specifies it.  You neither need nor want that for
Pegasos, so you can just have the rtas-size be the 20 bytes for the
tiny hypercall shim.
Alexey Kardashevskiy June 22, 2021, 7:49 a.m. UTC | #16
On 6/18/21 20:13, BALATON Zoltan wrote:
> On Fri, 18 Jun 2021, Alexey Kardashevskiy wrote:
>> On 6/17/21 21:29, BALATON Zoltan wrote:
>>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>>> On 17/06/2021 19:16, BALATON Zoltan wrote:
>>>>> On Thu, 17 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>> On 16/06/2021 20:34, BALATON Zoltan wrote:
>>>>>>> On Wed, 16 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>>> On 6/15/21 20:29, BALATON Zoltan wrote:
>>>>>>>>> On Tue, 15 Jun 2021, Alexey Kardashevskiy wrote:
>>>>>>>>>> The PAPR platform describes an OS environment that's presented by
>>>>>>>>>> a combination of a hypervisor and firmware. The features it 
>>>>>>>>>> specifies
>>>>>>>>>> require collaboration between the firmware and the hypervisor.
>>>>>>>>>>
>>>>>>>>>> Since the beginning, the runtime component of the firmware 
>>>>>>>>>> (RTAS) has
>>>>>>>>>> been implemented as a 20 byte shim which simply forwards it to
>>>>>>>>>> a hypercall implemented in qemu. The boot time firmware 
>>>>>>>>>> component is
>>>>>>>>>> SLOF - but a build that's specific to qemu, and has always 
>>>>>>>>>> needed to be
>>>>>>>>>> updated in sync with it. Even though we've managed to limit 
>>>>>>>>>> the amount
>>>>>>>>>> of runtime communication we need between qemu and SLOF, 
>>>>>>>>>> there's some,
>>>>>>>>>> and it has become increasingly awkward to handle as we've 
>>>>>>>>>> implemented
>>>>>>>>>> new features.
>>>>>>>>>>
>>>>>>>>>> This implements a boot time OF client interface (CI) which is
>>>>>>>>>> enabled by a new "x-vof" pseries machine option (stands for 
>>>>>>>>>> "Virtual Open
>>>>>>>>>> Firmware). When enabled, QEMU implements the custom 
>>>>>>>>>> H_OF_CLIENT hcall
>>>>>>>>>> which implements Open Firmware Client Interface (OF CI). This 
>>>>>>>>>> allows
>>>>>>>>>> using a smaller stateless firmware which does not have to manage
>>>>>>>>>> the device tree.
>>>>>>>>>>
>>>>>>>>>> The new "vof.bin" firmware image is included with source code 
>>>>>>>>>> under
>>>>>>>>>> pc-bios/. It also includes RTAS blob.
>>>>>>>>>>
>>>>>>>>>> This implements a handful of CI methods just to get 
>>>>>>>>>> -kernel/-initrd
>>>>>>>>>> working. In particular, this implements the device tree 
>>>>>>>>>> fetching and
>>>>>>>>>> simple memory allocator - "claim" (an OF CI memory allocator) 
>>>>>>>>>> and updates
>>>>>>>>>> "/memory@0/available" to report the client about available 
>>>>>>>>>> memory.
>>>>>>>>>>
>>>>>>>>>> This implements changing some device tree properties which we 
>>>>>>>>>> know how
>>>>>>>>>> to deal with, the rest is ignored. To allow changes, this skips
>>>>>>>>>> fdt_pack() when x-vof=on as not packing the blob leaves some 
>>>>>>>>>> room for
>>>>>>>>>> appending.
>>>>>>>>>>
>>>>>>>>>> In absence of SLOF, this assigns phandles to device tree nodes 
>>>>>>>>>> to make
>>>>>>>>>> device tree traversing work.
>>>>>>>>>>
>>>>>>>>>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds 
>>>>>>>>>> a tree.
>>>>>>>>>>
>>>>>>>>>> This adds basic instances support which are managed by a hash map
>>>>>>>>>> ihandle -> [phandle].
>>>>>>>>>>
>>>>>>>>>> Before the guest started, the used memory is:
>>>>>>>>>> 0..e60 - the initial firmware
>>>>>>>>>> 8000..10000 - stack
>>>>>>>>>> 400000.. - kernel
>>>>>>>>>> 3ea0000.. - initramdisk
>>>>>>>>>>
>>>>>>>>>> This OF CI does not implement "interpret".
>>>>>>>>>>
>>>>>>>>>> Unlike SLOF, this does not format uninitialized nvram. 
>>>>>>>>>> Instead, this
>>>>>>>>>> includes a disk image with pre-formatted nvram.
>>>>>>>>>>
>>>>>>>>>> With this basic support, this can only boot into kernel directly.
>>>>>>>>>> However this is just enough for the petitboot kernel and 
>>>>>>>>>> initradmdisk to
>>>>>>>>>> boot from any possible source. Note this requires reasonably 
>>>>>>>>>> recent guest
>>>>>>>>>> kernel with:
>>>>>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735 
>>>>>>>>>> The immediate benefit is much faster booting time which 
>>>>>>>>>> especially
>>>>>>>>>> crucial with fully emulated early CPU bring up environments. 
>>>>>>>>>> Also this
>>>>>>>>>> may come handy when/if GRUB-in-the-userspace sees light of the 
>>>>>>>>>> day.
>>>>>>>>>>
>>>>>>>>>> This separates VOF and sPAPR in a hope that VOF bits may be 
>>>>>>>>>> reused by
>>>>>>>>>> other POWERPC boards which do not support pSeries.
>>>>>>>>>>
>>>>>>>>>> This make VOF optional, it is disabled by default, add 
>>>>>>>>>> --enable-vof
>>>>>>>>>> to ./configure to enable it.
>>>>>>>>>>
>>>>>>>>>> This assumes potential support for booting from QEMU backends
>>>>>>>>>> such as blockdev or netdev without devices/drivers used.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>>> ---
>>>>>>>>>>
>>>>>>>>>> The example command line is:
>>>>>>>>>>
>>>>>>>>>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 
>>>>>>>>>> \
>>>>>>>>>> -nodefaults \
>>>>>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline \
>>>>>>>>>> -nographic \
>>>>>>>>>> -vga none \
>>>>>>>>>> -enable-kvm \
>>>>>>>>>> -m 8G \
>>>>>>>>>> -machine 
>>>>>>>>>> pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off 
>>>>>>>>>> \
>>>>>>>>>> -kernel pbuild/kernel-le-guest/vmlinux \
>>>>>>>>>> -initrd pb/rootfs.cpio.xz \
>>>>>>>>>> -drive 
>>>>>>>>>> id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof-nvram.bin,format=raw 
>>>>>>>>>> \
>>>>>>>>>> -global spapr-nvram.drive=DRIVE0 \
>>>>>>>>>> -snapshot \
>>>>>>>>>> -smp 8,threads=8 \
>>>>>>>>>> -L /home/aik/t/qemu-ppc64-bios/ \
>>>>>>>>>> -trace events=qemu_trace_events \
>>>>>>>>>> -d guest_errors \
>>>>>>>>>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>>>>>>>>>> -mon chardev=SOCKET0,mode=control
>>>>>>>>>
>>>>>>>>> I haven't looked at it in detail yet, just some quick comments 
>>>>>>>>> I have on first skim through.
>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>> Changes:
>>>>>>>>>> v21:
>>>>>>>>>> * s/ld/ldz/ in entry.S
>>>>>>>>>
>>>>>>>>> Typo? Has this become lwz?
>>>>>>>>
>>>>>>>> Yup, lwz.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>> * moved CONFIG_VOF from 
>>>>>>>>>> default-configs/devices/ppc64-softmmu.mak to Kconfig
>>>>>>>>>> * made CONFIG_VOF optional
>>>>>>>>>
>>>>>>>>> This won't work for pegasos2, see below.
>>>>>>>>>
>>>>>>>>>> * s/l.lds/vof.lds/
>>>>>>>>>> * force 32 BE in spapr_machine_reset() instead of the firmware
>>>>>>>>>> * added checks for non-null methods of VofMachineIfClass
>>>>>>>>>> * moved OF_STACK_SIZE to vof.h, renamed to VOF_..., added a 
>>>>>>>>>> better comment
>>>>>>>>>> * added  path_offset wrapper for handling mixed case for 
>>>>>>>>>> addresses
>>>>>>>>>> after "@" in node names
>>>>>>>>>> * changed getprop() to check for actual "name" property in the 
>>>>>>>>>> fdt
>>>>>>>>>> * moved VOF_MEM_READ/VOF_MEM_WRITE to vof.h for sharing as 
>>>>>>>>>> (unlike similar
>>>>>>>>>> rtas_ld/ldl_be_*) they return error codes
>>>>>>>>>> * VOF_MEM_READ uses now address_space_read (it was 
>>>>>>>>>> address_space_read_full
>>>>>>>>>> before, not sure why)
>>>>>>>>> [...]
>>>>>>>>>> ---
>>>>>>>>>> configure               |    9 +
>>>>>>>>>> pc-bios/vof/Makefile    |   23 +
>>>>>>>>>> include/hw/ppc/spapr.h  |   25 +-
>>>>>>>>>> include/hw/ppc/vof.h    |   55 ++
>>>>>>>>>> pc-bios/vof/vof.h       |   43 ++
>>>>>>>>>> hw/ppc/spapr.c          |   87 +++-
>>>>>>>>>> hw/ppc/spapr_hcall.c    |   29 +-
>>>>>>>>>> hw/ppc/spapr_vof.c      |  153 ++++++
>>>>>>>>>> hw/ppc/vof.c            | 1052 
>>>>>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>>>>>> pc-bios/vof/bootmem.c   |   14 +
>>>>>>>>>> pc-bios/vof/ci.c        |   91 ++++
>>>>>>>>>> pc-bios/vof/libc.c      |   92 ++++
>>>>>>>>>> pc-bios/vof/main.c      |   21 +
>>>>>>>>>> tests/qtest/rtas-test.c |   17 +-
>>>>>>>>>> MAINTAINERS             |   12 +
>>>>>>>>>> hw/ppc/Kconfig          |    3 +
>>>>>>>>>> hw/ppc/meson.build      |    3 +
>>>>>>>>>> hw/ppc/trace-events     |   24 +
>>>>>>>>>> meson.build             |    1 +
>>>>>>>>>> pc-bios/README          |    2 +
>>>>>>>>>> pc-bios/vof-nvram.bin   |  Bin 0 -> 16384 bytes
>>>>>>>>>> pc-bios/vof.bin         |  Bin 0 -> 3784 bytes
>>>>>>>>>> pc-bios/vof/entry.S     |   49 ++
>>>>>>>>>> pc-bios/vof/vof.lds     |   48 ++
>>>>>>>>>> 24 files changed, 1840 insertions(+), 13 deletions(-)
>>>>>>>>>> create mode 100644 pc-bios/vof/Makefile
>>>>>>>>>> create mode 100644 include/hw/ppc/vof.h
>>>>>>>>>> create mode 100644 pc-bios/vof/vof.h
>>>>>>>>>> create mode 100644 hw/ppc/spapr_vof.c
>>>>>>>>>> create mode 100644 hw/ppc/vof.c
>>>>>>>>>> create mode 100644 pc-bios/vof/bootmem.c
>>>>>>>>>> create mode 100644 pc-bios/vof/ci.c
>>>>>>>>>> create mode 100644 pc-bios/vof/libc.c
>>>>>>>>>> create mode 100644 pc-bios/vof/main.c
>>>>>>>>>> create mode 100644 pc-bios/vof-nvram.bin
>>>>>>>>>> create mode 100755 pc-bios/vof.bin
>>>>>>>>>> create mode 100644 pc-bios/vof/entry.S
>>>>>>>>>> create mode 100644 pc-bios/vof/vof.lds
>>>>>>>>>>
>>>>> [...]
>>>>>>>>>> diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
>>>>>>>>>> new file mode 100644
>>>>>>>>>> index 000000000000..65ca2fed0d41
>>>>>>>>>> --- /dev/null
>>>>>>>>>> +++ b/include/hw/ppc/vof.h
>>>>>>>>>> @@ -0,0 +1,55 @@
>>>>>>>>>> +/*
>>>>>>>>>> + * Virtual Open Firmware
>>>>>>>>>> + *
>>>>>>>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>>>>>>>> + */
>>>>>>>>>> +#ifndef HW_VOF_H
>>>>>>>>>> +#define HW_VOF_H
>>>>>>>>>> +
>>>>>>>>>> +typedef struct Vof {
>>>>>>>>>> +    uint64_t top_addr; /* copied from rma_size */
>>>>>>>>>> +    GArray *claimed; /* array of SpaprOfClaimed */
>>>>>>>>>> +    uint64_t claimed_base;
>>>>>>>>>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>>>>>>>>>> +    uint32_t of_instance_last;
>>>>>>>>>> +    char *bootargs;
>>>>>>>>>> +    long fw_size;
>>>>>>>>>> +} Vof;
>>>>>>>>>> +
>>>>>>>>>> +int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
>>>>>>>>>> +                    target_ulong args_real);
>>>>>>>>>> +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, 
>>>>>>>>>> uint64_t align);
>>>>>>>>>> +void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
>>>>>>>>>> +void vof_cleanup(Vof *vof);
>>>>>>>>>> +void vof_build_dt(void *fdt, Vof *vof);
>>>>>>>>>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const 
>>>>>>>>>> char *nodename,
>>>>>>>>>> +                               const char *prop, const char 
>>>>>>>>>> *path);
>>>>>>>>>> +
>>>>>>>>>> +#define TYPE_VOF_MACHINE_IF "vof-machine-if"
>>>>>>>>>> +
>>>>>>>>>> +typedef struct VofMachineIfClass VofMachineIfClass;
>>>>>>>>>> +DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, 
>>>>>>>>>> TYPE_VOF_MACHINE_IF)
>>>>>>>>>> +
>>>>>>>>>> +struct VofMachineIfClass {
>>>>>>>>>> +    InterfaceClass parent;
>>>>>>>>>> +    target_ulong (*client_architecture_support)(MachineState 
>>>>>>>>>> *ms, CPUState *cs,
>>>>>>>>>> +                                                target_ulong 
>>>>>>>>>> vec);
>>>>>>>>>> +    void (*quiesce)(MachineState *ms);
>>>>>>>>>> +    bool (*setprop)(MachineState *ms, const char *path, const 
>>>>>>>>>> char *propname,
>>>>>>>>>> +                    void *val, int vallen);
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Initial stack size is from
>>>>>>>>>> + * 
>>>>>>>>>> https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html 
>>>>>>>>>> + */
>>>>>>>>>> +#define VOF_STACK_SIZE       0x8000
>>>>>>>>>
>>>>>>>>> Maybe also add a define for RTAS_SIZE here? We'll need to put 
>>>>>>>>> that in the device tree but it depends on the rtas shim size 
>>>>>>>>> that's part of VOF so it should be defined here instead of 
>>>>>>>>> hardcoding it in boards that use VOF so it can be updated later 
>>>>>>>>> at one place if needed.
>>>>>>>>
>>>>>>>> This is rtas-size for pseries:
>>>>>>>>
>>>>>>>> _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX +
>>>>>>>>          ms->smp.max_cpus * sizeof(uint64_t)*2 + 
>>>>>>>> sizeof(uint64_t)));
>>>>>>>>
>>>>>>>> => depends on cpus => depends on the command line.
>>>>>>>>
>>>>>>>>
>>>>>>>> RTAS_SIZE is not used by anything in pseries anymore, I'll send 
>>>>>>>> a patch to ditch it.
>>>>>>>
>>>>>>> I mean you need to have at least the size of code in 
>>>>>>> pc-bios/vof/entry.S hv_rtas where also hv_rtas_size is defined 
>>>>>>> but that value is not available in QEMU where one needs to add it 
>>>>>>> to the device tree. So a define for that should be here in vof.h. 
>>>>>>> Currently I've counted instructions and have
>>>>>>>
>>>>>>>      qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20);
>>>>>>>
>>>>>>> in pegasos2.c but that 20 should be some VOF_RTAS_SIZE instead 
>>>>>>> that you define corresponding to hv_rtas_size. You'll probably 
>>>>>>> need the same even after changing above rtas size calculation in 
>>>>>>> spapr because client has to allocate memory for instantiate-rtas.
>>>>>>
>>>>>>
>>>>>> Ah fair point. I do not like "20" here and I think the right thing 
>>>>>> will be adding whatever number of bytes to rtas-size in the 
>>>>>> firmware itself and update it in QEMU via "setprop" as we do for 
>>>>>> "linux,rtas-base". And then do the same in SLOF.
>>>>>
>>>>> This is not the base address but the size of the shim with the 
>>>>> hypercall that instantiate-rtas copies. Why does it need to be 
>>>>> updated?
>>>>
>>>> The vm kernel allocates the space for it.
>>>>
>>>>> And why does it need to be more bytes than necessary?
>>>>
>>>> What is necessary? It is definitely way more than 20 bytes.
>>>
>>> I thought instantiate-rtas only copies the hv_rtas routine as the 
>>> comment in qemu/pc-bios/vof/entry.S says
>>
>> It does only copy the code, correct.
>>
>>> and that routine is 20 bytes. 
>>
>>
>> There is no "#define XXX 20" anywhere though. QEMU does not know and 
>> does not need to know that it is 20, it does not manage the RTAS blob.
> 
> But it manages the rtas-size property which has to be at least the RTAS 
> blob size so that's why I thought VOF should share this define in vof.h.
> 
>>
>>> What else is needed? If that's not enough then we even more need a 
>>> define for it as boards using VOF have no idea otherwise.
>>>
>>>>> I don't know what you do for spapr and why do you need larger 
>>>>> rtas-size than this but for pegasos2 this /rtas/rtas-size property 
>>>>> is only used by guests to allocate memory for rtas so all I need is 
>>>>> how many bytes are needed for hv_rtas in pc-bios/vof/entry.S which 
>>>>> is what should be #defined in vof.h. I've found 20 is just enough 
>>>>> so you could add that to vof.h.
>>>>
>>>> I am thinking now that may be the property should be created by 
>>>> vof.bin and not QEMU, QEMU just has to tell how many bytes on top it 
>>>> needs.
>>>
>>> Maybe. If it's always in /rtas/rtas-size on every OF implementation 
>>> (if that path is kind of standard for rtas) then that could also work 
>>> or you could have an vof_init_rtas() function or similar that sets 
>>> this, maybe pass it "/rtas" as path argument or even the whole 
>>> property path ("/rtas/rtas-size") to avoid hard coding it and let the 
>>> board tell it where it expects this property, then the value can be 
>>> set by this function so that's within VOF then. But I think just 
>>> adding a define for it in vof.h is enough and simple. Then boards can 
>>> add whatever they need and put that in the property where they like.
>>
>>
>> My idea is that boards like pegasos put a zero in such property and 
>> VOF then adjusts it to whatever it is + 20.
> 
> That could work too if VOF knows how to find this property. If it's the 
> same on every board then it does not have to look through the whole tree 
> for it.


After some thinking, I guess you just have to use 20 or some safe future 
proof number (64 bytes or similar) as I cannot reliably define the RTAS 
blob size in QEMU. Hacking the firmware seems even worse as the firmware 
does not really care. Well, I can say in QEMU it is 20 but the same code 
in spapr_rtas.c is supposed to work with the RTAS blob provided by VOF 
and by SLOF and even though these are identical now, this is not 
enforced ahyhow and not checked either.
diff mbox series

Patch

diff --git a/configure b/configure
index 8dcb9965b24e..00dc29c027fa 100755
--- a/configure
+++ b/configure
@@ -445,6 +445,7 @@  fuse="auto"
 fuse_lseek="auto"
 multiprocess="auto"
 slirp_smbd="$default_feature"
+vof="no"
 
 malloc_trim="auto"
 gio="$default_feature"
@@ -1561,6 +1562,10 @@  for opt do
   ;;
   --disable-slirp-smbd) slirp_smbd=no
   ;;
+  --enable-vof) vof=yes
+  ;;
+  --disable-vof) vof=no
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1940,6 +1945,7 @@  disabled with --disable-FEATURE, default is enabled if available
   multiprocess    Out of process device emulation support
   gio             libgio support
   slirp-smbd      use smbd (at path --smbd=*) in slirp networking
+  vof             Virtual Open Firmware support (powerpc/pseries, experimental)
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -5555,6 +5561,9 @@  if test "$slirp_smbd" = "yes" ; then
   echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
   echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
 fi
+if test "$vof" = "yes" ; then
+  echo "CONFIG_VOF=y" >> $config_host_mak
+fi
 if test "$vde" = "yes" ; then
   echo "CONFIG_VDE=y" >> $config_host_mak
   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
new file mode 100644
index 000000000000..aa1678c4d889
--- /dev/null
+++ b/pc-bios/vof/Makefile
@@ -0,0 +1,23 @@ 
+all: build-all
+
+build-all: vof.bin
+
+CROSS ?=
+CC = $(CROSS)gcc
+LD = $(CROSS)ld
+OBJCOPY = $(CROSS)objcopy
+
+%.o: %.S
+	$(CC) -m32 -mbig-endian -mcpu=power4 -c -o $@ $<
+
+%.o: %.c
+	$(CC) -m32 -mbig-endian -mcpu=power4 -c -fno-stack-protector -o $@ $<
+
+vof.elf: entry.o main.o ci.o bootmem.o libc.o
+	$(LD) -nostdlib -e_start -Tvof.lds -EB -o $@ $^
+
+%.bin: %.elf
+	$(OBJCOPY) -O binary -j .text -j .data -j .toc -j .got2 $^ $@
+
+clean:
+	rm -f *.o vof.bin vof.elf *~
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index f05219f75ef6..39b5581ae650 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -12,6 +12,9 @@ 
 #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
 #include "hw/ppc/xics.h"        /* For ICSState */
 #include "hw/ppc/spapr_tpm_proxy.h"
+#ifdef CONFIG_VOF
+#include "hw/ppc/vof.h"
+#endif
 
 struct SpaprVioBus;
 struct SpaprPhbState;
@@ -180,6 +183,9 @@  struct SpaprMachineState {
     uint64_t kernel_addr;
     uint32_t initrd_base;
     long initrd_size;
+#ifdef CONFIG_VOF
+    Vof *vof;
+#endif
     uint64_t rtc_offset; /* Now used only during incoming migration */
     struct PPCTimebase tb;
     bool has_graphics;
@@ -558,7 +564,9 @@  struct SpaprMachineState {
 /* Client Architecture support */
 #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
 #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
+/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
+#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
 
 /*
  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
@@ -956,4 +964,19 @@  bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
 hwaddr spapr_get_rtas_addr(void);
 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
+
+#ifdef CONFIG_VOF
+void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
+                     target_ulong *stack_ptr, Error **errp);
+void spapr_vof_quiesce(MachineState *ms);
+bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname,
+                       void *val, int vallen);
+target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                target_ulong opcode, target_ulong *args);
+target_ulong spapr_vof_client_architecture_support(MachineState *ms,
+                                                   CPUState *cs,
+                                                   target_ulong ovec_addr);
+void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
+#endif
+
 #endif /* HW_SPAPR_H */
diff --git a/include/hw/ppc/vof.h b/include/hw/ppc/vof.h
new file mode 100644
index 000000000000..65ca2fed0d41
--- /dev/null
+++ b/include/hw/ppc/vof.h
@@ -0,0 +1,55 @@ 
+/*
+ * Virtual Open Firmware
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef HW_VOF_H
+#define HW_VOF_H
+
+typedef struct Vof {
+    uint64_t top_addr; /* copied from rma_size */
+    GArray *claimed; /* array of SpaprOfClaimed */
+    uint64_t claimed_base;
+    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
+    uint32_t of_instance_last;
+    char *bootargs;
+    long fw_size;
+} Vof;
+
+int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
+                    target_ulong args_real);
+uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, uint64_t align);
+void vof_init(Vof *vof, uint64_t top_addr, Error **errp);
+void vof_cleanup(Vof *vof);
+void vof_build_dt(void *fdt, Vof *vof);
+uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
+                               const char *prop, const char *path);
+
+#define TYPE_VOF_MACHINE_IF "vof-machine-if"
+
+typedef struct VofMachineIfClass VofMachineIfClass;
+DECLARE_CLASS_CHECKERS(VofMachineIfClass, VOF_MACHINE, TYPE_VOF_MACHINE_IF)
+
+struct VofMachineIfClass {
+    InterfaceClass parent;
+    target_ulong (*client_architecture_support)(MachineState *ms, CPUState *cs,
+                                                target_ulong vec);
+    void (*quiesce)(MachineState *ms);
+    bool (*setprop)(MachineState *ms, const char *path, const char *propname,
+                    void *val, int vallen);
+};
+
+/*
+ * Initial stack size is from
+ * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html
+ */
+#define VOF_STACK_SIZE       0x8000
+
+#define VOF_MEM_READ(pa, buf, size) \
+    address_space_read(&address_space_memory, \
+    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
+#define VOF_MEM_WRITE(pa, buf, size) \
+    address_space_write(&address_space_memory, \
+    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
+
+#endif /* HW_VOF_H */
diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
new file mode 100644
index 000000000000..2d8958076907
--- /dev/null
+++ b/pc-bios/vof/vof.h
@@ -0,0 +1,43 @@ 
+/*
+ * Virtual Open Firmware
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include <stdarg.h>
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned long uint32_t;
+typedef unsigned long long uint64_t;
+#define NULL (0)
+#define PROM_ERROR (-1u)
+typedef unsigned long ihandle;
+typedef unsigned long phandle;
+typedef int size_t;
+typedef void client(void);
+
+/* globals */
+extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
+
+void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
+
+/* libc */
+int strlen(const char *s);
+int strcmp(const char *s1, const char *s2);
+void *memcpy(void *dest, const void *src, size_t n);
+int memcmp(const void *ptr1, const void *ptr2, size_t n);
+void *memmove(void *dest, const void *src, size_t n);
+void *memset(void *dest, int c, size_t size);
+
+/* CI wrappers */
+void ci_panic(const char *str);
+phandle ci_finddevice(const char *path);
+uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len);
+
+/* booting from -kernel */
+void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
+
+/* Entry points for CI and RTAS */
+extern uint32_t ci_entry(uint32_t params);
+extern unsigned long hv_rtas(unsigned long params);
+extern unsigned int hv_rtas_size;
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 4dd90b75cc52..6d747d72c614 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -101,6 +101,7 @@ 
 #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
 #define FW_MAX_SIZE             0x400000
 #define FW_FILE_NAME            "slof.bin"
+#define FW_FILE_NAME_VOF        "vof.bin"
 #define FW_OVERHEAD             0x2800000
 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
 
@@ -1639,22 +1640,40 @@  static void spapr_machine_reset(MachineState *machine)
     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
 
     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
+#ifdef CONFIG_VOF
+    if (spapr->vof) {
+        target_ulong stack_ptr = 0;
 
-    rc = fdt_pack(fdt);
+        spapr_vof_reset(spapr, fdt, &stack_ptr, &error_fatal);
 
-    /* Should only fail if we've built a corrupted tree */
-    assert(rc == 0);
+        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
+                                  stack_ptr, spapr->initrd_base,
+                                  spapr->initrd_size);
+        /* VOF is 32bit BE so enforce MSR here */
+        first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << MSR_LE));
+        /*
+         * Do not pack the FDT as the client may change properties.
+         * VOF client does not expect the FDT so we do not load it to the VM.
+         */
+    } else
+#endif
+    {
+        rc = fdt_pack(fdt);
+        /* Should only fail if we've built a corrupted tree */
+        assert(rc == 0);
 
-    /* Load the fdt */
+        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
+                                  0, fdt_addr, 0);
+        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
+    }
     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
-    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
+
     g_free(spapr->fdt_blob);
     spapr->fdt_size = fdt_totalsize(fdt);
     spapr->fdt_initial_size = spapr->fdt_size;
     spapr->fdt_blob = fdt;
 
     /* Set up the entry state */
-    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0);
     first_ppc_cpu->env.gpr[5] = 0;
 
     spapr->fwnmi_system_reset_addr = -1;
@@ -2657,7 +2676,12 @@  static void spapr_machine_init(MachineState *machine)
     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
     MachineClass *mc = MACHINE_GET_CLASS(machine);
-    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
+    const char *bios_default =
+#ifdef CONFIG_VOF
+        !!spapr->vof ? FW_FILE_NAME_VOF :
+#endif
+        FW_FILE_NAME;
+    const char *bios_name = machine->firmware ?: bios_default;
     const char *kernel_filename = machine->kernel_filename;
     const char *initrd_filename = machine->initrd_filename;
     PCIHostState *phb;
@@ -3014,6 +3038,12 @@  static void spapr_machine_init(MachineState *machine)
     }
 
     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
+#ifdef CONFIG_VOF
+    if (spapr->vof) {
+        spapr->vof->fw_size = fw_size; /* for claim() on itself */
+        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
+    }
+#endif
 }
 
 #define DEFAULT_KVM_TYPE "auto"
@@ -3204,6 +3234,30 @@  static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
     }
 }
 
+#ifdef CONFIG_VOF
+static bool spapr_get_vof(Object *obj, Error **errp)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+
+    return spapr->vof != NULL;
+}
+
+static void spapr_set_vof(Object *obj, bool value, Error **errp)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+
+    if (spapr->vof) {
+        vof_cleanup(spapr->vof);
+        g_free(spapr->vof);
+        spapr->vof = NULL;
+    }
+    if (!value) {
+        return;
+    }
+    spapr->vof = g_malloc0(sizeof(*spapr->vof));
+}
+#endif
+
 static char *spapr_get_ic_mode(Object *obj, Error **errp)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
@@ -3329,6 +3383,12 @@  static void spapr_instance_init(Object *obj)
                                     stringify(KERNEL_LOAD_ADDR)
                                     " for -kernel is the default");
     spapr->kernel_addr = KERNEL_LOAD_ADDR;
+#ifdef CONFIG_VOF
+    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
+    object_property_set_description(obj, "x-vof",
+                                    "Enable Virtual Open Firmware (experimental)");
+#endif
+
     /* The machine class defines the default interrupt controller mode */
     spapr->irq = smc->irq;
     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
@@ -4580,6 +4640,16 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     smc->smp_threads_vsmt = true;
     smc->nr_xirqs = SPAPR_NR_XIRQS;
     xfc->match_nvt = spapr_match_nvt;
+
+#ifdef CONFIG_VOF
+    {
+        VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
+        vmc->client_architecture_support =
+            spapr_vof_client_architecture_support;
+        vmc->quiesce = spapr_vof_quiesce;
+        vmc->setprop = spapr_vof_setprop;
+    }
+#endif
 }
 
 static const TypeInfo spapr_machine_info = {
@@ -4599,6 +4669,9 @@  static const TypeInfo spapr_machine_info = {
         { TYPE_XICS_FABRIC },
         { TYPE_INTERRUPT_STATS_PROVIDER },
         { TYPE_XIVE_FABRIC },
+#ifdef CONFIG_VOF
+        { TYPE_VOF_MACHINE_IF },
+#endif
         { }
     },
 };
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f25014afda40..986a4de34128 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1080,7 +1080,7 @@  target_ulong do_client_architecture_support(PowerPCCPU *cpu,
     SpaprOptionVector *ov1_guest, *ov5_guest;
     bool guest_radix;
     bool raw_mode_supported = false;
-    bool guest_xive;
+    bool guest_xive, reset_fdt = false;
     CPUState *cs;
     void *fdt;
     uint32_t max_compat = spapr->max_compat_pvr;
@@ -1233,8 +1233,10 @@  target_ulong do_client_architecture_support(PowerPCCPU *cpu,
         spapr_setup_hpt(spapr);
     }
 
-    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
-
+#ifdef CONFIG_VOF
+    reset_fdt = spapr->vof != NULL;
+#endif
+    fdt = spapr_build_fdt(spapr, reset_fdt, fdt_bufsize);
     g_free(spapr->fdt_blob);
     spapr->fdt_size = fdt_totalsize(fdt);
     spapr->fdt_initial_size = spapr->fdt_size;
@@ -1277,6 +1279,27 @@  static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
     return ret;
 }
 
+#ifdef CONFIG_VOF
+target_ulong spapr_vof_client_architecture_support(MachineState *ms,
+                                                   CPUState *cs,
+                                                   target_ulong ovec_addr)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
+
+    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr,
+                                                      ovec_addr, FDT_MAX_SIZE);
+
+    /*
+     * This adds stdout and generates phandles for boottime and CAS FDTs.
+     * It is alright to update the FDT here as do_client_architecture_support()
+     * does not pack it.
+     */
+    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
+
+    return ret;
+}
+#endif
+
 static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
                                               SpaprMachineState *spapr,
                                               target_ulong opcode,
diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
new file mode 100644
index 000000000000..653d376f38aa
--- /dev/null
+++ b/hw/ppc/spapr_vof.c
@@ -0,0 +1,153 @@ 
+/*
+ * SPAPR machine hooks to Virtual Open Firmware,
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include <sys/ioctl.h>
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_vio.h"
+#include "hw/ppc/fdt.h"
+#include "sysemu/sysemu.h"
+#include "qom/qom-qobject.h"
+#include "trace.h"
+
+target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                target_ulong opcode, target_ulong *_args)
+{
+    int ret = vof_client_call(MACHINE(spapr), spapr->vof, spapr->fdt_blob,
+                              ppc64_phys_to_real(_args[0]));
+
+    if (ret) {
+        return H_PARAMETER;
+    }
+    return H_SUCCESS;
+}
+
+void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
+{
+    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
+    int chosen;
+
+    vof_build_dt(fdt, spapr->vof);
+
+    _FDT(chosen = fdt_path_offset(fdt, "/chosen"));
+    _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
+                            spapr->vof->bootargs ? : ""));
+
+    /*
+     * SLOF-less setup requires an open instance of stdout for early
+     * kernel printk. By now all phandles are settled so we can open
+     * the default serial console.
+     */
+    if (stdout_path) {
+        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
+                                   stdout_path));
+    }
+}
+
+void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
+                     target_ulong *stack_ptr, Error **errp)
+{
+    Vof *vof = spapr->vof;
+
+    vof_init(vof, spapr->rma_size, errp);
+
+    *stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE);
+    if (*stack_ptr == -1) {
+        error_setg(errp, "Memory allocation for stack failed");
+        return;
+    }
+    /* Stack grows downwards plus reserve space for the minimum stack frame */
+    *stack_ptr += VOF_STACK_SIZE - 0x20;
+
+    if (spapr->kernel_size &&
+        vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == -1) {
+        error_setg(errp, "Memory for kernel is in use");
+        return;
+    }
+
+    if (spapr->initrd_size &&
+        vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == -1) {
+        error_setg(errp, "Memory for initramdisk is in use");
+        return;
+    }
+
+    spapr_vof_client_dt_finalize(spapr, fdt);
+
+    /*
+     * At this point the expected allocation map is:
+     *
+     * 0..c38 - the initial firmware
+     * 8000..10000 - stack
+     * 400000.. - kernel
+     * 3ea0000.. - initramdisk
+     *
+     * We skip writing FDT as nothing expects it; OF client interface is
+     * going to be used for reading the device tree.
+     */
+}
+
+void spapr_vof_quiesce(MachineState *ms)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
+
+    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
+    spapr->fdt_initial_size = spapr->fdt_size;
+}
+
+bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname,
+                       void *val, int vallen)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(ms);
+
+    /*
+     * We only allow changing properties which we know how to update in QEMU
+     * OR
+     * the ones which we know that they need to survive during "quiesce".
+     */
+
+    if (strcmp(path, "/rtas") == 0) {
+        if (strcmp(propname, "linux,rtas-base") == 0 ||
+            strcmp(propname, "linux,rtas-entry") == 0) {
+            /* These need to survive quiesce so let them store in the FDT */
+            return true;
+        }
+    }
+
+    if (strcmp(path, "/chosen") == 0) {
+        if (strcmp(propname, "bootargs") == 0) {
+            Vof *vof = spapr->vof;
+
+            g_free(vof->bootargs);
+            vof->bootargs = g_strndup(val, vallen);
+            return true;
+        }
+        if (strcmp(propname, "linux,initrd-start") == 0) {
+            if (vallen == sizeof(uint32_t)) {
+                spapr->initrd_base = ldl_be_p(val);
+                return true;
+            }
+            if (vallen == sizeof(uint64_t)) {
+                spapr->initrd_base = ldq_be_p(val);
+                return true;
+            }
+            return false;
+        }
+        if (strcmp(propname, "linux,initrd-end") == 0) {
+            if (vallen == sizeof(uint32_t)) {
+                spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base;
+                return true;
+            }
+            if (vallen == sizeof(uint64_t)) {
+                spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base;
+                return true;
+            }
+            return false;
+        }
+    }
+
+    return true;
+}
diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
new file mode 100644
index 000000000000..1068a1e58388
--- /dev/null
+++ b/hw/ppc/vof.c
@@ -0,0 +1,1052 @@ 
+/*
+ * QEMU PowerPC Virtual Open Firmware.
+ *
+ * This implements client interface from OpenFirmware IEEE1275 on the QEMU
+ * side to leave only a very basic firmware in the VM.
+ *
+ * Copyright (c) 2021 IBM Corporation.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "qemu/range.h"
+#include "qemu/units.h"
+#include "qapi/error.h"
+#include <sys/ioctl.h>
+#include "exec/ram_addr.h"
+#include "exec/address-spaces.h"
+#include "hw/ppc/vof.h"
+#include "hw/ppc/fdt.h"
+#include "sysemu/runstate.h"
+#include "qom/qom-qobject.h"
+#include "trace.h"
+
+#include <libfdt.h>
+
+/*
+ * OF 1275 "nextprop" description suggests is it 32 bytes max but
+ * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long.
+ */
+#define OF_PROPNAME_LEN_MAX 64
+
+#define VOF_MAX_PATH        256
+#define VOF_MAX_SETPROPLEN  2048
+#define VOF_MAX_METHODLEN   256
+#define VOF_MAX_FORTHCODE   256
+#define VOF_VTY_BUF_SIZE    256
+
+typedef struct {
+    uint64_t start;
+    uint64_t size;
+} OfClaimed;
+
+typedef struct {
+    char *path; /* the path used to open the instance */
+    uint32_t phandle;
+} OfInstance;
+
+static int readstr(hwaddr pa, char *buf, int size)
+{
+    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
+        return -1;
+    }
+    if (strnlen(buf, size) == size) {
+        buf[size - 1] = '\0';
+        trace_vof_error_str_truncated(buf, size);
+        return -1;
+    }
+    return 0;
+}
+
+static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
+                       const char *s1, unsigned nargscheck, unsigned nretcheck)
+{
+    if (strcmp(s, s1)) {
+        return false;
+    }
+    if ((nargscheck && (nargs != nargscheck)) ||
+        (nretcheck && (nret != nretcheck))) {
+        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
+        return false;
+    }
+
+    return true;
+}
+
+static void prop_format(char *tval, int tlen, const void *prop, int len)
+{
+    int i;
+    const unsigned char *c;
+    char *t;
+    const char bin[] = "...";
+
+    for (i = 0, c = prop; i < len; ++i, ++c) {
+        if (*c == '\0' && i == len - 1) {
+            strncpy(tval, prop, tlen - 1);
+            return;
+        }
+        if (*c < 0x20 || *c >= 0x80) {
+            break;
+        }
+    }
+
+    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
+        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
+            strcpy(t, bin);
+            return;
+        }
+        if (i && i % 4 == 0 && i != len - 1) {
+            strcat(t, " ");
+            ++t;
+        }
+        t += sprintf(t, "%02X", *c & 0xFF);
+    }
+}
+
+static int get_path(const void *fdt, int offset, char *buf, int len)
+{
+    int ret;
+
+    ret = fdt_get_path(fdt, offset, buf, len - 1);
+    if (ret < 0) {
+        return ret;
+    }
+
+    buf[len - 1] = '\0';
+
+    return strlen(buf) + 1;
+}
+
+static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, int len)
+{
+    int ret;
+
+    ret = fdt_node_offset_by_phandle(fdt, ph);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return get_path(fdt, ret, buf, len);
+}
+
+static int path_offset(const void *fdt, const char *path)
+{
+    g_autofree char *p = NULL;
+    char *at;
+
+    /*
+     * The addresses in node names are expected to in the lower case as per
+     * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html
+     */
+    at = strchr(path, '@');
+    if (!at) {
+        return fdt_path_offset(fdt, path);
+    }
+
+    p = g_strdup(path);
+    for (at = at - path + p + 1; *at; ++at) {
+        *at = tolower(*at);
+    }
+    return fdt_path_offset(fdt, p);
+}
+
+static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
+{
+    char fullnode[VOF_MAX_PATH];
+    uint32_t ret = -1;
+    int offset;
+
+    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
+        return (uint32_t) ret;
+    }
+
+    offset = path_offset(fdt, fullnode);
+    if (offset >= 0) {
+        ret = fdt_get_phandle(fdt, offset);
+    }
+    trace_vof_finddevice(fullnode, ret);
+    return (uint32_t) ret;
+}
+
+static const void *getprop(const void *fdt, int nodeoff, const char *propname,
+                           int *proplen, bool *write0)
+{
+    const char *unit, *prop;
+    const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen);
+
+    if (ret) {
+        if (write0) {
+            *write0 = false;
+        }
+        return ret;
+    }
+
+    /*
+     * The "name" property is not actually expected as a property in the FDT
+     * (although some platform may create those in "/" so we try getprop first),
+     * we emulate it by returning a pointer to the node's name and adjust
+     * proplen to include only the name but not the unit.
+     */
+    if (strcmp(propname, "name")) {
+        return NULL;
+    }
+    prop = fdt_get_name(fdt, nodeoff, proplen);
+    if (!prop) {
+        *proplen = 0;
+        return NULL;
+    }
+
+    unit = memchr(prop, '@', *proplen);
+    if (unit) {
+        *proplen = unit - prop;
+    }
+    *proplen += 1;
+
+    /*
+     * Since it might be cut at "@" and there will be no trailing zero
+     * in the prop buffer, tell the caller to write zero at the end.
+     */
+    if (write0) {
+        *write0 = true;
+    }
+    return prop;
+}
+
+static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
+                            uint32_t valaddr, uint32_t vallen)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = 0;
+    int proplen = 0;
+    const void *prop;
+    char trval[64] = "";
+    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
+    bool write0;
+
+    if (nodeoff < 0) {
+        return -1;
+    }
+    if (readstr(pname, propname, sizeof(propname))) {
+        return -1;
+    }
+    prop = getprop(fdt, nodeoff, propname, &proplen, &write0);
+    if (prop) {
+        const char zero = 0;
+        int cb = MIN(proplen, vallen);
+
+        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK ||
+            /* if that was "name" with a unit address, overwrite '@' with '0' */
+            (write0 &&
+             cb == proplen &&
+             VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) {
+            ret = -1;
+        } else {
+            /*
+             * OF1275 says:
+             * "Size is either the actual size of the property, or -1 if name
+             * does not exist", hence returning proplen instead of cb.
+             */
+            ret = proplen;
+            /* Do not format a value if tracepoint is silent, for performance */
+            if (trace_event_get_state(TRACE_VOF_GETPROP) &&
+                qemu_loglevel_mask(LOG_TRACE)) {
+                prop_format(trval, sizeof(trval), prop, ret);
+            }
+        }
+    } else {
+        ret = -1;
+    }
+    trace_vof_getprop(nodeph, propname, ret, trval);
+
+    return ret;
+}
+
+static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = 0;
+    int proplen = 0;
+    const void *prop;
+    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
+
+    if (nodeoff < 0) {
+        return -1;
+    }
+    if (readstr(pname, propname, sizeof(propname))) {
+        return -1;
+    }
+    prop = getprop(fdt, nodeoff, propname, &proplen, NULL);
+    if (prop) {
+        ret = proplen;
+    } else {
+        ret = -1;
+    }
+    trace_vof_getproplen(nodeph, propname, ret);
+
+    return ret;
+}
+
+static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof,
+                            uint32_t nodeph, uint32_t pname,
+                            uint32_t valaddr, uint32_t vallen)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = -1;
+    int offset;
+    char trval[64] = "";
+    char nodepath[VOF_MAX_PATH] = "";
+    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
+    g_autofree char *val = NULL;
+
+    if (vallen > VOF_MAX_SETPROPLEN) {
+        goto trace_exit;
+    }
+    if (readstr(pname, propname, sizeof(propname))) {
+        goto trace_exit;
+    }
+    offset = fdt_node_offset_by_phandle(fdt, nodeph);
+    if (offset < 0) {
+        goto trace_exit;
+    }
+    ret = get_path(fdt, offset, nodepath, sizeof(nodepath));
+    if (ret <= 0) {
+        goto trace_exit;
+    }
+
+    val = g_malloc0(vallen);
+    if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) {
+        goto trace_exit;
+    }
+
+    if (vmo) {
+        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
+
+        if (vmc->setprop &&
+            !vmc->setprop(ms, nodepath, propname, val, vallen)) {
+            goto trace_exit;
+        }
+    }
+
+    ret = fdt_setprop(fdt, offset, propname, val, vallen);
+    if (ret) {
+        goto trace_exit;
+    }
+
+    if (trace_event_get_state(TRACE_VOF_SETPROP) &&
+        qemu_loglevel_mask(LOG_TRACE)) {
+        prop_format(trval, sizeof(trval), val, vallen);
+    }
+    ret = vallen;
+
+trace_exit:
+    trace_vof_setprop(nodeph, propname, trval, vallen, ret);
+
+    return ret;
+}
+
+static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
+                             uint32_t prevaddr, uint32_t nameaddr)
+{
+    int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle);
+    char prev[OF_PROPNAME_LEN_MAX + 1];
+    const char *tmp;
+
+    if (readstr(prevaddr, prev, sizeof(prev))) {
+        return -1;
+    }
+
+    fdt_for_each_property_offset(offset, fdt, nodeoff) {
+        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
+            return 0;
+        }
+        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
+            if (prev[0] != '\0') {
+                offset = fdt_next_property_offset(fdt, offset);
+                if (offset < 0) {
+                    return 0;
+                }
+            }
+            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
+                return 0;
+            }
+
+            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) {
+                return -1;
+            }
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static uint32_t vof_peer(const void *fdt, uint32_t phandle)
+{
+    int ret;
+
+    if (phandle == 0) {
+        ret = fdt_path_offset(fdt, "/");
+    } else {
+        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+    }
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_child(const void *fdt, uint32_t phandle)
+{
+    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_parent(const void *fdt, uint32_t phandle)
+{
+    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const char *path)
+{
+    uint32_t ret = -1;
+    OfInstance *inst = NULL;
+
+    if (vof->of_instance_last == 0xFFFFFFFF) {
+        /* We do not recycle ihandles yet */
+        goto trace_exit;
+    }
+
+    inst = g_new0(OfInstance, 1);
+    inst->phandle = fdt_get_phandle(fdt, offset);
+    g_assert(inst->phandle);
+    ++vof->of_instance_last;
+
+    inst->path = g_strdup(path);
+    g_hash_table_insert(vof->of_instances,
+                        GINT_TO_POINTER(vof->of_instance_last),
+                        inst);
+    ret = vof->of_instance_last;
+
+trace_exit:
+    trace_vof_open(path, inst ? inst->phandle : 0, ret);
+
+    return ret;
+}
+
+uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
+                               const char *prop, const char *path)
+{
+    int node = fdt_path_offset(fdt, nodename);
+    int inst, offset;
+
+    offset = fdt_path_offset(fdt, path);
+    if (offset < 0) {
+        trace_vof_error_unknown_path(path);
+        return offset;
+    }
+
+    inst = vof_do_open(fdt, vof, offset, path);
+
+    return fdt_setprop_cell(fdt, node, prop, inst);
+}
+
+static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
+{
+    char path[VOF_MAX_PATH];
+    int offset;
+
+    if (readstr(pathaddr, path, sizeof(path))) {
+        return -1;
+    }
+
+    offset = path_offset(fdt, path);
+    if (offset < 0) {
+        trace_vof_error_unknown_path(path);
+        return offset;
+    }
+
+    return vof_do_open(fdt, vof, offset, path);
+}
+
+static void vof_close(Vof *vof, uint32_t ihandle)
+{
+    if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) {
+        trace_vof_error_unknown_ihandle_close(ihandle);
+    }
+}
+
+static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
+{
+    gpointer instp = g_hash_table_lookup(vof->of_instances,
+                                         GINT_TO_POINTER(ihandle));
+    uint32_t ret = -1;
+
+    if (instp) {
+        ret = ((OfInstance *)instp)->phandle;
+    }
+    trace_vof_instance_to_package(ihandle, ret);
+
+    return ret;
+}
+
+static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
+                                    uint32_t buf, uint32_t len)
+{
+    uint32_t ret = -1;
+    char tmp[VOF_MAX_PATH] = "";
+
+    ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
+    if (ret > 0) {
+        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
+            ret = -1;
+        }
+    }
+
+    trace_vof_package_to_path(phandle, tmp, ret);
+
+    return ret;
+}
+
+static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle,
+                                     uint32_t buf, uint32_t len)
+{
+    uint32_t ret = -1;
+    uint32_t phandle = vof_instance_to_package(vof, ihandle);
+    char tmp[VOF_MAX_PATH] = "";
+
+    if (phandle != -1) {
+        ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp));
+        if (ret > 0) {
+            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
+                ret = -1;
+            }
+        }
+    }
+    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
+
+    return ret;
+}
+
+static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf,
+                          uint32_t len)
+{
+    char tmp[VOF_VTY_BUF_SIZE];
+    unsigned cb;
+    OfInstance *inst = (OfInstance *)
+        g_hash_table_lookup(vof->of_instances, GINT_TO_POINTER(ihandle));
+
+    if (!inst) {
+        trace_vof_error_write(ihandle);
+        return -1;
+    }
+
+    for ( ; len > 0; len -= cb) {
+        cb = MIN(len, sizeof(tmp) - 1);
+        if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) {
+            return -1;
+        }
+
+        /* FIXME: there is no backend(s) yet so just call a trace */
+        if (trace_event_get_state(TRACE_VOF_WRITE) &&
+            qemu_loglevel_mask(LOG_TRACE)) {
+            tmp[cb] = '\0';
+            trace_vof_write(ihandle, cb, tmp);
+        }
+    }
+
+    return len;
+}
+
+static void vof_claimed_dump(GArray *claimed)
+{
+    int i;
+    OfClaimed c;
+
+    if (trace_event_get_state(TRACE_VOF_CLAIMED) &&
+        qemu_loglevel_mask(LOG_TRACE)) {
+
+        for (i = 0; i < claimed->len; ++i) {
+            c = g_array_index(claimed, OfClaimed, i);
+            trace_vof_claimed(c.start, c.start + c.size, c.size);
+        }
+    }
+}
+
+static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size)
+{
+    int i;
+    OfClaimed c;
+
+    for (i = 0; i < claimed->len; ++i) {
+        c = g_array_index(claimed, OfClaimed, i);
+        if (ranges_overlap(c.start, c.size, virt, size)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
+{
+    OfClaimed newclaim;
+
+    newclaim.start = virt;
+    newclaim.size = size;
+    g_array_append_val(claimed, newclaim);
+}
+
+static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
+{
+    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
+}
+
+static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
+{
+    int i, n, offset, proplen = 0, sc, ac;
+    target_ulong mem0_end;
+    const uint8_t *mem0_reg;
+    g_autofree uint8_t *avail = NULL;
+    uint8_t *availcur;
+
+    if (!fdt || !claimed) {
+        return;
+    }
+
+    offset = fdt_path_offset(fdt, "/");
+    _FDT(offset);
+    ac = fdt_address_cells(fdt, offset);
+    g_assert(ac == 1 || ac == 2);
+    sc = fdt_size_cells(fdt, offset);
+    g_assert(sc == 1 || sc == 2);
+
+    offset = fdt_path_offset(fdt, "/memory@0");
+    _FDT(offset);
+
+    mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
+    g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
+    if (sc == 2) {
+        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac));
+    } else {
+        mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac));
+    }
+
+    g_array_sort(claimed, of_claimed_compare_func);
+    vof_claimed_dump(claimed);
+
+    /*
+     * VOF resides in the first page so we do not need to check if there is
+     * available memory before the first claimed block
+     */
+    g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 0).start == 0));
+
+    avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len);
+    for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) {
+        OfClaimed c = g_array_index(claimed, OfClaimed, i);
+        uint64_t start, size;
+
+        start = c.start + c.size;
+        if (i < claimed->len - 1) {
+            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
+
+            size = cn.start - start;
+        } else {
+            size = mem0_end - start;
+        }
+
+        if (ac == 2) {
+            *(uint64_t *) availcur = cpu_to_be64(start);
+        } else {
+            *(uint32_t *) availcur = cpu_to_be32(start);
+        }
+        availcur += sizeof(uint32_t) * ac;
+        if (sc == 2) {
+            *(uint64_t *) availcur = cpu_to_be64(size);
+        } else {
+            *(uint32_t *) availcur = cpu_to_be32(size);
+        }
+        availcur += sizeof(uint32_t) * sc;
+
+        if (size) {
+            trace_vof_avail(c.start + c.size, c.start + c.size + size, size);
+            ++n;
+        }
+    }
+    _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - avail)));
+}
+
+/*
+ * OF1275:
+ * "Allocates size bytes of memory. If align is zero, the allocated range
+ * begins at the virtual address virt. Otherwise, an aligned address is
+ * automatically chosen and the input argument virt is ignored".
+ *
+ * In other words, exactly one of @virt and @align is non-zero.
+ */
+uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size,
+                   uint64_t align)
+{
+    uint64_t ret;
+
+    if (size == 0) {
+        ret = -1;
+    } else if (align == 0) {
+        if (!vof_claim_avail(vof->claimed, virt, size)) {
+            ret = -1;
+        } else {
+            ret = virt;
+        }
+    } else {
+        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
+        while (1) {
+            if (vof->claimed_base >= vof->top_addr) {
+                error_report("Out of RMA memory for the OF client");
+                return -1;
+            }
+            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
+                break;
+            }
+            vof->claimed_base += size;
+        }
+        ret = vof->claimed_base;
+    }
+
+    if (ret != -1) {
+        vof->claimed_base = MAX(vof->claimed_base, ret + size);
+        vof_claim_add(vof->claimed, ret, size);
+    }
+    trace_vof_claim(virt, size, align, ret);
+
+    return ret;
+}
+
+static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size)
+{
+    uint32_t ret = -1;
+    int i;
+    GArray *claimed = vof->claimed;
+    OfClaimed c;
+
+    for (i = 0; i < claimed->len; ++i) {
+        c = g_array_index(claimed, OfClaimed, i);
+        if (c.start == virt && c.size == size) {
+            g_array_remove_index(claimed, i);
+            ret = 0;
+            break;
+        }
+    }
+
+    trace_vof_release(virt, size, ret);
+
+    return ret;
+}
+
+static void vof_instantiate_rtas(Error **errp)
+{
+    error_setg(errp, "The firmware should have instantiated RTAS");
+}
+
+static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t methodaddr,
+                                uint32_t ihandle, uint32_t param1,
+                                uint32_t param2, uint32_t param3,
+                                uint32_t param4, uint32_t *ret2)
+{
+    uint32_t ret = -1;
+    char method[VOF_MAX_METHODLEN] = "";
+    OfInstance *inst;
+
+    if (!ihandle) {
+        goto trace_exit;
+    }
+
+    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
+                                              GINT_TO_POINTER(ihandle));
+    if (!inst) {
+        goto trace_exit;
+    }
+
+    if (readstr(methodaddr, method, sizeof(method))) {
+        goto trace_exit;
+    }
+
+    if (strcmp(inst->path, "/") == 0) {
+        if (strcmp(method, "ibm,client-architecture-support") == 0) {
+            Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
+
+            if (vmo) {
+                VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
+
+                g_assert(vmc->client_architecture_support);
+                ret = vmc->client_architecture_support(ms, first_cpu, param1);
+            }
+
+            *ret2 = 0;
+        }
+    } else if (strcmp(inst->path, "/rtas") == 0) {
+        if (strcmp(method, "instantiate-rtas") == 0) {
+            vof_instantiate_rtas(&error_fatal);
+            ret = 0;
+            *ret2 = param1; /* rtas-base */
+        }
+    } else {
+        trace_vof_error_unknown_method(method);
+    }
+
+trace_exit:
+    trace_vof_method(ihandle, method, param1, ret, *ret2);
+
+    return ret;
+}
+
+static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1,
+                                   uint32_t param2, uint32_t *ret2)
+{
+    uint32_t ret = -1;
+    char cmd[VOF_MAX_FORTHCODE] = "";
+
+    /* No interpret implemented so just call a trace */
+    readstr(cmdaddr, cmd, sizeof(cmd));
+    trace_vof_interpret(cmd, param1, param2, ret, *ret2);
+
+    return ret;
+}
+
+static void vof_quiesce(MachineState *ms, void *fdt, Vof *vof)
+{
+    Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF);
+    /* After "quiesce", no change is expected to the FDT, pack FDT to ensure */
+    int rc = fdt_pack(fdt);
+
+    assert(rc == 0);
+
+    if (vmo) {
+        VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo);
+
+        if (vmc->quiesce) {
+            vmc->quiesce(ms);
+        }
+    }
+
+    vof_claimed_dump(vof->claimed);
+}
+
+static uint32_t vof_client_handle(MachineState *ms, void *fdt, Vof *vof,
+                                  const char *service,
+                                  uint32_t *args, unsigned nargs,
+                                  uint32_t *rets, unsigned nrets)
+{
+    uint32_t ret = 0;
+
+    /* @nrets includes the value which this function returns */
+#define cmpserv(s, a, r) \
+    cmpservice(service, nargs, nrets, (s), (a), (r))
+
+    if (cmpserv("finddevice", 1, 1)) {
+        ret = vof_finddevice(fdt, args[0]);
+    } else if (cmpserv("getprop", 4, 1)) {
+        ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]);
+    } else if (cmpserv("getproplen", 2, 1)) {
+        ret = vof_getproplen(fdt, args[0], args[1]);
+    } else if (cmpserv("setprop", 4, 1)) {
+        ret = vof_setprop(ms, fdt, vof, args[0], args[1], args[2], args[3]);
+    } else if (cmpserv("nextprop", 3, 1)) {
+        ret = vof_nextprop(fdt, args[0], args[1], args[2]);
+    } else if (cmpserv("peer", 1, 1)) {
+        ret = vof_peer(fdt, args[0]);
+    } else if (cmpserv("child", 1, 1)) {
+        ret = vof_child(fdt, args[0]);
+    } else if (cmpserv("parent", 1, 1)) {
+        ret = vof_parent(fdt, args[0]);
+    } else if (cmpserv("open", 1, 1)) {
+        ret = vof_open(fdt, vof, args[0]);
+    } else if (cmpserv("close", 1, 0)) {
+        vof_close(vof, args[0]);
+    } else if (cmpserv("instance-to-package", 1, 1)) {
+        ret = vof_instance_to_package(vof, args[0]);
+    } else if (cmpserv("package-to-path", 3, 1)) {
+        ret = vof_package_to_path(fdt, args[0], args[1], args[2]);
+    } else if (cmpserv("instance-to-path", 3, 1)) {
+        ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]);
+    } else if (cmpserv("write", 3, 1)) {
+        ret = vof_write(vof, args[0], args[1], args[2]);
+    } else if (cmpserv("claim", 3, 1)) {
+        ret = vof_claim(vof, args[0], args[1], args[2]);
+        if (ret != -1) {
+            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+        }
+    } else if (cmpserv("release", 2, 0)) {
+        ret = vof_release(vof, args[0], args[1]);
+        if (ret != -1) {
+            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+        }
+    } else if (cmpserv("call-method", 0, 0)) {
+        ret = vof_call_method(ms, vof, args[0], args[1], args[2], args[3],
+                              args[4], args[5], rets);
+    } else if (cmpserv("interpret", 0, 0)) {
+        ret = vof_call_interpret(args[0], args[1], args[2], rets);
+    } else if (cmpserv("milliseconds", 0, 1)) {
+        ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+    } else if (cmpserv("quiesce", 0, 0)) {
+        vof_quiesce(ms, fdt, vof);
+    } else if (cmpserv("exit", 0, 0)) {
+        error_report("Stopped as the VM requested \"exit\"");
+        vm_stop(RUN_STATE_PAUSED);
+    } else {
+        trace_vof_error_unknown_service(service, nargs, nrets);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+/* Defined as Big Endian */
+struct prom_args {
+    uint32_t service;
+    uint32_t nargs;
+    uint32_t nret;
+    uint32_t args[10];
+} QEMU_PACKED;
+
+int vof_client_call(MachineState *ms, Vof *vof, void *fdt,
+                    target_ulong args_real)
+{
+    struct prom_args args_be;
+    uint32_t args[ARRAY_SIZE(args_be.args)];
+    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
+    char service[64];
+    unsigned nargs, nret, i;
+
+    if (address_space_rw(&address_space_memory, args_real,
+                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
+                         false) != MEMTX_OK) {
+        return -EINVAL;
+    }
+    nargs = be32_to_cpu(args_be.nargs);
+    if (nargs >= ARRAY_SIZE(args_be.args)) {
+        return -EINVAL;
+    }
+
+    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
+                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
+                         false) != MEMTX_OK) {
+        return -EINVAL;
+    }
+    if (strnlen(service, sizeof(service)) == sizeof(service)) {
+        /* Too long service name */
+        return -EINVAL;
+    }
+
+    for (i = 0; i < nargs; ++i) {
+        args[i] = be32_to_cpu(args_be.args[i]);
+    }
+
+    nret = be32_to_cpu(args_be.nret);
+    ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, nret);
+    if (!nret) {
+        return 0;
+    }
+
+    args_be.args[nargs] = cpu_to_be32(ret);
+    for (i = 1; i < nret; ++i) {
+        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
+    }
+
+    if (address_space_rw(&address_space_memory,
+                         args_real + offsetof(struct prom_args, args[nargs]),
+                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
+                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void vof_instance_free(gpointer data)
+{
+    OfInstance *inst = (OfInstance *) data;
+
+    g_free(inst->path);
+    g_free(inst);
+}
+
+void vof_init(Vof *vof, uint64_t top_addr, Error **errp)
+{
+    vof_cleanup(vof);
+
+    vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal,
+                                              NULL, vof_instance_free);
+    vof->claimed = g_array_new(false, false, sizeof(OfClaimed));
+
+    /* Keep allocations in 32bit as CLI ABI can only return cells==32bit */
+    vof->top_addr = MIN(top_addr, 4 * GiB);
+    if (vof_claim(vof, 0, vof->fw_size, 0) == -1) {
+        error_setg(errp, "Memory for firmware is in use");
+    }
+}
+
+void vof_cleanup(Vof *vof)
+{
+    if (vof->claimed) {
+        g_array_unref(vof->claimed);
+    }
+    if (vof->of_instances) {
+        g_hash_table_unref(vof->of_instances);
+    }
+    vof->claimed = NULL;
+    vof->of_instances = NULL;
+}
+
+void vof_build_dt(void *fdt, Vof *vof)
+{
+    uint32_t phandle = fdt_get_max_phandle(fdt);
+    int offset, proplen = 0;
+    const void *prop;
+
+    /* Assign phandles to nodes without predefined phandles (like XICS/XIVE) */
+    for (offset = fdt_next_node(fdt, -1, NULL);
+         offset >= 0;
+         offset = fdt_next_node(fdt, offset, NULL)) {
+        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
+        if (prop) {
+            continue;
+        }
+        ++phandle;
+        _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle));
+    }
+
+    vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+}
+
+static const TypeInfo vof_machine_if_info = {
+    .name = TYPE_VOF_MACHINE_IF,
+    .parent = TYPE_INTERFACE,
+    .class_size = sizeof(VofMachineIfClass),
+};
+
+static void vof_machine_if_register_types(void)
+{
+    type_register_static(&vof_machine_if_info);
+}
+type_init(vof_machine_if_register_types)
diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c
new file mode 100644
index 000000000000..771b9e95f95d
--- /dev/null
+++ b/pc-bios/vof/bootmem.c
@@ -0,0 +1,14 @@ 
+#include "vof.h"
+
+void boot_from_memory(uint64_t initrd, uint64_t initrdsize)
+{
+    uint64_t kern[2];
+    phandle chosen = ci_finddevice("/chosen");
+
+    if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=
+        sizeof(kern)) {
+        return;
+    }
+
+    do_boot(kern[0], initrd, initrdsize);
+}
diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
new file mode 100644
index 000000000000..a80806580dd0
--- /dev/null
+++ b/pc-bios/vof/ci.c
@@ -0,0 +1,91 @@ 
+#include "vof.h"
+
+struct prom_args {
+    uint32_t service;
+    uint32_t nargs;
+    uint32_t nret;
+    uint32_t args[10];
+};
+
+typedef unsigned long prom_arg_t;
+
+#define ADDR(x) (uint32_t)(x)
+
+static int prom_handle(struct prom_args *pargs)
+{
+    void *rtasbase;
+    uint32_t rtassize = 0;
+    phandle rtas;
+
+    if (strcmp("call-method", (void *)(unsigned long) pargs->service)) {
+        return -1;
+    }
+
+    if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0])) {
+        return -1;
+    }
+
+    rtas = ci_finddevice("/rtas");
+    /* rtas-size is set by QEMU depending of FWNMI support */
+    ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));
+    if (rtassize < hv_rtas_size) {
+        return -1;
+    }
+
+    rtasbase = (void *)(unsigned long) pargs->args[2];
+
+    memcpy(rtasbase, hv_rtas, hv_rtas_size);
+    pargs->args[pargs->nargs] = 0;
+    pargs->args[pargs->nargs + 1] = pargs->args[2];
+
+    return 0;
+}
+
+void prom_entry(uint32_t args)
+{
+    if (prom_handle((void *)(unsigned long) args)) {
+        ci_entry(args);
+    }
+}
+
+static int call_ci(const char *service, int nargs, int nret, ...)
+{
+    int i;
+    struct prom_args args;
+    va_list list;
+
+    args.service = ADDR(service);
+    args.nargs = nargs;
+    args.nret = nret;
+
+    va_start(list, nret);
+    for (i = 0; i < nargs; i++) {
+        args.args[i] = va_arg(list, prom_arg_t);
+    }
+    va_end(list);
+
+    for (i = 0; i < nret; i++) {
+        args.args[nargs + i] = 0;
+    }
+
+    if (ci_entry((uint32_t)(&args)) < 0) {
+        return PROM_ERROR;
+    }
+
+    return (nret > 0) ? args.args[nargs] : 0;
+}
+
+void ci_panic(const char *str)
+{
+    call_ci("exit", 0, 0);
+}
+
+phandle ci_finddevice(const char *path)
+{
+    return call_ci("finddevice", 1, 1, path);
+}
+
+uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len)
+{
+    return call_ci("getprop", 4, 1, ph, propname, prop, len);
+}
diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c
new file mode 100644
index 000000000000..00c10e6e7da1
--- /dev/null
+++ b/pc-bios/vof/libc.c
@@ -0,0 +1,92 @@ 
+#include "vof.h"
+
+int strlen(const char *s)
+{
+    int len = 0;
+
+    while (*s != 0) {
+        len += 1;
+        s += 1;
+    }
+
+    return len;
+}
+
+int strcmp(const char *s1, const char *s2)
+{
+    while (*s1 != 0 && *s2 != 0) {
+        if (*s1 != *s2) {
+            break;
+        }
+        s1 += 1;
+        s2 += 1;
+    }
+
+    return *s1 - *s2;
+}
+
+void *memcpy(void *dest, const void *src, size_t n)
+{
+    char *cdest;
+    const char *csrc = src;
+
+    cdest = dest;
+    while (n-- > 0) {
+        *cdest++ = *csrc++;
+    }
+
+    return dest;
+}
+
+int memcmp(const void *ptr1, const void *ptr2, size_t n)
+{
+    const unsigned char *p1 = ptr1;
+    const unsigned char *p2 = ptr2;
+
+    while (n-- > 0) {
+        if (*p1 != *p2) {
+            return *p1 - *p2;
+        }
+        p1 += 1;
+        p2 += 1;
+    }
+
+    return 0;
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+    char *cdest;
+    const char *csrc;
+    int i;
+
+    /* Do the buffers overlap in a bad way? */
+    if (src < dest && src + n >= dest) {
+        /* Copy from end to start */
+        cdest = dest + n - 1;
+        csrc = src + n - 1;
+        for (i = 0; i < n; i++) {
+            *cdest-- = *csrc--;
+        }
+    } else {
+        /* Normal copy is possible */
+        cdest = dest;
+        csrc = src;
+        for (i = 0; i < n; i++) {
+            *cdest++ = *csrc++;
+        }
+    }
+
+    return dest;
+}
+
+void *memset(void *dest, int c, size_t size)
+{
+    unsigned char *d = (unsigned char *)dest;
+
+    while (size-- > 0) {
+        *d++ = (unsigned char)c;
+    }
+
+    return dest;
+}
diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c
new file mode 100644
index 000000000000..9fc30d2d0957
--- /dev/null
+++ b/pc-bios/vof/main.c
@@ -0,0 +1,21 @@ 
+#include "vof.h"
+
+void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4)
+{
+    register unsigned long r3 __asm__("r3") = _r3;
+    register unsigned long r4 __asm__("r4") = _r4;
+    register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;
+
+    ((client *)(uint32_t)addr)();
+}
+
+void entry_c(void)
+{
+    register unsigned long r3 __asm__("r3");
+    register unsigned long r4 __asm__("r4");
+    register unsigned long r5 __asm__("r5");
+    uint64_t initrd = r3, initrdsize = r4;
+
+    boot_from_memory(initrd, initrdsize);
+    ci_panic("*** No boot target ***\n");
+}
diff --git a/tests/qtest/rtas-test.c b/tests/qtest/rtas-test.c
index 16751dbd2f55..5b8e3d240921 100644
--- a/tests/qtest/rtas-test.c
+++ b/tests/qtest/rtas-test.c
@@ -5,7 +5,7 @@ 
 #include "libqos/libqos-spapr.h"
 #include "libqos/rtas.h"
 
-static void test_rtas_get_time_of_day(void)
+static void run_test_rtas_get_time_of_day(const char *machine)
 {
     QOSState *qs;
     struct tm tm;
@@ -13,7 +13,7 @@  static void test_rtas_get_time_of_day(void)
     uint64_t ret;
     time_t t1, t2;
 
-    qs = qtest_spapr_boot("-machine pseries");
+    qs = qtest_spapr_boot(machine);
 
     t1 = time(NULL);
     ret = qrtas_get_time_of_day(qs->qts, &qs->alloc, &tm, &ns);
@@ -24,6 +24,18 @@  static void test_rtas_get_time_of_day(void)
     qtest_shutdown(qs);
 }
 
+static void test_rtas_get_time_of_day(void)
+{
+    run_test_rtas_get_time_of_day("-machine pseries");
+}
+
+static void test_rtas_get_time_of_day_vof(void)
+{
+#ifdef CONFIG_VOF
+    run_test_rtas_get_time_of_day("-machine pseries,x-vof=on");
+#endif
+}
+
 int main(int argc, char *argv[])
 {
     const char *arch = qtest_get_arch();
@@ -35,6 +47,7 @@  int main(int argc, char *argv[])
         exit(EXIT_FAILURE);
     }
     qtest_add_func("rtas/get-time-of-day", test_rtas_get_time_of_day);
+    qtest_add_func("rtas/get-time-of-day-vof", test_rtas_get_time_of_day_vof);
 
     return g_test_run();
 }
diff --git a/MAINTAINERS b/MAINTAINERS
index 7d9cd2904264..6fb202f99e90 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1356,6 +1356,18 @@  F: hw/pci-host/mv64361.c
 F: hw/pci-host/mv643xx.h
 F: include/hw/pci-host/mv64361.h
 
+Virtual Open Firmware (VOF)
+M: Alexey Kardashevskiy <aik@ozlabs.ru>
+M: David Gibson <david@gibson.dropbear.id.au>
+M: Greg Kurz <groug@kaod.org>
+L: qemu-ppc@nongnu.org
+S: Maintained
+F: hw/ppc/spapr_vof*
+F: hw/ppc/vof*
+F: include/hw/ppc/vof*
+F: pc-bios/vof/*
+F: pc-bios/vof*
+
 RISC-V Machines
 ---------------
 OpenTitan
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 66e0b15d9efd..b895720b28b2 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -144,3 +144,6 @@  config FW_CFG_PPC
 
 config FDT_PPC
     bool
+
+config VOF
+    bool
diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
index 597d974dd4ff..aa4c8e6a2eac 100644
--- a/hw/ppc/meson.build
+++ b/hw/ppc/meson.build
@@ -84,4 +84,7 @@  ppc_ss.add(when: 'CONFIG_VIRTEX', if_true: files('virtex_ml507.c'))
 # Pegasos2
 ppc_ss.add(when: 'CONFIG_PEGASOS2', if_true: files('pegasos2.c'))
 
+ppc_ss.add(when: 'CONFIG_VOF', if_true: files('vof.c'))
+ppc_ss.add(when: ['CONFIG_VOF', 'CONFIG_PSERIES'], if_true: files('spapr_vof.c'))
+
 hw_arch += {'ppc': ppc_ss}
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index 0ba3e403533f..6e90a0107247 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -71,6 +71,30 @@  spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3
 spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64
 spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed"
 
+# vof.c
+vof_error_str_truncated(const char *s, int len) "%s truncated to %d"
+vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d"
+vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d"
+vof_error_unknown_method(const char *method) "\"%s\""
+vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x"
+vof_error_unknown_path(const char *path) "\"%s\""
+vof_error_write(uint32_t ih) "ih=0x%x"
+vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x"
+vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x"
+vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x"
+vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x"
+vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]"
+vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d"
+vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t vallen, uint32_t ret) "ph=0x%x \"%s\" [%s] len=%d => ret=%d"
+vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x"
+vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x"
+vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d"
+vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d"
+vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x"
+vof_write(uint32_t ih, unsigned cb, const char *msg) "ih=0x%x [%u] \"%s\""
+vof_avail(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64
+vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64
+
 # ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
 
diff --git a/meson.build b/meson.build
index 626cf932c1e5..ed011adca89a 100644
--- a/meson.build
+++ b/meson.build
@@ -2700,6 +2700,7 @@  summary_info += {'pixman':            pixman.found()}
 summary_info += {'VTE support':       config_host.has_key('CONFIG_VTE')}
 # TODO: add back version
 summary_info += {'slirp support':     slirp_opt == 'disabled' ? false : slirp_opt}
+summary_info += {'VOF support':        config_host.has_key('CONFIG_VOF')}
 summary_info += {'libtasn1':          config_host.has_key('CONFIG_TASN1')}
 summary_info += {'PAM':               config_host.has_key('CONFIG_AUTH_PAM')}
 summary_info += {'iconv support':     iconv.found()}
diff --git a/pc-bios/README b/pc-bios/README
index c101c9a04f8f..6e6556e91c92 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -16,6 +16,8 @@ 
   https://github.com/aik/SLOF, and the image currently in qemu is
   built from git tag qemu-slof-20210217.
 
+- vof is a minimalistic firmware to work with -machine pseries,x-vof=on.
+
 - sgabios (the Serial Graphics Adapter option ROM) provides a means for
   legacy x86 software to communicate with an attached serial console as
   if a video card were attached.  The master sources reside in a subversion
diff --git a/pc-bios/vof-nvram.bin b/pc-bios/vof-nvram.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d183901cf980a91d81c4348bb20487c7bb62a2ec
GIT binary patch
literal 16384
zcmeI%Jx;?g6bEpZJ8*)oSZeqZi&Z2pKnD)sI4{AHlNb4;RW}a70XPHaW57uo=-#R7
zKSLBhJJ0sdixY3IuY@hzo0r$OmE%T;XE9uh@s1k=AOHafKmY;|fB*y_009U<00Izz
z00bZa0SG_<0uX=z1Rwwb2tWV=XCbip6d#B4{{rX#XR%}$Bm^J;0SG|gWP$!?Aq=-I
zcT+0Ix{{?1q>9J8r+eW^JK1tYYZZMWQCUwW%0S*~w^p@wfkX-<yRFx)H*+YEt0RRd
zmn}6xtwbP`yp4O=>kxMAEA<~5@*g)@mb%KD5!;O~8c)>8rRQBx55=trhk#+1+T3J_
zaf*G4vZAduqy$qda{``6Gnc2DQg<Es<GLxL#9<Oj*zP!8ZSnwf@-j7l47!nFXQO$a
z^Hes6YU^_M<KsM*k~zwOSa+2g3Sx{*Eyu^XrB0FM5IJ-*?8`VvpBc4}vS(+_UKJ;=
xITAns0uX=z1Rwwb2tWV=5P-nt34DD||Nni|VfbXeJORuY0uX=z1R!vE0>7B^s4f5i

literal 0
HcmV?d00001

diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin
new file mode 100755
index 0000000000000000000000000000000000000000..1ec670be82134adcb5ae128732aff6e371281360
GIT binary patch
literal 3784
zcmeHIL1-Lh6n>lC=%gLW9QLr#l}v1e-6f$B_K?xg-PA2?k`e+EC^WLWX18WB$*##L
zqwKgdDcGd6vY{31LDNGBdJsJHpr=+43D`ppJ*-fhdMkMGXwf=;Z|0vS*)$~>4}u-|
znVJ88^S<}K_ue-||L<!cO_V?RF=dHv`_EUqoW_-3XsZ68Z7oY}m-C>g{vfz^9((;=
zR2{WQtrd?N131V}{??u$da-4X{5^achQIqYsco$hpzbT*Q3SrDwbWtb<O;Rd*W1=o
z_bBk{z02W7outMJ$&H(&Sap)E6;kXI7MU-N7<P&zGag(UwL)DSk5(tuI!vLOgt&fm
zen0=K^QT!jYj7O<!@IOvD<3tMQzUCD5$;lL=@`F<#k*p09GetIED~aovlM!0eQ1TU
z6#Q}@F>HvOlgK%SoO^cXd?|*UBR(E-jM#A^vwOq<4r<c=3CGyeK{U9)aXJp_bXVBP
z!VdAY&pg5I27E|}^$q4#OOeuli$ct$^|>W;PMn9Gb2(NRw{|F-PFs(0bb%9Qj!*~J
zMtsklfRFZNVH1L%$aq|N#Y_x`wK#mF!RZi@e~!HX+t|YtwmXGw%(G=X=D<8QVxX>l
z2dnGFJDe|YCH6Ps{Pq5JZV<OL;ygOFkNWkA`rSeO)=|IPyX$vx3Rq{*vlHmqd}EWQ
z8xB>MJ4pv;#&V8o>5r(pnIfy1qs3;O8qF2@w0V=RH#g~vrbA!<)JfkU)-x{0DVo4t
z!FwKe?!(}<6s4IP+y_y^CiXM9^LbYE4eCh6CD_F}==njLy1{dn-?a`3j7!g*xYkK&
zzc#RC{~2wFLWnP8do_~yT<Zj9UQHBHJBjNCED1iD(+Ttl4O-6CZP-biP2f!8+Pi*W
zEBZj$)(5S=^1|2m$Vc{S^~YD7JNJ9qCYAA8{c)cn2Z`}Tzib?=Ulv!nU;3{T^;<-k
znaJrMUS2@o1oHH6=*(h<{X<6>BLIv5FrpsDqtCcEMIHy#vkP|_$JbBvVi^4KtSOcQ
zE#1No0bdpPFYFWlmY9VMW}zRmFtB?TE?B^i0lx>c&<Fe^a0op9%yFT{tUrp-kB~Qq
zPMq=4w@P|}deBFTU8k@)b(Zy>-;?I4*A3P=Gx!>2`Qf)*dr_D4Q|i9q>n*T-edRfc
z!tYierf}x<cLe*9CG?itdkYS0tg9caN3H)B_;&waKHu=0Wcwv=mSpzhYUu*ncqg7L
z@MPCs;6d9tF84cuf5Xqyw=)y&KhU)?<6~of&$Pt0BX86qidyg-MKQyS8R1!mZ`KG|
zOSrS`#==?w-IA7up*3C}sy5^}%Q;fr|Bvl#@b?0@m6N@2T|6HjxMO(U#S`-97$xSd
zHBX=0$2x>OwsF`R<LWkD*k6n}m-~qL_caiQ3T~pTU%0#b&{Xb6Rf{_g@9~+diCcTF
zJ##E{Q9;~&9>?I&<6hD_@Vt{_MdYHumUUSbnz)YdSyOlLx7+tq>$j(G_WMujh?u<j
z83Jz3{U=1nhw*=Ta=$e9!hx(lyPh%dpJX7BNT{O~_038}SM|c&TP0n^)*+fKOi%aD
zl=P`ek;=34dSO;C7xYr^oL-pcM2?FGzV*(RKPX|ZWR!J2J6E197EAAyC$R~K)wxQQ
zj!#Zi=1a5mPHE=d9!BeZyEHdjnr3^VfJ}5m!HQueu`qf^#;~8qI*sQEtRhww%fPb!
E2HT5NwEzGB

literal 0
HcmV?d00001

diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S
new file mode 100644
index 000000000000..10a101fb6d71
--- /dev/null
+++ b/pc-bios/vof/entry.S
@@ -0,0 +1,49 @@ 
+#define LOAD32(rn, name)    \
+	lis     rn,name##@h;    \
+	ori     rn,rn,name##@l
+
+#define ENTRY(func_name)    \
+	.text;                  \
+	.align  2;              \
+	.globl  .func_name;     \
+	.func_name:             \
+	.globl  func_name;      \
+	func_name:
+
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
+
+	. = 0x100 /* Do exactly as SLOF does */
+
+ENTRY(_start)
+	LOAD32(2, __toc_start)
+	b entry_c
+
+ENTRY(_prom_entry)
+	LOAD32(2, __toc_start)
+	stwu    %r1,-112(%r1)
+	stw     %r31,104(%r1)
+	mflr    %r31
+	bl prom_entry
+	nop
+	mtlr    %r31
+	lwz     %r31,104(%r1)
+	addi    %r1,%r1,112
+	blr
+
+ENTRY(ci_entry)
+	mr	4,3
+	LOAD32(3,KVMPPC_H_VOF_CLIENT)
+	sc	1
+	blr
+
+/* This is the actual RTAS blob copied to the OS at instantiate-rtas */
+ENTRY(hv_rtas)
+	mr      %r4,%r3
+	LOAD32(3,KVMPPC_H_RTAS)
+	sc	1
+	blr
+	.globl hv_rtas_size
+hv_rtas_size:
+	.long . - hv_rtas;
diff --git a/pc-bios/vof/vof.lds b/pc-bios/vof/vof.lds
new file mode 100644
index 000000000000..1506ab4b0185
--- /dev/null
+++ b/pc-bios/vof/vof.lds
@@ -0,0 +1,48 @@ 
+OUTPUT_FORMAT("elf32-powerpc")
+OUTPUT_ARCH(powerpc:common)
+
+/* set the entry point */
+ENTRY ( __start )
+
+SECTIONS {
+	__executable_start = .;
+
+	.text : {
+		*(.text)
+	}
+
+	__etext = .;
+
+	. = ALIGN(8);
+
+	.data : {
+		*(.data)
+		*(.rodata .rodata.*)
+		*(.got1)
+		*(.sdata)
+		*(.opd)
+	}
+
+	/* FIXME bss at end ??? */
+
+	. = ALIGN(8);
+	__bss_start = .;
+	.bss : {
+		*(.sbss) *(.scommon)
+		*(.dynbss)
+		*(.bss)
+	}
+
+	. = ALIGN(8);
+	__bss_end = .;
+	__bss_size = (__bss_end - __bss_start);
+
+	. = ALIGN(256);
+	__toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000;
+	.got :
+	{
+		 *(.toc .got)
+	}
+	. = ALIGN(8);
+	__toc_end = .;
+}