diff mbox series

[qemu,v14] spapr: Implement Open Firmware client interface

Message ID 20210224054130.4540-1-aik@ozlabs.ru
State New
Headers show
Series [qemu,v14] spapr: Implement Open Firmware client interface | expand

Commit Message

Alexey Kardashevskiy Feb. 24, 2021, 5:41 a.m. UTC
The PAPR platform which describes an OS environment that's presented by
a combination of a hypervisor and firmware. The features it specifies
require collaboration between the firmware and the hypervisor.

Since the beginning, the runtime component of the firmware (RTAS) has
been implemented as a 20 byte shim which simply forwards it to
a hypercall implemented in qemu. The boot time firmware component is
SLOF - but a build that's specific to qemu, and has always needed to be
updated in sync with it. Even though we've managed to limit the amount
of runtime communication we need between qemu and SLOF, there's some,
and it has become increasingly awkward to handle as we've implemented
new features.

This implements a boot time OF client interface (CI) which is
enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
which implements Open Firmware Client Interface (OF CI). This allows
using a smaller stateless firmware which does not have to manage
the device tree.

The new "vof.bin" firmware image is included with source code under
pc-bios/. It also includes RTAS blob.

This implements a handful of CI methods just to get -kernel/-initrd
working. In particular, this implements the device tree fetching and
simple memory allocator - "claim" (an OF CI memory allocator) and updates
"/memory@0/available" to report the client about available memory.

This implements changing some device tree properties which we know how
to deal with, the rest is ignored. To allow changes, this skips
fdt_pack() when x-vof=on as not packing the blob leaves some room for
appending.

In absence of SLOF, this assigns phandles to device tree nodes to make
device tree traversing work.

When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.

This adds basic instances support which are managed by a hash map
ihandle -> [phandle].

Before the guest started, the used memory is:
0..4000 - the initial firmware
10000..180000 - stack

This OF CI does not implement "interpret".

Unlike SLOF, this does not format uninitialized nvram. Instead, this
includes a disk image with pre-formatted nvram.

With this basic support, this can only boot into kernel directly.
However this is just enough for the petitboot kernel and initradmdisk to
boot from any possible source. Note this requires reasonably recent guest
kernel with:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735

The immediate benefit is much faster booting time which especially
crucial with fully emulated early CPU bring up environments. Also this
may come handy when/if GRUB-in-the-userspace sees light of the day.

This separates VOF and sPAPR in a hope that VOF bits may be reused by
other POWERPC boards which do not support pSeries.

This is coded in assumption that later on we might be adding support for
booting from QEMU backends (blockdev is the first candidate) without
devices/drivers in between as OF1275 does not require that and
it is quite easy to so.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

The example command line is:

/home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline \
-nographic \
-vga none \
-enable-kvm \
-m 2G \
-machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
-kernel pbuild/kernel-le-guest/vmlinux \
-initrd pb/rootfs.cpio.xz \
-drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof/nvram.bin,format=raw \
-global spapr-nvram.drive=DRIVE0 \
-snapshot \
-smp 8,threads=8 \
-L /home/aik/t/qemu-ppc64-bios/ \
-trace events=qemu_trace_events \
-d guest_errors \
-chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
-mon chardev=SOCKET0,mode=control

---
Changes:
v14:
* check for truncates in readstr()
* ditched a separate vof_reset()
* spapr->vof is a pointer now, dropped the "on" field
* removed rtas_base from vof and updated comment why we allow setting it
* added myself to maintainers
* updated commit log about blockdev and other possible platforms
* added a note why new hcall is 0x5
* no in place endianness convertion in spapr_h_vof_client
* converted all cpu_physical_memory_read/write to address_space_rw
* git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c

v13:
* rebase on latest ppc-for-6.0
* shuffled code around to touch spapr.c less

v12:
* split VOF and SPAPR

v11:
* added g_autofree
* fixed gcc warnings
* fixed few leaks
* added nvram image to make "nvram --print-config" not crash;
Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
is 16K, or it just does not work (empty output from "nvram")

v10:
* now rebased to compile with meson

v9:
* remove special handling of /rtas/rtas-size as now we always add it in QEMU
* removed leftovers from scsi/grub/stdout/stdin/...

v8:
* no read/write/seek
* no @dev in instances
* the machine flag is "x-vof" for now

v7:
* now we have a small firmware which loads at 0 as SLOF and starts from
0x100 as SLOF
* no MBR/ELF/GRUB business in QEMU anymore
* blockdev is a separate patch
* networking is a separate patch

v6:
* borrowed a big chunk of commit log introduction from David
* fixed initial stack pointer (points to the highest address of stack)
* traces for "interpret" and others
* disabled  translate_kernel_address() hack so grub can load (work in
progress)
* added "milliseconds" for grub
* fixed "claim" allocator again
* moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
* moved the most code possible from spapr.c to spapr_of_client.c, such as
RTAS, prom entry and FDT build/finalize
* separated blobs
* GRUB now proceeds to its console prompt (there are still other issues)
* parse MBR/GPT to find PReP and load GRUB

v5:
* made instances keep device and chardev pointers
* removed VIO dependencies
* print error if RTAS memory is not claimed as it should have been
* pack FDT as "quiesce"

v4:
* fixed open
* validate ihandles in "call-method"

v3:
* fixed phandles allocation
* s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
* fixed size of /chosen/stdout
* bunch of renames
* do not create rtas properties at all, let the client deal with it;
instead setprop allows changing these in the FDT
* no more packing FDT when bios=off - nobody needs it and getprop does not
work otherwise
* allow updating initramdisk device tree properties (for zImage)
* added instances
* fixed stdout on OF's "write"
* removed special handling for stdout in OF client, spapr-vty handles it
instead

v2:
* fixed claim()
* added "setprop"
* cleaner client interface and RTAS blobs management
* boots to petitboot and further to the target system
* more trace points
---
 pc-bios/vof/Makefile   |  18 +
 hw/ppc/vof.h           |  42 ++
 include/hw/ppc/spapr.h |  22 +-
 pc-bios/vof/vof.h      |  44 +++
 hw/ppc/spapr.c         |  78 +++-
 hw/ppc/spapr_hcall.c   |  26 +-
 hw/ppc/spapr_vof.c     | 138 +++++++
 hw/ppc/vof.c           | 864 +++++++++++++++++++++++++++++++++++++++++
 pc-bios/vof/bootmem.c  |  13 +
 pc-bios/vof/ci.c       | 108 ++++++
 pc-bios/vof/libc.c     |  91 +++++
 pc-bios/vof/main.c     |  22 ++
 MAINTAINERS            |  11 +
 hw/ppc/meson.build     |   2 +
 hw/ppc/trace-events    |  21 +
 pc-bios/README         |   2 +
 pc-bios/vof.bin        | Bin 0 -> 3680 bytes
 pc-bios/vof/entry.S    |  51 +++
 pc-bios/vof/l.lds      |  48 +++
 pc-bios/vof/nvram.bin  | Bin 0 -> 16384 bytes
 20 files changed, 1592 insertions(+), 9 deletions(-)
 create mode 100644 pc-bios/vof/Makefile
 create mode 100644 hw/ppc/vof.h
 create mode 100644 pc-bios/vof/vof.h
 create mode 100644 hw/ppc/spapr_vof.c
 create mode 100644 hw/ppc/vof.c
 create mode 100644 pc-bios/vof/bootmem.c
 create mode 100644 pc-bios/vof/ci.c
 create mode 100644 pc-bios/vof/libc.c
 create mode 100644 pc-bios/vof/main.c
 create mode 100755 pc-bios/vof.bin
 create mode 100644 pc-bios/vof/entry.S
 create mode 100644 pc-bios/vof/l.lds
 create mode 100644 pc-bios/vof/nvram.bin

GIT binary patch
literal 16384
zcmeI%Jx;?g6bEpZJ8*)oSZeqZi&Z2pKnD)sI4{AHlNb4;RW}a70XPHaW57uo=-#R7
zKSLBhJJ0sdixY3IuY@hzo0r$OmE%T;XE9uh@s1k=AOHafKmY;|fB*y_009U<00Izz
z00bZa0SG_<0uX=z1Rwwb2tWV=XCbip6d#B4{{rX#XR%}$Bm^J;0SG|gWP$!?Aq=-I
zcT+0Ix{{?1q>9J8r+eW^JK1tYYZZMWQCUwW%0S*~w^p@wfkX-<yRFx)H*+YEt0RRd
zmn}6xtwbP`yp4O=>kxMAEA<~5@*g)@mb%KD5!;O~8c)>8rRQBx55=trhk#+1+T3J_
zaf*G4vZAduqy$qda{``6Gnc2DQg<Es<GLxL#9<Oj*zP!8ZSnwf@-j7l47!nFXQO$a
z^Hes6YU^_M<KsM*k~zwOSa+2g3Sx{*Eyu^XrB0FM5IJ-*?8`VvpBc4}vS(+_UKJ;=
xITAns0uX=z1Rwwb2tWV=5P-nt34DD||Nni|VfbXeJORuY0uX=z1R!vE0>7B^s4f5i

literal 0
HcmV?d00001

Comments

no-reply@patchew.org Feb. 24, 2021, 5:48 a.m. UTC | #1
Patchew URL: https://patchew.org/QEMU/20210224054130.4540-1-aik@ozlabs.ru/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210224054130.4540-1-aik@ozlabs.ru
Subject: [PATCH qemu v14] spapr: Implement Open Firmware client interface

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag]         patchew/20210224054130.4540-1-aik@ozlabs.ru -> patchew/20210224054130.4540-1-aik@ozlabs.ru
Switched to a new branch 'test'
3fc539b spapr: Implement Open Firmware client interface

=== OUTPUT BEGIN ===
WARNING: line over 80 characters
#268: FILE: hw/ppc/spapr.c:4463:
+    ClientArchitectureSupportClass *casc = CLIENT_ARCHITECTURE_SUPPORT_CLASS(oc);

WARNING: line over 80 characters
#1431: FILE: hw/ppc/vof.h:29:
+    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)

ERROR: code indent should never use tabs
#1548: FILE: pc-bios/vof/bootmem.c:5:
+^Iuint64_t kern[2];$

ERROR: code indent should never use tabs
#1549: FILE: pc-bios/vof/bootmem.c:6:
+^Iphandle chosen = ci_finddevice("/chosen");$

ERROR: code indent should never use tabs
#1551: FILE: pc-bios/vof/bootmem.c:8:
+^Iif (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=$

ERROR: code indent should never use tabs
#1552: FILE: pc-bios/vof/bootmem.c:9:
+^I^I^Isizeof(kern))$

ERROR: code indent should never use tabs
#1553: FILE: pc-bios/vof/bootmem.c:10:
+^I^Ireturn;$

ERROR: code indent should never use tabs
#1555: FILE: pc-bios/vof/bootmem.c:12:
+^Ido_boot(kern[0], initrd, initrdsize);$

ERROR: externs should be avoided in .c files
#1574: FILE: pc-bios/vof/ci.c:12:
+extern uint32_t ci_entry(uint32_t params);

ERROR: externs should be avoided in .c files
#1576: FILE: pc-bios/vof/ci.c:14:
+extern unsigned long hv_rtas(unsigned long params);

ERROR: externs should be avoided in .c files
#1577: FILE: pc-bios/vof/ci.c:15:
+extern unsigned int hv_rtas_size;

ERROR: code indent should never use tabs
#1581: FILE: pc-bios/vof/ci.c:19:
+^Ivoid *rtasbase;$

ERROR: code indent should never use tabs
#1582: FILE: pc-bios/vof/ci.c:20:
+^Iuint32_t rtassize = 0;$

ERROR: code indent should never use tabs
#1583: FILE: pc-bios/vof/ci.c:21:
+^Iphandle rtas;$

ERROR: code indent should never use tabs
#1585: FILE: pc-bios/vof/ci.c:23:
+^Iif (strcmp("call-method", (void *)(unsigned long) pargs->service))$

ERROR: braces {} are necessary for all arms of this statement
#1585: FILE: pc-bios/vof/ci.c:23:
+       if (strcmp("call-method", (void *)(unsigned long) pargs->service))
[...]

ERROR: code indent should never use tabs
#1586: FILE: pc-bios/vof/ci.c:24:
+^I^Ireturn false;$

ERROR: code indent should never use tabs
#1588: FILE: pc-bios/vof/ci.c:26:
+^Iif (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))$

ERROR: braces {} are necessary for all arms of this statement
#1588: FILE: pc-bios/vof/ci.c:26:
+       if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
[...]

ERROR: code indent should never use tabs
#1589: FILE: pc-bios/vof/ci.c:27:
+^I^Ireturn false;$

ERROR: code indent should never use tabs
#1591: FILE: pc-bios/vof/ci.c:29:
+^Irtas = ci_finddevice("/rtas");$

ERROR: code indent should never use tabs
#1592: FILE: pc-bios/vof/ci.c:30:
+^Ici_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));$

ERROR: code indent should never use tabs
#1593: FILE: pc-bios/vof/ci.c:31:
+^Iif (rtassize < hv_rtas_size)$

ERROR: braces {} are necessary for all arms of this statement
#1593: FILE: pc-bios/vof/ci.c:31:
+       if (rtassize < hv_rtas_size)
[...]

ERROR: code indent should never use tabs
#1594: FILE: pc-bios/vof/ci.c:32:
+^I^Ireturn false;$

ERROR: code indent should never use tabs
#1596: FILE: pc-bios/vof/ci.c:34:
+^Irtasbase = (void *)(unsigned long) pargs->args[2];$

ERROR: code indent should never use tabs
#1598: FILE: pc-bios/vof/ci.c:36:
+^Imemcpy(rtasbase, hv_rtas, hv_rtas_size);$

ERROR: code indent should never use tabs
#1599: FILE: pc-bios/vof/ci.c:37:
+^Ipargs->args[pargs->nargs] = 0;$

ERROR: code indent should never use tabs
#1600: FILE: pc-bios/vof/ci.c:38:
+^Ipargs->args[pargs->nargs + 1] = pargs->args[2];$

ERROR: code indent should never use tabs
#1602: FILE: pc-bios/vof/ci.c:40:
+^Ireturn true;$

ERROR: code indent should never use tabs
#1607: FILE: pc-bios/vof/ci.c:45:
+^Iif (!prom_handle((void *)(unsigned long) args))$

ERROR: braces {} are necessary for all arms of this statement
#1607: FILE: pc-bios/vof/ci.c:45:
+       if (!prom_handle((void *)(unsigned long) args))
[...]

ERROR: code indent should never use tabs
#1608: FILE: pc-bios/vof/ci.c:46:
+^I^Ici_entry(args);$

ERROR: braces {} are necessary for all arms of this statement
#1622: FILE: pc-bios/vof/ci.c:60:
+        for (i = 0; i < nargs; i++)
[...]

ERROR: braces {} are necessary for all arms of this statement
#1626: FILE: pc-bios/vof/ci.c:64:
+        for (i = 0; i < nret; i++)
[...]

ERROR: spaces required around that '+' (ctx:VxV)
#1627: FILE: pc-bios/vof/ci.c:65:
+                args.args[nargs+i] = 0;
                                ^

ERROR: braces {} are necessary for all arms of this statement
#1629: FILE: pc-bios/vof/ci.c:67:
+        if (ci_entry((uint32_t)(&args)) < 0)
[...]

ERROR: code indent should never use tabs
#1637: FILE: pc-bios/vof/ci.c:75:
+^Icall_prom("exit", 0, 0);$

ERROR: code indent should never use tabs
#1642: FILE: pc-bios/vof/ci.c:80:
+^Ireturn call_prom("finddevice", 1, 1, path);$

ERROR: code indent should never use tabs
#1647: FILE: pc-bios/vof/ci.c:85:
+^Ireturn call_prom("getprop", 4, 1, ph, propname, prop, len);$

ERROR: code indent should never use tabs
#1652: FILE: pc-bios/vof/ci.c:90:
+^Ireturn call_prom("open", 1, 1, path);$

ERROR: code indent should never use tabs
#1657: FILE: pc-bios/vof/ci.c:95:
+^Icall_prom("close", 1, 0, ih);$

ERROR: code indent should never use tabs
#1662: FILE: pc-bios/vof/ci.c:100:
+^Iuint32_t ret = call_prom("claim", 3, 1, ADDR(virt), size, align);$

ERROR: code indent should never use tabs
#1664: FILE: pc-bios/vof/ci.c:102:
+^Ireturn (void *) (unsigned long) ret;$

ERROR: code indent should never use tabs
#1669: FILE: pc-bios/vof/ci.c:107:
+^Ireturn call_prom("release", 2, 1, ADDR(virt), size);$

ERROR: code indent should never use tabs
#1792: FILE: pc-bios/vof/libc.c:5:
+^Iint len = 0;$

ERROR: code indent should never use tabs
#1794: FILE: pc-bios/vof/libc.c:7:
+^Iwhile (*s != 0) {$

ERROR: code indent should never use tabs
#1795: FILE: pc-bios/vof/libc.c:8:
+^I^Ilen += 1;$

ERROR: code indent should never use tabs
#1796: FILE: pc-bios/vof/libc.c:9:
+^I^Is += 1;$

ERROR: code indent should never use tabs
#1797: FILE: pc-bios/vof/libc.c:10:
+^I}$

ERROR: code indent should never use tabs
#1799: FILE: pc-bios/vof/libc.c:12:
+^Ireturn len;$

ERROR: braces {} are necessary for all arms of this statement
#1805: FILE: pc-bios/vof/libc.c:18:
+                if (*s1 != *s2)
[...]

ERROR: braces {} are necessary for all arms of this statement
#1833: FILE: pc-bios/vof/libc.c:46:
+                if (*p1 != *p2)
[...]

ERROR: return is not a function, parentheses are not required
#1834: FILE: pc-bios/vof/libc.c:47:
+                        return (*p1 - *p2);

ERROR: else should follow close brace '}'
#1857: FILE: pc-bios/vof/libc.c:70:
+        }
+        else {

ERROR: code indent should never use tabs
#1890: FILE: pc-bios/vof/main.c:6:
+^Iregister unsigned long r3 __asm__("r3") = _r3;$

ERROR: code indent should never use tabs
#1891: FILE: pc-bios/vof/main.c:7:
+^Iregister unsigned long r4 __asm__("r4") = _r4;$

ERROR: code indent should never use tabs
#1892: FILE: pc-bios/vof/main.c:8:
+^Iregister unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;$

ERROR: code indent should never use tabs
#1894: FILE: pc-bios/vof/main.c:10:
+^I((client *)(uint32_t)addr)();$

ERROR: code indent should never use tabs
#1899: FILE: pc-bios/vof/main.c:15:
+^Iregister unsigned long r3 __asm__("r3");$

ERROR: code indent should never use tabs
#1900: FILE: pc-bios/vof/main.c:16:
+^Iregister unsigned long r4 __asm__("r4");$

ERROR: code indent should never use tabs
#1901: FILE: pc-bios/vof/main.c:17:
+^Iregister unsigned long r5 __asm__("r5");$

ERROR: code indent should never use tabs
#1902: FILE: pc-bios/vof/main.c:18:
+^Iuint64_t initrd = r3, initrdsize = r4;$

ERROR: code indent should never use tabs
#1904: FILE: pc-bios/vof/main.c:20:
+^Iboot_from_memory(initrd, initrdsize);$

ERROR: code indent should never use tabs
#1905: FILE: pc-bios/vof/main.c:21:
+^Ici_panic("*** No boot target ***\n");$

total: 63 errors, 2 warnings, 1738 lines checked

Commit 3fc539b07428 (spapr: Implement Open Firmware client interface) has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20210224054130.4540-1-aik@ozlabs.ru/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com
David Gibson March 2, 2021, 2:17 a.m. UTC | #2
On Tue, Feb 23, 2021 at 09:48:56PM -0800, no-reply@patchew.org wrote:
> Patchew URL: https://patchew.org/QEMU/20210224054130.4540-1-aik@ozlabs.ru/
> 
> 
> 
> Hi,
> 
> This series seems to have some coding style problems. See output below for
> more information:
> 
> Type: series
> Message-id: 20210224054130.4540-1-aik@ozlabs.ru
> Subject: [PATCH qemu v14] spapr: Implement Open Firmware client interface
> 
> === TEST SCRIPT BEGIN ===
> #!/bin/bash
> git rev-parse base > /dev/null || exit 0
> git config --local diff.renamelimit 0
> git config --local diff.renames True
> git config --local diff.algorithm histogram
> ./scripts/checkpatch.pl --mailback base..
> === TEST SCRIPT END ===
> 
> Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
> From https://github.com/patchew-project/qemu
>  * [new tag]         patchew/20210224054130.4540-1-aik@ozlabs.ru -> patchew/20210224054130.4540-1-aik@ozlabs.ru
> Switched to a new branch 'test'
> 3fc539b spapr: Implement Open Firmware client interface
> 
> === OUTPUT BEGIN ===
> WARNING: line over 80 characters
> #268: FILE: hw/ppc/spapr.c:4463:
> +    ClientArchitectureSupportClass *casc = CLIENT_ARCHITECTURE_SUPPORT_CLASS(oc);

These style warnings in the qemu code proper will need to be fixed.

> WARNING: line over 80 characters
> #1431: FILE: hw/ppc/vof.h:29:
> +    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)
> 
> ERROR: code indent should never use tabs
> #1548: FILE: pc-bios/vof/bootmem.c:5:
> +^Iuint64_t kern[2];$

I'm a bit torn about these ones in the vof code.  I think it might be
simpler to go to non-tab indenting there (for .c, not .S) just to
avoid checkpatch whining all the time.

Or if you really don't want to update the coding style in VOF, it
would probably be good to include a patch altering checkpatch so it
excludes the VOF code (as it already does for the code imported into
linux-headers).

> 
> ERROR: code indent should never use tabs
> #1549: FILE: pc-bios/vof/bootmem.c:6:
> +^Iphandle chosen = ci_finddevice("/chosen");$
> 
> ERROR: code indent should never use tabs
> #1551: FILE: pc-bios/vof/bootmem.c:8:
> +^Iif (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=$
> 
> ERROR: code indent should never use tabs
> #1552: FILE: pc-bios/vof/bootmem.c:9:
> +^I^I^Isizeof(kern))$
> 
> ERROR: code indent should never use tabs
> #1553: FILE: pc-bios/vof/bootmem.c:10:
> +^I^Ireturn;$
> 
> ERROR: code indent should never use tabs
> #1555: FILE: pc-bios/vof/bootmem.c:12:
> +^Ido_boot(kern[0], initrd, initrdsize);$
> 
> ERROR: externs should be avoided in .c files
> #1574: FILE: pc-bios/vof/ci.c:12:
> +extern uint32_t ci_entry(uint32_t params);
> 
> ERROR: externs should be avoided in .c files
> #1576: FILE: pc-bios/vof/ci.c:14:
> +extern unsigned long hv_rtas(unsigned long params);
> 
> ERROR: externs should be avoided in .c files
> #1577: FILE: pc-bios/vof/ci.c:15:
> +extern unsigned int hv_rtas_size;
> 
> ERROR: code indent should never use tabs
> #1581: FILE: pc-bios/vof/ci.c:19:
> +^Ivoid *rtasbase;$
> 
> ERROR: code indent should never use tabs
> #1582: FILE: pc-bios/vof/ci.c:20:
> +^Iuint32_t rtassize = 0;$
> 
> ERROR: code indent should never use tabs
> #1583: FILE: pc-bios/vof/ci.c:21:
> +^Iphandle rtas;$
> 
> ERROR: code indent should never use tabs
> #1585: FILE: pc-bios/vof/ci.c:23:
> +^Iif (strcmp("call-method", (void *)(unsigned long) pargs->service))$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1585: FILE: pc-bios/vof/ci.c:23:
> +       if (strcmp("call-method", (void *)(unsigned long) pargs->service))
> [...]
> 
> ERROR: code indent should never use tabs
> #1586: FILE: pc-bios/vof/ci.c:24:
> +^I^Ireturn false;$
> 
> ERROR: code indent should never use tabs
> #1588: FILE: pc-bios/vof/ci.c:26:
> +^Iif (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1588: FILE: pc-bios/vof/ci.c:26:
> +       if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
> [...]
> 
> ERROR: code indent should never use tabs
> #1589: FILE: pc-bios/vof/ci.c:27:
> +^I^Ireturn false;$
> 
> ERROR: code indent should never use tabs
> #1591: FILE: pc-bios/vof/ci.c:29:
> +^Irtas = ci_finddevice("/rtas");$
> 
> ERROR: code indent should never use tabs
> #1592: FILE: pc-bios/vof/ci.c:30:
> +^Ici_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));$
> 
> ERROR: code indent should never use tabs
> #1593: FILE: pc-bios/vof/ci.c:31:
> +^Iif (rtassize < hv_rtas_size)$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1593: FILE: pc-bios/vof/ci.c:31:
> +       if (rtassize < hv_rtas_size)
> [...]
> 
> ERROR: code indent should never use tabs
> #1594: FILE: pc-bios/vof/ci.c:32:
> +^I^Ireturn false;$
> 
> ERROR: code indent should never use tabs
> #1596: FILE: pc-bios/vof/ci.c:34:
> +^Irtasbase = (void *)(unsigned long) pargs->args[2];$
> 
> ERROR: code indent should never use tabs
> #1598: FILE: pc-bios/vof/ci.c:36:
> +^Imemcpy(rtasbase, hv_rtas, hv_rtas_size);$
> 
> ERROR: code indent should never use tabs
> #1599: FILE: pc-bios/vof/ci.c:37:
> +^Ipargs->args[pargs->nargs] = 0;$
> 
> ERROR: code indent should never use tabs
> #1600: FILE: pc-bios/vof/ci.c:38:
> +^Ipargs->args[pargs->nargs + 1] = pargs->args[2];$
> 
> ERROR: code indent should never use tabs
> #1602: FILE: pc-bios/vof/ci.c:40:
> +^Ireturn true;$
> 
> ERROR: code indent should never use tabs
> #1607: FILE: pc-bios/vof/ci.c:45:
> +^Iif (!prom_handle((void *)(unsigned long) args))$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1607: FILE: pc-bios/vof/ci.c:45:
> +       if (!prom_handle((void *)(unsigned long) args))
> [...]
> 
> ERROR: code indent should never use tabs
> #1608: FILE: pc-bios/vof/ci.c:46:
> +^I^Ici_entry(args);$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1622: FILE: pc-bios/vof/ci.c:60:
> +        for (i = 0; i < nargs; i++)
> [...]
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1626: FILE: pc-bios/vof/ci.c:64:
> +        for (i = 0; i < nret; i++)
> [...]
> 
> ERROR: spaces required around that '+' (ctx:VxV)
> #1627: FILE: pc-bios/vof/ci.c:65:
> +                args.args[nargs+i] = 0;
>                                 ^
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1629: FILE: pc-bios/vof/ci.c:67:
> +        if (ci_entry((uint32_t)(&args)) < 0)
> [...]
> 
> ERROR: code indent should never use tabs
> #1637: FILE: pc-bios/vof/ci.c:75:
> +^Icall_prom("exit", 0, 0);$
> 
> ERROR: code indent should never use tabs
> #1642: FILE: pc-bios/vof/ci.c:80:
> +^Ireturn call_prom("finddevice", 1, 1, path);$
> 
> ERROR: code indent should never use tabs
> #1647: FILE: pc-bios/vof/ci.c:85:
> +^Ireturn call_prom("getprop", 4, 1, ph, propname, prop, len);$
> 
> ERROR: code indent should never use tabs
> #1652: FILE: pc-bios/vof/ci.c:90:
> +^Ireturn call_prom("open", 1, 1, path);$
> 
> ERROR: code indent should never use tabs
> #1657: FILE: pc-bios/vof/ci.c:95:
> +^Icall_prom("close", 1, 0, ih);$
> 
> ERROR: code indent should never use tabs
> #1662: FILE: pc-bios/vof/ci.c:100:
> +^Iuint32_t ret = call_prom("claim", 3, 1, ADDR(virt), size, align);$
> 
> ERROR: code indent should never use tabs
> #1664: FILE: pc-bios/vof/ci.c:102:
> +^Ireturn (void *) (unsigned long) ret;$
> 
> ERROR: code indent should never use tabs
> #1669: FILE: pc-bios/vof/ci.c:107:
> +^Ireturn call_prom("release", 2, 1, ADDR(virt), size);$
> 
> ERROR: code indent should never use tabs
> #1792: FILE: pc-bios/vof/libc.c:5:
> +^Iint len = 0;$
> 
> ERROR: code indent should never use tabs
> #1794: FILE: pc-bios/vof/libc.c:7:
> +^Iwhile (*s != 0) {$
> 
> ERROR: code indent should never use tabs
> #1795: FILE: pc-bios/vof/libc.c:8:
> +^I^Ilen += 1;$
> 
> ERROR: code indent should never use tabs
> #1796: FILE: pc-bios/vof/libc.c:9:
> +^I^Is += 1;$
> 
> ERROR: code indent should never use tabs
> #1797: FILE: pc-bios/vof/libc.c:10:
> +^I}$
> 
> ERROR: code indent should never use tabs
> #1799: FILE: pc-bios/vof/libc.c:12:
> +^Ireturn len;$
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1805: FILE: pc-bios/vof/libc.c:18:
> +                if (*s1 != *s2)
> [...]
> 
> ERROR: braces {} are necessary for all arms of this statement
> #1833: FILE: pc-bios/vof/libc.c:46:
> +                if (*p1 != *p2)
> [...]
> 
> ERROR: return is not a function, parentheses are not required
> #1834: FILE: pc-bios/vof/libc.c:47:
> +                        return (*p1 - *p2);
> 
> ERROR: else should follow close brace '}'
> #1857: FILE: pc-bios/vof/libc.c:70:
> +        }
> +        else {
> 
> ERROR: code indent should never use tabs
> #1890: FILE: pc-bios/vof/main.c:6:
> +^Iregister unsigned long r3 __asm__("r3") = _r3;$
> 
> ERROR: code indent should never use tabs
> #1891: FILE: pc-bios/vof/main.c:7:
> +^Iregister unsigned long r4 __asm__("r4") = _r4;$
> 
> ERROR: code indent should never use tabs
> #1892: FILE: pc-bios/vof/main.c:8:
> +^Iregister unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;$
> 
> ERROR: code indent should never use tabs
> #1894: FILE: pc-bios/vof/main.c:10:
> +^I((client *)(uint32_t)addr)();$
> 
> ERROR: code indent should never use tabs
> #1899: FILE: pc-bios/vof/main.c:15:
> +^Iregister unsigned long r3 __asm__("r3");$
> 
> ERROR: code indent should never use tabs
> #1900: FILE: pc-bios/vof/main.c:16:
> +^Iregister unsigned long r4 __asm__("r4");$
> 
> ERROR: code indent should never use tabs
> #1901: FILE: pc-bios/vof/main.c:17:
> +^Iregister unsigned long r5 __asm__("r5");$
> 
> ERROR: code indent should never use tabs
> #1902: FILE: pc-bios/vof/main.c:18:
> +^Iuint64_t initrd = r3, initrdsize = r4;$
> 
> ERROR: code indent should never use tabs
> #1904: FILE: pc-bios/vof/main.c:20:
> +^Iboot_from_memory(initrd, initrdsize);$
> 
> ERROR: code indent should never use tabs
> #1905: FILE: pc-bios/vof/main.c:21:
> +^Ici_panic("*** No boot target ***\n");$
> 
> total: 63 errors, 2 warnings, 1738 lines checked
> 
> Commit 3fc539b07428 (spapr: Implement Open Firmware client interface) has style problems, please review.  If any of these errors
> are false positives report them to the maintainer, see
> CHECKPATCH in MAINTAINERS.
> === OUTPUT END ===
> 
> Test command exited with code: 1
> 
> 
> The full log is available at
> http://patchew.org/logs/20210224054130.4540-1-aik@ozlabs.ru/testing.checkpatch/?type=message.
> ---
> Email generated automatically by Patchew [https://patchew.org/].
> Please send your feedback to patchew-devel@redhat.com
David Gibson March 2, 2021, 3:35 a.m. UTC | #3
On Wed, Feb 24, 2021 at 04:41:30PM +1100, Alexey Kardashevskiy wrote:
> The PAPR platform which describes an OS environment that's presented by
> a combination of a hypervisor and firmware. The features it specifies
> require collaboration between the firmware and the hypervisor.
> 
> Since the beginning, the runtime component of the firmware (RTAS) has
> been implemented as a 20 byte shim which simply forwards it to
> a hypercall implemented in qemu. The boot time firmware component is
> SLOF - but a build that's specific to qemu, and has always needed to be
> updated in sync with it. Even though we've managed to limit the amount
> of runtime communication we need between qemu and SLOF, there's some,
> and it has become increasingly awkward to handle as we've implemented
> new features.
> 
> This implements a boot time OF client interface (CI) which is
> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
> which implements Open Firmware Client Interface (OF CI). This allows
> using a smaller stateless firmware which does not have to manage
> the device tree.
> 
> The new "vof.bin" firmware image is included with source code under
> pc-bios/. It also includes RTAS blob.
> 
> This implements a handful of CI methods just to get -kernel/-initrd
> working. In particular, this implements the device tree fetching and
> simple memory allocator - "claim" (an OF CI memory allocator) and updates
> "/memory@0/available" to report the client about available memory.
> 
> This implements changing some device tree properties which we know how
> to deal with, the rest is ignored. To allow changes, this skips
> fdt_pack() when x-vof=on as not packing the blob leaves some room for
> appending.
> 
> In absence of SLOF, this assigns phandles to device tree nodes to make
> device tree traversing work.
> 
> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
> 
> This adds basic instances support which are managed by a hash map
> ihandle -> [phandle].
> 
> Before the guest started, the used memory is:
> 0..4000 - the initial firmware
> 10000..180000 - stack
> 
> This OF CI does not implement "interpret".
> 
> Unlike SLOF, this does not format uninitialized nvram. Instead, this
> includes a disk image with pre-formatted nvram.

I think we'll need to improve this, but that can be a later patch.

> With this basic support, this can only boot into kernel directly.
> However this is just enough for the petitboot kernel and initradmdisk to
> boot from any possible source. Note this requires reasonably recent guest
> kernel with:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
> 
> The immediate benefit is much faster booting time which especially
> crucial with fully emulated early CPU bring up environments. Also this
> may come handy when/if GRUB-in-the-userspace sees light of the day.
> 
> This separates VOF and sPAPR in a hope that VOF bits may be reused by
> other POWERPC boards which do not support pSeries.
> 
> This is coded in assumption that later on we might be adding support for
> booting from QEMU backends (blockdev is the first candidate) without
> devices/drivers in between as OF1275 does not require that and
> it is quite easy to so.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> The example command line is:
> 
> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline \
> -nographic \
> -vga none \
> -enable-kvm \
> -m 2G \
> -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
> -kernel pbuild/kernel-le-guest/vmlinux \
> -initrd pb/rootfs.cpio.xz \
> -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof/nvram.bin,format=raw \
> -global spapr-nvram.drive=DRIVE0 \
> -snapshot \
> -smp 8,threads=8 \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events \
> -d guest_errors \
> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
> -mon chardev=SOCKET0,mode=control
> 
> ---
> Changes:
> v14:
> * check for truncates in readstr()
> * ditched a separate vof_reset()
> * spapr->vof is a pointer now, dropped the "on" field
> * removed rtas_base from vof and updated comment why we allow setting it
> * added myself to maintainers
> * updated commit log about blockdev and other possible platforms
> * added a note why new hcall is 0x5
> * no in place endianness convertion in spapr_h_vof_client
> * converted all cpu_physical_memory_read/write to address_space_rw
> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
> 
> v13:
> * rebase on latest ppc-for-6.0
> * shuffled code around to touch spapr.c less
> 
> v12:
> * split VOF and SPAPR
> 
> v11:
> * added g_autofree
> * fixed gcc warnings
> * fixed few leaks
> * added nvram image to make "nvram --print-config" not crash;
> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
> is 16K, or it just does not work (empty output from "nvram")
> 
> v10:
> * now rebased to compile with meson
> 
> v9:
> * remove special handling of /rtas/rtas-size as now we always add it in QEMU
> * removed leftovers from scsi/grub/stdout/stdin/...
> 
> v8:
> * no read/write/seek
> * no @dev in instances
> * the machine flag is "x-vof" for now
> 
> v7:
> * now we have a small firmware which loads at 0 as SLOF and starts from
> 0x100 as SLOF
> * no MBR/ELF/GRUB business in QEMU anymore
> * blockdev is a separate patch
> * networking is a separate patch
> 
> v6:
> * borrowed a big chunk of commit log introduction from David
> * fixed initial stack pointer (points to the highest address of stack)
> * traces for "interpret" and others
> * disabled  translate_kernel_address() hack so grub can load (work in
> progress)
> * added "milliseconds" for grub
> * fixed "claim" allocator again
> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
> * moved the most code possible from spapr.c to spapr_of_client.c, such as
> RTAS, prom entry and FDT build/finalize
> * separated blobs
> * GRUB now proceeds to its console prompt (there are still other issues)
> * parse MBR/GPT to find PReP and load GRUB
> 
> v5:
> * made instances keep device and chardev pointers
> * removed VIO dependencies
> * print error if RTAS memory is not claimed as it should have been
> * pack FDT as "quiesce"
> 
> v4:
> * fixed open
> * validate ihandles in "call-method"
> 
> v3:
> * fixed phandles allocation
> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
> * fixed size of /chosen/stdout
> * bunch of renames
> * do not create rtas properties at all, let the client deal with it;
> instead setprop allows changing these in the FDT
> * no more packing FDT when bios=off - nobody needs it and getprop does not
> work otherwise
> * allow updating initramdisk device tree properties (for zImage)
> * added instances
> * fixed stdout on OF's "write"
> * removed special handling for stdout in OF client, spapr-vty handles it
> instead
> 
> v2:
> * fixed claim()
> * added "setprop"
> * cleaner client interface and RTAS blobs management
> * boots to petitboot and further to the target system
> * more trace points
> ---
>  pc-bios/vof/Makefile   |  18 +
>  hw/ppc/vof.h           |  42 ++
>  include/hw/ppc/spapr.h |  22 +-
>  pc-bios/vof/vof.h      |  44 +++
>  hw/ppc/spapr.c         |  78 +++-
>  hw/ppc/spapr_hcall.c   |  26 +-
>  hw/ppc/spapr_vof.c     | 138 +++++++
>  hw/ppc/vof.c           | 864 +++++++++++++++++++++++++++++++++++++++++
>  pc-bios/vof/bootmem.c  |  13 +
>  pc-bios/vof/ci.c       | 108 ++++++
>  pc-bios/vof/libc.c     |  91 +++++
>  pc-bios/vof/main.c     |  22 ++
>  MAINTAINERS            |  11 +
>  hw/ppc/meson.build     |   2 +
>  hw/ppc/trace-events    |  21 +
>  pc-bios/README         |   2 +
>  pc-bios/vof.bin        | Bin 0 -> 3680 bytes
>  pc-bios/vof/entry.S    |  51 +++
>  pc-bios/vof/l.lds      |  48 +++
>  pc-bios/vof/nvram.bin  | Bin 0 -> 16384 bytes
>  20 files changed, 1592 insertions(+), 9 deletions(-)
>  create mode 100644 pc-bios/vof/Makefile
>  create mode 100644 hw/ppc/vof.h
>  create mode 100644 pc-bios/vof/vof.h
>  create mode 100644 hw/ppc/spapr_vof.c
>  create mode 100644 hw/ppc/vof.c
>  create mode 100644 pc-bios/vof/bootmem.c
>  create mode 100644 pc-bios/vof/ci.c
>  create mode 100644 pc-bios/vof/libc.c
>  create mode 100644 pc-bios/vof/main.c
>  create mode 100755 pc-bios/vof.bin
>  create mode 100644 pc-bios/vof/entry.S
>  create mode 100644 pc-bios/vof/l.lds
>  create mode 100644 pc-bios/vof/nvram.bin
> 
> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
> new file mode 100644
> index 000000000000..49f7e240eeff
> --- /dev/null
> +++ b/pc-bios/vof/Makefile
> @@ -0,0 +1,18 @@
> +all: build-all
> +
> +build-all: vof.bin
> +
> +%.o: %.S
> +	cc -m32 -mbig-endian -c -o $@ $<
> +
> +%.o: %.c
> +	cc -m32 -mbig-endian -c -fno-stack-protector -Wno-builtin-declaration-mismatch -o $@ $<
> +
> +vof.elf: entry.o main.o libc.o ci.o bootmem.o
> +	ld -nostdlib -e_start -Tl.lds -EB -o $@ $^
> +
> +%.bin: %.elf
> +	objcopy -O binary -j .text -j .data -j .toc -j .got2 $^ $@
> +
> +clean:
> +	rm -f *.o *.bin *.elf *~
> diff --git a/hw/ppc/vof.h b/hw/ppc/vof.h
> new file mode 100644
> index 000000000000..c8fadf23ea5b
> --- /dev/null
> +++ b/hw/ppc/vof.h
> @@ -0,0 +1,42 @@
> + /* Virtual Open Firmware */
> +#ifndef HW_VOF_H
> +#define HW_VOF_H
> +
> +typedef struct Vof {
> +    uint32_t top_addr; /* copied from rma_size */
> +    GArray *claimed; /* array of SpaprOfClaimed */
> +    uint64_t claimed_base;
> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
> +    uint32_t of_instance_last;
> +    char *bootargs;
> +    uint32_t initrd_base; /* Updated in spapr at CAS */
> +    long initrd_size; /* Updated in spapr at CAS */
> +} Vof;
> +
> +uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
> +                         uint32_t *args, unsigned nargs,
> +                         uint32_t *rets, unsigned nrets);
> +uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
> +                   uint64_t align);
> +void vof_cleanup(Vof *vof);
> +void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr);
> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> +                               const char *prop, const char *path);
> +
> +/* ibm,client-architecture-support */
> +#define TYPE_CLIENT_ARCHITECTURE_SUPPORT "client-architecture-support"
> +#define CLIENT_ARCHITECTURE_SUPPORT(obj) \
> +    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)
> +
> +typedef struct ClientArchitectureSupportClass ClientArchitectureSupportClass;
> +DECLARE_CLASS_CHECKERS(ClientArchitectureSupportClass,
> +                       CLIENT_ARCHITECTURE_SUPPORT,
> +                       TYPE_CLIENT_ARCHITECTURE_SUPPORT)
> +
> +struct ClientArchitectureSupportClass {
> +    InterfaceClass parent;
> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
> +    void (*quiesce)(void);

Is there actually any real connection of quiesce behaviour to cas
behaviour?  Basically, I'm wondering if this is not so much about
client-architecture-support fundamentally as just about
machine-specific parts of the VOF behaviour.  Which would be fine, but
suggests a different name for the interface.

> +};
> +
> +#endif /* HW_VOF_H */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ccbeeca1de84..4896b9fae784 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -12,6 +12,7 @@
>  #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>  #include "hw/ppc/xics.h"        /* For ICSState */
>  #include "hw/ppc/spapr_tpm_proxy.h"
> +#include "hw/ppc/vof.h"
>  
>  struct SpaprVioBus;
>  struct SpaprPhbState;
> @@ -180,6 +181,7 @@ struct SpaprMachineState {
>      uint64_t kernel_addr;
>      uint32_t initrd_base;
>      long initrd_size;
> +    Vof *vof;
>      uint64_t rtc_offset; /* Now used only during incoming migration */
>      struct PPCTimebase tb;
>      bool has_graphics;
> @@ -554,7 +556,9 @@ struct SpaprMachineState {
>  /* Client Architecture support */
>  #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>  #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>  
>  /*
>   * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
> @@ -944,4 +948,20 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
>  void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>  hwaddr spapr_get_rtas_addr(void);
>  bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
> +
> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> +                     target_ulong *stack_ptr);
> +void spapr_vof_quiesce(void);
> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                target_ulong opcode, target_ulong *args);

Alexey or Zoltan, any thoughts on how non-PAPR versions of this would
call into qemu to get the non-guest parts of VOF to execute?

> +target_ulong spapr_vof_client_architecture_support(CPUState *cs,
> +                                                   target_ulong ovec_addr);
> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
> +
> +/* Copied from SLOF, and 4K is definitely not enough for GRUB */
> +#define OF_STACK_SIZE       0x8000
> +
> +/* 0..10000 is reserved for the VOF fw */
> +#define OF_STACK_ADDR       0x10000
> +
>  #endif /* HW_SPAPR_H */
> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
> new file mode 100644
> index 000000000000..cd5989952a98
> --- /dev/null
> +++ b/pc-bios/vof/vof.h
> @@ -0,0 +1,44 @@
> +#include <stdarg.h>
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned long uint32_t;
> +typedef unsigned long long uint64_t;
> +#define NULL (0)
> +#define PROM_ERROR (-1u)
> +typedef unsigned char bool;
> +typedef unsigned long ihandle;
> +typedef unsigned long phandle;
> +#define false ((bool)0)
> +#define true ((bool)1)

This is actually kinda risky if 'bool' is aliased to unsigned char.
So, it's probably worth figuring out how to bind it instead to the
_Bool builtin which is available in modern compilers.

> +typedef int size_t;
> +typedef void client(void);
> +
> +/* globals */
> +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
> +
> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
> +
> +/* libc */
> +int strlen(const char *s);
> +int strcmp(const char *s1, const char *s2);
> +void *memcpy(void *dest, const void *src, size_t n);
> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
> +void *memmove(void *dest, const void *src, size_t n);
> +void *memset(void *dest, int c, size_t size);
> +
> +/* Prom */
> +typedef unsigned long prom_arg_t;
> +int call_prom(const char *service, int nargs, int nret, ...);

AIUI this isn't so much about calling the PROM, since this *is* the
PROM code, but rather about calling the parts that are implemented on
the qemu side.  Different names might clarify that.

> +
> +/* CI wrappers */
> +void ci_panic(const char *str);
> +phandle ci_finddevice(const char *path);
> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len);
> +ihandle ci_open(const char *path);
> +void ci_close(ihandle ih);
> +void *ci_claim(void *virt, uint32_t size, uint32_t align);
> +uint32_t ci_release(void *virt, uint32_t size);
> +
> +/* booting from -kernel */
> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 85fe65f89476..3c20af115627 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -102,6 +102,7 @@
>  #define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
>  #define FW_MAX_SIZE             0x400000
>  #define FW_FILE_NAME            "slof.bin"
> +#define FW_FILE_NAME_VOF        "vof.bin"
>  #define FW_OVERHEAD             0x2800000
>  #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>  
> @@ -1562,6 +1563,7 @@ static void spapr_machine_reset(MachineState *machine)
>      SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>      PowerPCCPU *first_ppc_cpu;
>      hwaddr fdt_addr;
> +    target_ulong stack_ptr = 0;
>      void *fdt;
>      int rc;
>  
> @@ -1624,22 +1626,41 @@ static void spapr_machine_reset(MachineState *machine)
>  
>      fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
>  
> -    rc = fdt_pack(fdt);
> +    if (spapr->vof) {
> +        /*
> +         * Claims initramdisk and stack which changes "available" so
> +         * doing it befofe packing.
> +         */
> +        spapr_vof_reset(spapr, fdt, &stack_ptr);
>  
> -    /* Should only fail if we've built a corrupted tree */
> -    assert(rc == 0);
> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
> +                                  stack_ptr, spapr->initrd_base,
> +                                  spapr->initrd_size);
> +        /*
> +         * We do not pack the FDT as the client may change properties and
> +         * do not write FDT to the VM as the client does not expect it.
> +         */
> +    } else {
> +        rc = fdt_pack(fdt);
> +        /* Should only fail if we've built a corrupted tree */
> +        assert(rc == 0);
>  
> -    /* Load the fdt */
> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
> +                                  0, fdt_addr, 0);
> +    }
>      qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
> +
>      g_free(spapr->fdt_blob);
>      spapr->fdt_size = fdt_totalsize(fdt);
>      spapr->fdt_initial_size = spapr->fdt_size;
>      spapr->fdt_blob = fdt;
>  
>      /* Set up the entry state */
> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0);
>      first_ppc_cpu->env.gpr[5] = 0;
> +    if (!spapr->vof) {
> +        /* Load the fdt */
> +        cpu_physical_memory_write(fdt_addr, spapr->fdt_blob, spapr->fdt_size);
> +    }
>  
>      spapr->fwnmi_system_reset_addr = -1;
>      spapr->fwnmi_machine_check_addr = -1;
> @@ -2639,7 +2660,8 @@ static void spapr_machine_init(MachineState *machine)
>      SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>      SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
> +    const char *bios_default = !!spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
> +    const char *bios_name = machine->firmware ?: bios_default;
>      const char *kernel_filename = machine->kernel_filename;
>      const char *initrd_filename = machine->initrd_filename;
>      PCIHostState *phb;
> @@ -2996,6 +3018,10 @@ static void spapr_machine_init(MachineState *machine)
>      }
>  
>      qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
> +
> +    if (spapr->vof) {
> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
> +    }
>  }
>  
>  #define DEFAULT_KVM_TYPE "auto"
> @@ -3186,6 +3212,28 @@ static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
>      }
>  }
>  
> +static bool spapr_get_vof(Object *obj, Error **errp)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->vof != NULL;
> +}
> +
> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    if (spapr->vof) {
> +        vof_cleanup(spapr->vof);
> +        g_free(spapr->vof);
> +        spapr->vof = NULL;
> +    }
> +    if (!value) {
> +        return;
> +    }
> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
> +}
> +
>  static char *spapr_get_ic_mode(Object *obj, Error **errp)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(obj);
> @@ -3311,6 +3359,10 @@ static void spapr_instance_init(Object *obj)
>                                      stringify(KERNEL_LOAD_ADDR)
>                                      " for -kernel is the default");
>      spapr->kernel_addr = KERNEL_LOAD_ADDR;
> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
> +    object_property_set_description(obj, "x-vof",
> +                                    "Enable Virtual Open Firmware");
> +
>      /* The machine class defines the default interrupt controller mode */
>      spapr->irq = smc->irq;
>      object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
> @@ -4408,6 +4460,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
>      InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
>      XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
> +    ClientArchitectureSupportClass *casc = CLIENT_ARCHITECTURE_SUPPORT_CLASS(oc);
>  
>      mc->desc = "pSeries Logical Partition (PAPR compliant)";
>      mc->ignore_boot_device_suffixes = true;
> @@ -4487,6 +4540,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      smc->smp_threads_vsmt = true;
>      smc->nr_xirqs = SPAPR_NR_XIRQS;
>      xfc->match_nvt = spapr_match_nvt;
> +
> +    casc->cas = spapr_vof_client_architecture_support;
> +    casc->quiesce = spapr_vof_quiesce;
>  }
>  
>  static const TypeInfo spapr_machine_info = {
> @@ -4506,6 +4562,7 @@ static const TypeInfo spapr_machine_info = {
>          { TYPE_XICS_FABRIC },
>          { TYPE_INTERRUPT_STATS_PROVIDER },
>          { TYPE_XIVE_FABRIC },
> +        { TYPE_CLIENT_ARCHITECTURE_SUPPORT },
>          { }
>      },
>  };
> @@ -4974,9 +5031,16 @@ static void spapr_machine_2_1_class_options(MachineClass *mc)
>  }
>  DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
>  
> +static const TypeInfo client_archivecture_support_info = {

s/archivecture/architecture/

> +    .name = TYPE_CLIENT_ARCHITECTURE_SUPPORT,
> +    .parent = TYPE_INTERFACE,
> +    .class_size = sizeof(ClientArchitectureSupportClass),
> +};
> +
>  static void spapr_machine_register_types(void)
>  {
>      type_register_static(&spapr_machine_info);
> +    type_register_static(&client_archivecture_support_info);
>  }
>  
>  type_init(spapr_machine_register_types)
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 7b5cd3553c26..0cdf90af6afb 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1806,7 +1806,13 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>          spapr_setup_hpt(spapr);
>      }
>  
> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
> +    if (spapr->vof && spapr->vof->initrd_base && spapr->vof->initrd_size) {
> +        /* Update initramdisk location so the right area gets reserved below */
> +        spapr->initrd_base = spapr->vof->initrd_base;
> +        spapr->initrd_size = spapr->vof->initrd_size;
> +    }
> +
> +    fdt = spapr_build_fdt(spapr, spapr->vof != NULL, fdt_bufsize);
>  
>      g_free(spapr->fdt_blob);
>      spapr->fdt_size = fdt_totalsize(fdt);
> @@ -1850,6 +1856,24 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
>      return ret;
>  }
>  
> +target_ulong spapr_vof_client_architecture_support(CPUState *cs,
> +                                                  target_ulong ovec_addr)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +
> +    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr,
> +                                                      ovec_addr, FDT_MAX_SIZE);
> +
> +    /*
> +     * This adds stdout and generates phandles for boottime and CAS FDTs.
> +     * It is alright to update the FDT here as do_client_architecture_support()
> +     * does not pack it.
> +     */
> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
> +
> +    return ret;
> +}
> +
>  static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>                                                SpaprMachineState *spapr,
>                                                target_ulong opcode,
> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
> new file mode 100644
> index 000000000000..f2978d830da5
> --- /dev/null
> +++ b/hw/ppc/spapr_vof.c
> @@ -0,0 +1,138 @@
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include <sys/ioctl.h>
> +#include "qapi/error.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_vio.h"
> +#include "hw/ppc/fdt.h"
> +#include "sysemu/sysemu.h"
> +#include "qom/qom-qobject.h"
> +#include "trace.h"
> +
> +/* Defined as Big Endian */
> +struct prom_args {
> +    uint32_t service;
> +    uint32_t nargs;
> +    uint32_t nret;
> +    uint32_t args[10];
> +} QEMU_PACKED;
> +
> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                target_ulong opcode, target_ulong *_args)
> +{
> +    target_ulong args_real = ppc64_phys_to_real(_args[0]);
> +    struct prom_args args_be;
> +    uint32_t args[ARRAY_SIZE(args_be.args)];
> +    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
> +    char service[64];
> +    unsigned nargs, nret, i;
> +
> +    if (address_space_rw(&address_space_memory, args_real,
> +                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
> +                         false) != MEMTX_OK) {
> +        return H_HARDWARE;
> +    }
> +    nargs = be32_to_cpu(args_be.nargs);
> +    if (nargs >= ARRAY_SIZE(args_be.args)) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
> +                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
> +                         false) != MEMTX_OK) {
> +        return H_HARDWARE;
> +    }
> +    if (strnlen(service, sizeof(service)) == sizeof(service)) {
> +        /* Too long service name */
> +        return H_PARAMETER;
> +    }
> +
> +    for (i = 0; i < nargs; ++i) {
> +        args[i] = be32_to_cpu(args_be.args[i]);
> +    }
> +
> +    nret = be32_to_cpu(args_be.nret);
> +    ret = vof_client_call(spapr->fdt_blob, spapr->vof, service,
> +                          args, nargs, rets, nret);
> +    if (!nret) {
> +        return H_SUCCESS;
> +    }
> +
> +    args_be.args[nargs] = cpu_to_be32(ret);
> +    for (i = 1; i < nret; ++i) {
> +        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
> +    }
> +
> +    if (address_space_rw(&address_space_memory,
> +                         args_real + offsetof(struct prom_args, args[nargs]),
> +                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
> +                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
> +        return H_HARDWARE;
> +    }
> +
> +    return H_SUCCESS;
> +}
> +
> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
> +{
> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
> +
> +    vof_build_dt(fdt, spapr->vof, spapr->rma_size);
> +
> +    /*
> +     * SLOF-less setup requires an open instance of stdout for early
> +     * kernel printk. By now all phandles are settled so we can open
> +     * the default serial console.
> +     */
> +    if (stdout_path) {
> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
> +                                   stdout_path));
> +    }
> +}
> +
> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> +                     target_ulong *stack_ptr)
> +{
> +    Vof *vof = spapr->vof;
> +
> +    spapr_vof_client_dt_finalize(spapr, fdt);
> +
> +    *stack_ptr = vof_claim(spapr->fdt_blob, vof, OF_STACK_ADDR, OF_STACK_SIZE,
> +                           OF_STACK_SIZE);
> +    if (*stack_ptr == -1) {
> +        error_report("Memory allocation for stack failed");
> +        exit(1);

Might make things a little cleaner to add an Error ** parameter, and
pass in &error_fatal in the caller.

> +    }
> +    /*
> +     * Stack grows downwards and we also reserve here space for
> +     * the minimum stack frame.
> +     */
> +    *stack_ptr += OF_STACK_SIZE - 0x20;
> +
> +    if (spapr->kernel_size &&
> +        vof_claim(spapr->fdt_blob, vof, spapr->kernel_addr, spapr->kernel_size,
> +                  0) == -1) {
> +        error_report("Memory for kernel is in use");
> +        exit(1);
> +    }
> +
> +    if (spapr->initrd_size &&
> +        vof_claim(spapr->fdt_blob, vof, spapr->initrd_base, spapr->initrd_size,
> +                  0) == -1) {
> +        error_report("Memory for initramdisk is in use");
> +        exit(1);
> +    }
> +
> +    /*
> +     * We skip writing FDT as nothing expects it; OF client interface is
> +     * going to be used for reading the device tree.
> +     */
> +}
> +
> +void spapr_vof_quiesce(void)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +
> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
> +    spapr->fdt_initial_size = spapr->fdt_size;
> +}
> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
> new file mode 100644
> index 000000000000..9c76891e668c
> --- /dev/null
> +++ b/hw/ppc/vof.c
> @@ -0,0 +1,864 @@
> +/*
> + * QEMU PowerPC Virtual Open Firmware.
> + *
> + * This implements client interface from OpenFirmware IEEE1275 on the QEMU
> + * side to leave only a very basic firmware in the VM.
> + *
> + * Copyright (c) 2020 IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include <sys/ioctl.h>
> +#include "exec/ram_addr.h"
> +#include "exec/address-spaces.h"
> +#include "qemu/timer.h"
> +#include "qemu/range.h"
> +#include "hw/ppc/vof.h"
> +#include "hw/ppc/fdt.h"
> +#include "sysemu/runstate.h"
> +#include "qom/qom-qobject.h"
> +#include "trace.h"
> +
> +#include <libfdt.h>
> +
> +/*
> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long.
> + */
> +#define OF_PROPNAME_LEN_MAX 64
> +
> +typedef struct {
> +    uint64_t start;
> +    uint64_t size;
> +} OfClaimed;
> +
> +typedef struct {
> +    char *path; /* the path used to open the instance */
> +    uint32_t phandle;
> +} OfInstance;
> +
> +#define VOF_MEM_READ(pa, buf, size) \
> +    address_space_read_full(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> +#define VOF_MEM_WRITE(pa, buf, size) \
> +    address_space_write(&address_space_memory, \
> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
> +
> +static int readstr(hwaddr pa, char *buf, int size)
> +{
> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
> +        return -1;
> +    }
> +    if (strnlen(buf, size) == size) {
> +        buf[size - 1] = '\0';
> +        trace_vof_error_str_truncated(buf, size);
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
> +                       const char *s1, unsigned nargscheck, unsigned nretcheck)
> +{
> +    if (strcmp(s, s1)) {
> +        return false;
> +    }
> +    if ((nargscheck && (nargs != nargscheck)) ||
> +        (nretcheck && (nret != nretcheck))) {
> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static void prop_format(char *tval, int tlen, const void *prop, int len)
> +{
> +    int i;
> +    const unsigned char *c;
> +    char *t;
> +    const char bin[] = "...";
> +
> +    for (i = 0, c = prop; i < len; ++i, ++c) {
> +        if (*c == '\0' && i == len - 1) {
> +            strncpy(tval, prop, tlen - 1);
> +            return;
> +        }
> +        if (*c < 0x20 || *c >= 0x80) {
> +            break;
> +        }
> +    }
> +
> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
> +            strcpy(t, bin);
> +            return;
> +        }
> +        if (i && i % 4 == 0 && i != len - 1) {
> +            strcat(t, " ");
> +            ++t;
> +        }
> +        t += sprintf(t, "%02X", *c & 0xFF);
> +    }
> +}
> +
> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
> +{
> +    char fullnode[1024];
> +    uint32_t ret = -1;
> +    int offset;
> +
> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
> +        return (uint32_t) ret;
> +    }
> +
> +    offset = fdt_path_offset(fdt, fullnode);
> +    if (offset >= 0) {
> +        ret = fdt_get_phandle(fdt, offset);
> +    }
> +    trace_vof_finddevice(fullnode, ret);
> +    return (uint32_t) ret;
> +}
> +
> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
> +                            uint32_t valaddr, uint32_t vallen)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = 0;
> +    int proplen = 0;
> +    const void *prop;
> +    char trval[64] = "";
> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
> +
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        return -1;
> +    }
> +    if (strcmp(propname, "name") == 0) {
> +        prop = fdt_get_name(fdt, nodeoff, &proplen);
> +        proplen += 1;

This isn't quite right, I don't think.  fdt_get_name() returns the
name *including* unit address, but the 'name' property will omit the
unit address.

> +    } else {
> +        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
> +    }
> +
> +    if (prop) {
> +        int cb = MIN(proplen, vallen);
> +
> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK) {
> +            ret = -1;
> +        } else {
> +            /*
> +             * OF1275 says:
> +             * "Size is either the actual size of the property, or -1 if name
> +             * does not exist", hence returning proplen instead of cb.
> +             */
> +            ret = proplen;
> +            prop_format(trval, sizeof(trval), prop, ret);

It would be nice if we could elide this when tracing isn't enabled
:/.  Guess that can be a later optimization, though.

> +        }
> +    } else {
> +        ret = -1;
> +    }
> +    trace_vof_getprop(nodeph, propname, ret, trval);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = 0;
> +    int proplen = 0;
> +    const void *prop;
> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
> +
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        return -1;
> +    }
> +    if (strcmp(propname, "name") == 0) {
> +        prop = fdt_get_name(fdt, nodeoff, &proplen);
> +        proplen += 1;
> +    } else {
> +        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
> +    }
> +
> +    if (prop) {
> +        ret = proplen;
> +    } else {
> +        ret = -1;
> +    }
> +    trace_vof_getproplen(nodeph, propname, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_setprop(void *fdt, Vof *vof,
> +                            uint32_t nodeph, uint32_t pname,
> +                            uint32_t valaddr, uint32_t vallen)
> +{
> +    char propname[OF_PROPNAME_LEN_MAX + 1];
> +    uint32_t ret = -1;
> +    int offset;
> +    char trval[64] = "";
> +
> +    if (readstr(pname, propname, sizeof(propname))) {
> +        return -1;
> +    }
> +    /*
> +     * We only allow changing properties which we know how to update in QEMU
> +     * OR
> +     * the ones which we know that they need to survive during "quiesce".
> +     */

Should we be checking the node these are being written to as well?  I
think they're all in /chosen.

> +    if (vallen == sizeof(uint32_t)) {
> +        uint32_t val32 = ldl_be_phys(first_cpu->as, valaddr);
> +
> +        if ((strcmp(propname, "linux,rtas-base") == 0) ||
> +            (strcmp(propname, "linux,rtas-entry") == 0)) {
> +            /* These need to survive quiesce so let them store in the FDT */
> +        } else if (strcmp(propname, "linux,initrd-start") == 0) {
> +            vof->initrd_base = val32;
> +        } else if (strcmp(propname, "linux,initrd-end") == 0) {
> +            vof->initrd_size = val32 - vof->initrd_base;
> +        } else {
> +            goto trace_exit;
> +        }
> +    } else if (vallen == sizeof(uint64_t)) {
> +        uint64_t val64 = ldq_be_phys(first_cpu->as, valaddr);
> +
> +        if (strcmp(propname, "linux,initrd-start") == 0) {
> +            vof->initrd_base = val64;
> +        } else if (strcmp(propname, "linux,initrd-end") == 0) {
> +            vof->initrd_size = val64 - vof->initrd_base;
> +        } else {
> +            goto trace_exit;
> +        }
> +    } else if (strcmp(propname, "bootargs") == 0) {
> +        char val[1024];
> +
> +        if (readstr(valaddr, val, sizeof(val))) {
> +            goto trace_exit;
> +        }
> +        g_free(vof->bootargs);
> +        vof->bootargs = g_strdup(val);

We should probably truncate vallen to the size of the loaded string,
yes?

> +    } else {
> +        goto trace_exit;
> +    }
> +
> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
> +    if (offset >= 0) {
> +        uint8_t data[vallen];

We probably want some limit on vallen so the guest can't force
allocation of arbitrary amounts of qemu stack space.

> +
> +        if ((VOF_MEM_READ(valaddr, data, vallen) == MEMTX_OK) &&
> +            !fdt_setprop(fdt, offset, propname, data, vallen)) {
> +            ret = vallen;
> +            prop_format(trval, sizeof(trval), data, ret);
> +        }
> +    }
> +
> +trace_exit:
> +    trace_vof_setprop(nodeph, propname, trval, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
> +                             uint32_t prevaddr, uint32_t nameaddr)
> +{
> +    int offset = fdt_node_offset_by_phandle(fdt, phandle);
> +    char prev[OF_PROPNAME_LEN_MAX + 1];
> +    const char *tmp;
> +
> +    if (readstr(prevaddr, prev, sizeof(prev))) {
> +        return -1;
> +    }
> +    for (offset = fdt_first_property_offset(fdt, offset);
> +         offset >= 0;
> +         offset = fdt_next_property_offset(fdt, offset)) {

I think you can use libfdt's for_each_property_offset macro here.

> +
> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
> +            return 0;
> +        }
> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
> +            if (prev[0] != '\0') {
> +                offset = fdt_next_property_offset(fdt, offset);
> +                if (offset < 0) {
> +                    return 0;
> +                }
> +            }
> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
> +                return 0;
> +            }
> +
> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) {
> +                return -1;
> +            }
> +            return 1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
> +{
> +    int ret;
> +
> +    if (phandle == 0) {
> +        ret = fdt_path_offset(fdt, "/");
> +    } else {
> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +    }
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
> +{
> +    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
> +{
> +    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle));
> +
> +    if (ret < 0) {
> +        ret = 0;
> +    } else {
> +        ret = fdt_get_phandle(fdt, ret);
> +    }
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_do_open(void *fdt, Vof *vof, const char *path)
> +{
> +    int offset;
> +    uint32_t ret = 0;
> +    OfInstance *inst = NULL;
> +
> +    if (vof->of_instance_last == 0xFFFFFFFF) {
> +        /* We do not recycle ihandles yet */
> +        goto trace_exit;
> +    }
> +
> +    offset = fdt_path_offset(fdt, path);
> +    if (offset < 0) {
> +        trace_vof_error_unknown_path(path);
> +        goto trace_exit;
> +    }
> +
> +    inst = g_new0(OfInstance, 1);
> +    inst->phandle = fdt_get_phandle(fdt, offset);
> +    g_assert(inst->phandle);
> +    ++vof->of_instance_last;
> +
> +    inst->path = g_strdup(path);
> +    g_hash_table_insert(vof->of_instances,
> +                        GINT_TO_POINTER(vof->of_instance_last),
> +                        inst);
> +    ret = vof->of_instance_last;
> +
> +trace_exit:
> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
> +
> +    return ret;
> +}
> +
> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> +                               const char *prop, const char *path)
> +{
> +    int node = fdt_path_offset(fdt, nodename);
> +    uint32_t inst = vof_do_open(fdt, vof, path);
> +
> +    return fdt_setprop_cell(fdt, node, prop, inst);
> +}
> +
> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
> +{
> +    char path[256];
> +
> +    if (readstr(pathaddr, path, sizeof(path))) {
> +        return -1;
> +    }
> +
> +    return vof_do_open(fdt, vof, path);
> +}
> +
> +static void vof_close(Vof *vof, uint32_t ihandle)
> +{
> +    if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) {
> +        trace_vof_error_unknown_ihandle_close(ihandle);
> +    }
> +}
> +
> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
> +{
> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
> +                                         GINT_TO_POINTER(ihandle));
> +    uint32_t ret = -1;
> +
> +    if (instp) {
> +        ret = ((OfInstance *)instp)->phandle;
> +    }
> +    trace_vof_instance_to_package(ihandle, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
> +                                    uint32_t buf, uint32_t len)
> +{
> +    uint32_t ret = -1;
> +    char tmp[256] = "";
> +
> +    if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle), tmp,
> +                      sizeof(tmp))) {
> +        tmp[sizeof(tmp) - 1] = 0;
> +        ret = MIN(len, strlen(tmp) + 1);
> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
> +            ret = -1;
> +        }
> +    }
> +
> +    trace_vof_package_to_path(phandle, tmp, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle,
> +                                     uint32_t buf, uint32_t len)
> +{
> +    uint32_t ret = -1;
> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
> +    char tmp[256] = "";


There are a bunch of these hardcoded 256 values, and in other places
1024.  Probably worth defining a VOF_PATH_MAX or whatever you want to
call it for consistency.

> +
> +    if (phandle != -1) {
> +        if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle),
> +                          tmp, sizeof(tmp))) {
> +            tmp[sizeof(tmp) - 1] = 0;
> +            ret = MIN(len, strlen(tmp) + 1);
> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
> +                ret = -1;
> +            }
> +        }
> +    }
> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
> +
> +    return ret;
> +}
> +
> +static void vof_claimed_dump(GArray *claimed)
> +{
> +#ifdef DEBUG
> +    int i;
> +    OfClaimed c;
> +
> +    for (i = 0; i < claimed->len; ++i) {
> +        c = g_array_index(claimed, OfClaimed, i);
> +        error_printf("CLAIMED %lx..%lx size=%ld\n", c.start, c.start + c.size,
> +                     c.size);
> +    }
> +#endif
> +}
> +
> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size)
> +{
> +    int i;
> +    OfClaimed c;
> +
> +    for (i = 0; i < claimed->len; ++i) {
> +        c = g_array_index(claimed, OfClaimed, i);
> +        if (ranges_overlap(c.start, c.size, virt, size)) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
> +{
> +    OfClaimed newclaim;
> +
> +    newclaim.start = virt;
> +    newclaim.size = size;
> +    g_array_append_val(claimed, newclaim);
> +}
> +
> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
> +{
> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
> +}
> +
> +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
> +{
> +    int i, n, offset, proplen = 0;
> +    uint64_t *mem0_reg;
> +    struct { uint64_t start, size; } *avail;
> +
> +    if (!fdt || !claimed) {
> +        return;
> +    }
> +
> +    offset = fdt_path_offset(fdt, "/memory@0");
> +    _FDT(offset);
> +
> +    mem0_reg = (uint64_t *) fdt_getprop(fdt, offset, "reg", &proplen);
> +    if (!mem0_reg || proplen != 2 * sizeof(uint64_t)) {
> +        return;
> +    }
> +
> +    g_array_sort(claimed, of_claimed_compare_func);
> +    vof_claimed_dump(claimed);
> +
> +    avail = g_malloc0(sizeof(uint64_t) * 2 * claimed->len);

Using sizeof(avail[0]) * claimed->len would make this a little bit
more robust.

> +    for (i = 0, n = 0; i < claimed->len; ++i) {
> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
> +
> +        avail[n].start = c.start + c.size;
> +        if (i < claimed->len - 1) {
> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
> +
> +            avail[n].size = cn.start - avail[n].start;
> +        } else {
> +            avail[n].size = be64_to_cpu(mem0_reg[1]) - avail[n].start;
> +        }
> +
> +        if (avail[n].size) {
> +#ifdef DEBUG
> +            error_printf("AVAIL %lx..%lx size=%ld\n", avail[n].start,
> +                         avail[n].start + avail[n].size, avail[n].size);
> +#endif
> +            avail[n].start = cpu_to_be64(avail[n].start);
> +            avail[n].size = cpu_to_be64(avail[n].size);

I'd prefer to do the endian switches as you first write to the array.

> +            ++n;
> +        }
> +    }
> +    _FDT((fdt_setprop(fdt, offset, "available", avail,
> +                      sizeof(uint64_t) * 2 * n)));
> +    g_free(avail);
> +}
> +
> +/*
> + * OF1275:
> + * "Allocates size bytes of memory. If align is zero, the allocated range
> + * begins at the virtual address virt. Otherwise, an aligned address is
> + * automatically chosen and the input argument virt is ignored".
> + *
> + * In other words, exactly one of @virt and @align is non-zero.
> + */
> +uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
> +                   uint64_t align)
> +{
> +    uint64_t ret;
> +
> +    if (size == 0) {
> +        ret = -1;
> +    } else if (align == 0) {
> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
> +            ret = -1;
> +        } else {
> +            ret = virt;
> +        }
> +    } else {
> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
> +        while (1) {
> +            if (vof->claimed_base >= vof->top_addr) {
> +                error_report("Out of RMA memory for the OF client");
> +                return -1;
> +            }
> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
> +                break;
> +            }
> +            vof->claimed_base += size;
> +        }
> +        ret = vof->claimed_base;
> +    }
> +
> +    if (ret != -1) {
> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
> +        vof_claim_add(vof->claimed, ret, size);
> +        /* The client reads "/memory@0/available" to know where it can claim */
> +        vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> +    }
> +    trace_vof_claim(virt, size, align, ret);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_release(void *fdt, Vof *vof, uint64_t virt, uint64_t size)
> +{
> +    uint32_t ret = -1;
> +    int i;
> +    GArray *claimed = vof->claimed;
> +    OfClaimed c;
> +
> +    for (i = 0; i < claimed->len; ++i) {
> +        c = g_array_index(claimed, OfClaimed, i);
> +        if (c.start == virt && c.size == size) {
> +            g_array_remove_index(claimed, i);
> +            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> +            ret = 0;
> +            break;
> +        }
> +    }
> +
> +    trace_vof_release(virt, size, ret);
> +
> +    return ret;
> +}
> +
> +static void vof_instantiate_rtas(void)
> +{
> +    error_report("The firmware should have instantiated RTAS");
> +    exit(1);
> +}
> +
> +static uint32_t vof_call_method(Vof *vof, uint32_t methodaddr,
> +                                uint32_t ihandle,
> +                                uint32_t param1, uint32_t param2,
> +                                uint32_t param3, uint32_t param4,
> +                                uint32_t *ret2)
> +{
> +    uint32_t ret = -1;
> +    char method[256] = "";
> +    OfInstance *inst;
> +
> +    if (!ihandle) {
> +        goto trace_exit;
> +    }
> +
> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
> +                                              GINT_TO_POINTER(ihandle));
> +    if (!inst) {
> +        goto trace_exit;
> +    }
> +
> +    if (readstr(methodaddr, method, sizeof(method))) {
> +        goto trace_exit;
> +    }
> +
> +    if (strcmp(inst->path, "/") == 0) {
> +        if (strcmp(method, "ibm,client-architecture-support") == 0) {
> +            Object *cas_if = object_dynamic_cast(
> +                    qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
> +
> +            if (cas_if) {
> +                ClientArchitectureSupportClass *casc =
> +                    CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
> +
> +                ret = casc->cas(first_cpu, param1);
> +            }
> +
> +            *ret2 = 0;
> +        }
> +    } else if (strcmp(inst->path, "/rtas") == 0) {
> +        if (strcmp(method, "instantiate-rtas") == 0) {
> +            vof_instantiate_rtas();
> +            ret = 0;
> +            *ret2 = param1; /* rtas-base */
> +        }
> +    } else {
> +        trace_vof_error_unknown_method(method);
> +    }
> +
> +trace_exit:
> +    trace_vof_method(ihandle, method, param1, ret, *ret2);
> +
> +    return ret;
> +}
> +
> +static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1,
> +                                   uint32_t param2, uint32_t *ret2)
> +{
> +    uint32_t ret = -1;
> +    char cmd[256] = "";
> +
> +    /* No interpret implemented */
> +    readstr(cmdaddr, cmd, sizeof(cmd));
> +    trace_vof_interpret(cmd, param1, param2, ret, *ret2);
> +
> +    return ret;
> +}
> +
> +static void vof_quiesce(void *fdt, Vof *vof)
> +{
> +    Object *cas_if = object_dynamic_cast(
> +        qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
> +
> +    int rc = fdt_pack(fdt);
> +
> +    assert(rc == 0);
> +
> +    if (cas_if) {
> +        ClientArchitectureSupportClass *casc =
> +            CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
> +
> +        casc->quiesce();
> +    }
> +
> +    vof_claimed_dump(vof->claimed);
> +}
> +
> +uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
> +                         uint32_t *args, unsigned nargs,
> +                         uint32_t *rets, unsigned nrets)
> +{
> +    uint32_t ret = 0;
> +
> +    /* @nrets includes the value which this function returns */
> +#define cmpserv(s, a, r) \
> +    cmpservice(service, nargs, nrets, (s), (a), (r))
> +
> +    if (cmpserv("finddevice", 1, 1)) {
> +        ret = vof_finddevice(fdt, args[0]);
> +    } else if (cmpserv("getprop", 4, 1)) {
> +        ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]);
> +    } else if (cmpserv("getproplen", 2, 1)) {
> +        ret = vof_getproplen(fdt, args[0], args[1]);
> +    } else if (cmpserv("setprop", 4, 1)) {
> +        ret = vof_setprop(fdt, vof, args[0], args[1], args[2], args[3]);
> +    } else if (cmpserv("nextprop", 3, 1)) {
> +        ret = vof_nextprop(fdt, args[0], args[1], args[2]);
> +    } else if (cmpserv("peer", 1, 1)) {
> +        ret = vof_peer(fdt, args[0]);
> +    } else if (cmpserv("child", 1, 1)) {
> +        ret = vof_child(fdt, args[0]);
> +    } else if (cmpserv("parent", 1, 1)) {
> +        ret = vof_parent(fdt, args[0]);
> +    } else if (cmpserv("open", 1, 1)) {
> +        ret = vof_open(fdt, vof, args[0]);
> +    } else if (cmpserv("close", 1, 0)) {
> +        vof_close(vof, args[0]);
> +    } else if (cmpserv("instance-to-package", 1, 1)) {
> +        ret = vof_instance_to_package(vof, args[0]);
> +    } else if (cmpserv("package-to-path", 3, 1)) {
> +        ret = vof_package_to_path(fdt, args[0], args[1], args[2]);
> +    } else if (cmpserv("instance-to-path", 3, 1)) {
> +        ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]);
> +    } else if (cmpserv("claim", 3, 1)) {
> +        ret = vof_claim(fdt, vof, args[0], args[1], args[2]);
> +    } else if (cmpserv("release", 2, 0)) {
> +        ret = vof_release(fdt, vof, args[0], args[1]);
> +    } else if (cmpserv("call-method", 0, 0)) {
> +        ret = vof_call_method(vof, args[0], args[1], args[2], args[3], args[4],
> +                              args[5], rets);
> +    } else if (cmpserv("interpret", 0, 0)) {
> +        ret = vof_call_interpret(args[0], args[1], args[2], rets);
> +    } else if (cmpserv("milliseconds", 0, 1)) {
> +        ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +    } else if (cmpserv("quiesce", 0, 0)) {
> +        vof_quiesce(fdt, vof);
> +    } else if (cmpserv("exit", 0, 0)) {
> +        error_report("Stopped as the VM requested \"exit\"");
> +        vm_stop(RUN_STATE_PAUSED); /* Or qemu_system_guest_panicked(NULL); ? */
> +    } else {
> +        trace_vof_error_unknown_service(service, nargs, nrets);
> +        ret = -1;
> +    }
> +
> +    return ret;
> +}
> +
> +static void of_instance_free(gpointer data)
> +{
> +    OfInstance *inst = (OfInstance *) data;
> +
> +    g_free(inst->path);
> +    g_free(inst);
> +}
> +
> +void vof_cleanup(Vof *vof)
> +{
> +    if (vof->claimed) {
> +        g_array_unref(vof->claimed);
> +    }
> +    if (vof->of_instances) {
> +        g_hash_table_unref(vof->of_instances);
> +    }
> +}
> +
> +void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr)
> +{
> +    uint32_t phandle;
> +    int i, offset, proplen = 0;
> +    const void *prop;
> +    bool found = false;
> +    GArray *phandles = g_array_new(false, false, sizeof(uint32_t));
> +
> +    vof_cleanup(vof);
> +
> +    vof->claimed = g_array_new(false, false, sizeof(OfClaimed));
> +    vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal,
> +                                              NULL, of_instance_free);
> +    vof->top_addr = top_addr;
> +
> +    /* Find all predefined phandles */
> +    for (offset = fdt_next_node(fdt, -1, NULL);
> +         offset >= 0;
> +         offset = fdt_next_node(fdt, offset, NULL)) {
> +        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
> +        if (prop && proplen == sizeof(uint32_t)) {
> +            phandle = fdt32_ld(prop);
> +            g_array_append_val(phandles, phandle);
> +        }
> +    }
> +
> +    /* Assign phandles skipping the predefined ones */
> +    for (offset = fdt_next_node(fdt, -1, NULL), phandle = 1;
> +         offset >= 0;
> +         offset = fdt_next_node(fdt, offset, NULL), ++phandle) {
> +
> +        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
> +        if (prop) {
> +            continue;
> +        }
> +        /* Check if the current phandle is not allocated already */
> +        for ( ; ; ++phandle) {
> +            for (i = 0, found = false; i < phandles->len; ++i) {
> +                if (phandle == g_array_index(phandles, uint32_t, i)) {
> +                    found = true;
> +                    break;
> +                }
> +            }
> +            if (!found) {
> +                break;
> +            }
> +        }
> +        _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle));
> +    }
> +    g_array_unref(phandles);
> +
> +    vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);

I still think using max_phandle would be simpler and therefore
preferable, but whatever.

> +}
> diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c
> new file mode 100644
> index 000000000000..de7d5fc76431
> --- /dev/null
> +++ b/pc-bios/vof/bootmem.c
> @@ -0,0 +1,13 @@
> +#include "vof.h"
> +
> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize)
> +{
> +	uint64_t kern[2];
> +	phandle chosen = ci_finddevice("/chosen");
> +
> +	if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=
> +			sizeof(kern))
> +		return;
> +
> +	do_boot(kern[0], initrd, initrdsize);
> +}
> diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
> new file mode 100644
> index 000000000000..4880b3d2047c
> --- /dev/null
> +++ b/pc-bios/vof/ci.c
> @@ -0,0 +1,108 @@
> +#include "vof.h"
> +
> +struct prom_args {
> +        uint32_t service;
> +        uint32_t nargs;
> +        uint32_t nret;
> +        uint32_t args[10];
> +};
> +
> +#define ADDR(x) (uint32_t)(x)
> +
> +extern uint32_t ci_entry(uint32_t params);
> +
> +extern unsigned long hv_rtas(unsigned long params);
> +extern unsigned int hv_rtas_size;
> +
> +bool prom_handle(struct prom_args *pargs)
> +{
> +	void *rtasbase;
> +	uint32_t rtassize = 0;
> +	phandle rtas;
> +
> +	if (strcmp("call-method", (void *)(unsigned long) pargs->service))
> +		return false;
> +
> +	if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
> +		return false;
> +
> +	rtas = ci_finddevice("/rtas");
> +	ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));

Why do we need this?  IIRC the "rtas-size" property is a qemu
extension we used to use for the interaction between SLOF and qemu -
can't we just use hv_rtas_size directly for the VOF case?

> +	if (rtassize < hv_rtas_size)
> +		return false;
> +
> +	rtasbase = (void *)(unsigned long) pargs->args[2];
> +
> +	memcpy(rtasbase, hv_rtas, hv_rtas_size);
> +	pargs->args[pargs->nargs] = 0;
> +	pargs->args[pargs->nargs + 1] = pargs->args[2];
> +
> +	return true;
> +}
> +
> +void prom_entry(uint32_t args)
> +{
> +	if (!prom_handle((void *)(unsigned long) args))
> +		ci_entry(args);
> +}
> +
> +int call_prom(const char *service, int nargs, int nret, ...)
> +{
> +        int i;
> +        struct prom_args args;
> +        va_list list;
> +
> +        args.service = ADDR(service);
> +        args.nargs = nargs;
> +        args.nret = nret;
> +
> +        va_start(list, nret);
> +        for (i = 0; i < nargs; i++)
> +                args.args[i] = va_arg(list, prom_arg_t);
> +        va_end(list);
> +
> +        for (i = 0; i < nret; i++)
> +                args.args[nargs+i] = 0;
> +
> +        if (ci_entry((uint32_t)(&args)) < 0)
> +                return PROM_ERROR;
> +
> +        return (nret > 0) ? args.args[nargs] : 0;
> +}
> +
> +void ci_panic(const char *str)
> +{
> +	call_prom("exit", 0, 0);
> +}
> +
> +phandle ci_finddevice(const char *path)
> +{
> +	return call_prom("finddevice", 1, 1, path);
> +}
> +
> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len)
> +{
> +	return call_prom("getprop", 4, 1, ph, propname, prop, len);
> +}
> +
> +ihandle ci_open(const char *path)
> +{
> +	return call_prom("open", 1, 1, path);
> +}
> +
> +void ci_close(ihandle ih)
> +{
> +	call_prom("close", 1, 0, ih);
> +}
> +
> +void *ci_claim(void *virt, uint32_t size, uint32_t align)
> +{
> +	uint32_t ret = call_prom("claim", 3, 1, ADDR(virt), size, align);
> +
> +	return (void *) (unsigned long) ret;
> +}
> +
> +uint32_t ci_release(void *virt, uint32_t size)
> +{
> +	return call_prom("release", 2, 1, ADDR(virt), size);
> +}
> diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c
> new file mode 100644
> index 000000000000..8603aedcb32c
> --- /dev/null
> +++ b/pc-bios/vof/libc.c
> @@ -0,0 +1,91 @@
> +#include "vof.h"
> +
> +int strlen(const char *s)
> +{
> +	int len = 0;
> +
> +	while (*s != 0) {
> +		len += 1;
> +		s += 1;
> +	}
> +
> +	return len;
> +}
> +
> +int strcmp(const char *s1, const char *s2)
> +{
> +        while (*s1 != 0 && *s2 != 0) {
> +                if (*s1 != *s2)
> +                        break;
> +                s1 += 1;
> +                s2 += 1;
> +        }
> +
> +        return *s1 - *s2;
> +}
> +
> +void *memcpy(void *dest, const void *src, size_t n)
> +{
> +        char *cdest;
> +        const char *csrc = src;
> +
> +        cdest = dest;
> +        while (n-- > 0) {
> +                *cdest++ = *csrc++;
> +        }
> +
> +        return dest;
> +}
> +
> +int memcmp(const void *ptr1, const void *ptr2, size_t n)
> +{
> +        const unsigned char *p1 = ptr1;
> +        const unsigned char *p2 = ptr2;
> +
> +        while (n-- > 0) {
> +                if (*p1 != *p2)
> +                        return (*p1 - *p2);
> +                p1 += 1;
> +                p2 += 1;
> +        }
> +
> +        return 0;
> +}

I believe there are gcc builtins for a number of these - could you use
those rather than having to open code them?

> +void *memmove(void *dest, const void *src, size_t n)
> +{
> +        char *cdest;
> +        const char *csrc;
> +        int i;
> +
> +        /* Do the buffers overlap in a bad way? */
> +        if (src < dest && src + n >= dest) {
> +                /* Copy from end to start */
> +                cdest = dest + n - 1;
> +                csrc = src + n - 1;
> +                for (i = 0; i < n; i++) {
> +                        *cdest-- = *csrc--;
> +                }
> +        }
> +        else {
> +                /* Normal copy is possible */
> +                cdest = dest;
> +                csrc = src;
> +                for (i = 0; i < n; i++) {
> +                        *cdest++ = *csrc++;
> +                }
> +        }
> +
> +        return dest;
> +}
> +
> +void *memset(void *dest, int c, size_t size)
> +{
> +        unsigned char *d = (unsigned char *)dest;
> +
> +        while (size-- > 0) {
> +                *d++ = (unsigned char)c;
> +        }
> +
> +        return dest;
> +}
> diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c
> new file mode 100644
> index 000000000000..34299a9cc5ad
> --- /dev/null
> +++ b/pc-bios/vof/main.c
> @@ -0,0 +1,22 @@
> +#include "vof.h"
> +
> +
> +void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4)
> +{
> +	register unsigned long r3 __asm__("r3") = _r3;
> +	register unsigned long r4 __asm__("r4") = _r4;
> +	register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;
> +
> +	((client *)(uint32_t)addr)();
> +}
> +
> +void entry_c(void)
> +{
> +	register unsigned long r3 __asm__("r3");
> +	register unsigned long r4 __asm__("r4");
> +	register unsigned long r5 __asm__("r5");
> +	uint64_t initrd = r3, initrdsize = r4;
> +
> +	boot_from_memory(initrd, initrdsize);
> +	ci_panic("*** No boot target ***\n");
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8201f12271b7..469b76b36b2a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1335,6 +1335,17 @@ F: pc-bios/canyonlands.dt[sb]
>  F: pc-bios/u-boot-sam460ex-20100605.bin
>  F: roms/u-boot-sam460ex
>  
> +VOF

I'd expand this to Virtual Open Firmware, which at least gives a very
faint idea of what it is.

> +M: Alexey Kardashevskiy <aik@ozlabs.ru>
> +M: David Gibson <david@gibson.dropbear.id.au>
> +M: Greg Kurz <groug@kaod.org>
> +L: qemu-ppc@nongnu.org
> +S: Maintained
> +F: hw/ppc/spapr_vof*
> +F: hw/ppc/vof*
> +F: pc-bios/vof/*
> +F: pc-bios/vof*
> +
>  RISC-V Machines
>  ---------------
>  OpenTitan
> diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
> index 218631c883be..24427d3f51c1 100644
> --- a/hw/ppc/meson.build
> +++ b/hw/ppc/meson.build
> @@ -28,6 +28,8 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files(
>    'spapr_rtas_ddw.c',
>    'spapr_numa.c',
>    'pef.c',
> +  'spapr_vof.c',
> +  'vof.c',
>  ))
>  ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c'))
>  ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files(
> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
> index 1e91984526a3..017c48624f96 100644
> --- a/hw/ppc/trace-events
> +++ b/hw/ppc/trace-events
> @@ -71,6 +71,27 @@ spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3
>  spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64
>  spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed"
>  
> +# vof.c
> +vof_error_str_truncated(const char *s, int len) "%s truncated to %d"
> +vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d"
> +vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d"
> +vof_error_unknown_method(const char *method) "\"%s\""
> +vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x"
> +vof_error_unknown_path(const char *path) "\"%s\""
> +vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x"
> +vof_canon(const char *path) "\"%s\""
> +vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x"
> +vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x"
> +vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x"
> +vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]"
> +vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d"
> +vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t ret) "ph=0x%x \"%s\" [%s] => len=%d"
> +vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x"
> +vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x"
> +vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d"
> +vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d"
> +vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x"
> +
>  # ppc.c
>  ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
>  
> diff --git a/pc-bios/README b/pc-bios/README
> index db7129ef6484..176587da8ea5 100644
> --- a/pc-bios/README
> +++ b/pc-bios/README
> @@ -16,6 +16,8 @@
>    https://github.com/aik/SLOF, and the image currently in qemu is
>    built from git tag qemu-slof-20200717.
>  
> +- vof is a minimalistic firmware to work with -machine pseries,x-vof=on.
> +
>  - sgabios (the Serial Graphics Adapter option ROM) provides a means for
>    legacy x86 software to communicate with an attached serial console as
>    if a video card were attached.  The master sources reside in a subversion
> diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin
> new file mode 100755
> index 0000000000000000000000000000000000000000..0606d9451c6bff39b32879c2a3369406a6a0d07d
> GIT binary patch
> literal 3680
> zcmd^BUuauZ82{2_+ue2@_aP6FMSANxX==+V^r6wxq_aZW%u<<!iXBNzt}d80wap}x
> z#N4z}S8<TCJvivU2Pyj2hY5XKg&_s^u!ji|bnD&*d)b3vWu4dGckj6_{g=h<(ZJ!H
> zob#RU_kF+b&$&eZ?_Xe@C~goH)*Xq?AW`(>B?=MUwAULA#>}I8bhdE6ZYf0j&qP62
> z;l6X}0rol?XtZA^HJ3P=?ZiPe{%~Ip(aYQR$lk+M_mu|YR!*v0DmkeA@{iPX`5v|8
> z0*A9F7ieE$t!^oBha7L*`MOjpkW^YAxpbWry+E?QKuWPor~L77wiqBO?{hBo>#hd2
> z$6uf}{V=(5l5=VDXJcdXfw7s7y`N<}YQ$4>$)nks0Lgxp2(y%%KF0su;=iJ^9e2qM
> zTO`;bhv?8H{d3(Fq5~Idv7ype_~x;3z_pkC@%zZZxLA9{+QaK<AJK_vqTsyoo9Xwl
> zaF+eu#m{wMc)+KZ`Ly|i{&v1M-Aja7W_^XPLc!oX1$n;$R~Bp8loz<&^QL{~Jz>W+
> z=#<!JTQ1*h#>AA#`vwN`mW}EzHm<_P4%pZ!Z1f<;Uc^|1E-&+s*~2^lqp=u4Jeo~j
> zOq=`%v95<NeMDC+e^h_3Va!BGW!t*vj9YwP;0OOBzKLtw0sH~1ckl*2*j9;o-0ru{
> z^&XsVxAj}jfoHNcV#dT{#&MQPutx<~-h6YP8?(9JJx5$8s^4cAyvJuEzwA9zo;kO3
> z9QL<aoP$#<7LxxU)>}DOgntTT6~=%a)IDpg;r=lGZ{cyL?yB+JQ#b2<Ca#1%u+6Ho
> zUf@SiXW(nFZ-3Vwc~pK1UJX6$%i9%o&OCXFoHLE#Ecyn8jm>ws=a_jrZ^ZyMa*eZ`
> zEaY5}^X(#c4LVRG&NE!+{s1W*tK{O|sL$)Njy;@xZ^oxfR~T31iWzI*WCRZ9gp9sw
> z%82{i<7D6t3K<vlcm{pS>zqS~z3dO7NA{Za&$hZaA6TZ?NA6<1kZrPLSq?ei5V5P)
> zr`N^1&{)~Ww!RBfI{OQ|B-R%;{=Q&ygMMcE9X9?t&%FsfQmyhPv<a-+z*<8;y1DZW
> znsNhc2>odo{V7_yOR17Uqo{=pILppNC>NZf#&Ur4a)d6H3sfpE&^P7lv{=4N-<1to
> z{<WS~VC#^mrD4pR2T{!Yt~jp-D9H7Jxf?MoW4{M8pNB-<a34%eL+;gSI)Iwyo*3f)
> z+zSUK<ZnF%?btw;_4oLD$p!m*idKvyKKIvyGb<*Fh~1OkA2B%{7Il230=pe}T+<TA
> z-qZtGkq1ge9&FbY6}oP~AK8l4pTD#3<{L6+_R!(&`Z!PFgT(kEUv5>2VGa56#Q^6^
> z$0AXOPK28Aocigt3FM6<N>9y^8uU*-Wc~OqU<?N^9Ki5e7?0;TH-#V7a<PV9GI^14
> z?C&Auq8nUtttq;Jynll40=_EnU#h@=&bphFcJZ^(!rA<x0;igvHSjsHSb@VaV?LV%
> z&f7J_Y*o~CPvv*U)*dp^35?3`4i9)#!J{|-N;oUx=mc(yh5P6i_764rZ{I<02|iz5
> z7FX<ea&2)8E3oT;-6pW#tC8Df;rF}nJA{6AYG*$y@;6}<a;OFUtPMHT4jjBgk6&b4
> zP^bKcDB=xp>Da@W#`x$x3i@jcY~?rKoe!PnZ;KYJ+w*Uo&hlH8cfN^!{Ks*wUlFH)
> zy*NVYQFC*%+MiZGNT)MuCN}nwmQk^Fh~lwSs`Z?fIh#&Ua%4Oc8_6VN8Lf3J6C3Ay
> zC&YrM*74+L8uqkoGQ(#dCPxwp?bBo&n@~79mL8?_s5Zi9@l<-8W#g$>@*L(dEv3b<
> zb0U5g2P5=}cJ6!&ThscnHa4QA_zsc7L-axxql7Vwv5c{TeVjXnu)cwD6Qhi=j&UF3
> NA;t#L5!^o_{R4q+IM@IH
> 
> literal 0
> HcmV?d00001
> 
> diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S
> new file mode 100644
> index 000000000000..90f4b859a059
> --- /dev/null
> +++ b/pc-bios/vof/entry.S
> @@ -0,0 +1,51 @@
> +#define LOAD32(rn, name)    \
> +	lis     rn,name##@h;    \
> +	ori     rn,rn,name##@l
> +
> +#define ENTRY(func_name)    \
> +	.text;                  \
> +	.align  2;              \
> +	.globl  .func_name;     \
> +	.func_name:             \
> +	.globl  func_name;      \
> +	func_name:
> +
> +#define KVMPPC_HCALL_BASE       0xf000
> +#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> +
> +	. = 0x100 /* Do exactly as SLOF does */
> +
> +ENTRY(_start)
> +	LOAD32(%r31, 0) /* Go 32bit mode */
> +	mtmsrd %r31,0
> +	LOAD32(2, __toc_start)
> +	b entry_c
> +
> +ENTRY(_prom_entry)
> +	LOAD32(2, __toc_start)
> +	stdu    %r1,-112(%r1)
> +	std     %r31,104(%r1)
> +	mflr    %r31
> +	bl prom_entry
> +	nop
> +	mtlr    %r31
> +	ld      %r31,104(%r1)
> +	addi    %r1,%r1,112
> +	blr
> +
> +ENTRY(ci_entry)
> +	mr	4,3
> +	LOAD32(3,KVMPPC_H_VOF_CLIENT)
> +	sc	1
> +	blr
> +
> +/* This is the actual RTAS blob copied to the OS at instantiate-rtas */
> +ENTRY(hv_rtas)
> +	mr      %r4,%r3
> +	LOAD32(3,KVMPPC_H_RTAS)
> +	sc	1
> +	blr
> +	.globl hv_rtas_size
> +hv_rtas_size:
> +	.long . - hv_rtas;
> diff --git a/pc-bios/vof/l.lds b/pc-bios/vof/l.lds
> new file mode 100644
> index 000000000000..10b557a81f78
> --- /dev/null
> +++ b/pc-bios/vof/l.lds
> @@ -0,0 +1,48 @@
> +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
> +OUTPUT_ARCH(powerpc:common)
> +
> +/* set the entry point */
> +ENTRY ( __start )
> +
> +SECTIONS {
> +	__executable_start = .;
> +
> +	.text : {
> +		*(.text)
> +	}
> +
> +	__etext = .;
> +
> +	. = ALIGN(8);
> +
> +	.data : {
> +		*(.data)
> +		*(.rodata .rodata.*)
> +		*(.got1)
> +		*(.sdata)
> +		*(.opd)
> +	}
> +
> +	/* FIXME bss at end ??? */
> +
> +	. = ALIGN(8);
> +	__bss_start = .;
> +	.bss : {
> +		*(.sbss) *(.scommon)
> +		*(.dynbss)
> +		*(.bss)
> +	}
> +
> +	. = ALIGN(8);
> +	__bss_end = .;
> +	__bss_size = (__bss_end - __bss_start);
> +
> +	. = ALIGN(256);
> +	__toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000;
> +	.got :
> +	{
> +		 *(.toc .got)
> +	}
> +	. = ALIGN(8);
> +	__toc_end = .;
> +}
> diff --git a/pc-bios/vof/nvram.bin b/pc-bios/vof/nvram.bin
> new file mode 100644
> index 0000000000000000000000000000000000000000..d183901cf980a91d81c4348bb20487c7bb62a2ec
> GIT binary patch
> literal 16384
> zcmeI%Jx;?g6bEpZJ8*)oSZeqZi&Z2pKnD)sI4{AHlNb4;RW}a70XPHaW57uo=-#R7
> zKSLBhJJ0sdixY3IuY@hzo0r$OmE%T;XE9uh@s1k=AOHafKmY;|fB*y_009U<00Izz
> z00bZa0SG_<0uX=z1Rwwb2tWV=XCbip6d#B4{{rX#XR%}$Bm^J;0SG|gWP$!?Aq=-I
> zcT+0Ix{{?1q>9J8r+eW^JK1tYYZZMWQCUwW%0S*~w^p@wfkX-<yRFx)H*+YEt0RRd
> zmn}6xtwbP`yp4O=>kxMAEA<~5@*g)@mb%KD5!;O~8c)>8rRQBx55=trhk#+1+T3J_
> zaf*G4vZAduqy$qda{``6Gnc2DQg<Es<GLxL#9<Oj*zP!8ZSnwf@-j7l47!nFXQO$a
> z^Hes6YU^_M<KsM*k~zwOSa+2g3Sx{*Eyu^XrB0FM5IJ-*?8`VvpBc4}vS(+_UKJ;=
> xITAns0uX=z1Rwwb2tWV=5P-nt34DD||Nni|VfbXeJORuY0uX=z1R!vE0>7B^s4f5i
> 
> literal 0
> HcmV?d00001
>
Alexey Kardashevskiy March 2, 2021, 7:21 a.m. UTC | #4
On 02/03/2021 14:35, David Gibson wrote:
> On Wed, Feb 24, 2021 at 04:41:30PM +1100, Alexey Kardashevskiy wrote:
>> The PAPR platform which describes an OS environment that's presented by
>> a combination of a hypervisor and firmware. The features it specifies
>> require collaboration between the firmware and the hypervisor.
>>
>> Since the beginning, the runtime component of the firmware (RTAS) has
>> been implemented as a 20 byte shim which simply forwards it to
>> a hypercall implemented in qemu. The boot time firmware component is
>> SLOF - but a build that's specific to qemu, and has always needed to be
>> updated in sync with it. Even though we've managed to limit the amount
>> of runtime communication we need between qemu and SLOF, there's some,
>> and it has become increasingly awkward to handle as we've implemented
>> new features.
>>
>> This implements a boot time OF client interface (CI) which is
>> enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
>> Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
>> which implements Open Firmware Client Interface (OF CI). This allows
>> using a smaller stateless firmware which does not have to manage
>> the device tree.
>>
>> The new "vof.bin" firmware image is included with source code under
>> pc-bios/. It also includes RTAS blob.
>>
>> This implements a handful of CI methods just to get -kernel/-initrd
>> working. In particular, this implements the device tree fetching and
>> simple memory allocator - "claim" (an OF CI memory allocator) and updates
>> "/memory@0/available" to report the client about available memory.
>>
>> This implements changing some device tree properties which we know how
>> to deal with, the rest is ignored. To allow changes, this skips
>> fdt_pack() when x-vof=on as not packing the blob leaves some room for
>> appending.
>>
>> In absence of SLOF, this assigns phandles to device tree nodes to make
>> device tree traversing work.
>>
>> When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
>>
>> This adds basic instances support which are managed by a hash map
>> ihandle -> [phandle].
>>
>> Before the guest started, the used memory is:
>> 0..4000 - the initial firmware
>> 10000..180000 - stack
>>
>> This OF CI does not implement "interpret".
>>
>> Unlike SLOF, this does not format uninitialized nvram. Instead, this
>> includes a disk image with pre-formatted nvram.
> 
> I think we'll need to improve this, but that can be a later patch.
> 
>> With this basic support, this can only boot into kernel directly.
>> However this is just enough for the petitboot kernel and initradmdisk to
>> boot from any possible source. Note this requires reasonably recent guest
>> kernel with:
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
>>
>> The immediate benefit is much faster booting time which especially
>> crucial with fully emulated early CPU bring up environments. Also this
>> may come handy when/if GRUB-in-the-userspace sees light of the day.
>>
>> This separates VOF and sPAPR in a hope that VOF bits may be reused by
>> other POWERPC boards which do not support pSeries.
>>
>> This is coded in assumption that later on we might be adding support for
>> booting from QEMU backends (blockdev is the first candidate) without
>> devices/drivers in between as OF1275 does not require that and
>> it is quite easy to so.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> The example command line is:
>>
>> /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
>> -nodefaults \
>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>> -mon id=MON0,chardev=STDIO0,mode=readline \
>> -nographic \
>> -vga none \
>> -enable-kvm \
>> -m 2G \
>> -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
>> -kernel pbuild/kernel-le-guest/vmlinux \
>> -initrd pb/rootfs.cpio.xz \
>> -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof/nvram.bin,format=raw \
>> -global spapr-nvram.drive=DRIVE0 \
>> -snapshot \
>> -smp 8,threads=8 \
>> -L /home/aik/t/qemu-ppc64-bios/ \
>> -trace events=qemu_trace_events \
>> -d guest_errors \
>> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
>> -mon chardev=SOCKET0,mode=control
>>
>> ---
>> Changes:
>> v14:
>> * check for truncates in readstr()
>> * ditched a separate vof_reset()
>> * spapr->vof is a pointer now, dropped the "on" field
>> * removed rtas_base from vof and updated comment why we allow setting it
>> * added myself to maintainers
>> * updated commit log about blockdev and other possible platforms
>> * added a note why new hcall is 0x5
>> * no in place endianness convertion in spapr_h_vof_client
>> * converted all cpu_physical_memory_read/write to address_space_rw
>> * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
>>
>> v13:
>> * rebase on latest ppc-for-6.0
>> * shuffled code around to touch spapr.c less
>>
>> v12:
>> * split VOF and SPAPR
>>
>> v11:
>> * added g_autofree
>> * fixed gcc warnings
>> * fixed few leaks
>> * added nvram image to make "nvram --print-config" not crash;
>> Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
>> is 16K, or it just does not work (empty output from "nvram")
>>
>> v10:
>> * now rebased to compile with meson
>>
>> v9:
>> * remove special handling of /rtas/rtas-size as now we always add it in QEMU
>> * removed leftovers from scsi/grub/stdout/stdin/...
>>
>> v8:
>> * no read/write/seek
>> * no @dev in instances
>> * the machine flag is "x-vof" for now
>>
>> v7:
>> * now we have a small firmware which loads at 0 as SLOF and starts from
>> 0x100 as SLOF
>> * no MBR/ELF/GRUB business in QEMU anymore
>> * blockdev is a separate patch
>> * networking is a separate patch
>>
>> v6:
>> * borrowed a big chunk of commit log introduction from David
>> * fixed initial stack pointer (points to the highest address of stack)
>> * traces for "interpret" and others
>> * disabled  translate_kernel_address() hack so grub can load (work in
>> progress)
>> * added "milliseconds" for grub
>> * fixed "claim" allocator again
>> * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
>> * moved the most code possible from spapr.c to spapr_of_client.c, such as
>> RTAS, prom entry and FDT build/finalize
>> * separated blobs
>> * GRUB now proceeds to its console prompt (there are still other issues)
>> * parse MBR/GPT to find PReP and load GRUB
>>
>> v5:
>> * made instances keep device and chardev pointers
>> * removed VIO dependencies
>> * print error if RTAS memory is not claimed as it should have been
>> * pack FDT as "quiesce"
>>
>> v4:
>> * fixed open
>> * validate ihandles in "call-method"
>>
>> v3:
>> * fixed phandles allocation
>> * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
>> * fixed size of /chosen/stdout
>> * bunch of renames
>> * do not create rtas properties at all, let the client deal with it;
>> instead setprop allows changing these in the FDT
>> * no more packing FDT when bios=off - nobody needs it and getprop does not
>> work otherwise
>> * allow updating initramdisk device tree properties (for zImage)
>> * added instances
>> * fixed stdout on OF's "write"
>> * removed special handling for stdout in OF client, spapr-vty handles it
>> instead
>>
>> v2:
>> * fixed claim()
>> * added "setprop"
>> * cleaner client interface and RTAS blobs management
>> * boots to petitboot and further to the target system
>> * more trace points
>> ---
>>   pc-bios/vof/Makefile   |  18 +
>>   hw/ppc/vof.h           |  42 ++
>>   include/hw/ppc/spapr.h |  22 +-
>>   pc-bios/vof/vof.h      |  44 +++
>>   hw/ppc/spapr.c         |  78 +++-
>>   hw/ppc/spapr_hcall.c   |  26 +-
>>   hw/ppc/spapr_vof.c     | 138 +++++++
>>   hw/ppc/vof.c           | 864 +++++++++++++++++++++++++++++++++++++++++
>>   pc-bios/vof/bootmem.c  |  13 +
>>   pc-bios/vof/ci.c       | 108 ++++++
>>   pc-bios/vof/libc.c     |  91 +++++
>>   pc-bios/vof/main.c     |  22 ++
>>   MAINTAINERS            |  11 +
>>   hw/ppc/meson.build     |   2 +
>>   hw/ppc/trace-events    |  21 +
>>   pc-bios/README         |   2 +
>>   pc-bios/vof.bin        | Bin 0 -> 3680 bytes
>>   pc-bios/vof/entry.S    |  51 +++
>>   pc-bios/vof/l.lds      |  48 +++
>>   pc-bios/vof/nvram.bin  | Bin 0 -> 16384 bytes
>>   20 files changed, 1592 insertions(+), 9 deletions(-)
>>   create mode 100644 pc-bios/vof/Makefile
>>   create mode 100644 hw/ppc/vof.h
>>   create mode 100644 pc-bios/vof/vof.h
>>   create mode 100644 hw/ppc/spapr_vof.c
>>   create mode 100644 hw/ppc/vof.c
>>   create mode 100644 pc-bios/vof/bootmem.c
>>   create mode 100644 pc-bios/vof/ci.c
>>   create mode 100644 pc-bios/vof/libc.c
>>   create mode 100644 pc-bios/vof/main.c
>>   create mode 100755 pc-bios/vof.bin
>>   create mode 100644 pc-bios/vof/entry.S
>>   create mode 100644 pc-bios/vof/l.lds
>>   create mode 100644 pc-bios/vof/nvram.bin
>>
>> diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
>> new file mode 100644
>> index 000000000000..49f7e240eeff
>> --- /dev/null
>> +++ b/pc-bios/vof/Makefile
>> @@ -0,0 +1,18 @@
>> +all: build-all
>> +
>> +build-all: vof.bin
>> +
>> +%.o: %.S
>> +	cc -m32 -mbig-endian -c -o $@ $<
>> +
>> +%.o: %.c
>> +	cc -m32 -mbig-endian -c -fno-stack-protector -Wno-builtin-declaration-mismatch -o $@ $<
>> +
>> +vof.elf: entry.o main.o libc.o ci.o bootmem.o
>> +	ld -nostdlib -e_start -Tl.lds -EB -o $@ $^
>> +
>> +%.bin: %.elf
>> +	objcopy -O binary -j .text -j .data -j .toc -j .got2 $^ $@
>> +
>> +clean:
>> +	rm -f *.o *.bin *.elf *~
>> diff --git a/hw/ppc/vof.h b/hw/ppc/vof.h
>> new file mode 100644
>> index 000000000000..c8fadf23ea5b
>> --- /dev/null
>> +++ b/hw/ppc/vof.h
>> @@ -0,0 +1,42 @@
>> + /* Virtual Open Firmware */
>> +#ifndef HW_VOF_H
>> +#define HW_VOF_H
>> +
>> +typedef struct Vof {
>> +    uint32_t top_addr; /* copied from rma_size */
>> +    GArray *claimed; /* array of SpaprOfClaimed */
>> +    uint64_t claimed_base;
>> +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
>> +    uint32_t of_instance_last;
>> +    char *bootargs;
>> +    uint32_t initrd_base; /* Updated in spapr at CAS */
>> +    long initrd_size; /* Updated in spapr at CAS */
>> +} Vof;
>> +
>> +uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
>> +                         uint32_t *args, unsigned nargs,
>> +                         uint32_t *rets, unsigned nrets);
>> +uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
>> +                   uint64_t align);
>> +void vof_cleanup(Vof *vof);
>> +void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr);
>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
>> +                               const char *prop, const char *path);
>> +
>> +/* ibm,client-architecture-support */
>> +#define TYPE_CLIENT_ARCHITECTURE_SUPPORT "client-architecture-support"
>> +#define CLIENT_ARCHITECTURE_SUPPORT(obj) \
>> +    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)
>> +
>> +typedef struct ClientArchitectureSupportClass ClientArchitectureSupportClass;
>> +DECLARE_CLASS_CHECKERS(ClientArchitectureSupportClass,
>> +                       CLIENT_ARCHITECTURE_SUPPORT,
>> +                       TYPE_CLIENT_ARCHITECTURE_SUPPORT)
>> +
>> +struct ClientArchitectureSupportClass {
>> +    InterfaceClass parent;
>> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
>> +    void (*quiesce)(void);
> 
> Is there actually any real connection of quiesce behaviour to cas
> behaviour?  Basically, I'm wondering if this is not so much about
> client-architecture-support fundamentally as just about
> machine-specific parts of the VOF behaviour.  Which would be fine, but
> suggests a different name for the interface.


The most canonical way would be having 2 interfaces. I thought it would 
be too much and left one. Then I thought may be the name should be PAPR 
but "quiesce" is ... I actually cannot spot where it came from, I do not 
see it neither in OF1275 nor PAPR. So the new name is fine but which 
one? I can make it

struct VofSupportClass {
  target_ulong callmethod(const char *method, int nargs, target_ulong 
*args, int nret, target_ulong *rets);
}

but it looks too vague and makes it harder for the reader.


> 
>> +};
>> +
>> +#endif /* HW_VOF_H */
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index ccbeeca1de84..4896b9fae784 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -12,6 +12,7 @@
>>   #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>>   #include "hw/ppc/xics.h"        /* For ICSState */
>>   #include "hw/ppc/spapr_tpm_proxy.h"
>> +#include "hw/ppc/vof.h"
>>   
>>   struct SpaprVioBus;
>>   struct SpaprPhbState;
>> @@ -180,6 +181,7 @@ struct SpaprMachineState {
>>       uint64_t kernel_addr;
>>       uint32_t initrd_base;
>>       long initrd_size;
>> +    Vof *vof;
>>       uint64_t rtc_offset; /* Now used only during incoming migration */
>>       struct PPCTimebase tb;
>>       bool has_graphics;
>> @@ -554,7 +556,9 @@ struct SpaprMachineState {
>>   /* Client Architecture support */
>>   #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
>>   #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>> -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
>> +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>> +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
>>   
>>   /*
>>    * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
>> @@ -944,4 +948,20 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
>>   void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
>>   hwaddr spapr_get_rtas_addr(void);
>>   bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
>> +
>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>> +                     target_ulong *stack_ptr);
>> +void spapr_vof_quiesce(void);
>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> +                                target_ulong opcode, target_ulong *args);
> 
> Alexey or Zoltan, any thoughts on how non-PAPR versions of this would
> call into qemu to get the non-guest parts of VOF to execute?

Non-PAPR could do it as we do it for soft breakpoints in KVM - some 
predefined illegal instruction which KVM knows that it is used for soft 
breakpoints.


>> +target_ulong spapr_vof_client_architecture_support(CPUState *cs,
>> +                                                   target_ulong ovec_addr);
>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
>> +
>> +/* Copied from SLOF, and 4K is definitely not enough for GRUB */
>> +#define OF_STACK_SIZE       0x8000
>> +
>> +/* 0..10000 is reserved for the VOF fw */
>> +#define OF_STACK_ADDR       0x10000
>> +
>>   #endif /* HW_SPAPR_H */
>> diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
>> new file mode 100644
>> index 000000000000..cd5989952a98
>> --- /dev/null
>> +++ b/pc-bios/vof/vof.h
>> @@ -0,0 +1,44 @@
>> +#include <stdarg.h>
>> +
>> +typedef unsigned char uint8_t;
>> +typedef unsigned short uint16_t;
>> +typedef unsigned long uint32_t;
>> +typedef unsigned long long uint64_t;
>> +#define NULL (0)
>> +#define PROM_ERROR (-1u)
>> +typedef unsigned char bool;
>> +typedef unsigned long ihandle;
>> +typedef unsigned long phandle;
>> +#define false ((bool)0)
>> +#define true ((bool)1)
> 
> This is actually kinda risky if 'bool' is aliased to unsigned char.
> So, it's probably worth figuring out how to bind it instead to the
> _Bool builtin which is available in modern compilers.

I do not need "bool" now so I better off ditching it for now.


>> +typedef int size_t;
>> +typedef void client(void);
>> +
>> +/* globals */
>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
>> +
>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>> +
>> +/* libc */
>> +int strlen(const char *s);
>> +int strcmp(const char *s1, const char *s2);
>> +void *memcpy(void *dest, const void *src, size_t n);
>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>> +void *memmove(void *dest, const void *src, size_t n);
>> +void *memset(void *dest, int c, size_t size);
>> +
>> +/* Prom */
>> +typedef unsigned long prom_arg_t;
>> +int call_prom(const char *service, int nargs, int nret, ...);
> 
> AIUI this isn't so much about calling the PROM, since this *is* the
> PROM code, but rather about calling the parts that are implemented on
> the qemu side.  Different names might clarify that.

"call_ci"?


>> +
>> +/* CI wrappers */
>> +void ci_panic(const char *str);
>> +phandle ci_finddevice(const char *path);
>> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len);
>> +ihandle ci_open(const char *path);
>> +void ci_close(ihandle ih);
>> +void *ci_claim(void *virt, uint32_t size, uint32_t align);
>> +uint32_t ci_release(void *virt, uint32_t size);
>> +
>> +/* booting from -kernel */
>> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 85fe65f89476..3c20af115627 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -102,6 +102,7 @@
>>   #define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
>>   #define FW_MAX_SIZE             0x400000
>>   #define FW_FILE_NAME            "slof.bin"
>> +#define FW_FILE_NAME_VOF        "vof.bin"
>>   #define FW_OVERHEAD             0x2800000
>>   #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
>>   
>> @@ -1562,6 +1563,7 @@ static void spapr_machine_reset(MachineState *machine)
>>       SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>>       PowerPCCPU *first_ppc_cpu;
>>       hwaddr fdt_addr;
>> +    target_ulong stack_ptr = 0;
>>       void *fdt;
>>       int rc;
>>   
>> @@ -1624,22 +1626,41 @@ static void spapr_machine_reset(MachineState *machine)
>>   
>>       fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
>>   
>> -    rc = fdt_pack(fdt);
>> +    if (spapr->vof) {
>> +        /*
>> +         * Claims initramdisk and stack which changes "available" so
>> +         * doing it befofe packing.
>> +         */
>> +        spapr_vof_reset(spapr, fdt, &stack_ptr);
>>   
>> -    /* Should only fail if we've built a corrupted tree */
>> -    assert(rc == 0);
>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>> +                                  stack_ptr, spapr->initrd_base,
>> +                                  spapr->initrd_size);
>> +        /*
>> +         * We do not pack the FDT as the client may change properties and
>> +         * do not write FDT to the VM as the client does not expect it.
>> +         */
>> +    } else {
>> +        rc = fdt_pack(fdt);
>> +        /* Should only fail if we've built a corrupted tree */
>> +        assert(rc == 0);
>>   
>> -    /* Load the fdt */
>> +        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
>> +                                  0, fdt_addr, 0);
>> +    }
>>       qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
>> -    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
>> +
>>       g_free(spapr->fdt_blob);
>>       spapr->fdt_size = fdt_totalsize(fdt);
>>       spapr->fdt_initial_size = spapr->fdt_size;
>>       spapr->fdt_blob = fdt;
>>   
>>       /* Set up the entry state */
>> -    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0);
>>       first_ppc_cpu->env.gpr[5] = 0;
>> +    if (!spapr->vof) {
>> +        /* Load the fdt */
>> +        cpu_physical_memory_write(fdt_addr, spapr->fdt_blob, spapr->fdt_size);
>> +    }
>>   
>>       spapr->fwnmi_system_reset_addr = -1;
>>       spapr->fwnmi_machine_check_addr = -1;
>> @@ -2639,7 +2660,8 @@ static void spapr_machine_init(MachineState *machine)
>>       SpaprMachineState *spapr = SPAPR_MACHINE(machine);
>>       SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>>       MachineClass *mc = MACHINE_GET_CLASS(machine);
>> -    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
>> +    const char *bios_default = !!spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
>> +    const char *bios_name = machine->firmware ?: bios_default;
>>       const char *kernel_filename = machine->kernel_filename;
>>       const char *initrd_filename = machine->initrd_filename;
>>       PCIHostState *phb;
>> @@ -2996,6 +3018,10 @@ static void spapr_machine_init(MachineState *machine)
>>       }
>>   
>>       qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
>> +
>> +    if (spapr->vof) {
>> +        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
>> +    }
>>   }
>>   
>>   #define DEFAULT_KVM_TYPE "auto"
>> @@ -3186,6 +3212,28 @@ static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
>>       }
>>   }
>>   
>> +static bool spapr_get_vof(Object *obj, Error **errp)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    return spapr->vof != NULL;
>> +}
>> +
>> +static void spapr_set_vof(Object *obj, bool value, Error **errp)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    if (spapr->vof) {
>> +        vof_cleanup(spapr->vof);
>> +        g_free(spapr->vof);
>> +        spapr->vof = NULL;
>> +    }
>> +    if (!value) {
>> +        return;
>> +    }
>> +    spapr->vof = g_malloc0(sizeof(*spapr->vof));
>> +}
>> +
>>   static char *spapr_get_ic_mode(Object *obj, Error **errp)
>>   {
>>       SpaprMachineState *spapr = SPAPR_MACHINE(obj);
>> @@ -3311,6 +3359,10 @@ static void spapr_instance_init(Object *obj)
>>                                       stringify(KERNEL_LOAD_ADDR)
>>                                       " for -kernel is the default");
>>       spapr->kernel_addr = KERNEL_LOAD_ADDR;
>> +    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
>> +    object_property_set_description(obj, "x-vof",
>> +                                    "Enable Virtual Open Firmware");
>> +
>>       /* The machine class defines the default interrupt controller mode */
>>       spapr->irq = smc->irq;
>>       object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
>> @@ -4408,6 +4460,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>>       XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
>>       InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
>>       XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
>> +    ClientArchitectureSupportClass *casc = CLIENT_ARCHITECTURE_SUPPORT_CLASS(oc);
>>   
>>       mc->desc = "pSeries Logical Partition (PAPR compliant)";
>>       mc->ignore_boot_device_suffixes = true;
>> @@ -4487,6 +4540,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>>       smc->smp_threads_vsmt = true;
>>       smc->nr_xirqs = SPAPR_NR_XIRQS;
>>       xfc->match_nvt = spapr_match_nvt;
>> +
>> +    casc->cas = spapr_vof_client_architecture_support;
>> +    casc->quiesce = spapr_vof_quiesce;
>>   }
>>   
>>   static const TypeInfo spapr_machine_info = {
>> @@ -4506,6 +4562,7 @@ static const TypeInfo spapr_machine_info = {
>>           { TYPE_XICS_FABRIC },
>>           { TYPE_INTERRUPT_STATS_PROVIDER },
>>           { TYPE_XIVE_FABRIC },
>> +        { TYPE_CLIENT_ARCHITECTURE_SUPPORT },
>>           { }
>>       },
>>   };
>> @@ -4974,9 +5031,16 @@ static void spapr_machine_2_1_class_options(MachineClass *mc)
>>   }
>>   DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
>>   
>> +static const TypeInfo client_archivecture_support_info = {
> 
> s/archivecture/architecture/
> 
>> +    .name = TYPE_CLIENT_ARCHITECTURE_SUPPORT,
>> +    .parent = TYPE_INTERFACE,
>> +    .class_size = sizeof(ClientArchitectureSupportClass),
>> +};
>> +
>>   static void spapr_machine_register_types(void)
>>   {
>>       type_register_static(&spapr_machine_info);
>> +    type_register_static(&client_archivecture_support_info);
>>   }
>>   
>>   type_init(spapr_machine_register_types)
>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>> index 7b5cd3553c26..0cdf90af6afb 100644
>> --- a/hw/ppc/spapr_hcall.c
>> +++ b/hw/ppc/spapr_hcall.c
>> @@ -1806,7 +1806,13 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu,
>>           spapr_setup_hpt(spapr);
>>       }
>>   
>> -    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
>> +    if (spapr->vof && spapr->vof->initrd_base && spapr->vof->initrd_size) {
>> +        /* Update initramdisk location so the right area gets reserved below */
>> +        spapr->initrd_base = spapr->vof->initrd_base;
>> +        spapr->initrd_size = spapr->vof->initrd_size;
>> +    }
>> +
>> +    fdt = spapr_build_fdt(spapr, spapr->vof != NULL, fdt_bufsize);
>>   
>>       g_free(spapr->fdt_blob);
>>       spapr->fdt_size = fdt_totalsize(fdt);
>> @@ -1850,6 +1856,24 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
>>       return ret;
>>   }
>>   
>> +target_ulong spapr_vof_client_architecture_support(CPUState *cs,
>> +                                                  target_ulong ovec_addr)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +
>> +    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr,
>> +                                                      ovec_addr, FDT_MAX_SIZE);
>> +
>> +    /*
>> +     * This adds stdout and generates phandles for boottime and CAS FDTs.
>> +     * It is alright to update the FDT here as do_client_architecture_support()
>> +     * does not pack it.
>> +     */
>> +    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
>> +
>> +    return ret;
>> +}
>> +
>>   static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
>>                                                 SpaprMachineState *spapr,
>>                                                 target_ulong opcode,
>> diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
>> new file mode 100644
>> index 000000000000..f2978d830da5
>> --- /dev/null
>> +++ b/hw/ppc/spapr_vof.c
>> @@ -0,0 +1,138 @@
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include <sys/ioctl.h>
>> +#include "qapi/error.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_vio.h"
>> +#include "hw/ppc/fdt.h"
>> +#include "sysemu/sysemu.h"
>> +#include "qom/qom-qobject.h"
>> +#include "trace.h"
>> +
>> +/* Defined as Big Endian */
>> +struct prom_args {
>> +    uint32_t service;
>> +    uint32_t nargs;
>> +    uint32_t nret;
>> +    uint32_t args[10];
>> +} QEMU_PACKED;
>> +
>> +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> +                                target_ulong opcode, target_ulong *_args)
>> +{
>> +    target_ulong args_real = ppc64_phys_to_real(_args[0]);
>> +    struct prom_args args_be;
>> +    uint32_t args[ARRAY_SIZE(args_be.args)];
>> +    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
>> +    char service[64];
>> +    unsigned nargs, nret, i;
>> +
>> +    if (address_space_rw(&address_space_memory, args_real,
>> +                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
>> +                         false) != MEMTX_OK) {
>> +        return H_HARDWARE;
>> +    }
>> +    nargs = be32_to_cpu(args_be.nargs);
>> +    if (nargs >= ARRAY_SIZE(args_be.args)) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
>> +                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
>> +                         false) != MEMTX_OK) {
>> +        return H_HARDWARE;
>> +    }
>> +    if (strnlen(service, sizeof(service)) == sizeof(service)) {
>> +        /* Too long service name */
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    for (i = 0; i < nargs; ++i) {
>> +        args[i] = be32_to_cpu(args_be.args[i]);
>> +    }
>> +
>> +    nret = be32_to_cpu(args_be.nret);
>> +    ret = vof_client_call(spapr->fdt_blob, spapr->vof, service,
>> +                          args, nargs, rets, nret);
>> +    if (!nret) {
>> +        return H_SUCCESS;
>> +    }
>> +
>> +    args_be.args[nargs] = cpu_to_be32(ret);
>> +    for (i = 1; i < nret; ++i) {
>> +        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
>> +    }
>> +
>> +    if (address_space_rw(&address_space_memory,
>> +                         args_real + offsetof(struct prom_args, args[nargs]),
>> +                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
>> +                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
>> +        return H_HARDWARE;
>> +    }
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
>> +{
>> +    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
>> +
>> +    vof_build_dt(fdt, spapr->vof, spapr->rma_size);
>> +
>> +    /*
>> +     * SLOF-less setup requires an open instance of stdout for early
>> +     * kernel printk. By now all phandles are settled so we can open
>> +     * the default serial console.
>> +     */
>> +    if (stdout_path) {
>> +        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
>> +                                   stdout_path));
>> +    }
>> +}
>> +
>> +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
>> +                     target_ulong *stack_ptr)
>> +{
>> +    Vof *vof = spapr->vof;
>> +
>> +    spapr_vof_client_dt_finalize(spapr, fdt);
>> +
>> +    *stack_ptr = vof_claim(spapr->fdt_blob, vof, OF_STACK_ADDR, OF_STACK_SIZE,
>> +                           OF_STACK_SIZE);
>> +    if (*stack_ptr == -1) {
>> +        error_report("Memory allocation for stack failed");
>> +        exit(1);
> 
> Might make things a little cleaner to add an Error ** parameter, and
> pass in &error_fatal in the caller.

Ok!


>> +    }
>> +    /*
>> +     * Stack grows downwards and we also reserve here space for
>> +     * the minimum stack frame.
>> +     */
>> +    *stack_ptr += OF_STACK_SIZE - 0x20;
>> +
>> +    if (spapr->kernel_size &&
>> +        vof_claim(spapr->fdt_blob, vof, spapr->kernel_addr, spapr->kernel_size,
>> +                  0) == -1) {
>> +        error_report("Memory for kernel is in use");
>> +        exit(1);
>> +    }
>> +
>> +    if (spapr->initrd_size &&
>> +        vof_claim(spapr->fdt_blob, vof, spapr->initrd_base, spapr->initrd_size,
>> +                  0) == -1) {
>> +        error_report("Memory for initramdisk is in use");
>> +        exit(1);
>> +    }
>> +
>> +    /*
>> +     * We skip writing FDT as nothing expects it; OF client interface is
>> +     * going to be used for reading the device tree.
>> +     */
>> +}
>> +
>> +void spapr_vof_quiesce(void)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +
>> +    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
>> +    spapr->fdt_initial_size = spapr->fdt_size;
>> +}
>> diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
>> new file mode 100644
>> index 000000000000..9c76891e668c
>> --- /dev/null
>> +++ b/hw/ppc/vof.c
>> @@ -0,0 +1,864 @@
>> +/*
>> + * QEMU PowerPC Virtual Open Firmware.
>> + *
>> + * This implements client interface from OpenFirmware IEEE1275 on the QEMU
>> + * side to leave only a very basic firmware in the VM.
>> + *
>> + * Copyright (c) 2020 IBM Corporation.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> + * of this software and associated documentation files (the "Software"), to deal
>> + * in the Software without restriction, including without limitation the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> + * copies of the Software, and to permit persons to whom the Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include <sys/ioctl.h>
>> +#include "exec/ram_addr.h"
>> +#include "exec/address-spaces.h"
>> +#include "qemu/timer.h"
>> +#include "qemu/range.h"
>> +#include "hw/ppc/vof.h"
>> +#include "hw/ppc/fdt.h"
>> +#include "sysemu/runstate.h"
>> +#include "qom/qom-qobject.h"
>> +#include "trace.h"
>> +
>> +#include <libfdt.h>
>> +
>> +/*
>> + * OF 1275 "nextprop" description suggests is it 32 bytes max but
>> + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long.
>> + */
>> +#define OF_PROPNAME_LEN_MAX 64
>> +
>> +typedef struct {
>> +    uint64_t start;
>> +    uint64_t size;
>> +} OfClaimed;
>> +
>> +typedef struct {
>> +    char *path; /* the path used to open the instance */
>> +    uint32_t phandle;
>> +} OfInstance;
>> +
>> +#define VOF_MEM_READ(pa, buf, size) \
>> +    address_space_read_full(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> +#define VOF_MEM_WRITE(pa, buf, size) \
>> +    address_space_write(&address_space_memory, \
>> +    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
>> +
>> +static int readstr(hwaddr pa, char *buf, int size)
>> +{
>> +    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
>> +        return -1;
>> +    }
>> +    if (strnlen(buf, size) == size) {
>> +        buf[size - 1] = '\0';
>> +        trace_vof_error_str_truncated(buf, size);
>> +        return -1;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
>> +                       const char *s1, unsigned nargscheck, unsigned nretcheck)
>> +{
>> +    if (strcmp(s, s1)) {
>> +        return false;
>> +    }
>> +    if ((nargscheck && (nargs != nargscheck)) ||
>> +        (nretcheck && (nret != nretcheck))) {
>> +        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static void prop_format(char *tval, int tlen, const void *prop, int len)
>> +{
>> +    int i;
>> +    const unsigned char *c;
>> +    char *t;
>> +    const char bin[] = "...";
>> +
>> +    for (i = 0, c = prop; i < len; ++i, ++c) {
>> +        if (*c == '\0' && i == len - 1) {
>> +            strncpy(tval, prop, tlen - 1);
>> +            return;
>> +        }
>> +        if (*c < 0x20 || *c >= 0x80) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
>> +        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
>> +            strcpy(t, bin);
>> +            return;
>> +        }
>> +        if (i && i % 4 == 0 && i != len - 1) {
>> +            strcat(t, " ");
>> +            ++t;
>> +        }
>> +        t += sprintf(t, "%02X", *c & 0xFF);
>> +    }
>> +}
>> +
>> +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
>> +{
>> +    char fullnode[1024];
>> +    uint32_t ret = -1;
>> +    int offset;
>> +
>> +    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
>> +        return (uint32_t) ret;
>> +    }
>> +
>> +    offset = fdt_path_offset(fdt, fullnode);
>> +    if (offset >= 0) {
>> +        ret = fdt_get_phandle(fdt, offset);
>> +    }
>> +    trace_vof_finddevice(fullnode, ret);
>> +    return (uint32_t) ret;
>> +}
>> +
>> +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
>> +                            uint32_t valaddr, uint32_t vallen)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = 0;
>> +    int proplen = 0;
>> +    const void *prop;
>> +    char trval[64] = "";
>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>> +
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        return -1;
>> +    }
>> +    if (strcmp(propname, "name") == 0) {
>> +        prop = fdt_get_name(fdt, nodeoff, &proplen);
>> +        proplen += 1;
> 
> This isn't quite right, I don't think.  fdt_get_name() returns the
> name *including* unit address, but the 'name' property will omit the
> unit address.

Ok, will fix. Is there a helper to chop off the unit address btw, since 
I gave up my split()?


>> +    } else {
>> +        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
>> +    }
>> +
>> +    if (prop) {
>> +        int cb = MIN(proplen, vallen);
>> +
>> +        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK) {
>> +            ret = -1;
>> +        } else {
>> +            /*
>> +             * OF1275 says:
>> +             * "Size is either the actual size of the property, or -1 if name
>> +             * does not exist", hence returning proplen instead of cb.
>> +             */
>> +            ret = proplen;
>> +            prop_format(trval, sizeof(trval), prop, ret);
> 
> It would be nice if we could elide this when tracing isn't enabled
> :/.  Guess that can be a later optimization, though.


At the time I did not see how to tell if a specific trace point is 
enabled, may.


> 
>> +        }
>> +    } else {
>> +        ret = -1;
>> +    }
>> +    trace_vof_getprop(nodeph, propname, ret, trval);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = 0;
>> +    int proplen = 0;
>> +    const void *prop;
>> +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
>> +
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        return -1;
>> +    }
>> +    if (strcmp(propname, "name") == 0) {
>> +        prop = fdt_get_name(fdt, nodeoff, &proplen);
>> +        proplen += 1;
>> +    } else {
>> +        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
>> +    }
>> +
>> +    if (prop) {
>> +        ret = proplen;
>> +    } else {
>> +        ret = -1;
>> +    }
>> +    trace_vof_getproplen(nodeph, propname, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_setprop(void *fdt, Vof *vof,
>> +                            uint32_t nodeph, uint32_t pname,
>> +                            uint32_t valaddr, uint32_t vallen)
>> +{
>> +    char propname[OF_PROPNAME_LEN_MAX + 1];
>> +    uint32_t ret = -1;
>> +    int offset;
>> +    char trval[64] = "";
>> +
>> +    if (readstr(pname, propname, sizeof(propname))) {
>> +        return -1;
>> +    }
>> +    /*
>> +     * We only allow changing properties which we know how to update in QEMU
>> +     * OR
>> +     * the ones which we know that they need to survive during "quiesce".
>> +     */
> 
> Should we be checking the node these are being written to as well?  I
> think they're all in /chosen.

We should, I'll fix it.

>> +    if (vallen == sizeof(uint32_t)) {
>> +        uint32_t val32 = ldl_be_phys(first_cpu->as, valaddr);
>> +
>> +        if ((strcmp(propname, "linux,rtas-base") == 0) ||
>> +            (strcmp(propname, "linux,rtas-entry") == 0)) {
>> +            /* These need to survive quiesce so let them store in the FDT */
>> +        } else if (strcmp(propname, "linux,initrd-start") == 0) {
>> +            vof->initrd_base = val32;
>> +        } else if (strcmp(propname, "linux,initrd-end") == 0) {
>> +            vof->initrd_size = val32 - vof->initrd_base;
>> +        } else {
>> +            goto trace_exit;
>> +        }
>> +    } else if (vallen == sizeof(uint64_t)) {
>> +        uint64_t val64 = ldq_be_phys(first_cpu->as, valaddr);
>> +
>> +        if (strcmp(propname, "linux,initrd-start") == 0) {
>> +            vof->initrd_base = val64;
>> +        } else if (strcmp(propname, "linux,initrd-end") == 0) {
>> +            vof->initrd_size = val64 - vof->initrd_base;
>> +        } else {
>> +            goto trace_exit;
>> +        }
>> +    } else if (strcmp(propname, "bootargs") == 0) {
>> +        char val[1024];
>> +
>> +        if (readstr(valaddr, val, sizeof(val))) {
>> +            goto trace_exit;
>> +        }
>> +        g_free(vof->bootargs);
>> +        vof->bootargs = g_strdup(val);
> 
> We should probably truncate vallen to the size of the loaded string,
> yes?

Not sure. I can imagine a (hackish) firmware reserving space in the FDT 
blob. May be.


>> +    } else {
>> +        goto trace_exit;
>> +    }
>> +
>> +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
>> +    if (offset >= 0) {
>> +        uint8_t data[vallen];
> 
> We probably want some limit on vallen so the guest can't force
> allocation of arbitrary amounts of qemu stack space.

Indeed. 1024 + 1?


> 
>> +
>> +        if ((VOF_MEM_READ(valaddr, data, vallen) == MEMTX_OK) &&
>> +            !fdt_setprop(fdt, offset, propname, data, vallen)) {
>> +            ret = vallen;
>> +            prop_format(trval, sizeof(trval), data, ret);
>> +        }
>> +    }
>> +
>> +trace_exit:
>> +    trace_vof_setprop(nodeph, propname, trval, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
>> +                             uint32_t prevaddr, uint32_t nameaddr)
>> +{
>> +    int offset = fdt_node_offset_by_phandle(fdt, phandle);
>> +    char prev[OF_PROPNAME_LEN_MAX + 1];
>> +    const char *tmp;
>> +
>> +    if (readstr(prevaddr, prev, sizeof(prev))) {
>> +        return -1;
>> +    }
>> +    for (offset = fdt_first_property_offset(fdt, offset);
>> +         offset >= 0;
>> +         offset = fdt_next_property_offset(fdt, offset)) {
> 
> I think you can use libfdt's for_each_property_offset macro here.

Ok.

> 
>> +
>> +        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>> +            return 0;
>> +        }
>> +        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
>> +            if (prev[0] != '\0') {
>> +                offset = fdt_next_property_offset(fdt, offset);
>> +                if (offset < 0) {
>> +                    return 0;
>> +                }
>> +            }
>> +            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
>> +                return 0;
>> +            }
>> +
>> +            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) {
>> +                return -1;
>> +            }
>> +            return 1;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static uint32_t vof_peer(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret;
>> +
>> +    if (phandle == 0) {
>> +        ret = fdt_path_offset(fdt, "/");
>> +    } else {
>> +        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
>> +    }
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_child(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_parent(const void *fdt, uint32_t phandle)
>> +{
>> +    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle));
>> +
>> +    if (ret < 0) {
>> +        ret = 0;
>> +    } else {
>> +        ret = fdt_get_phandle(fdt, ret);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_do_open(void *fdt, Vof *vof, const char *path)
>> +{
>> +    int offset;
>> +    uint32_t ret = 0;
>> +    OfInstance *inst = NULL;
>> +
>> +    if (vof->of_instance_last == 0xFFFFFFFF) {
>> +        /* We do not recycle ihandles yet */
>> +        goto trace_exit;
>> +    }
>> +
>> +    offset = fdt_path_offset(fdt, path);
>> +    if (offset < 0) {
>> +        trace_vof_error_unknown_path(path);
>> +        goto trace_exit;
>> +    }
>> +
>> +    inst = g_new0(OfInstance, 1);
>> +    inst->phandle = fdt_get_phandle(fdt, offset);
>> +    g_assert(inst->phandle);
>> +    ++vof->of_instance_last;
>> +
>> +    inst->path = g_strdup(path);
>> +    g_hash_table_insert(vof->of_instances,
>> +                        GINT_TO_POINTER(vof->of_instance_last),
>> +                        inst);
>> +    ret = vof->of_instance_last;
>> +
>> +trace_exit:
>> +    trace_vof_open(path, inst ? inst->phandle : 0, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
>> +                               const char *prop, const char *path)
>> +{
>> +    int node = fdt_path_offset(fdt, nodename);
>> +    uint32_t inst = vof_do_open(fdt, vof, path);
>> +
>> +    return fdt_setprop_cell(fdt, node, prop, inst);
>> +}
>> +
>> +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
>> +{
>> +    char path[256];
>> +
>> +    if (readstr(pathaddr, path, sizeof(path))) {
>> +        return -1;
>> +    }
>> +
>> +    return vof_do_open(fdt, vof, path);
>> +}
>> +
>> +static void vof_close(Vof *vof, uint32_t ihandle)
>> +{
>> +    if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) {
>> +        trace_vof_error_unknown_ihandle_close(ihandle);
>> +    }
>> +}
>> +
>> +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
>> +{
>> +    gpointer instp = g_hash_table_lookup(vof->of_instances,
>> +                                         GINT_TO_POINTER(ihandle));
>> +    uint32_t ret = -1;
>> +
>> +    if (instp) {
>> +        ret = ((OfInstance *)instp)->phandle;
>> +    }
>> +    trace_vof_instance_to_package(ihandle, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
>> +                                    uint32_t buf, uint32_t len)
>> +{
>> +    uint32_t ret = -1;
>> +    char tmp[256] = "";
>> +
>> +    if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle), tmp,
>> +                      sizeof(tmp))) {
>> +        tmp[sizeof(tmp) - 1] = 0;
>> +        ret = MIN(len, strlen(tmp) + 1);
>> +        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>> +            ret = -1;
>> +        }
>> +    }
>> +
>> +    trace_vof_package_to_path(phandle, tmp, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle,
>> +                                     uint32_t buf, uint32_t len)
>> +{
>> +    uint32_t ret = -1;
>> +    uint32_t phandle = vof_instance_to_package(vof, ihandle);
>> +    char tmp[256] = "";
> 
> 
> There are a bunch of these hardcoded 256 values, and in other places
> 1024.  Probably worth defining a VOF_PATH_MAX or whatever you want to
> call it for consistency.

Ok, I'll add the macros.


> 
>> +
>> +    if (phandle != -1) {
>> +        if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle),
>> +                          tmp, sizeof(tmp))) {
>> +            tmp[sizeof(tmp) - 1] = 0;
>> +            ret = MIN(len, strlen(tmp) + 1);
>> +            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
>> +                ret = -1;
>> +            }
>> +        }
>> +    }
>> +    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static void vof_claimed_dump(GArray *claimed)
>> +{
>> +#ifdef DEBUG
>> +    int i;
>> +    OfClaimed c;
>> +
>> +    for (i = 0; i < claimed->len; ++i) {
>> +        c = g_array_index(claimed, OfClaimed, i);
>> +        error_printf("CLAIMED %lx..%lx size=%ld\n", c.start, c.start + c.size,
>> +                     c.size);
>> +    }
>> +#endif
>> +}
>> +
>> +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size)
>> +{
>> +    int i;
>> +    OfClaimed c;
>> +
>> +    for (i = 0; i < claimed->len; ++i) {
>> +        c = g_array_index(claimed, OfClaimed, i);
>> +        if (ranges_overlap(c.start, c.size, virt, size)) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
>> +{
>> +    OfClaimed newclaim;
>> +
>> +    newclaim.start = virt;
>> +    newclaim.size = size;
>> +    g_array_append_val(claimed, newclaim);
>> +}
>> +
>> +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
>> +{
>> +    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
>> +}
>> +
>> +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
>> +{
>> +    int i, n, offset, proplen = 0;
>> +    uint64_t *mem0_reg;
>> +    struct { uint64_t start, size; } *avail;
>> +
>> +    if (!fdt || !claimed) {
>> +        return;
>> +    }
>> +
>> +    offset = fdt_path_offset(fdt, "/memory@0");
>> +    _FDT(offset);
>> +
>> +    mem0_reg = (uint64_t *) fdt_getprop(fdt, offset, "reg", &proplen);
>> +    if (!mem0_reg || proplen != 2 * sizeof(uint64_t)) {
>> +        return;
>> +    }
>> +
>> +    g_array_sort(claimed, of_claimed_compare_func);
>> +    vof_claimed_dump(claimed);
>> +
>> +    avail = g_malloc0(sizeof(uint64_t) * 2 * claimed->len);
> 
> Using sizeof(avail[0]) * claimed->len would make this a little bit
> more robust.

Uff, I missed this.


> 
>> +    for (i = 0, n = 0; i < claimed->len; ++i) {
>> +        OfClaimed c = g_array_index(claimed, OfClaimed, i);
>> +
>> +        avail[n].start = c.start + c.size;
>> +        if (i < claimed->len - 1) {
>> +            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
>> +
>> +            avail[n].size = cn.start - avail[n].start;
>> +        } else {
>> +            avail[n].size = be64_to_cpu(mem0_reg[1]) - avail[n].start;
>> +        }
>> +
>> +        if (avail[n].size) {
>> +#ifdef DEBUG
>> +            error_printf("AVAIL %lx..%lx size=%ld\n", avail[n].start,
>> +                         avail[n].start + avail[n].size, avail[n].size);
>> +#endif
>> +            avail[n].start = cpu_to_be64(avail[n].start);
>> +            avail[n].size = cpu_to_be64(avail[n].size);
> 
> I'd prefer to do the endian switches as you first write to the array.

Ok.


> 
>> +            ++n;
>> +        }
>> +    }
>> +    _FDT((fdt_setprop(fdt, offset, "available", avail,
>> +                      sizeof(uint64_t) * 2 * n)));
>> +    g_free(avail);
>> +}
>> +
>> +/*
>> + * OF1275:
>> + * "Allocates size bytes of memory. If align is zero, the allocated range
>> + * begins at the virtual address virt. Otherwise, an aligned address is
>> + * automatically chosen and the input argument virt is ignored".
>> + *
>> + * In other words, exactly one of @virt and @align is non-zero.
>> + */
>> +uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
>> +                   uint64_t align)
>> +{
>> +    uint64_t ret;
>> +
>> +    if (size == 0) {
>> +        ret = -1;
>> +    } else if (align == 0) {
>> +        if (!vof_claim_avail(vof->claimed, virt, size)) {
>> +            ret = -1;
>> +        } else {
>> +            ret = virt;
>> +        }
>> +    } else {
>> +        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
>> +        while (1) {
>> +            if (vof->claimed_base >= vof->top_addr) {
>> +                error_report("Out of RMA memory for the OF client");
>> +                return -1;
>> +            }
>> +            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
>> +                break;
>> +            }
>> +            vof->claimed_base += size;
>> +        }
>> +        ret = vof->claimed_base;
>> +    }
>> +
>> +    if (ret != -1) {
>> +        vof->claimed_base = MAX(vof->claimed_base, ret + size);
>> +        vof_claim_add(vof->claimed, ret, size);
>> +        /* The client reads "/memory@0/available" to know where it can claim */
>> +        vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
>> +    }
>> +    trace_vof_claim(virt, size, align, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_release(void *fdt, Vof *vof, uint64_t virt, uint64_t size)
>> +{
>> +    uint32_t ret = -1;
>> +    int i;
>> +    GArray *claimed = vof->claimed;
>> +    OfClaimed c;
>> +
>> +    for (i = 0; i < claimed->len; ++i) {
>> +        c = g_array_index(claimed, OfClaimed, i);
>> +        if (c.start == virt && c.size == size) {
>> +            g_array_remove_index(claimed, i);
>> +            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
>> +            ret = 0;
>> +            break;
>> +        }
>> +    }
>> +
>> +    trace_vof_release(virt, size, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static void vof_instantiate_rtas(void)
>> +{
>> +    error_report("The firmware should have instantiated RTAS");
>> +    exit(1);
>> +}
>> +
>> +static uint32_t vof_call_method(Vof *vof, uint32_t methodaddr,
>> +                                uint32_t ihandle,
>> +                                uint32_t param1, uint32_t param2,
>> +                                uint32_t param3, uint32_t param4,
>> +                                uint32_t *ret2)
>> +{
>> +    uint32_t ret = -1;
>> +    char method[256] = "";
>> +    OfInstance *inst;
>> +
>> +    if (!ihandle) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
>> +                                              GINT_TO_POINTER(ihandle));
>> +    if (!inst) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    if (readstr(methodaddr, method, sizeof(method))) {
>> +        goto trace_exit;
>> +    }
>> +
>> +    if (strcmp(inst->path, "/") == 0) {
>> +        if (strcmp(method, "ibm,client-architecture-support") == 0) {
>> +            Object *cas_if = object_dynamic_cast(
>> +                    qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
>> +
>> +            if (cas_if) {
>> +                ClientArchitectureSupportClass *casc =
>> +                    CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
>> +
>> +                ret = casc->cas(first_cpu, param1);
>> +            }
>> +
>> +            *ret2 = 0;
>> +        }
>> +    } else if (strcmp(inst->path, "/rtas") == 0) {
>> +        if (strcmp(method, "instantiate-rtas") == 0) {
>> +            vof_instantiate_rtas();
>> +            ret = 0;
>> +            *ret2 = param1; /* rtas-base */
>> +        }
>> +    } else {
>> +        trace_vof_error_unknown_method(method);
>> +    }
>> +
>> +trace_exit:
>> +    trace_vof_method(ihandle, method, param1, ret, *ret2);
>> +
>> +    return ret;
>> +}
>> +
>> +static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1,
>> +                                   uint32_t param2, uint32_t *ret2)
>> +{
>> +    uint32_t ret = -1;
>> +    char cmd[256] = "";
>> +
>> +    /* No interpret implemented */
>> +    readstr(cmdaddr, cmd, sizeof(cmd));
>> +    trace_vof_interpret(cmd, param1, param2, ret, *ret2);
>> +
>> +    return ret;
>> +}
>> +
>> +static void vof_quiesce(void *fdt, Vof *vof)
>> +{
>> +    Object *cas_if = object_dynamic_cast(
>> +        qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
>> +
>> +    int rc = fdt_pack(fdt);
>> +
>> +    assert(rc == 0);
>> +
>> +    if (cas_if) {
>> +        ClientArchitectureSupportClass *casc =
>> +            CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
>> +
>> +        casc->quiesce();
>> +    }
>> +
>> +    vof_claimed_dump(vof->claimed);
>> +}
>> +
>> +uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
>> +                         uint32_t *args, unsigned nargs,
>> +                         uint32_t *rets, unsigned nrets)
>> +{
>> +    uint32_t ret = 0;
>> +
>> +    /* @nrets includes the value which this function returns */
>> +#define cmpserv(s, a, r) \
>> +    cmpservice(service, nargs, nrets, (s), (a), (r))
>> +
>> +    if (cmpserv("finddevice", 1, 1)) {
>> +        ret = vof_finddevice(fdt, args[0]);
>> +    } else if (cmpserv("getprop", 4, 1)) {
>> +        ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]);
>> +    } else if (cmpserv("getproplen", 2, 1)) {
>> +        ret = vof_getproplen(fdt, args[0], args[1]);
>> +    } else if (cmpserv("setprop", 4, 1)) {
>> +        ret = vof_setprop(fdt, vof, args[0], args[1], args[2], args[3]);
>> +    } else if (cmpserv("nextprop", 3, 1)) {
>> +        ret = vof_nextprop(fdt, args[0], args[1], args[2]);
>> +    } else if (cmpserv("peer", 1, 1)) {
>> +        ret = vof_peer(fdt, args[0]);
>> +    } else if (cmpserv("child", 1, 1)) {
>> +        ret = vof_child(fdt, args[0]);
>> +    } else if (cmpserv("parent", 1, 1)) {
>> +        ret = vof_parent(fdt, args[0]);
>> +    } else if (cmpserv("open", 1, 1)) {
>> +        ret = vof_open(fdt, vof, args[0]);
>> +    } else if (cmpserv("close", 1, 0)) {
>> +        vof_close(vof, args[0]);
>> +    } else if (cmpserv("instance-to-package", 1, 1)) {
>> +        ret = vof_instance_to_package(vof, args[0]);
>> +    } else if (cmpserv("package-to-path", 3, 1)) {
>> +        ret = vof_package_to_path(fdt, args[0], args[1], args[2]);
>> +    } else if (cmpserv("instance-to-path", 3, 1)) {
>> +        ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]);
>> +    } else if (cmpserv("claim", 3, 1)) {
>> +        ret = vof_claim(fdt, vof, args[0], args[1], args[2]);
>> +    } else if (cmpserv("release", 2, 0)) {
>> +        ret = vof_release(fdt, vof, args[0], args[1]);
>> +    } else if (cmpserv("call-method", 0, 0)) {
>> +        ret = vof_call_method(vof, args[0], args[1], args[2], args[3], args[4],
>> +                              args[5], rets);
>> +    } else if (cmpserv("interpret", 0, 0)) {
>> +        ret = vof_call_interpret(args[0], args[1], args[2], rets);
>> +    } else if (cmpserv("milliseconds", 0, 1)) {
>> +        ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
>> +    } else if (cmpserv("quiesce", 0, 0)) {
>> +        vof_quiesce(fdt, vof);
>> +    } else if (cmpserv("exit", 0, 0)) {
>> +        error_report("Stopped as the VM requested \"exit\"");
>> +        vm_stop(RUN_STATE_PAUSED); /* Or qemu_system_guest_panicked(NULL); ? */
>> +    } else {
>> +        trace_vof_error_unknown_service(service, nargs, nrets);
>> +        ret = -1;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static void of_instance_free(gpointer data)
>> +{
>> +    OfInstance *inst = (OfInstance *) data;
>> +
>> +    g_free(inst->path);
>> +    g_free(inst);
>> +}
>> +
>> +void vof_cleanup(Vof *vof)
>> +{
>> +    if (vof->claimed) {
>> +        g_array_unref(vof->claimed);
>> +    }
>> +    if (vof->of_instances) {
>> +        g_hash_table_unref(vof->of_instances);
>> +    }
>> +}
>> +
>> +void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr)
>> +{
>> +    uint32_t phandle;
>> +    int i, offset, proplen = 0;
>> +    const void *prop;
>> +    bool found = false;
>> +    GArray *phandles = g_array_new(false, false, sizeof(uint32_t));
>> +
>> +    vof_cleanup(vof);
>> +
>> +    vof->claimed = g_array_new(false, false, sizeof(OfClaimed));
>> +    vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal,
>> +                                              NULL, of_instance_free);
>> +    vof->top_addr = top_addr;
>> +
>> +    /* Find all predefined phandles */
>> +    for (offset = fdt_next_node(fdt, -1, NULL);
>> +         offset >= 0;
>> +         offset = fdt_next_node(fdt, offset, NULL)) {
>> +        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
>> +        if (prop && proplen == sizeof(uint32_t)) {
>> +            phandle = fdt32_ld(prop);
>> +            g_array_append_val(phandles, phandle);
>> +        }
>> +    }
>> +
>> +    /* Assign phandles skipping the predefined ones */
>> +    for (offset = fdt_next_node(fdt, -1, NULL), phandle = 1;
>> +         offset >= 0;
>> +         offset = fdt_next_node(fdt, offset, NULL), ++phandle) {
>> +
>> +        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
>> +        if (prop) {
>> +            continue;
>> +        }
>> +        /* Check if the current phandle is not allocated already */
>> +        for ( ; ; ++phandle) {
>> +            for (i = 0, found = false; i < phandles->len; ++i) {
>> +                if (phandle == g_array_index(phandles, uint32_t, i)) {
>> +                    found = true;
>> +                    break;
>> +                }
>> +            }
>> +            if (!found) {
>> +                break;
>> +            }
>> +        }
>> +        _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle));
>> +    }
>> +    g_array_unref(phandles);
>> +
>> +    vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
> 
> I still think using max_phandle would be simpler and therefore
> preferable, but whatever.
> 
>> +}
>> diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c
>> new file mode 100644
>> index 000000000000..de7d5fc76431
>> --- /dev/null
>> +++ b/pc-bios/vof/bootmem.c
>> @@ -0,0 +1,13 @@
>> +#include "vof.h"
>> +
>> +void boot_from_memory(uint64_t initrd, uint64_t initrdsize)
>> +{
>> +	uint64_t kern[2];
>> +	phandle chosen = ci_finddevice("/chosen");
>> +
>> +	if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=
>> +			sizeof(kern))
>> +		return;
>> +
>> +	do_boot(kern[0], initrd, initrdsize);
>> +}
>> diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
>> new file mode 100644
>> index 000000000000..4880b3d2047c
>> --- /dev/null
>> +++ b/pc-bios/vof/ci.c
>> @@ -0,0 +1,108 @@
>> +#include "vof.h"
>> +
>> +struct prom_args {
>> +        uint32_t service;
>> +        uint32_t nargs;
>> +        uint32_t nret;
>> +        uint32_t args[10];
>> +};
>> +
>> +#define ADDR(x) (uint32_t)(x)
>> +
>> +extern uint32_t ci_entry(uint32_t params);
>> +
>> +extern unsigned long hv_rtas(unsigned long params);
>> +extern unsigned int hv_rtas_size;
>> +
>> +bool prom_handle(struct prom_args *pargs)
>> +{
>> +	void *rtasbase;
>> +	uint32_t rtassize = 0;
>> +	phandle rtas;
>> +
>> +	if (strcmp("call-method", (void *)(unsigned long) pargs->service))
>> +		return false;
>> +
>> +	if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
>> +		return false;
>> +
>> +	rtas = ci_finddevice("/rtas");
>> +	ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));
> 
> Why do we need this?  IIRC the "rtas-size" property is a qemu
> extension we used to use for the interaction between SLOF and qemu -
> can't we just use hv_rtas_size directly for the VOF case?


It depends on the FWNMI capability, it is either RTAS or RTAS + log size.


> 
>> +	if (rtassize < hv_rtas_size)
>> +		return false;
>> +
>> +	rtasbase = (void *)(unsigned long) pargs->args[2];
>> +
>> +	memcpy(rtasbase, hv_rtas, hv_rtas_size);
>> +	pargs->args[pargs->nargs] = 0;
>> +	pargs->args[pargs->nargs + 1] = pargs->args[2];
>> +
>> +	return true;
>> +}
>> +
>> +void prom_entry(uint32_t args)
>> +{
>> +	if (!prom_handle((void *)(unsigned long) args))
>> +		ci_entry(args);
>> +}
>> +
>> +int call_prom(const char *service, int nargs, int nret, ...)
>> +{
>> +        int i;
>> +        struct prom_args args;
>> +        va_list list;
>> +
>> +        args.service = ADDR(service);
>> +        args.nargs = nargs;
>> +        args.nret = nret;
>> +
>> +        va_start(list, nret);
>> +        for (i = 0; i < nargs; i++)
>> +                args.args[i] = va_arg(list, prom_arg_t);
>> +        va_end(list);
>> +
>> +        for (i = 0; i < nret; i++)
>> +                args.args[nargs+i] = 0;
>> +
>> +        if (ci_entry((uint32_t)(&args)) < 0)
>> +                return PROM_ERROR;
>> +
>> +        return (nret > 0) ? args.args[nargs] : 0;
>> +}
>> +
>> +void ci_panic(const char *str)
>> +{
>> +	call_prom("exit", 0, 0);
>> +}
>> +
>> +phandle ci_finddevice(const char *path)
>> +{
>> +	return call_prom("finddevice", 1, 1, path);
>> +}
>> +
>> +uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len)
>> +{
>> +	return call_prom("getprop", 4, 1, ph, propname, prop, len);
>> +}
>> +
>> +ihandle ci_open(const char *path)
>> +{
>> +	return call_prom("open", 1, 1, path);
>> +}
>> +
>> +void ci_close(ihandle ih)
>> +{
>> +	call_prom("close", 1, 0, ih);
>> +}
>> +
>> +void *ci_claim(void *virt, uint32_t size, uint32_t align)
>> +{
>> +	uint32_t ret = call_prom("claim", 3, 1, ADDR(virt), size, align);
>> +
>> +	return (void *) (unsigned long) ret;
>> +}
>> +
>> +uint32_t ci_release(void *virt, uint32_t size)
>> +{
>> +	return call_prom("release", 2, 1, ADDR(virt), size);
>> +}
>> diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c
>> new file mode 100644
>> index 000000000000..8603aedcb32c
>> --- /dev/null
>> +++ b/pc-bios/vof/libc.c
>> @@ -0,0 +1,91 @@
>> +#include "vof.h"
>> +
>> +int strlen(const char *s)
>> +{
>> +	int len = 0;
>> +
>> +	while (*s != 0) {
>> +		len += 1;
>> +		s += 1;
>> +	}
>> +
>> +	return len;
>> +}
>> +
>> +int strcmp(const char *s1, const char *s2)
>> +{
>> +        while (*s1 != 0 && *s2 != 0) {
>> +                if (*s1 != *s2)
>> +                        break;
>> +                s1 += 1;
>> +                s2 += 1;
>> +        }
>> +
>> +        return *s1 - *s2;
>> +}
>> +
>> +void *memcpy(void *dest, const void *src, size_t n)
>> +{
>> +        char *cdest;
>> +        const char *csrc = src;
>> +
>> +        cdest = dest;
>> +        while (n-- > 0) {
>> +                *cdest++ = *csrc++;
>> +        }
>> +
>> +        return dest;
>> +}
>> +
>> +int memcmp(const void *ptr1, const void *ptr2, size_t n)
>> +{
>> +        const unsigned char *p1 = ptr1;
>> +        const unsigned char *p2 = ptr2;
>> +
>> +        while (n-- > 0) {
>> +                if (*p1 != *p2)
>> +                        return (*p1 - *p2);
>> +                p1 += 1;
>> +                p2 += 1;
>> +        }
>> +
>> +        return 0;
>> +}
> 
> I believe there are gcc builtins for a number of these - could you use
> those rather than having to open code them?

Huh, I did not realize how many builtins gcc has. I'll check them out.


>> +void *memmove(void *dest, const void *src, size_t n)
>> +{
>> +        char *cdest;
>> +        const char *csrc;
>> +        int i;
>> +
>> +        /* Do the buffers overlap in a bad way? */
>> +        if (src < dest && src + n >= dest) {
>> +                /* Copy from end to start */
>> +                cdest = dest + n - 1;
>> +                csrc = src + n - 1;
>> +                for (i = 0; i < n; i++) {
>> +                        *cdest-- = *csrc--;
>> +                }
>> +        }
>> +        else {
>> +                /* Normal copy is possible */
>> +                cdest = dest;
>> +                csrc = src;
>> +                for (i = 0; i < n; i++) {
>> +                        *cdest++ = *csrc++;
>> +                }
>> +        }
>> +
>> +        return dest;
>> +}
>> +
>> +void *memset(void *dest, int c, size_t size)
>> +{
>> +        unsigned char *d = (unsigned char *)dest;
>> +
>> +        while (size-- > 0) {
>> +                *d++ = (unsigned char)c;
>> +        }
>> +
>> +        return dest;
>> +}
>> diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c
>> new file mode 100644
>> index 000000000000..34299a9cc5ad
>> --- /dev/null
>> +++ b/pc-bios/vof/main.c
>> @@ -0,0 +1,22 @@
>> +#include "vof.h"
>> +
>> +
>> +void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4)
>> +{
>> +	register unsigned long r3 __asm__("r3") = _r3;
>> +	register unsigned long r4 __asm__("r4") = _r4;
>> +	register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;
>> +
>> +	((client *)(uint32_t)addr)();
>> +}
>> +
>> +void entry_c(void)
>> +{
>> +	register unsigned long r3 __asm__("r3");
>> +	register unsigned long r4 __asm__("r4");
>> +	register unsigned long r5 __asm__("r5");
>> +	uint64_t initrd = r3, initrdsize = r4;
>> +
>> +	boot_from_memory(initrd, initrdsize);
>> +	ci_panic("*** No boot target ***\n");
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 8201f12271b7..469b76b36b2a 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1335,6 +1335,17 @@ F: pc-bios/canyonlands.dt[sb]
>>   F: pc-bios/u-boot-sam460ex-20100605.bin
>>   F: roms/u-boot-sam460ex
>>   
>> +VOF
> 
> I'd expand this to Virtual Open Firmware, which at least gives a very
> faint idea of what it is.

Ok.

> 
>> +M: Alexey Kardashevskiy <aik@ozlabs.ru>
>> +M: David Gibson <david@gibson.dropbear.id.au>
>> +M: Greg Kurz <groug@kaod.org>
>> +L: qemu-ppc@nongnu.org
>> +S: Maintained
>> +F: hw/ppc/spapr_vof*
>> +F: hw/ppc/vof*
>> +F: pc-bios/vof/*
>> +F: pc-bios/vof*
>> +
>>   RISC-V Machines
>>   ---------------
>>   OpenTitan
>> diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
>> index 218631c883be..24427d3f51c1 100644
>> --- a/hw/ppc/meson.build
>> +++ b/hw/ppc/meson.build
>> @@ -28,6 +28,8 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files(
>>     'spapr_rtas_ddw.c',
>>     'spapr_numa.c',
>>     'pef.c',
>> +  'spapr_vof.c',
>> +  'vof.c',
>>   ))
>>   ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c'))
>>   ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files(
>> diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
>> index 1e91984526a3..017c48624f96 100644
>> --- a/hw/ppc/trace-events
>> +++ b/hw/ppc/trace-events
>> @@ -71,6 +71,27 @@ spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3
>>   spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64
>>   spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed"
>>   
>> +# vof.c
>> +vof_error_str_truncated(const char *s, int len) "%s truncated to %d"
>> +vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d"
>> +vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d"
>> +vof_error_unknown_method(const char *method) "\"%s\""
>> +vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x"
>> +vof_error_unknown_path(const char *path) "\"%s\""
>> +vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x"
>> +vof_canon(const char *path) "\"%s\""
>> +vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x"
>> +vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x"
>> +vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x"
>> +vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]"
>> +vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d"
>> +vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t ret) "ph=0x%x \"%s\" [%s] => len=%d"
>> +vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x"
>> +vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x"
>> +vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d"
>> +vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d"
>> +vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x"
>> +
>>   # ppc.c
>>   ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
>>   
>> diff --git a/pc-bios/README b/pc-bios/README
>> index db7129ef6484..176587da8ea5 100644
>> --- a/pc-bios/README
>> +++ b/pc-bios/README
>> @@ -16,6 +16,8 @@
>>     https://github.com/aik/SLOF, and the image currently in qemu is
>>     built from git tag qemu-slof-20200717.
>>   
>> +- vof is a minimalistic firmware to work with -machine pseries,x-vof=on.
>> +
>>   - sgabios (the Serial Graphics Adapter option ROM) provides a means for
>>     legacy x86 software to communicate with an attached serial console as
>>     if a video card were attached.  The master sources reside in a subversion
>> diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin
>> new file mode 100755
>> index 0000000000000000000000000000000000000000..0606d9451c6bff39b32879c2a3369406a6a0d07d
>> GIT binary patch
>> literal 3680
>> zcmd^BUuauZ82{2_+ue2@_aP6FMSANxX==+V^r6wxq_aZW%u<<!iXBNzt}d80wap}x
>> z#N4z}S8<TCJvivU2Pyj2hY5XKg&_s^u!ji|bnD&*d)b3vWu4dGckj6_{g=h<(ZJ!H
>> zob#RU_kF+b&$&eZ?_Xe@C~goH)*Xq?AW`(>B?=MUwAULA#>}I8bhdE6ZYf0j&qP62
>> z;l6X}0rol?XtZA^HJ3P=?ZiPe{%~Ip(aYQR$lk+M_mu|YR!*v0DmkeA@{iPX`5v|8
>> z0*A9F7ieE$t!^oBha7L*`MOjpkW^YAxpbWry+E?QKuWPor~L77wiqBO?{hBo>#hd2
>> z$6uf}{V=(5l5=VDXJcdXfw7s7y`N<}YQ$4>$)nks0Lgxp2(y%%KF0su;=iJ^9e2qM
>> zTO`;bhv?8H{d3(Fq5~Idv7ype_~x;3z_pkC@%zZZxLA9{+QaK<AJK_vqTsyoo9Xwl
>> zaF+eu#m{wMc)+KZ`Ly|i{&v1M-Aja7W_^XPLc!oX1$n;$R~Bp8loz<&^QL{~Jz>W+
>> z=#<!JTQ1*h#>AA#`vwN`mW}EzHm<_P4%pZ!Z1f<;Uc^|1E-&+s*~2^lqp=u4Jeo~j
>> zOq=`%v95<NeMDC+e^h_3Va!BGW!t*vj9YwP;0OOBzKLtw0sH~1ckl*2*j9;o-0ru{
>> z^&XsVxAj}jfoHNcV#dT{#&MQPutx<~-h6YP8?(9JJx5$8s^4cAyvJuEzwA9zo;kO3
>> z9QL<aoP$#<7LxxU)>}DOgntTT6~=%a)IDpg;r=lGZ{cyL?yB+JQ#b2<Ca#1%u+6Ho
>> zUf@SiXW(nFZ-3Vwc~pK1UJX6$%i9%o&OCXFoHLE#Ecyn8jm>ws=a_jrZ^ZyMa*eZ`
>> zEaY5}^X(#c4LVRG&NE!+{s1W*tK{O|sL$)Njy;@xZ^oxfR~T31iWzI*WCRZ9gp9sw
>> z%82{i<7D6t3K<vlcm{pS>zqS~z3dO7NA{Za&$hZaA6TZ?NA6<1kZrPLSq?ei5V5P)
>> zr`N^1&{)~Ww!RBfI{OQ|B-R%;{=Q&ygMMcE9X9?t&%FsfQmyhPv<a-+z*<8;y1DZW
>> znsNhc2>odo{V7_yOR17Uqo{=pILppNC>NZf#&Ur4a)d6H3sfpE&^P7lv{=4N-<1to
>> z{<WS~VC#^mrD4pR2T{!Yt~jp-D9H7Jxf?MoW4{M8pNB-<a34%eL+;gSI)Iwyo*3f)
>> z+zSUK<ZnF%?btw;_4oLD$p!m*idKvyKKIvyGb<*Fh~1OkA2B%{7Il230=pe}T+<TA
>> z-qZtGkq1ge9&FbY6}oP~AK8l4pTD#3<{L6+_R!(&`Z!PFgT(kEUv5>2VGa56#Q^6^
>> z$0AXOPK28Aocigt3FM6<N>9y^8uU*-Wc~OqU<?N^9Ki5e7?0;TH-#V7a<PV9GI^14
>> z?C&Auq8nUtttq;Jynll40=_EnU#h@=&bphFcJZ^(!rA<x0;igvHSjsHSb@VaV?LV%
>> z&f7J_Y*o~CPvv*U)*dp^35?3`4i9)#!J{|-N;oUx=mc(yh5P6i_764rZ{I<02|iz5
>> z7FX<ea&2)8E3oT;-6pW#tC8Df;rF}nJA{6AYG*$y@;6}<a;OFUtPMHT4jjBgk6&b4
>> zP^bKcDB=xp>Da@W#`x$x3i@jcY~?rKoe!PnZ;KYJ+w*Uo&hlH8cfN^!{Ks*wUlFH)
>> zy*NVYQFC*%+MiZGNT)MuCN}nwmQk^Fh~lwSs`Z?fIh#&Ua%4Oc8_6VN8Lf3J6C3Ay
>> zC&YrM*74+L8uqkoGQ(#dCPxwp?bBo&n@~79mL8?_s5Zi9@l<-8W#g$>@*L(dEv3b<
>> zb0U5g2P5=}cJ6!&ThscnHa4QA_zsc7L-axxql7Vwv5c{TeVjXnu)cwD6Qhi=j&UF3
>> NA;t#L5!^o_{R4q+IM@IH
>>
>> literal 0
>> HcmV?d00001
>>
>> diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S
>> new file mode 100644
>> index 000000000000..90f4b859a059
>> --- /dev/null
>> +++ b/pc-bios/vof/entry.S
>> @@ -0,0 +1,51 @@
>> +#define LOAD32(rn, name)    \
>> +	lis     rn,name##@h;    \
>> +	ori     rn,rn,name##@l
>> +
>> +#define ENTRY(func_name)    \
>> +	.text;                  \
>> +	.align  2;              \
>> +	.globl  .func_name;     \
>> +	.func_name:             \
>> +	.globl  func_name;      \
>> +	func_name:
>> +
>> +#define KVMPPC_HCALL_BASE       0xf000
>> +#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
>> +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
>> +
>> +	. = 0x100 /* Do exactly as SLOF does */
>> +
>> +ENTRY(_start)
>> +	LOAD32(%r31, 0) /* Go 32bit mode */
>> +	mtmsrd %r31,0
>> +	LOAD32(2, __toc_start)
>> +	b entry_c
>> +
>> +ENTRY(_prom_entry)
>> +	LOAD32(2, __toc_start)
>> +	stdu    %r1,-112(%r1)
>> +	std     %r31,104(%r1)
>> +	mflr    %r31
>> +	bl prom_entry
>> +	nop
>> +	mtlr    %r31
>> +	ld      %r31,104(%r1)
>> +	addi    %r1,%r1,112
>> +	blr
>> +
>> +ENTRY(ci_entry)
>> +	mr	4,3
>> +	LOAD32(3,KVMPPC_H_VOF_CLIENT)
>> +	sc	1
>> +	blr
>> +
>> +/* This is the actual RTAS blob copied to the OS at instantiate-rtas */
>> +ENTRY(hv_rtas)
>> +	mr      %r4,%r3
>> +	LOAD32(3,KVMPPC_H_RTAS)
>> +	sc	1
>> +	blr
>> +	.globl hv_rtas_size
>> +hv_rtas_size:
>> +	.long . - hv_rtas;
>> diff --git a/pc-bios/vof/l.lds b/pc-bios/vof/l.lds
>> new file mode 100644
>> index 000000000000..10b557a81f78
>> --- /dev/null
>> +++ b/pc-bios/vof/l.lds
>> @@ -0,0 +1,48 @@
>> +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
>> +OUTPUT_ARCH(powerpc:common)
>> +
>> +/* set the entry point */
>> +ENTRY ( __start )
>> +
>> +SECTIONS {
>> +	__executable_start = .;
>> +
>> +	.text : {
>> +		*(.text)
>> +	}
>> +
>> +	__etext = .;
>> +
>> +	. = ALIGN(8);
>> +
>> +	.data : {
>> +		*(.data)
>> +		*(.rodata .rodata.*)
>> +		*(.got1)
>> +		*(.sdata)
>> +		*(.opd)
>> +	}
>> +
>> +	/* FIXME bss at end ??? */
>> +
>> +	. = ALIGN(8);
>> +	__bss_start = .;
>> +	.bss : {
>> +		*(.sbss) *(.scommon)
>> +		*(.dynbss)
>> +		*(.bss)
>> +	}
>> +
>> +	. = ALIGN(8);
>> +	__bss_end = .;
>> +	__bss_size = (__bss_end - __bss_start);
>> +
>> +	. = ALIGN(256);
>> +	__toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000;
>> +	.got :
>> +	{
>> +		 *(.toc .got)
>> +	}
>> +	. = ALIGN(8);
>> +	__toc_end = .;
>> +}
>> diff --git a/pc-bios/vof/nvram.bin b/pc-bios/vof/nvram.bin
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..d183901cf980a91d81c4348bb20487c7bb62a2ec
>> GIT binary patch
>> literal 16384
>> zcmeI%Jx;?g6bEpZJ8*)oSZeqZi&Z2pKnD)sI4{AHlNb4;RW}a70XPHaW57uo=-#R7
>> zKSLBhJJ0sdixY3IuY@hzo0r$OmE%T;XE9uh@s1k=AOHafKmY;|fB*y_009U<00Izz
>> z00bZa0SG_<0uX=z1Rwwb2tWV=XCbip6d#B4{{rX#XR%}$Bm^J;0SG|gWP$!?Aq=-I
>> zcT+0Ix{{?1q>9J8r+eW^JK1tYYZZMWQCUwW%0S*~w^p@wfkX-<yRFx)H*+YEt0RRd
>> zmn}6xtwbP`yp4O=>kxMAEA<~5@*g)@mb%KD5!;O~8c)>8rRQBx55=trhk#+1+T3J_
>> zaf*G4vZAduqy$qda{``6Gnc2DQg<Es<GLxL#9<Oj*zP!8ZSnwf@-j7l47!nFXQO$a
>> z^Hes6YU^_M<KsM*k~zwOSa+2g3Sx{*Eyu^XrB0FM5IJ-*?8`VvpBc4}vS(+_UKJ;=
>> xITAns0uX=z1Rwwb2tWV=5P-nt34DD||Nni|VfbXeJORuY0uX=z1R!vE0>7B^s4f5i
>>
>> literal 0
>> HcmV?d00001
>>
>
BALATON Zoltan March 2, 2021, 9:37 a.m. UTC | #5
On Tue, 2 Mar 2021, Alexey Kardashevskiy wrote:
> On 02/03/2021 14:35, David Gibson wrote:
>> Alexey or Zoltan, any thoughts on how non-PAPR versions of this would
>> call into qemu to get the non-guest parts of VOF to execute?
>
> Non-PAPR could do it as we do it for soft breakpoints in KVM - some 
> predefined illegal instruction which KVM knows that it is used for soft 
> breakpoints.

So far I've thought I'd need to implement TYPE_PPC_VIRTUAL_HYPERVISOR 
interface in the machine for the code in target/ppc/excp_helper.c to 
forward client syscalls to QEMU where I could handle the VOF related calls 
but not sure this is the best way. This could be OK for the initial boot 
when nothing else should use syscalls yet but with RTAS this may not work 
as the guest OS could also use syscalls so to avoid conflicts we may need 
to shut down the virtual hypervisor on quiesce which means I may need a 
minimal guest only rtas for pegasos2 (which would be OK as I think it's 
only used for shutdown/reboot anyway). Also may need some changes to allow 
empty callbacks in vhyp to be ignored when I only want to implement 
hypercall method but that's just adding checks to only call non-NULL 
callbacks in PPCVirtualHypervisorClass.

There's also an old patch from Benjamin Herrenschmidt to add MOL OSI which 
is a similar hypercall interface:
https://github.com/ozbenh/qemu/commit/6dc8803641e323030ffd01ad8ce0dcf081896698
This might also be useful later to use MOL paravirtual drivers to speed up 
MacOSX emulation. but I haven't looked at the details yet.

Any other ideas?

Regards,
BALATON Zoltan
David Gibson March 9, 2021, 5:29 a.m. UTC | #6
On Tue, Mar 02, 2021 at 06:21:51PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 02/03/2021 14:35, David Gibson wrote:
> > On Wed, Feb 24, 2021 at 04:41:30PM +1100, Alexey Kardashevskiy wrote:
> > > The PAPR platform which describes an OS environment that's presented by
> > > a combination of a hypervisor and firmware. The features it specifies
> > > require collaboration between the firmware and the hypervisor.
> > > 
> > > Since the beginning, the runtime component of the firmware (RTAS) has
> > > been implemented as a 20 byte shim which simply forwards it to
> > > a hypercall implemented in qemu. The boot time firmware component is
> > > SLOF - but a build that's specific to qemu, and has always needed to be
> > > updated in sync with it. Even though we've managed to limit the amount
> > > of runtime communication we need between qemu and SLOF, there's some,
> > > and it has become increasingly awkward to handle as we've implemented
> > > new features.
> > > 
> > > This implements a boot time OF client interface (CI) which is
> > > enabled by a new "x-vof" pseries machine option (stands for "Virtual Open
> > > Firmware). When enabled, QEMU implements the custom H_OF_CLIENT hcall
> > > which implements Open Firmware Client Interface (OF CI). This allows
> > > using a smaller stateless firmware which does not have to manage
> > > the device tree.
> > > 
> > > The new "vof.bin" firmware image is included with source code under
> > > pc-bios/. It also includes RTAS blob.
> > > 
> > > This implements a handful of CI methods just to get -kernel/-initrd
> > > working. In particular, this implements the device tree fetching and
> > > simple memory allocator - "claim" (an OF CI memory allocator) and updates
> > > "/memory@0/available" to report the client about available memory.
> > > 
> > > This implements changing some device tree properties which we know how
> > > to deal with, the rest is ignored. To allow changes, this skips
> > > fdt_pack() when x-vof=on as not packing the blob leaves some room for
> > > appending.
> > > 
> > > In absence of SLOF, this assigns phandles to device tree nodes to make
> > > device tree traversing work.
> > > 
> > > When x-vof=on, this adds "/chosen" every time QEMU (re)builds a tree.
> > > 
> > > This adds basic instances support which are managed by a hash map
> > > ihandle -> [phandle].
> > > 
> > > Before the guest started, the used memory is:
> > > 0..4000 - the initial firmware
> > > 10000..180000 - stack
> > > 
> > > This OF CI does not implement "interpret".
> > > 
> > > Unlike SLOF, this does not format uninitialized nvram. Instead, this
> > > includes a disk image with pre-formatted nvram.
> > 
> > I think we'll need to improve this, but that can be a later patch.
> > 
> > > With this basic support, this can only boot into kernel directly.
> > > However this is just enough for the petitboot kernel and initradmdisk to
> > > boot from any possible source. Note this requires reasonably recent guest
> > > kernel with:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df5be5be8735
> > > 
> > > The immediate benefit is much faster booting time which especially
> > > crucial with fully emulated early CPU bring up environments. Also this
> > > may come handy when/if GRUB-in-the-userspace sees light of the day.
> > > 
> > > This separates VOF and sPAPR in a hope that VOF bits may be reused by
> > > other POWERPC boards which do not support pSeries.
> > > 
> > > This is coded in assumption that later on we might be adding support for
> > > booting from QEMU backends (blockdev is the first candidate) without
> > > devices/drivers in between as OF1275 does not require that and
> > > it is quite easy to so.
> > > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > ---
> > > 
> > > The example command line is:
> > > 
> > > /home/aik/pbuild/qemu-killslof-localhost-ppc64/qemu-system-ppc64 \
> > > -nodefaults \
> > > -chardev stdio,id=STDIO0,signal=off,mux=on \
> > > -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> > > -mon id=MON0,chardev=STDIO0,mode=readline \
> > > -nographic \
> > > -vga none \
> > > -enable-kvm \
> > > -m 2G \
> > > -machine pseries,x-vof=on,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken,cap-ccf-assist=off \
> > > -kernel pbuild/kernel-le-guest/vmlinux \
> > > -initrd pb/rootfs.cpio.xz \
> > > -drive id=DRIVE0,if=none,file=./p/qemu-killslof/pc-bios/vof/nvram.bin,format=raw \
> > > -global spapr-nvram.drive=DRIVE0 \
> > > -snapshot \
> > > -smp 8,threads=8 \
> > > -L /home/aik/t/qemu-ppc64-bios/ \
> > > -trace events=qemu_trace_events \
> > > -d guest_errors \
> > > -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.tmux26 \
> > > -mon chardev=SOCKET0,mode=control
> > > 
> > > ---
> > > Changes:
> > > v14:
> > > * check for truncates in readstr()
> > > * ditched a separate vof_reset()
> > > * spapr->vof is a pointer now, dropped the "on" field
> > > * removed rtas_base from vof and updated comment why we allow setting it
> > > * added myself to maintainers
> > > * updated commit log about blockdev and other possible platforms
> > > * added a note why new hcall is 0x5
> > > * no in place endianness convertion in spapr_h_vof_client
> > > * converted all cpu_physical_memory_read/write to address_space_rw
> > > * git mv hw/ppc/spapr_vof_client.c hw/ppc/spapr_vof.c
> > > 
> > > v13:
> > > * rebase on latest ppc-for-6.0
> > > * shuffled code around to touch spapr.c less
> > > 
> > > v12:
> > > * split VOF and SPAPR
> > > 
> > > v11:
> > > * added g_autofree
> > > * fixed gcc warnings
> > > * fixed few leaks
> > > * added nvram image to make "nvram --print-config" not crash;
> > > Note that contrary to  MIN_NVRAM_SIZE (8 * KiB), the actual minimum size
> > > is 16K, or it just does not work (empty output from "nvram")
> > > 
> > > v10:
> > > * now rebased to compile with meson
> > > 
> > > v9:
> > > * remove special handling of /rtas/rtas-size as now we always add it in QEMU
> > > * removed leftovers from scsi/grub/stdout/stdin/...
> > > 
> > > v8:
> > > * no read/write/seek
> > > * no @dev in instances
> > > * the machine flag is "x-vof" for now
> > > 
> > > v7:
> > > * now we have a small firmware which loads at 0 as SLOF and starts from
> > > 0x100 as SLOF
> > > * no MBR/ELF/GRUB business in QEMU anymore
> > > * blockdev is a separate patch
> > > * networking is a separate patch
> > > 
> > > v6:
> > > * borrowed a big chunk of commit log introduction from David
> > > * fixed initial stack pointer (points to the highest address of stack)
> > > * traces for "interpret" and others
> > > * disabled  translate_kernel_address() hack so grub can load (work in
> > > progress)
> > > * added "milliseconds" for grub
> > > * fixed "claim" allocator again
> > > * moved FDT_MAX_SIZE to spapr.h as spapr_of_client.c wants it too for CAS
> > > * moved the most code possible from spapr.c to spapr_of_client.c, such as
> > > RTAS, prom entry and FDT build/finalize
> > > * separated blobs
> > > * GRUB now proceeds to its console prompt (there are still other issues)
> > > * parse MBR/GPT to find PReP and load GRUB
> > > 
> > > v5:
> > > * made instances keep device and chardev pointers
> > > * removed VIO dependencies
> > > * print error if RTAS memory is not claimed as it should have been
> > > * pack FDT as "quiesce"
> > > 
> > > v4:
> > > * fixed open
> > > * validate ihandles in "call-method"
> > > 
> > > v3:
> > > * fixed phandles allocation
> > > * s/__be32/uint32_t/ as we do not normally have __be32 type in qemu
> > > * fixed size of /chosen/stdout
> > > * bunch of renames
> > > * do not create rtas properties at all, let the client deal with it;
> > > instead setprop allows changing these in the FDT
> > > * no more packing FDT when bios=off - nobody needs it and getprop does not
> > > work otherwise
> > > * allow updating initramdisk device tree properties (for zImage)
> > > * added instances
> > > * fixed stdout on OF's "write"
> > > * removed special handling for stdout in OF client, spapr-vty handles it
> > > instead
> > > 
> > > v2:
> > > * fixed claim()
> > > * added "setprop"
> > > * cleaner client interface and RTAS blobs management
> > > * boots to petitboot and further to the target system
> > > * more trace points
> > > ---
> > >   pc-bios/vof/Makefile   |  18 +
> > >   hw/ppc/vof.h           |  42 ++
> > >   include/hw/ppc/spapr.h |  22 +-
> > >   pc-bios/vof/vof.h      |  44 +++
> > >   hw/ppc/spapr.c         |  78 +++-
> > >   hw/ppc/spapr_hcall.c   |  26 +-
> > >   hw/ppc/spapr_vof.c     | 138 +++++++
> > >   hw/ppc/vof.c           | 864 +++++++++++++++++++++++++++++++++++++++++
> > >   pc-bios/vof/bootmem.c  |  13 +
> > >   pc-bios/vof/ci.c       | 108 ++++++
> > >   pc-bios/vof/libc.c     |  91 +++++
> > >   pc-bios/vof/main.c     |  22 ++
> > >   MAINTAINERS            |  11 +
> > >   hw/ppc/meson.build     |   2 +
> > >   hw/ppc/trace-events    |  21 +
> > >   pc-bios/README         |   2 +
> > >   pc-bios/vof.bin        | Bin 0 -> 3680 bytes
> > >   pc-bios/vof/entry.S    |  51 +++
> > >   pc-bios/vof/l.lds      |  48 +++
> > >   pc-bios/vof/nvram.bin  | Bin 0 -> 16384 bytes
> > >   20 files changed, 1592 insertions(+), 9 deletions(-)
> > >   create mode 100644 pc-bios/vof/Makefile
> > >   create mode 100644 hw/ppc/vof.h
> > >   create mode 100644 pc-bios/vof/vof.h
> > >   create mode 100644 hw/ppc/spapr_vof.c
> > >   create mode 100644 hw/ppc/vof.c
> > >   create mode 100644 pc-bios/vof/bootmem.c
> > >   create mode 100644 pc-bios/vof/ci.c
> > >   create mode 100644 pc-bios/vof/libc.c
> > >   create mode 100644 pc-bios/vof/main.c
> > >   create mode 100755 pc-bios/vof.bin
> > >   create mode 100644 pc-bios/vof/entry.S
> > >   create mode 100644 pc-bios/vof/l.lds
> > >   create mode 100644 pc-bios/vof/nvram.bin
> > > 
> > > diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
> > > new file mode 100644
> > > index 000000000000..49f7e240eeff
> > > --- /dev/null
> > > +++ b/pc-bios/vof/Makefile
> > > @@ -0,0 +1,18 @@
> > > +all: build-all
> > > +
> > > +build-all: vof.bin
> > > +
> > > +%.o: %.S
> > > +	cc -m32 -mbig-endian -c -o $@ $<
> > > +
> > > +%.o: %.c
> > > +	cc -m32 -mbig-endian -c -fno-stack-protector -Wno-builtin-declaration-mismatch -o $@ $<
> > > +
> > > +vof.elf: entry.o main.o libc.o ci.o bootmem.o
> > > +	ld -nostdlib -e_start -Tl.lds -EB -o $@ $^
> > > +
> > > +%.bin: %.elf
> > > +	objcopy -O binary -j .text -j .data -j .toc -j .got2 $^ $@
> > > +
> > > +clean:
> > > +	rm -f *.o *.bin *.elf *~
> > > diff --git a/hw/ppc/vof.h b/hw/ppc/vof.h
> > > new file mode 100644
> > > index 000000000000..c8fadf23ea5b
> > > --- /dev/null
> > > +++ b/hw/ppc/vof.h
> > > @@ -0,0 +1,42 @@
> > > + /* Virtual Open Firmware */
> > > +#ifndef HW_VOF_H
> > > +#define HW_VOF_H
> > > +
> > > +typedef struct Vof {
> > > +    uint32_t top_addr; /* copied from rma_size */
> > > +    GArray *claimed; /* array of SpaprOfClaimed */
> > > +    uint64_t claimed_base;
> > > +    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
> > > +    uint32_t of_instance_last;
> > > +    char *bootargs;
> > > +    uint32_t initrd_base; /* Updated in spapr at CAS */
> > > +    long initrd_size; /* Updated in spapr at CAS */
> > > +} Vof;
> > > +
> > > +uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
> > > +                         uint32_t *args, unsigned nargs,
> > > +                         uint32_t *rets, unsigned nrets);
> > > +uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
> > > +                   uint64_t align);
> > > +void vof_cleanup(Vof *vof);
> > > +void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr);
> > > +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
> > > +                               const char *prop, const char *path);
> > > +
> > > +/* ibm,client-architecture-support */
> > > +#define TYPE_CLIENT_ARCHITECTURE_SUPPORT "client-architecture-support"
> > > +#define CLIENT_ARCHITECTURE_SUPPORT(obj) \
> > > +    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)
> > > +
> > > +typedef struct ClientArchitectureSupportClass ClientArchitectureSupportClass;
> > > +DECLARE_CLASS_CHECKERS(ClientArchitectureSupportClass,
> > > +                       CLIENT_ARCHITECTURE_SUPPORT,
> > > +                       TYPE_CLIENT_ARCHITECTURE_SUPPORT)
> > > +
> > > +struct ClientArchitectureSupportClass {
> > > +    InterfaceClass parent;
> > > +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
> > > +    void (*quiesce)(void);
> > 
> > Is there actually any real connection of quiesce behaviour to cas
> > behaviour?  Basically, I'm wondering if this is not so much about
> > client-architecture-support fundamentally as just about
> > machine-specific parts of the VOF behaviour.  Which would be fine, but
> > suggests a different name for the interface.
> 
> The most canonical way would be having 2 interfaces.

Why?  I don't see any reason these shouldn't be a single interface, it
just has a bad name.

> I thought it would be
> too much and left one. Then I thought may be the name should be PAPR but
> "quiesce" is ... I actually cannot spot where it came from, I do not see it
> neither in OF1275 nor PAPR. So the new name is fine but which one? I can
> make it

Huh, weird.  I'm pretty sure it's not PAPR specific - it was used on
ppc based Macs as well.  But as you say, it doesn't appear in IEEE1275
(although the concept of a quiescent state is mentioned with the
'reset' method).  My guess would be it's described in one of those
early OF extension documents that we basically assume is always
implemented (like the interrupt mapping bindings or the generic names
convention).

> struct VofSupportClass {
>  target_ulong callmethod(const char *method, int nargs, target_ulong *args,
> int nret, target_ulong *rets);
> }
> 
> but it looks too vague and makes it harder for the reader.

No, that would be silly.  Just have separate methods in the one
interface for cas and quiesce.

> > > +};
> > > +
> > > +#endif /* HW_VOF_H */
> > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > > index ccbeeca1de84..4896b9fae784 100644
> > > --- a/include/hw/ppc/spapr.h
> > > +++ b/include/hw/ppc/spapr.h
> > > @@ -12,6 +12,7 @@
> > >   #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
> > >   #include "hw/ppc/xics.h"        /* For ICSState */
> > >   #include "hw/ppc/spapr_tpm_proxy.h"
> > > +#include "hw/ppc/vof.h"
> > >   struct SpaprVioBus;
> > >   struct SpaprPhbState;
> > > @@ -180,6 +181,7 @@ struct SpaprMachineState {
> > >       uint64_t kernel_addr;
> > >       uint32_t initrd_base;
> > >       long initrd_size;
> > > +    Vof *vof;
> > >       uint64_t rtc_offset; /* Now used only during incoming migration */
> > >       struct PPCTimebase tb;
> > >       bool has_graphics;
> > > @@ -554,7 +556,9 @@ struct SpaprMachineState {
> > >   /* Client Architecture support */
> > >   #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
> > >   #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
> > > -#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
> > > +/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
> > > +#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> > > +#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
> > >   /*
> > >    * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
> > > @@ -944,4 +948,20 @@ bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
> > >   void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
> > >   hwaddr spapr_get_rtas_addr(void);
> > >   bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
> > > +
> > > +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
> > > +                     target_ulong *stack_ptr);
> > > +void spapr_vof_quiesce(void);
> > > +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
> > > +                                target_ulong opcode, target_ulong *args);
> > 
> > Alexey or Zoltan, any thoughts on how non-PAPR versions of this would
> > call into qemu to get the non-guest parts of VOF to execute?
> 
> Non-PAPR could do it as we do it for soft breakpoints in KVM - some
> predefined illegal instruction which KVM knows that it is used for soft
> breakpoints.

Yeah, I guess.

[snip]
> > > +typedef int size_t;
> > > +typedef void client(void);
> > > +
> > > +/* globals */
> > > +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
> > > +
> > > +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
> > > +
> > > +/* libc */
> > > +int strlen(const char *s);
> > > +int strcmp(const char *s1, const char *s2);
> > > +void *memcpy(void *dest, const void *src, size_t n);
> > > +int memcmp(const void *ptr1, const void *ptr2, size_t n);
> > > +void *memmove(void *dest, const void *src, size_t n);
> > > +void *memset(void *dest, int c, size_t size);
> > > +
> > > +/* Prom */
> > > +typedef unsigned long prom_arg_t;
> > > +int call_prom(const char *service, int nargs, int nret, ...);
> > 
> > AIUI this isn't so much about calling the PROM, since this *is* the
> > PROM code, but rather about calling the parts that are implemented on
> > the qemu side.  Different names might clarify that.
> 
> "call_ci"?

Works for me.

[snip]
> > > +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
> > > +                            uint32_t valaddr, uint32_t vallen)
> > > +{
> > > +    char propname[OF_PROPNAME_LEN_MAX + 1];
> > > +    uint32_t ret = 0;
> > > +    int proplen = 0;
> > > +    const void *prop;
> > > +    char trval[64] = "";
> > > +    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
> > > +
> > > +    if (readstr(pname, propname, sizeof(propname))) {
> > > +        return -1;
> > > +    }
> > > +    if (strcmp(propname, "name") == 0) {
> > > +        prop = fdt_get_name(fdt, nodeoff, &proplen);
> > > +        proplen += 1;
> > 
> > This isn't quite right, I don't think.  fdt_get_name() returns the
> > name *including* unit address, but the 'name' property will omit the
> > unit address.
> 
> Ok, will fix. Is there a helper to chop off the unit address btw, since I
> gave up my split()?

I'm afraid not.  It's not really feasible to "chop" strings in libfdt,
since we don't use malloc().  fdt_get_name() just returns a pointer
into the dtb at the relevant place, which will include the unit
address.

[snip]
> > > +    } else {
> > > +        goto trace_exit;
> > > +    }
> > > +
> > > +    offset = fdt_node_offset_by_phandle(fdt, nodeph);
> > > +    if (offset >= 0) {
> > > +        uint8_t data[vallen];
> > 
> > We probably want some limit on vallen so the guest can't force
> > allocation of arbitrary amounts of qemu stack space.
> 
> Indeed. 1024 + 1?

Seems reasonable for now.

[snip]
> > > diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
> > > new file mode 100644
> > > index 000000000000..4880b3d2047c
> > > --- /dev/null
> > > +++ b/pc-bios/vof/ci.c
> > > @@ -0,0 +1,108 @@
> > > +#include "vof.h"
> > > +
> > > +struct prom_args {
> > > +        uint32_t service;
> > > +        uint32_t nargs;
> > > +        uint32_t nret;
> > > +        uint32_t args[10];
> > > +};
> > > +
> > > +#define ADDR(x) (uint32_t)(x)
> > > +
> > > +extern uint32_t ci_entry(uint32_t params);
> > > +
> > > +extern unsigned long hv_rtas(unsigned long params);
> > > +extern unsigned int hv_rtas_size;
> > > +
> > > +bool prom_handle(struct prom_args *pargs)
> > > +{
> > > +	void *rtasbase;
> > > +	uint32_t rtassize = 0;
> > > +	phandle rtas;
> > > +
> > > +	if (strcmp("call-method", (void *)(unsigned long) pargs->service))
> > > +		return false;
> > > +
> > > +	if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
> > > +		return false;
> > > +
> > > +	rtas = ci_finddevice("/rtas");
> > > +	ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));
> > 
> > Why do we need this?  IIRC the "rtas-size" property is a qemu
> > extension we used to use for the interaction between SLOF and qemu -
> > can't we just use hv_rtas_size directly for the VOF case?
> 
> 
> It depends on the FWNMI capability, it is either RTAS or RTAS + log
> size.

Ah, right.
David Gibson March 9, 2021, 5:33 a.m. UTC | #7
On Tue, Mar 02, 2021 at 10:37:59AM +0100, BALATON Zoltan wrote:
> On Tue, 2 Mar 2021, Alexey Kardashevskiy wrote:
> > On 02/03/2021 14:35, David Gibson wrote:
> > > Alexey or Zoltan, any thoughts on how non-PAPR versions of this would
> > > call into qemu to get the non-guest parts of VOF to execute?
> > 
> > Non-PAPR could do it as we do it for soft breakpoints in KVM - some
> > predefined illegal instruction which KVM knows that it is used for soft
> > breakpoints.
> 
> So far I've thought I'd need to implement TYPE_PPC_VIRTUAL_HYPERVISOR
> interface in the machine for the code in target/ppc/excp_helper.c to forward
> client syscalls to QEMU where I could handle the VOF related calls but not
> sure this is the best way.

I'm not sure that will work on its own.  VIRTUAL_HYPERVISOR just traps
the "sc 1" (hypecall instruction).  If that's an illegal instruction
for CPU, we can theoretically still use it, but I suspect TCG will
flag it as an illegal instruction and trap before we even get to the
VIRTUAL_HYPERVISOR dispatch point.  You'll need to investigate.

> This could be OK for the initial boot when
> nothing else should use syscalls yet but with RTAS this may not work as the
> guest OS could also use syscalls so to avoid conflicts we may need to shut
> down the virtual hypervisor on quiesce

Uh.. no.  VIRTUAL_HYPERVISOR doesn't intercept normal system calls,
only "level 1" system calls which are explicitly designated for
hypercalls.

> which means I may need a minimal
> guest only rtas for pegasos2 (which would be OK as I think it's only used
> for shutdown/reboot anyway). Also may need some changes to allow empty
> callbacks in vhyp to be ignored when I only want to implement hypercall
> method but that's just adding checks to only call non-NULL callbacks in
> PPCVirtualHypervisorClass.
> 
> There's also an old patch from Benjamin Herrenschmidt to add MOL OSI which
> is a similar hypercall interface:
> https://github.com/ozbenh/qemu/commit/6dc8803641e323030ffd01ad8ce0dcf081896698
> This might also be useful later to use MOL paravirtual drivers to speed up
> MacOSX emulation. but I haven't looked at the details yet.
> 
> Any other ideas?
> 
> Regards,
> BALATON Zoltan
>
Alexey Kardashevskiy March 9, 2021, 7:28 a.m. UTC | #8
On 09/03/2021 16:29, David Gibson wrote:


>>>> +struct ClientArchitectureSupportClass {
>>>> +    InterfaceClass parent;
>>>> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
>>>> +    void (*quiesce)(void);
>>>
>>> Is there actually any real connection of quiesce behaviour to cas
>>> behaviour?  Basically, I'm wondering if this is not so much about
>>> client-architecture-support fundamentally as just about
>>> machine-specific parts of the VOF behaviour.  Which would be fine, but
>>> suggests a different name for the interface.
>>
>> The most canonical way would be having 2 interfaces.
> 
> Why?  I don't see any reason these shouldn't be a single interface, it
> just has a bad name.

I renamed it to SpaprVofInterface for now.


> [snip]
>>>> +typedef int size_t;
>>>> +typedef void client(void);
>>>> +
>>>> +/* globals */
>>>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
>>>> +
>>>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>>>> +
>>>> +/* libc */
>>>> +int strlen(const char *s);
>>>> +int strcmp(const char *s1, const char *s2);
>>>> +void *memcpy(void *dest, const void *src, size_t n);
>>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>>> +void *memmove(void *dest, const void *src, size_t n);
>>>> +void *memset(void *dest, int c, size_t size);
>>>> +
>>>> +/* Prom */
>>>> +typedef unsigned long prom_arg_t;
>>>> +int call_prom(const char *service, int nargs, int nret, ...);
>>>
>>> AIUI this isn't so much about calling the PROM, since this *is* the
>>> PROM code, but rather about calling the parts that are implemented on
>>> the qemu side.  Different names might clarify that.
>>
>> "call_ci"?
> 
> Works for me.

call_ci() it is then.

About builtins such as memcmp() - turns out these are not really 
builtins as they are not inlined and gcc/ld still want to link against 
libc which is trickier for such firmware (not quite sure how to do this 
and keep it small and not pull other libc stuff in), gcc just knows 
about them a bit more. This is different from, for example, 
__builtin_ctz which is inlined. So I am keeping my libc.o for now.
BALATON Zoltan March 9, 2021, 2 p.m. UTC | #9
On Tue, 9 Mar 2021, Alexey Kardashevskiy wrote:
> On 09/03/2021 16:29, David Gibson wrote:
>>>>> +struct ClientArchitectureSupportClass {
>>>>> +    InterfaceClass parent;
>>>>> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
>>>>> +    void (*quiesce)(void);
>>>> 
>>>> Is there actually any real connection of quiesce behaviour to cas
>>>> behaviour?  Basically, I'm wondering if this is not so much about
>>>> client-architecture-support fundamentally as just about
>>>> machine-specific parts of the VOF behaviour.  Which would be fine, but
>>>> suggests a different name for the interface.
>>> 
>>> The most canonical way would be having 2 interfaces.
>> 
>> Why?  I don't see any reason these shouldn't be a single interface, it
>> just has a bad name.
>
> I renamed it to SpaprVofInterface for now.
>
>
>> [snip]
>>>>> +typedef int size_t;
>>>>> +typedef void client(void);
>>>>> +
>>>>> +/* globals */
>>>>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this 
>>>>> firmware) */
>>>>> +
>>>>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
>>>>> +
>>>>> +/* libc */
>>>>> +int strlen(const char *s);
>>>>> +int strcmp(const char *s1, const char *s2);
>>>>> +void *memcpy(void *dest, const void *src, size_t n);
>>>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>>>> +void *memmove(void *dest, const void *src, size_t n);
>>>>> +void *memset(void *dest, int c, size_t size);
>>>>> +
>>>>> +/* Prom */
>>>>> +typedef unsigned long prom_arg_t;
>>>>> +int call_prom(const char *service, int nargs, int nret, ...);
>>>> 
>>>> AIUI this isn't so much about calling the PROM, since this *is* the
>>>> PROM code, but rather about calling the parts that are implemented on
>>>> the qemu side.  Different names might clarify that.
>>> 
>>> "call_ci"?
>> 
>> Works for me.
>
> call_ci() it is then.
>
> About builtins such as memcmp() - turns out these are not really builtins as 
> they are not inlined and gcc/ld still want to link against libc which is 
> trickier for such firmware (not quite sure how to do this and keep it small 
> and not pull other libc stuff in), gcc just knows about them a bit more. This 
> is different from, for example, __builtin_ctz which is inlined. So I am 
> keeping my libc.o for now.

Do they really want libc or they are in libgcc for which there's 
--static-libgcc I think to avoid needing it in runtime but not sure what 
clang has for these.

Regards,
BALATON Zoltan
Alexey Kardashevskiy March 10, 2021, 1:55 a.m. UTC | #10
On 10/03/2021 01:00, BALATON Zoltan wrote:
> On Tue, 9 Mar 2021, Alexey Kardashevskiy wrote:
>> On 09/03/2021 16:29, David Gibson wrote:
>>>>>> +struct ClientArchitectureSupportClass {
>>>>>> +    InterfaceClass parent;
>>>>>> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
>>>>>> +    void (*quiesce)(void);
>>>>>
>>>>> Is there actually any real connection of quiesce behaviour to cas
>>>>> behaviour?  Basically, I'm wondering if this is not so much about
>>>>> client-architecture-support fundamentally as just about
>>>>> machine-specific parts of the VOF behaviour.  Which would be fine, but
>>>>> suggests a different name for the interface.
>>>>
>>>> The most canonical way would be having 2 interfaces.
>>>
>>> Why?  I don't see any reason these shouldn't be a single interface, it
>>> just has a bad name.
>>
>> I renamed it to SpaprVofInterface for now.
>>
>>
>>> [snip]
>>>>>> +typedef int size_t;
>>>>>> +typedef void client(void);
>>>>>> +
>>>>>> +/* globals */
>>>>>> +extern void _prom_entry(void); /* OF CI entry point (i.e. this 
>>>>>> firmware) */
>>>>>> +
>>>>>> +void do_boot(unsigned long addr, unsigned long r3, unsigned long 
>>>>>> r4);
>>>>>> +
>>>>>> +/* libc */
>>>>>> +int strlen(const char *s);
>>>>>> +int strcmp(const char *s1, const char *s2);
>>>>>> +void *memcpy(void *dest, const void *src, size_t n);
>>>>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>>>>> +void *memmove(void *dest, const void *src, size_t n);
>>>>>> +void *memset(void *dest, int c, size_t size);
>>>>>> +
>>>>>> +/* Prom */
>>>>>> +typedef unsigned long prom_arg_t;
>>>>>> +int call_prom(const char *service, int nargs, int nret, ...);
>>>>>
>>>>> AIUI this isn't so much about calling the PROM, since this *is* the
>>>>> PROM code, but rather about calling the parts that are implemented on
>>>>> the qemu side.  Different names might clarify that.
>>>>
>>>> "call_ci"?
>>>
>>> Works for me.
>>
>> call_ci() it is then.
>>
>> About builtins such as memcmp() - turns out these are not really 
>> builtins as they are not inlined and gcc/ld still want to link against 
>> libc which is trickier for such firmware (not quite sure how to do 
>> this and keep it small and not pull other libc stuff in), gcc just 
>> knows about them a bit more. This is different from, for example, 
>> __builtin_ctz which is inlined. So I am keeping my libc.o for now.
> 
> Do they really want libc or they are in libgcc for which there's 
> --static-libgcc I think to avoid needing it in runtime but not sure what 
> clang has for these.

I was getting "unresolved symbol `memcmp`" when I tried calling memcmp() 
or __builtin_memcmp() and "-lc" did not help (installed some multilib 
packages, did not help either). I figured if I cannot get it compile in 
3 minutes, I should not probably be posting this and better off simply 
keeping the existing small libc.
David Gibson March 10, 2021, 2:40 a.m. UTC | #11
On Wed, Mar 10, 2021 at 12:55:07PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 10/03/2021 01:00, BALATON Zoltan wrote:
> > On Tue, 9 Mar 2021, Alexey Kardashevskiy wrote:
> > > On 09/03/2021 16:29, David Gibson wrote:
> > > > > > > +struct ClientArchitectureSupportClass {
> > > > > > > +    InterfaceClass parent;
> > > > > > > +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
> > > > > > > +    void (*quiesce)(void);
> > > > > > 
> > > > > > Is there actually any real connection of quiesce behaviour to cas
> > > > > > behaviour?  Basically, I'm wondering if this is not so much about
> > > > > > client-architecture-support fundamentally as just about
> > > > > > machine-specific parts of the VOF behaviour.  Which would be fine, but
> > > > > > suggests a different name for the interface.
> > > > > 
> > > > > The most canonical way would be having 2 interfaces.
> > > > 
> > > > Why?  I don't see any reason these shouldn't be a single interface, it
> > > > just has a bad name.
> > > 
> > > I renamed it to SpaprVofInterface for now.
> > > 
> > > 
> > > > [snip]
> > > > > > > +typedef int size_t;
> > > > > > > +typedef void client(void);
> > > > > > > +
> > > > > > > +/* globals */
> > > > > > > +extern void _prom_entry(void); /* OF CI entry point
> > > > > > > (i.e. this firmware) */
> > > > > > > +
> > > > > > > +void do_boot(unsigned long addr, unsigned long r3,
> > > > > > > unsigned long r4);
> > > > > > > +
> > > > > > > +/* libc */
> > > > > > > +int strlen(const char *s);
> > > > > > > +int strcmp(const char *s1, const char *s2);
> > > > > > > +void *memcpy(void *dest, const void *src, size_t n);
> > > > > > > +int memcmp(const void *ptr1, const void *ptr2, size_t n);
> > > > > > > +void *memmove(void *dest, const void *src, size_t n);
> > > > > > > +void *memset(void *dest, int c, size_t size);
> > > > > > > +
> > > > > > > +/* Prom */
> > > > > > > +typedef unsigned long prom_arg_t;
> > > > > > > +int call_prom(const char *service, int nargs, int nret, ...);
> > > > > > 
> > > > > > AIUI this isn't so much about calling the PROM, since this *is* the
> > > > > > PROM code, but rather about calling the parts that are implemented on
> > > > > > the qemu side.  Different names might clarify that.
> > > > > 
> > > > > "call_ci"?
> > > > 
> > > > Works for me.
> > > 
> > > call_ci() it is then.
> > > 
> > > About builtins such as memcmp() - turns out these are not really
> > > builtins as they are not inlined and gcc/ld still want to link
> > > against libc which is trickier for such firmware (not quite sure how
> > > to do this and keep it small and not pull other libc stuff in), gcc
> > > just knows about them a bit more. This is different from, for
> > > example, __builtin_ctz which is inlined. So I am keeping my libc.o
> > > for now.
> > 
> > Do they really want libc or they are in libgcc for which there's
> > --static-libgcc I think to avoid needing it in runtime but not sure what
> > clang has for these.
> 
> I was getting "unresolved symbol `memcmp`" when I tried calling memcmp() or
> __builtin_memcmp() and "-lc" did not help (installed some multilib packages,

Yeah, you'll need -lgcc rather than -lc, libgcc is the one with the
builtin helpers.

> did not help either). I figured if I cannot get it compile in 3 minutes, I
> should not probably be posting this and better off simply keeping the
> existing small libc.

Fair point.
David Gibson March 10, 2021, 4:11 a.m. UTC | #12
On Tue, Mar 09, 2021 at 06:28:44PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 09/03/2021 16:29, David Gibson wrote:
> 
> 
> > > > > +struct ClientArchitectureSupportClass {
> > > > > +    InterfaceClass parent;
> > > > > +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
> > > > > +    void (*quiesce)(void);
> > > > 
> > > > Is there actually any real connection of quiesce behaviour to cas
> > > > behaviour?  Basically, I'm wondering if this is not so much about
> > > > client-architecture-support fundamentally as just about
> > > > machine-specific parts of the VOF behaviour.  Which would be fine, but
> > > > suggests a different name for the interface.
> > > 
> > > The most canonical way would be having 2 interfaces.
> > 
> > Why?  I don't see any reason these shouldn't be a single interface, it
> > just has a bad name.
> 
> I renamed it to SpaprVofInterface for now.

It doesn't really have anything to do with PAPR, though.  Well, I
guess the CAS part does, but quiesce doesn't.  I'd suggest
"VofMachineInterface" - it represents where VOF needs to interact with
machine type specifics.

> > [snip]
> > > > > +typedef int size_t;
> > > > > +typedef void client(void);
> > > > > +
> > > > > +/* globals */
> > > > > +extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
> > > > > +
> > > > > +void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
> > > > > +
> > > > > +/* libc */
> > > > > +int strlen(const char *s);
> > > > > +int strcmp(const char *s1, const char *s2);
> > > > > +void *memcpy(void *dest, const void *src, size_t n);
> > > > > +int memcmp(const void *ptr1, const void *ptr2, size_t n);
> > > > > +void *memmove(void *dest, const void *src, size_t n);
> > > > > +void *memset(void *dest, int c, size_t size);
> > > > > +
> > > > > +/* Prom */
> > > > > +typedef unsigned long prom_arg_t;
> > > > > +int call_prom(const char *service, int nargs, int nret, ...);
> > > > 
> > > > AIUI this isn't so much about calling the PROM, since this *is* the
> > > > PROM code, but rather about calling the parts that are implemented on
> > > > the qemu side.  Different names might clarify that.
> > > 
> > > "call_ci"?
> > 
> > Works for me.
> 
> call_ci() it is then.
> 
> About builtins such as memcmp() - turns out these are not really builtins as
> they are not inlined and gcc/ld still want to link against libc which is
> trickier for such firmware (not quite sure how to do this and keep it small
> and not pull other libc stuff in), gcc just knows about them a bit more.
> This is different from, for example, __builtin_ctz which is inlined. So I am
> keeping my libc.o for now.
> 
> 
>
Alexey Kardashevskiy March 10, 2021, 4:52 a.m. UTC | #13
On 10/03/2021 13:40, David Gibson wrote:
> On Wed, Mar 10, 2021 at 12:55:07PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 10/03/2021 01:00, BALATON Zoltan wrote:
>>> On Tue, 9 Mar 2021, Alexey Kardashevskiy wrote:
>>>> On 09/03/2021 16:29, David Gibson wrote:
>>>>>>>> +struct ClientArchitectureSupportClass {
>>>>>>>> +    InterfaceClass parent;
>>>>>>>> +    target_ulong (*cas)(CPUState *cs, target_ulong vec);
>>>>>>>> +    void (*quiesce)(void);
>>>>>>>
>>>>>>> Is there actually any real connection of quiesce behaviour to cas
>>>>>>> behaviour?  Basically, I'm wondering if this is not so much about
>>>>>>> client-architecture-support fundamentally as just about
>>>>>>> machine-specific parts of the VOF behaviour.  Which would be fine, but
>>>>>>> suggests a different name for the interface.
>>>>>>
>>>>>> The most canonical way would be having 2 interfaces.
>>>>>
>>>>> Why?  I don't see any reason these shouldn't be a single interface, it
>>>>> just has a bad name.
>>>>
>>>> I renamed it to SpaprVofInterface for now.
>>>>
>>>>
>>>>> [snip]
>>>>>>>> +typedef int size_t;
>>>>>>>> +typedef void client(void);
>>>>>>>> +
>>>>>>>> +/* globals */
>>>>>>>> +extern void _prom_entry(void); /* OF CI entry point
>>>>>>>> (i.e. this firmware) */
>>>>>>>> +
>>>>>>>> +void do_boot(unsigned long addr, unsigned long r3,
>>>>>>>> unsigned long r4);
>>>>>>>> +
>>>>>>>> +/* libc */
>>>>>>>> +int strlen(const char *s);
>>>>>>>> +int strcmp(const char *s1, const char *s2);
>>>>>>>> +void *memcpy(void *dest, const void *src, size_t n);
>>>>>>>> +int memcmp(const void *ptr1, const void *ptr2, size_t n);
>>>>>>>> +void *memmove(void *dest, const void *src, size_t n);
>>>>>>>> +void *memset(void *dest, int c, size_t size);
>>>>>>>> +
>>>>>>>> +/* Prom */
>>>>>>>> +typedef unsigned long prom_arg_t;
>>>>>>>> +int call_prom(const char *service, int nargs, int nret, ...);
>>>>>>>
>>>>>>> AIUI this isn't so much about calling the PROM, since this *is* the
>>>>>>> PROM code, but rather about calling the parts that are implemented on
>>>>>>> the qemu side.  Different names might clarify that.
>>>>>>
>>>>>> "call_ci"?
>>>>>
>>>>> Works for me.
>>>>
>>>> call_ci() it is then.
>>>>
>>>> About builtins such as memcmp() - turns out these are not really
>>>> builtins as they are not inlined and gcc/ld still want to link
>>>> against libc which is trickier for such firmware (not quite sure how
>>>> to do this and keep it small and not pull other libc stuff in), gcc
>>>> just knows about them a bit more. This is different from, for
>>>> example, __builtin_ctz which is inlined. So I am keeping my libc.o
>>>> for now.
>>>
>>> Do they really want libc or they are in libgcc for which there's
>>> --static-libgcc I think to avoid needing it in runtime but not sure what
>>> clang has for these.
>>
>> I was getting "unresolved symbol `memcmp`" when I tried calling memcmp() or
>> __builtin_memcmp() and "-lc" did not help (installed some multilib packages,
> 
> Yeah, you'll need -lgcc rather than -lc, libgcc is the one with the
> builtin helpers.

Tried that:
===
ld -nostdlib -e_start -Tl.lds -EB -lgcc -o vof.elf entry.o main.o ci.o 
bootmem.o
ld: cannot find -lgcc
===

I tried playing with the cmdline but to no avail.

I also looked at libgcc and it does not look like it has the libc 
implementations:

===
[fstn1-p1 qemu-killslof]$ find /lib -iname "libgcc*"
/lib/powerpc64le-linux-gnu/libgcc_s.so.1

[fstn1-p1 qemu-killslof]$ objdump -D 
/lib/powerpc64le-linux-gnu/libgcc_s.so.1  | egrep '(memcmp|memcpy)'
0000000000010750 <memcpy@plt>:

[fstn1-p1 qemu-killslof]$
===


>> did not help either). I figured if I cannot get it compile in 3 minutes, I
>> should not probably be posting this and better off simply keeping the
>> existing small libc.
> 
> Fair point.

Yup.
diff mbox series

Patch

diff --git a/pc-bios/vof/Makefile b/pc-bios/vof/Makefile
new file mode 100644
index 000000000000..49f7e240eeff
--- /dev/null
+++ b/pc-bios/vof/Makefile
@@ -0,0 +1,18 @@ 
+all: build-all
+
+build-all: vof.bin
+
+%.o: %.S
+	cc -m32 -mbig-endian -c -o $@ $<
+
+%.o: %.c
+	cc -m32 -mbig-endian -c -fno-stack-protector -Wno-builtin-declaration-mismatch -o $@ $<
+
+vof.elf: entry.o main.o libc.o ci.o bootmem.o
+	ld -nostdlib -e_start -Tl.lds -EB -o $@ $^
+
+%.bin: %.elf
+	objcopy -O binary -j .text -j .data -j .toc -j .got2 $^ $@
+
+clean:
+	rm -f *.o *.bin *.elf *~
diff --git a/hw/ppc/vof.h b/hw/ppc/vof.h
new file mode 100644
index 000000000000..c8fadf23ea5b
--- /dev/null
+++ b/hw/ppc/vof.h
@@ -0,0 +1,42 @@ 
+ /* Virtual Open Firmware */
+#ifndef HW_VOF_H
+#define HW_VOF_H
+
+typedef struct Vof {
+    uint32_t top_addr; /* copied from rma_size */
+    GArray *claimed; /* array of SpaprOfClaimed */
+    uint64_t claimed_base;
+    GHashTable *of_instances; /* ihandle -> SpaprOfInstance */
+    uint32_t of_instance_last;
+    char *bootargs;
+    uint32_t initrd_base; /* Updated in spapr at CAS */
+    long initrd_size; /* Updated in spapr at CAS */
+} Vof;
+
+uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
+                         uint32_t *args, unsigned nargs,
+                         uint32_t *rets, unsigned nrets);
+uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
+                   uint64_t align);
+void vof_cleanup(Vof *vof);
+void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr);
+uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
+                               const char *prop, const char *path);
+
+/* ibm,client-architecture-support */
+#define TYPE_CLIENT_ARCHITECTURE_SUPPORT "client-architecture-support"
+#define CLIENT_ARCHITECTURE_SUPPORT(obj) \
+    INTERFACE_CHECK(ClientArchitectureSupport, (obj), TYPE_CLIENT_ARCHITECTURE_SUPPORT)
+
+typedef struct ClientArchitectureSupportClass ClientArchitectureSupportClass;
+DECLARE_CLASS_CHECKERS(ClientArchitectureSupportClass,
+                       CLIENT_ARCHITECTURE_SUPPORT,
+                       TYPE_CLIENT_ARCHITECTURE_SUPPORT)
+
+struct ClientArchitectureSupportClass {
+    InterfaceClass parent;
+    target_ulong (*cas)(CPUState *cs, target_ulong vec);
+    void (*quiesce)(void);
+};
+
+#endif /* HW_VOF_H */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ccbeeca1de84..4896b9fae784 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -12,6 +12,7 @@ 
 #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
 #include "hw/ppc/xics.h"        /* For ICSState */
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/ppc/vof.h"
 
 struct SpaprVioBus;
 struct SpaprPhbState;
@@ -180,6 +181,7 @@  struct SpaprMachineState {
     uint64_t kernel_addr;
     uint32_t initrd_base;
     long initrd_size;
+    Vof *vof;
     uint64_t rtc_offset; /* Now used only during incoming migration */
     struct PPCTimebase tb;
     bool has_graphics;
@@ -554,7 +556,9 @@  struct SpaprMachineState {
 /* Client Architecture support */
 #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
 #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_UPDATE_DT
+/* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
+#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
 
 /*
  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
@@ -944,4 +948,20 @@  bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize,
 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask);
 hwaddr spapr_get_rtas_addr(void);
 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr);
+
+void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
+                     target_ulong *stack_ptr);
+void spapr_vof_quiesce(void);
+target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                target_ulong opcode, target_ulong *args);
+target_ulong spapr_vof_client_architecture_support(CPUState *cs,
+                                                   target_ulong ovec_addr);
+void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt);
+
+/* Copied from SLOF, and 4K is definitely not enough for GRUB */
+#define OF_STACK_SIZE       0x8000
+
+/* 0..10000 is reserved for the VOF fw */
+#define OF_STACK_ADDR       0x10000
+
 #endif /* HW_SPAPR_H */
diff --git a/pc-bios/vof/vof.h b/pc-bios/vof/vof.h
new file mode 100644
index 000000000000..cd5989952a98
--- /dev/null
+++ b/pc-bios/vof/vof.h
@@ -0,0 +1,44 @@ 
+#include <stdarg.h>
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned long uint32_t;
+typedef unsigned long long uint64_t;
+#define NULL (0)
+#define PROM_ERROR (-1u)
+typedef unsigned char bool;
+typedef unsigned long ihandle;
+typedef unsigned long phandle;
+#define false ((bool)0)
+#define true ((bool)1)
+typedef int size_t;
+typedef void client(void);
+
+/* globals */
+extern void _prom_entry(void); /* OF CI entry point (i.e. this firmware) */
+
+void do_boot(unsigned long addr, unsigned long r3, unsigned long r4);
+
+/* libc */
+int strlen(const char *s);
+int strcmp(const char *s1, const char *s2);
+void *memcpy(void *dest, const void *src, size_t n);
+int memcmp(const void *ptr1, const void *ptr2, size_t n);
+void *memmove(void *dest, const void *src, size_t n);
+void *memset(void *dest, int c, size_t size);
+
+/* Prom */
+typedef unsigned long prom_arg_t;
+int call_prom(const char *service, int nargs, int nret, ...);
+
+/* CI wrappers */
+void ci_panic(const char *str);
+phandle ci_finddevice(const char *path);
+uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len);
+ihandle ci_open(const char *path);
+void ci_close(ihandle ih);
+void *ci_claim(void *virt, uint32_t size, uint32_t align);
+uint32_t ci_release(void *virt, uint32_t size);
+
+/* booting from -kernel */
+void boot_from_memory(uint64_t initrd, uint64_t initrdsize);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 85fe65f89476..3c20af115627 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -102,6 +102,7 @@ 
 #define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
 #define FW_MAX_SIZE             0x400000
 #define FW_FILE_NAME            "slof.bin"
+#define FW_FILE_NAME_VOF        "vof.bin"
 #define FW_OVERHEAD             0x2800000
 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
 
@@ -1562,6 +1563,7 @@  static void spapr_machine_reset(MachineState *machine)
     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
     PowerPCCPU *first_ppc_cpu;
     hwaddr fdt_addr;
+    target_ulong stack_ptr = 0;
     void *fdt;
     int rc;
 
@@ -1624,22 +1626,41 @@  static void spapr_machine_reset(MachineState *machine)
 
     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
 
-    rc = fdt_pack(fdt);
+    if (spapr->vof) {
+        /*
+         * Claims initramdisk and stack which changes "available" so
+         * doing it befofe packing.
+         */
+        spapr_vof_reset(spapr, fdt, &stack_ptr);
 
-    /* Should only fail if we've built a corrupted tree */
-    assert(rc == 0);
+        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
+                                  stack_ptr, spapr->initrd_base,
+                                  spapr->initrd_size);
+        /*
+         * We do not pack the FDT as the client may change properties and
+         * do not write FDT to the VM as the client does not expect it.
+         */
+    } else {
+        rc = fdt_pack(fdt);
+        /* Should only fail if we've built a corrupted tree */
+        assert(rc == 0);
 
-    /* Load the fdt */
+        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
+                                  0, fdt_addr, 0);
+    }
     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
-    cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
+
     g_free(spapr->fdt_blob);
     spapr->fdt_size = fdt_totalsize(fdt);
     spapr->fdt_initial_size = spapr->fdt_size;
     spapr->fdt_blob = fdt;
 
     /* Set up the entry state */
-    spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0);
     first_ppc_cpu->env.gpr[5] = 0;
+    if (!spapr->vof) {
+        /* Load the fdt */
+        cpu_physical_memory_write(fdt_addr, spapr->fdt_blob, spapr->fdt_size);
+    }
 
     spapr->fwnmi_system_reset_addr = -1;
     spapr->fwnmi_machine_check_addr = -1;
@@ -2639,7 +2660,8 @@  static void spapr_machine_init(MachineState *machine)
     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
     MachineClass *mc = MACHINE_GET_CLASS(machine);
-    const char *bios_name = machine->firmware ?: FW_FILE_NAME;
+    const char *bios_default = !!spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
+    const char *bios_name = machine->firmware ?: bios_default;
     const char *kernel_filename = machine->kernel_filename;
     const char *initrd_filename = machine->initrd_filename;
     PCIHostState *phb;
@@ -2996,6 +3018,10 @@  static void spapr_machine_init(MachineState *machine)
     }
 
     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
+
+    if (spapr->vof) {
+        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
+    }
 }
 
 #define DEFAULT_KVM_TYPE "auto"
@@ -3186,6 +3212,28 @@  static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
     }
 }
 
+static bool spapr_get_vof(Object *obj, Error **errp)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+
+    return spapr->vof != NULL;
+}
+
+static void spapr_set_vof(Object *obj, bool value, Error **errp)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
+
+    if (spapr->vof) {
+        vof_cleanup(spapr->vof);
+        g_free(spapr->vof);
+        spapr->vof = NULL;
+    }
+    if (!value) {
+        return;
+    }
+    spapr->vof = g_malloc0(sizeof(*spapr->vof));
+}
+
 static char *spapr_get_ic_mode(Object *obj, Error **errp)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
@@ -3311,6 +3359,10 @@  static void spapr_instance_init(Object *obj)
                                     stringify(KERNEL_LOAD_ADDR)
                                     " for -kernel is the default");
     spapr->kernel_addr = KERNEL_LOAD_ADDR;
+    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
+    object_property_set_description(obj, "x-vof",
+                                    "Enable Virtual Open Firmware");
+
     /* The machine class defines the default interrupt controller mode */
     spapr->irq = smc->irq;
     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
@@ -4408,6 +4460,7 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
     XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
+    ClientArchitectureSupportClass *casc = CLIENT_ARCHITECTURE_SUPPORT_CLASS(oc);
 
     mc->desc = "pSeries Logical Partition (PAPR compliant)";
     mc->ignore_boot_device_suffixes = true;
@@ -4487,6 +4540,9 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     smc->smp_threads_vsmt = true;
     smc->nr_xirqs = SPAPR_NR_XIRQS;
     xfc->match_nvt = spapr_match_nvt;
+
+    casc->cas = spapr_vof_client_architecture_support;
+    casc->quiesce = spapr_vof_quiesce;
 }
 
 static const TypeInfo spapr_machine_info = {
@@ -4506,6 +4562,7 @@  static const TypeInfo spapr_machine_info = {
         { TYPE_XICS_FABRIC },
         { TYPE_INTERRUPT_STATS_PROVIDER },
         { TYPE_XIVE_FABRIC },
+        { TYPE_CLIENT_ARCHITECTURE_SUPPORT },
         { }
     },
 };
@@ -4974,9 +5031,16 @@  static void spapr_machine_2_1_class_options(MachineClass *mc)
 }
 DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
 
+static const TypeInfo client_archivecture_support_info = {
+    .name = TYPE_CLIENT_ARCHITECTURE_SUPPORT,
+    .parent = TYPE_INTERFACE,
+    .class_size = sizeof(ClientArchitectureSupportClass),
+};
+
 static void spapr_machine_register_types(void)
 {
     type_register_static(&spapr_machine_info);
+    type_register_static(&client_archivecture_support_info);
 }
 
 type_init(spapr_machine_register_types)
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 7b5cd3553c26..0cdf90af6afb 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1806,7 +1806,13 @@  target_ulong do_client_architecture_support(PowerPCCPU *cpu,
         spapr_setup_hpt(spapr);
     }
 
-    fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
+    if (spapr->vof && spapr->vof->initrd_base && spapr->vof->initrd_size) {
+        /* Update initramdisk location so the right area gets reserved below */
+        spapr->initrd_base = spapr->vof->initrd_base;
+        spapr->initrd_size = spapr->vof->initrd_size;
+    }
+
+    fdt = spapr_build_fdt(spapr, spapr->vof != NULL, fdt_bufsize);
 
     g_free(spapr->fdt_blob);
     spapr->fdt_size = fdt_totalsize(fdt);
@@ -1850,6 +1856,24 @@  static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
     return ret;
 }
 
+target_ulong spapr_vof_client_architecture_support(CPUState *cs,
+                                                  target_ulong ovec_addr)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+    target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr,
+                                                      ovec_addr, FDT_MAX_SIZE);
+
+    /*
+     * This adds stdout and generates phandles for boottime and CAS FDTs.
+     * It is alright to update the FDT here as do_client_architecture_support()
+     * does not pack it.
+     */
+    spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob);
+
+    return ret;
+}
+
 static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
                                               SpaprMachineState *spapr,
                                               target_ulong opcode,
diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c
new file mode 100644
index 000000000000..f2978d830da5
--- /dev/null
+++ b/hw/ppc/spapr_vof.c
@@ -0,0 +1,138 @@ 
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include <sys/ioctl.h>
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_vio.h"
+#include "hw/ppc/fdt.h"
+#include "sysemu/sysemu.h"
+#include "qom/qom-qobject.h"
+#include "trace.h"
+
+/* Defined as Big Endian */
+struct prom_args {
+    uint32_t service;
+    uint32_t nargs;
+    uint32_t nret;
+    uint32_t args[10];
+} QEMU_PACKED;
+
+target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                target_ulong opcode, target_ulong *_args)
+{
+    target_ulong args_real = ppc64_phys_to_real(_args[0]);
+    struct prom_args args_be;
+    uint32_t args[ARRAY_SIZE(args_be.args)];
+    uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret;
+    char service[64];
+    unsigned nargs, nret, i;
+
+    if (address_space_rw(&address_space_memory, args_real,
+                         MEMTXATTRS_UNSPECIFIED, &args_be, sizeof(args_be),
+                         false) != MEMTX_OK) {
+        return H_HARDWARE;
+    }
+    nargs = be32_to_cpu(args_be.nargs);
+    if (nargs >= ARRAY_SIZE(args_be.args)) {
+        return H_PARAMETER;
+    }
+
+    if (address_space_rw(&address_space_memory, be32_to_cpu(args_be.service),
+                         MEMTXATTRS_UNSPECIFIED, service, sizeof(service),
+                         false) != MEMTX_OK) {
+        return H_HARDWARE;
+    }
+    if (strnlen(service, sizeof(service)) == sizeof(service)) {
+        /* Too long service name */
+        return H_PARAMETER;
+    }
+
+    for (i = 0; i < nargs; ++i) {
+        args[i] = be32_to_cpu(args_be.args[i]);
+    }
+
+    nret = be32_to_cpu(args_be.nret);
+    ret = vof_client_call(spapr->fdt_blob, spapr->vof, service,
+                          args, nargs, rets, nret);
+    if (!nret) {
+        return H_SUCCESS;
+    }
+
+    args_be.args[nargs] = cpu_to_be32(ret);
+    for (i = 1; i < nret; ++i) {
+        args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]);
+    }
+
+    if (address_space_rw(&address_space_memory,
+                         args_real + offsetof(struct prom_args, args[nargs]),
+                         MEMTXATTRS_UNSPECIFIED, args_be.args + nargs,
+                         sizeof(args_be.args[0]) * nret, true) != MEMTX_OK) {
+        return H_HARDWARE;
+    }
+
+    return H_SUCCESS;
+}
+
+void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt)
+{
+    char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
+
+    vof_build_dt(fdt, spapr->vof, spapr->rma_size);
+
+    /*
+     * SLOF-less setup requires an open instance of stdout for early
+     * kernel printk. By now all phandles are settled so we can open
+     * the default serial console.
+     */
+    if (stdout_path) {
+        _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout",
+                                   stdout_path));
+    }
+}
+
+void spapr_vof_reset(SpaprMachineState *spapr, void *fdt,
+                     target_ulong *stack_ptr)
+{
+    Vof *vof = spapr->vof;
+
+    spapr_vof_client_dt_finalize(spapr, fdt);
+
+    *stack_ptr = vof_claim(spapr->fdt_blob, vof, OF_STACK_ADDR, OF_STACK_SIZE,
+                           OF_STACK_SIZE);
+    if (*stack_ptr == -1) {
+        error_report("Memory allocation for stack failed");
+        exit(1);
+    }
+    /*
+     * Stack grows downwards and we also reserve here space for
+     * the minimum stack frame.
+     */
+    *stack_ptr += OF_STACK_SIZE - 0x20;
+
+    if (spapr->kernel_size &&
+        vof_claim(spapr->fdt_blob, vof, spapr->kernel_addr, spapr->kernel_size,
+                  0) == -1) {
+        error_report("Memory for kernel is in use");
+        exit(1);
+    }
+
+    if (spapr->initrd_size &&
+        vof_claim(spapr->fdt_blob, vof, spapr->initrd_base, spapr->initrd_size,
+                  0) == -1) {
+        error_report("Memory for initramdisk is in use");
+        exit(1);
+    }
+
+    /*
+     * We skip writing FDT as nothing expects it; OF client interface is
+     * going to be used for reading the device tree.
+     */
+}
+
+void spapr_vof_quiesce(void)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+    spapr->fdt_size = fdt_totalsize(spapr->fdt_blob);
+    spapr->fdt_initial_size = spapr->fdt_size;
+}
diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
new file mode 100644
index 000000000000..9c76891e668c
--- /dev/null
+++ b/hw/ppc/vof.c
@@ -0,0 +1,864 @@ 
+/*
+ * QEMU PowerPC Virtual Open Firmware.
+ *
+ * This implements client interface from OpenFirmware IEEE1275 on the QEMU
+ * side to leave only a very basic firmware in the VM.
+ *
+ * Copyright (c) 2020 IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include <sys/ioctl.h>
+#include "exec/ram_addr.h"
+#include "exec/address-spaces.h"
+#include "qemu/timer.h"
+#include "qemu/range.h"
+#include "hw/ppc/vof.h"
+#include "hw/ppc/fdt.h"
+#include "sysemu/runstate.h"
+#include "qom/qom-qobject.h"
+#include "trace.h"
+
+#include <libfdt.h>
+
+/*
+ * OF 1275 "nextprop" description suggests is it 32 bytes max but
+ * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long.
+ */
+#define OF_PROPNAME_LEN_MAX 64
+
+typedef struct {
+    uint64_t start;
+    uint64_t size;
+} OfClaimed;
+
+typedef struct {
+    char *path; /* the path used to open the instance */
+    uint32_t phandle;
+} OfInstance;
+
+#define VOF_MEM_READ(pa, buf, size) \
+    address_space_read_full(&address_space_memory, \
+    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
+#define VOF_MEM_WRITE(pa, buf, size) \
+    address_space_write(&address_space_memory, \
+    (pa), MEMTXATTRS_UNSPECIFIED, (buf), (size))
+
+static int readstr(hwaddr pa, char *buf, int size)
+{
+    if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) {
+        return -1;
+    }
+    if (strnlen(buf, size) == size) {
+        buf[size - 1] = '\0';
+        trace_vof_error_str_truncated(buf, size);
+        return -1;
+    }
+    return 0;
+}
+
+static bool cmpservice(const char *s, unsigned nargs, unsigned nret,
+                       const char *s1, unsigned nargscheck, unsigned nretcheck)
+{
+    if (strcmp(s, s1)) {
+        return false;
+    }
+    if ((nargscheck && (nargs != nargscheck)) ||
+        (nretcheck && (nret != nretcheck))) {
+        trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret);
+        return false;
+    }
+
+    return true;
+}
+
+static void prop_format(char *tval, int tlen, const void *prop, int len)
+{
+    int i;
+    const unsigned char *c;
+    char *t;
+    const char bin[] = "...";
+
+    for (i = 0, c = prop; i < len; ++i, ++c) {
+        if (*c == '\0' && i == len - 1) {
+            strncpy(tval, prop, tlen - 1);
+            return;
+        }
+        if (*c < 0x20 || *c >= 0x80) {
+            break;
+        }
+    }
+
+    for (i = 0, c = prop, t = tval; i < len; ++i, ++c) {
+        if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) {
+            strcpy(t, bin);
+            return;
+        }
+        if (i && i % 4 == 0 && i != len - 1) {
+            strcat(t, " ");
+            ++t;
+        }
+        t += sprintf(t, "%02X", *c & 0xFF);
+    }
+}
+
+static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr)
+{
+    char fullnode[1024];
+    uint32_t ret = -1;
+    int offset;
+
+    if (readstr(nodeaddr, fullnode, sizeof(fullnode))) {
+        return (uint32_t) ret;
+    }
+
+    offset = fdt_path_offset(fdt, fullnode);
+    if (offset >= 0) {
+        ret = fdt_get_phandle(fdt, offset);
+    }
+    trace_vof_finddevice(fullnode, ret);
+    return (uint32_t) ret;
+}
+
+static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname,
+                            uint32_t valaddr, uint32_t vallen)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = 0;
+    int proplen = 0;
+    const void *prop;
+    char trval[64] = "";
+    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
+
+    if (readstr(pname, propname, sizeof(propname))) {
+        return -1;
+    }
+    if (strcmp(propname, "name") == 0) {
+        prop = fdt_get_name(fdt, nodeoff, &proplen);
+        proplen += 1;
+    } else {
+        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
+    }
+
+    if (prop) {
+        int cb = MIN(proplen, vallen);
+
+        if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK) {
+            ret = -1;
+        } else {
+            /*
+             * OF1275 says:
+             * "Size is either the actual size of the property, or -1 if name
+             * does not exist", hence returning proplen instead of cb.
+             */
+            ret = proplen;
+            prop_format(trval, sizeof(trval), prop, ret);
+        }
+    } else {
+        ret = -1;
+    }
+    trace_vof_getprop(nodeph, propname, ret, trval);
+
+    return ret;
+}
+
+static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = 0;
+    int proplen = 0;
+    const void *prop;
+    int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph);
+
+    if (readstr(pname, propname, sizeof(propname))) {
+        return -1;
+    }
+    if (strcmp(propname, "name") == 0) {
+        prop = fdt_get_name(fdt, nodeoff, &proplen);
+        proplen += 1;
+    } else {
+        prop = fdt_getprop(fdt, nodeoff, propname, &proplen);
+    }
+
+    if (prop) {
+        ret = proplen;
+    } else {
+        ret = -1;
+    }
+    trace_vof_getproplen(nodeph, propname, ret);
+
+    return ret;
+}
+
+static uint32_t vof_setprop(void *fdt, Vof *vof,
+                            uint32_t nodeph, uint32_t pname,
+                            uint32_t valaddr, uint32_t vallen)
+{
+    char propname[OF_PROPNAME_LEN_MAX + 1];
+    uint32_t ret = -1;
+    int offset;
+    char trval[64] = "";
+
+    if (readstr(pname, propname, sizeof(propname))) {
+        return -1;
+    }
+    /*
+     * We only allow changing properties which we know how to update in QEMU
+     * OR
+     * the ones which we know that they need to survive during "quiesce".
+     */
+    if (vallen == sizeof(uint32_t)) {
+        uint32_t val32 = ldl_be_phys(first_cpu->as, valaddr);
+
+        if ((strcmp(propname, "linux,rtas-base") == 0) ||
+            (strcmp(propname, "linux,rtas-entry") == 0)) {
+            /* These need to survive quiesce so let them store in the FDT */
+        } else if (strcmp(propname, "linux,initrd-start") == 0) {
+            vof->initrd_base = val32;
+        } else if (strcmp(propname, "linux,initrd-end") == 0) {
+            vof->initrd_size = val32 - vof->initrd_base;
+        } else {
+            goto trace_exit;
+        }
+    } else if (vallen == sizeof(uint64_t)) {
+        uint64_t val64 = ldq_be_phys(first_cpu->as, valaddr);
+
+        if (strcmp(propname, "linux,initrd-start") == 0) {
+            vof->initrd_base = val64;
+        } else if (strcmp(propname, "linux,initrd-end") == 0) {
+            vof->initrd_size = val64 - vof->initrd_base;
+        } else {
+            goto trace_exit;
+        }
+    } else if (strcmp(propname, "bootargs") == 0) {
+        char val[1024];
+
+        if (readstr(valaddr, val, sizeof(val))) {
+            goto trace_exit;
+        }
+        g_free(vof->bootargs);
+        vof->bootargs = g_strdup(val);
+    } else {
+        goto trace_exit;
+    }
+
+    offset = fdt_node_offset_by_phandle(fdt, nodeph);
+    if (offset >= 0) {
+        uint8_t data[vallen];
+
+        if ((VOF_MEM_READ(valaddr, data, vallen) == MEMTX_OK) &&
+            !fdt_setprop(fdt, offset, propname, data, vallen)) {
+            ret = vallen;
+            prop_format(trval, sizeof(trval), data, ret);
+        }
+    }
+
+trace_exit:
+    trace_vof_setprop(nodeph, propname, trval, ret);
+
+    return ret;
+}
+
+static uint32_t vof_nextprop(const void *fdt, uint32_t phandle,
+                             uint32_t prevaddr, uint32_t nameaddr)
+{
+    int offset = fdt_node_offset_by_phandle(fdt, phandle);
+    char prev[OF_PROPNAME_LEN_MAX + 1];
+    const char *tmp;
+
+    if (readstr(prevaddr, prev, sizeof(prev))) {
+        return -1;
+    }
+    for (offset = fdt_first_property_offset(fdt, offset);
+         offset >= 0;
+         offset = fdt_next_property_offset(fdt, offset)) {
+
+        if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
+            return 0;
+        }
+        if (prev[0] == '\0' || strcmp(prev, tmp) == 0) {
+            if (prev[0] != '\0') {
+                offset = fdt_next_property_offset(fdt, offset);
+                if (offset < 0) {
+                    return 0;
+                }
+            }
+            if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) {
+                return 0;
+            }
+
+            if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) {
+                return -1;
+            }
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static uint32_t vof_peer(const void *fdt, uint32_t phandle)
+{
+    int ret;
+
+    if (phandle == 0) {
+        ret = fdt_path_offset(fdt, "/");
+    } else {
+        ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+    }
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_child(const void *fdt, uint32_t phandle)
+{
+    int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_parent(const void *fdt, uint32_t phandle)
+{
+    int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle));
+
+    if (ret < 0) {
+        ret = 0;
+    } else {
+        ret = fdt_get_phandle(fdt, ret);
+    }
+
+    return ret;
+}
+
+static uint32_t vof_do_open(void *fdt, Vof *vof, const char *path)
+{
+    int offset;
+    uint32_t ret = 0;
+    OfInstance *inst = NULL;
+
+    if (vof->of_instance_last == 0xFFFFFFFF) {
+        /* We do not recycle ihandles yet */
+        goto trace_exit;
+    }
+
+    offset = fdt_path_offset(fdt, path);
+    if (offset < 0) {
+        trace_vof_error_unknown_path(path);
+        goto trace_exit;
+    }
+
+    inst = g_new0(OfInstance, 1);
+    inst->phandle = fdt_get_phandle(fdt, offset);
+    g_assert(inst->phandle);
+    ++vof->of_instance_last;
+
+    inst->path = g_strdup(path);
+    g_hash_table_insert(vof->of_instances,
+                        GINT_TO_POINTER(vof->of_instance_last),
+                        inst);
+    ret = vof->of_instance_last;
+
+trace_exit:
+    trace_vof_open(path, inst ? inst->phandle : 0, ret);
+
+    return ret;
+}
+
+uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename,
+                               const char *prop, const char *path)
+{
+    int node = fdt_path_offset(fdt, nodename);
+    uint32_t inst = vof_do_open(fdt, vof, path);
+
+    return fdt_setprop_cell(fdt, node, prop, inst);
+}
+
+static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr)
+{
+    char path[256];
+
+    if (readstr(pathaddr, path, sizeof(path))) {
+        return -1;
+    }
+
+    return vof_do_open(fdt, vof, path);
+}
+
+static void vof_close(Vof *vof, uint32_t ihandle)
+{
+    if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) {
+        trace_vof_error_unknown_ihandle_close(ihandle);
+    }
+}
+
+static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle)
+{
+    gpointer instp = g_hash_table_lookup(vof->of_instances,
+                                         GINT_TO_POINTER(ihandle));
+    uint32_t ret = -1;
+
+    if (instp) {
+        ret = ((OfInstance *)instp)->phandle;
+    }
+    trace_vof_instance_to_package(ihandle, ret);
+
+    return ret;
+}
+
+static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle,
+                                    uint32_t buf, uint32_t len)
+{
+    uint32_t ret = -1;
+    char tmp[256] = "";
+
+    if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle), tmp,
+                      sizeof(tmp))) {
+        tmp[sizeof(tmp) - 1] = 0;
+        ret = MIN(len, strlen(tmp) + 1);
+        if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
+            ret = -1;
+        }
+    }
+
+    trace_vof_package_to_path(phandle, tmp, ret);
+
+    return ret;
+}
+
+static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle,
+                                     uint32_t buf, uint32_t len)
+{
+    uint32_t ret = -1;
+    uint32_t phandle = vof_instance_to_package(vof, ihandle);
+    char tmp[256] = "";
+
+    if (phandle != -1) {
+        if (!fdt_get_path(fdt, fdt_node_offset_by_phandle(fdt, phandle),
+                          tmp, sizeof(tmp))) {
+            tmp[sizeof(tmp) - 1] = 0;
+            ret = MIN(len, strlen(tmp) + 1);
+            if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) {
+                ret = -1;
+            }
+        }
+    }
+    trace_vof_instance_to_path(ihandle, phandle, tmp, ret);
+
+    return ret;
+}
+
+static void vof_claimed_dump(GArray *claimed)
+{
+#ifdef DEBUG
+    int i;
+    OfClaimed c;
+
+    for (i = 0; i < claimed->len; ++i) {
+        c = g_array_index(claimed, OfClaimed, i);
+        error_printf("CLAIMED %lx..%lx size=%ld\n", c.start, c.start + c.size,
+                     c.size);
+    }
+#endif
+}
+
+static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size)
+{
+    int i;
+    OfClaimed c;
+
+    for (i = 0; i < claimed->len; ++i) {
+        c = g_array_index(claimed, OfClaimed, i);
+        if (ranges_overlap(c.start, c.size, virt, size)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size)
+{
+    OfClaimed newclaim;
+
+    newclaim.start = virt;
+    newclaim.size = size;
+    g_array_append_val(claimed, newclaim);
+}
+
+static gint of_claimed_compare_func(gconstpointer a, gconstpointer b)
+{
+    return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start;
+}
+
+static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
+{
+    int i, n, offset, proplen = 0;
+    uint64_t *mem0_reg;
+    struct { uint64_t start, size; } *avail;
+
+    if (!fdt || !claimed) {
+        return;
+    }
+
+    offset = fdt_path_offset(fdt, "/memory@0");
+    _FDT(offset);
+
+    mem0_reg = (uint64_t *) fdt_getprop(fdt, offset, "reg", &proplen);
+    if (!mem0_reg || proplen != 2 * sizeof(uint64_t)) {
+        return;
+    }
+
+    g_array_sort(claimed, of_claimed_compare_func);
+    vof_claimed_dump(claimed);
+
+    avail = g_malloc0(sizeof(uint64_t) * 2 * claimed->len);
+    for (i = 0, n = 0; i < claimed->len; ++i) {
+        OfClaimed c = g_array_index(claimed, OfClaimed, i);
+
+        avail[n].start = c.start + c.size;
+        if (i < claimed->len - 1) {
+            OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1);
+
+            avail[n].size = cn.start - avail[n].start;
+        } else {
+            avail[n].size = be64_to_cpu(mem0_reg[1]) - avail[n].start;
+        }
+
+        if (avail[n].size) {
+#ifdef DEBUG
+            error_printf("AVAIL %lx..%lx size=%ld\n", avail[n].start,
+                         avail[n].start + avail[n].size, avail[n].size);
+#endif
+            avail[n].start = cpu_to_be64(avail[n].start);
+            avail[n].size = cpu_to_be64(avail[n].size);
+            ++n;
+        }
+    }
+    _FDT((fdt_setprop(fdt, offset, "available", avail,
+                      sizeof(uint64_t) * 2 * n)));
+    g_free(avail);
+}
+
+/*
+ * OF1275:
+ * "Allocates size bytes of memory. If align is zero, the allocated range
+ * begins at the virtual address virt. Otherwise, an aligned address is
+ * automatically chosen and the input argument virt is ignored".
+ *
+ * In other words, exactly one of @virt and @align is non-zero.
+ */
+uint64_t vof_claim(void *fdt, Vof *vof, uint64_t virt, uint64_t size,
+                   uint64_t align)
+{
+    uint64_t ret;
+
+    if (size == 0) {
+        ret = -1;
+    } else if (align == 0) {
+        if (!vof_claim_avail(vof->claimed, virt, size)) {
+            ret = -1;
+        } else {
+            ret = virt;
+        }
+    } else {
+        vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align);
+        while (1) {
+            if (vof->claimed_base >= vof->top_addr) {
+                error_report("Out of RMA memory for the OF client");
+                return -1;
+            }
+            if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) {
+                break;
+            }
+            vof->claimed_base += size;
+        }
+        ret = vof->claimed_base;
+    }
+
+    if (ret != -1) {
+        vof->claimed_base = MAX(vof->claimed_base, ret + size);
+        vof_claim_add(vof->claimed, ret, size);
+        /* The client reads "/memory@0/available" to know where it can claim */
+        vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+    }
+    trace_vof_claim(virt, size, align, ret);
+
+    return ret;
+}
+
+static uint32_t vof_release(void *fdt, Vof *vof, uint64_t virt, uint64_t size)
+{
+    uint32_t ret = -1;
+    int i;
+    GArray *claimed = vof->claimed;
+    OfClaimed c;
+
+    for (i = 0; i < claimed->len; ++i) {
+        c = g_array_index(claimed, OfClaimed, i);
+        if (c.start == virt && c.size == size) {
+            g_array_remove_index(claimed, i);
+            vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+            ret = 0;
+            break;
+        }
+    }
+
+    trace_vof_release(virt, size, ret);
+
+    return ret;
+}
+
+static void vof_instantiate_rtas(void)
+{
+    error_report("The firmware should have instantiated RTAS");
+    exit(1);
+}
+
+static uint32_t vof_call_method(Vof *vof, uint32_t methodaddr,
+                                uint32_t ihandle,
+                                uint32_t param1, uint32_t param2,
+                                uint32_t param3, uint32_t param4,
+                                uint32_t *ret2)
+{
+    uint32_t ret = -1;
+    char method[256] = "";
+    OfInstance *inst;
+
+    if (!ihandle) {
+        goto trace_exit;
+    }
+
+    inst = (OfInstance *) g_hash_table_lookup(vof->of_instances,
+                                              GINT_TO_POINTER(ihandle));
+    if (!inst) {
+        goto trace_exit;
+    }
+
+    if (readstr(methodaddr, method, sizeof(method))) {
+        goto trace_exit;
+    }
+
+    if (strcmp(inst->path, "/") == 0) {
+        if (strcmp(method, "ibm,client-architecture-support") == 0) {
+            Object *cas_if = object_dynamic_cast(
+                    qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
+
+            if (cas_if) {
+                ClientArchitectureSupportClass *casc =
+                    CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
+
+                ret = casc->cas(first_cpu, param1);
+            }
+
+            *ret2 = 0;
+        }
+    } else if (strcmp(inst->path, "/rtas") == 0) {
+        if (strcmp(method, "instantiate-rtas") == 0) {
+            vof_instantiate_rtas();
+            ret = 0;
+            *ret2 = param1; /* rtas-base */
+        }
+    } else {
+        trace_vof_error_unknown_method(method);
+    }
+
+trace_exit:
+    trace_vof_method(ihandle, method, param1, ret, *ret2);
+
+    return ret;
+}
+
+static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1,
+                                   uint32_t param2, uint32_t *ret2)
+{
+    uint32_t ret = -1;
+    char cmd[256] = "";
+
+    /* No interpret implemented */
+    readstr(cmdaddr, cmd, sizeof(cmd));
+    trace_vof_interpret(cmd, param1, param2, ret, *ret2);
+
+    return ret;
+}
+
+static void vof_quiesce(void *fdt, Vof *vof)
+{
+    Object *cas_if = object_dynamic_cast(
+        qdev_get_machine(), TYPE_CLIENT_ARCHITECTURE_SUPPORT);
+
+    int rc = fdt_pack(fdt);
+
+    assert(rc == 0);
+
+    if (cas_if) {
+        ClientArchitectureSupportClass *casc =
+            CLIENT_ARCHITECTURE_SUPPORT_GET_CLASS(cas_if);
+
+        casc->quiesce();
+    }
+
+    vof_claimed_dump(vof->claimed);
+}
+
+uint32_t vof_client_call(void *fdt, Vof *vof, const char *service,
+                         uint32_t *args, unsigned nargs,
+                         uint32_t *rets, unsigned nrets)
+{
+    uint32_t ret = 0;
+
+    /* @nrets includes the value which this function returns */
+#define cmpserv(s, a, r) \
+    cmpservice(service, nargs, nrets, (s), (a), (r))
+
+    if (cmpserv("finddevice", 1, 1)) {
+        ret = vof_finddevice(fdt, args[0]);
+    } else if (cmpserv("getprop", 4, 1)) {
+        ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]);
+    } else if (cmpserv("getproplen", 2, 1)) {
+        ret = vof_getproplen(fdt, args[0], args[1]);
+    } else if (cmpserv("setprop", 4, 1)) {
+        ret = vof_setprop(fdt, vof, args[0], args[1], args[2], args[3]);
+    } else if (cmpserv("nextprop", 3, 1)) {
+        ret = vof_nextprop(fdt, args[0], args[1], args[2]);
+    } else if (cmpserv("peer", 1, 1)) {
+        ret = vof_peer(fdt, args[0]);
+    } else if (cmpserv("child", 1, 1)) {
+        ret = vof_child(fdt, args[0]);
+    } else if (cmpserv("parent", 1, 1)) {
+        ret = vof_parent(fdt, args[0]);
+    } else if (cmpserv("open", 1, 1)) {
+        ret = vof_open(fdt, vof, args[0]);
+    } else if (cmpserv("close", 1, 0)) {
+        vof_close(vof, args[0]);
+    } else if (cmpserv("instance-to-package", 1, 1)) {
+        ret = vof_instance_to_package(vof, args[0]);
+    } else if (cmpserv("package-to-path", 3, 1)) {
+        ret = vof_package_to_path(fdt, args[0], args[1], args[2]);
+    } else if (cmpserv("instance-to-path", 3, 1)) {
+        ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]);
+    } else if (cmpserv("claim", 3, 1)) {
+        ret = vof_claim(fdt, vof, args[0], args[1], args[2]);
+    } else if (cmpserv("release", 2, 0)) {
+        ret = vof_release(fdt, vof, args[0], args[1]);
+    } else if (cmpserv("call-method", 0, 0)) {
+        ret = vof_call_method(vof, args[0], args[1], args[2], args[3], args[4],
+                              args[5], rets);
+    } else if (cmpserv("interpret", 0, 0)) {
+        ret = vof_call_interpret(args[0], args[1], args[2], rets);
+    } else if (cmpserv("milliseconds", 0, 1)) {
+        ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+    } else if (cmpserv("quiesce", 0, 0)) {
+        vof_quiesce(fdt, vof);
+    } else if (cmpserv("exit", 0, 0)) {
+        error_report("Stopped as the VM requested \"exit\"");
+        vm_stop(RUN_STATE_PAUSED); /* Or qemu_system_guest_panicked(NULL); ? */
+    } else {
+        trace_vof_error_unknown_service(service, nargs, nrets);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static void of_instance_free(gpointer data)
+{
+    OfInstance *inst = (OfInstance *) data;
+
+    g_free(inst->path);
+    g_free(inst);
+}
+
+void vof_cleanup(Vof *vof)
+{
+    if (vof->claimed) {
+        g_array_unref(vof->claimed);
+    }
+    if (vof->of_instances) {
+        g_hash_table_unref(vof->of_instances);
+    }
+}
+
+void vof_build_dt(void *fdt, Vof *vof, uint32_t top_addr)
+{
+    uint32_t phandle;
+    int i, offset, proplen = 0;
+    const void *prop;
+    bool found = false;
+    GArray *phandles = g_array_new(false, false, sizeof(uint32_t));
+
+    vof_cleanup(vof);
+
+    vof->claimed = g_array_new(false, false, sizeof(OfClaimed));
+    vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal,
+                                              NULL, of_instance_free);
+    vof->top_addr = top_addr;
+
+    /* Find all predefined phandles */
+    for (offset = fdt_next_node(fdt, -1, NULL);
+         offset >= 0;
+         offset = fdt_next_node(fdt, offset, NULL)) {
+        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
+        if (prop && proplen == sizeof(uint32_t)) {
+            phandle = fdt32_ld(prop);
+            g_array_append_val(phandles, phandle);
+        }
+    }
+
+    /* Assign phandles skipping the predefined ones */
+    for (offset = fdt_next_node(fdt, -1, NULL), phandle = 1;
+         offset >= 0;
+         offset = fdt_next_node(fdt, offset, NULL), ++phandle) {
+
+        prop = fdt_getprop(fdt, offset, "phandle", &proplen);
+        if (prop) {
+            continue;
+        }
+        /* Check if the current phandle is not allocated already */
+        for ( ; ; ++phandle) {
+            for (i = 0, found = false; i < phandles->len; ++i) {
+                if (phandle == g_array_index(phandles, uint32_t, i)) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                break;
+            }
+        }
+        _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle));
+    }
+    g_array_unref(phandles);
+
+    vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base);
+}
diff --git a/pc-bios/vof/bootmem.c b/pc-bios/vof/bootmem.c
new file mode 100644
index 000000000000..de7d5fc76431
--- /dev/null
+++ b/pc-bios/vof/bootmem.c
@@ -0,0 +1,13 @@ 
+#include "vof.h"
+
+void boot_from_memory(uint64_t initrd, uint64_t initrdsize)
+{
+	uint64_t kern[2];
+	phandle chosen = ci_finddevice("/chosen");
+
+	if (ci_getprop(chosen, "qemu,boot-kernel", kern, sizeof(kern)) !=
+			sizeof(kern))
+		return;
+
+	do_boot(kern[0], initrd, initrdsize);
+}
diff --git a/pc-bios/vof/ci.c b/pc-bios/vof/ci.c
new file mode 100644
index 000000000000..4880b3d2047c
--- /dev/null
+++ b/pc-bios/vof/ci.c
@@ -0,0 +1,108 @@ 
+#include "vof.h"
+
+struct prom_args {
+        uint32_t service;
+        uint32_t nargs;
+        uint32_t nret;
+        uint32_t args[10];
+};
+
+#define ADDR(x) (uint32_t)(x)
+
+extern uint32_t ci_entry(uint32_t params);
+
+extern unsigned long hv_rtas(unsigned long params);
+extern unsigned int hv_rtas_size;
+
+bool prom_handle(struct prom_args *pargs)
+{
+	void *rtasbase;
+	uint32_t rtassize = 0;
+	phandle rtas;
+
+	if (strcmp("call-method", (void *)(unsigned long) pargs->service))
+		return false;
+
+	if (strcmp("instantiate-rtas", (void *)(unsigned long) pargs->args[0]))
+		return false;
+
+	rtas = ci_finddevice("/rtas");
+	ci_getprop(rtas, "rtas-size", &rtassize, sizeof(rtassize));
+	if (rtassize < hv_rtas_size)
+		return false;
+
+	rtasbase = (void *)(unsigned long) pargs->args[2];
+
+	memcpy(rtasbase, hv_rtas, hv_rtas_size);
+	pargs->args[pargs->nargs] = 0;
+	pargs->args[pargs->nargs + 1] = pargs->args[2];
+
+	return true;
+}
+
+void prom_entry(uint32_t args)
+{
+	if (!prom_handle((void *)(unsigned long) args))
+		ci_entry(args);
+}
+
+int call_prom(const char *service, int nargs, int nret, ...)
+{
+        int i;
+        struct prom_args args;
+        va_list list;
+
+        args.service = ADDR(service);
+        args.nargs = nargs;
+        args.nret = nret;
+
+        va_start(list, nret);
+        for (i = 0; i < nargs; i++)
+                args.args[i] = va_arg(list, prom_arg_t);
+        va_end(list);
+
+        for (i = 0; i < nret; i++)
+                args.args[nargs+i] = 0;
+
+        if (ci_entry((uint32_t)(&args)) < 0)
+                return PROM_ERROR;
+
+        return (nret > 0) ? args.args[nargs] : 0;
+}
+
+void ci_panic(const char *str)
+{
+	call_prom("exit", 0, 0);
+}
+
+phandle ci_finddevice(const char *path)
+{
+	return call_prom("finddevice", 1, 1, path);
+}
+
+uint32_t ci_getprop(phandle ph, const char *propname, void *prop, int len)
+{
+	return call_prom("getprop", 4, 1, ph, propname, prop, len);
+}
+
+ihandle ci_open(const char *path)
+{
+	return call_prom("open", 1, 1, path);
+}
+
+void ci_close(ihandle ih)
+{
+	call_prom("close", 1, 0, ih);
+}
+
+void *ci_claim(void *virt, uint32_t size, uint32_t align)
+{
+	uint32_t ret = call_prom("claim", 3, 1, ADDR(virt), size, align);
+
+	return (void *) (unsigned long) ret;
+}
+
+uint32_t ci_release(void *virt, uint32_t size)
+{
+	return call_prom("release", 2, 1, ADDR(virt), size);
+}
diff --git a/pc-bios/vof/libc.c b/pc-bios/vof/libc.c
new file mode 100644
index 000000000000..8603aedcb32c
--- /dev/null
+++ b/pc-bios/vof/libc.c
@@ -0,0 +1,91 @@ 
+#include "vof.h"
+
+int strlen(const char *s)
+{
+	int len = 0;
+
+	while (*s != 0) {
+		len += 1;
+		s += 1;
+	}
+
+	return len;
+}
+
+int strcmp(const char *s1, const char *s2)
+{
+        while (*s1 != 0 && *s2 != 0) {
+                if (*s1 != *s2)
+                        break;
+                s1 += 1;
+                s2 += 1;
+        }
+
+        return *s1 - *s2;
+}
+
+void *memcpy(void *dest, const void *src, size_t n)
+{
+        char *cdest;
+        const char *csrc = src;
+
+        cdest = dest;
+        while (n-- > 0) {
+                *cdest++ = *csrc++;
+        }
+
+        return dest;
+}
+
+int memcmp(const void *ptr1, const void *ptr2, size_t n)
+{
+        const unsigned char *p1 = ptr1;
+        const unsigned char *p2 = ptr2;
+
+        while (n-- > 0) {
+                if (*p1 != *p2)
+                        return (*p1 - *p2);
+                p1 += 1;
+                p2 += 1;
+        }
+
+        return 0;
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+        char *cdest;
+        const char *csrc;
+        int i;
+
+        /* Do the buffers overlap in a bad way? */
+        if (src < dest && src + n >= dest) {
+                /* Copy from end to start */
+                cdest = dest + n - 1;
+                csrc = src + n - 1;
+                for (i = 0; i < n; i++) {
+                        *cdest-- = *csrc--;
+                }
+        }
+        else {
+                /* Normal copy is possible */
+                cdest = dest;
+                csrc = src;
+                for (i = 0; i < n; i++) {
+                        *cdest++ = *csrc++;
+                }
+        }
+
+        return dest;
+}
+
+void *memset(void *dest, int c, size_t size)
+{
+        unsigned char *d = (unsigned char *)dest;
+
+        while (size-- > 0) {
+                *d++ = (unsigned char)c;
+        }
+
+        return dest;
+}
diff --git a/pc-bios/vof/main.c b/pc-bios/vof/main.c
new file mode 100644
index 000000000000..34299a9cc5ad
--- /dev/null
+++ b/pc-bios/vof/main.c
@@ -0,0 +1,22 @@ 
+#include "vof.h"
+
+
+void do_boot(unsigned long addr, unsigned long _r3, unsigned long _r4)
+{
+	register unsigned long r3 __asm__("r3") = _r3;
+	register unsigned long r4 __asm__("r4") = _r4;
+	register unsigned long r5 __asm__("r5") = (unsigned long) _prom_entry;
+
+	((client *)(uint32_t)addr)();
+}
+
+void entry_c(void)
+{
+	register unsigned long r3 __asm__("r3");
+	register unsigned long r4 __asm__("r4");
+	register unsigned long r5 __asm__("r5");
+	uint64_t initrd = r3, initrdsize = r4;
+
+	boot_from_memory(initrd, initrdsize);
+	ci_panic("*** No boot target ***\n");
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index 8201f12271b7..469b76b36b2a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1335,6 +1335,17 @@  F: pc-bios/canyonlands.dt[sb]
 F: pc-bios/u-boot-sam460ex-20100605.bin
 F: roms/u-boot-sam460ex
 
+VOF
+M: Alexey Kardashevskiy <aik@ozlabs.ru>
+M: David Gibson <david@gibson.dropbear.id.au>
+M: Greg Kurz <groug@kaod.org>
+L: qemu-ppc@nongnu.org
+S: Maintained
+F: hw/ppc/spapr_vof*
+F: hw/ppc/vof*
+F: pc-bios/vof/*
+F: pc-bios/vof*
+
 RISC-V Machines
 ---------------
 OpenTitan
diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
index 218631c883be..24427d3f51c1 100644
--- a/hw/ppc/meson.build
+++ b/hw/ppc/meson.build
@@ -28,6 +28,8 @@  ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files(
   'spapr_rtas_ddw.c',
   'spapr_numa.c',
   'pef.c',
+  'spapr_vof.c',
+  'vof.c',
 ))
 ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c'))
 ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files(
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index 1e91984526a3..017c48624f96 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -71,6 +71,27 @@  spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3
 spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64
 spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed"
 
+# vof.c
+vof_error_str_truncated(const char *s, int len) "%s truncated to %d"
+vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d"
+vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d"
+vof_error_unknown_method(const char *method) "\"%s\""
+vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x"
+vof_error_unknown_path(const char *path) "\"%s\""
+vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x"
+vof_canon(const char *path) "\"%s\""
+vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x"
+vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x"
+vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x"
+vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]"
+vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d"
+vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t ret) "ph=0x%x \"%s\" [%s] => len=%d"
+vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x"
+vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x"
+vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d"
+vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d"
+vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x"
+
 # ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
 
diff --git a/pc-bios/README b/pc-bios/README
index db7129ef6484..176587da8ea5 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -16,6 +16,8 @@ 
   https://github.com/aik/SLOF, and the image currently in qemu is
   built from git tag qemu-slof-20200717.
 
+- vof is a minimalistic firmware to work with -machine pseries,x-vof=on.
+
 - sgabios (the Serial Graphics Adapter option ROM) provides a means for
   legacy x86 software to communicate with an attached serial console as
   if a video card were attached.  The master sources reside in a subversion
diff --git a/pc-bios/vof.bin b/pc-bios/vof.bin
new file mode 100755
index 0000000000000000000000000000000000000000..0606d9451c6bff39b32879c2a3369406a6a0d07d
GIT binary patch
literal 3680
zcmd^BUuauZ82{2_+ue2@_aP6FMSANxX==+V^r6wxq_aZW%u<<!iXBNzt}d80wap}x
z#N4z}S8<TCJvivU2Pyj2hY5XKg&_s^u!ji|bnD&*d)b3vWu4dGckj6_{g=h<(ZJ!H
zob#RU_kF+b&$&eZ?_Xe@C~goH)*Xq?AW`(>B?=MUwAULA#>}I8bhdE6ZYf0j&qP62
z;l6X}0rol?XtZA^HJ3P=?ZiPe{%~Ip(aYQR$lk+M_mu|YR!*v0DmkeA@{iPX`5v|8
z0*A9F7ieE$t!^oBha7L*`MOjpkW^YAxpbWry+E?QKuWPor~L77wiqBO?{hBo>#hd2
z$6uf}{V=(5l5=VDXJcdXfw7s7y`N<}YQ$4>$)nks0Lgxp2(y%%KF0su;=iJ^9e2qM
zTO`;bhv?8H{d3(Fq5~Idv7ype_~x;3z_pkC@%zZZxLA9{+QaK<AJK_vqTsyoo9Xwl
zaF+eu#m{wMc)+KZ`Ly|i{&v1M-Aja7W_^XPLc!oX1$n;$R~Bp8loz<&^QL{~Jz>W+
z=#<!JTQ1*h#>AA#`vwN`mW}EzHm<_P4%pZ!Z1f<;Uc^|1E-&+s*~2^lqp=u4Jeo~j
zOq=`%v95<NeMDC+e^h_3Va!BGW!t*vj9YwP;0OOBzKLtw0sH~1ckl*2*j9;o-0ru{
z^&XsVxAj}jfoHNcV#dT{#&MQPutx<~-h6YP8?(9JJx5$8s^4cAyvJuEzwA9zo;kO3
z9QL<aoP$#<7LxxU)>}DOgntTT6~=%a)IDpg;r=lGZ{cyL?yB+JQ#b2<Ca#1%u+6Ho
zUf@SiXW(nFZ-3Vwc~pK1UJX6$%i9%o&OCXFoHLE#Ecyn8jm>ws=a_jrZ^ZyMa*eZ`
zEaY5}^X(#c4LVRG&NE!+{s1W*tK{O|sL$)Njy;@xZ^oxfR~T31iWzI*WCRZ9gp9sw
z%82{i<7D6t3K<vlcm{pS>zqS~z3dO7NA{Za&$hZaA6TZ?NA6<1kZrPLSq?ei5V5P)
zr`N^1&{)~Ww!RBfI{OQ|B-R%;{=Q&ygMMcE9X9?t&%FsfQmyhPv<a-+z*<8;y1DZW
znsNhc2>odo{V7_yOR17Uqo{=pILppNC>NZf#&Ur4a)d6H3sfpE&^P7lv{=4N-<1to
z{<WS~VC#^mrD4pR2T{!Yt~jp-D9H7Jxf?MoW4{M8pNB-<a34%eL+;gSI)Iwyo*3f)
z+zSUK<ZnF%?btw;_4oLD$p!m*idKvyKKIvyGb<*Fh~1OkA2B%{7Il230=pe}T+<TA
z-qZtGkq1ge9&FbY6}oP~AK8l4pTD#3<{L6+_R!(&`Z!PFgT(kEUv5>2VGa56#Q^6^
z$0AXOPK28Aocigt3FM6<N>9y^8uU*-Wc~OqU<?N^9Ki5e7?0;TH-#V7a<PV9GI^14
z?C&Auq8nUtttq;Jynll40=_EnU#h@=&bphFcJZ^(!rA<x0;igvHSjsHSb@VaV?LV%
z&f7J_Y*o~CPvv*U)*dp^35?3`4i9)#!J{|-N;oUx=mc(yh5P6i_764rZ{I<02|iz5
z7FX<ea&2)8E3oT;-6pW#tC8Df;rF}nJA{6AYG*$y@;6}<a;OFUtPMHT4jjBgk6&b4
zP^bKcDB=xp>Da@W#`x$x3i@jcY~?rKoe!PnZ;KYJ+w*Uo&hlH8cfN^!{Ks*wUlFH)
zy*NVYQFC*%+MiZGNT)MuCN}nwmQk^Fh~lwSs`Z?fIh#&Ua%4Oc8_6VN8Lf3J6C3Ay
zC&YrM*74+L8uqkoGQ(#dCPxwp?bBo&n@~79mL8?_s5Zi9@l<-8W#g$>@*L(dEv3b<
zb0U5g2P5=}cJ6!&ThscnHa4QA_zsc7L-axxql7Vwv5c{TeVjXnu)cwD6Qhi=j&UF3
NA;t#L5!^o_{R4q+IM@IH

literal 0
HcmV?d00001

diff --git a/pc-bios/vof/entry.S b/pc-bios/vof/entry.S
new file mode 100644
index 000000000000..90f4b859a059
--- /dev/null
+++ b/pc-bios/vof/entry.S
@@ -0,0 +1,51 @@ 
+#define LOAD32(rn, name)    \
+	lis     rn,name##@h;    \
+	ori     rn,rn,name##@l
+
+#define ENTRY(func_name)    \
+	.text;                  \
+	.align  2;              \
+	.globl  .func_name;     \
+	.func_name:             \
+	.globl  func_name;      \
+	func_name:
+
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
+
+	. = 0x100 /* Do exactly as SLOF does */
+
+ENTRY(_start)
+	LOAD32(%r31, 0) /* Go 32bit mode */
+	mtmsrd %r31,0
+	LOAD32(2, __toc_start)
+	b entry_c
+
+ENTRY(_prom_entry)
+	LOAD32(2, __toc_start)
+	stdu    %r1,-112(%r1)
+	std     %r31,104(%r1)
+	mflr    %r31
+	bl prom_entry
+	nop
+	mtlr    %r31
+	ld      %r31,104(%r1)
+	addi    %r1,%r1,112
+	blr
+
+ENTRY(ci_entry)
+	mr	4,3
+	LOAD32(3,KVMPPC_H_VOF_CLIENT)
+	sc	1
+	blr
+
+/* This is the actual RTAS blob copied to the OS at instantiate-rtas */
+ENTRY(hv_rtas)
+	mr      %r4,%r3
+	LOAD32(3,KVMPPC_H_RTAS)
+	sc	1
+	blr
+	.globl hv_rtas_size
+hv_rtas_size:
+	.long . - hv_rtas;
diff --git a/pc-bios/vof/l.lds b/pc-bios/vof/l.lds
new file mode 100644
index 000000000000..10b557a81f78
--- /dev/null
+++ b/pc-bios/vof/l.lds
@@ -0,0 +1,48 @@ 
+OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
+OUTPUT_ARCH(powerpc:common)
+
+/* set the entry point */
+ENTRY ( __start )
+
+SECTIONS {
+	__executable_start = .;
+
+	.text : {
+		*(.text)
+	}
+
+	__etext = .;
+
+	. = ALIGN(8);
+
+	.data : {
+		*(.data)
+		*(.rodata .rodata.*)
+		*(.got1)
+		*(.sdata)
+		*(.opd)
+	}
+
+	/* FIXME bss at end ??? */
+
+	. = ALIGN(8);
+	__bss_start = .;
+	.bss : {
+		*(.sbss) *(.scommon)
+		*(.dynbss)
+		*(.bss)
+	}
+
+	. = ALIGN(8);
+	__bss_end = .;
+	__bss_size = (__bss_end - __bss_start);
+
+	. = ALIGN(256);
+	__toc_start = DEFINED (.TOC.) ? .TOC. : ADDR (.got) + 0x8000;
+	.got :
+	{
+		 *(.toc .got)
+	}
+	. = ALIGN(8);
+	__toc_end = .;
+}
diff --git a/pc-bios/vof/nvram.bin b/pc-bios/vof/nvram.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d183901cf980a91d81c4348bb20487c7bb62a2ec