diff mbox series

[v1,3/4] hw/arm/virt-acpi-build: patch guest SRAT for NUMA nodes

Message ID 20230915024559.6565-4-ankita@nvidia.com
State New
Headers show
Series vfio: report NUMA nodes for device memory | expand

Commit Message

Ankit Agrawal Sept. 15, 2023, 2:45 a.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

During bootup, Linux kernel parse the ACPI SRAT to determine the PXM ids.
This allows for the creation of NUMA nodes for each unique id.

Insert a series of the unique PXM ids in the VM SRAT ACPI table. The
range of nodes can be determined from the "dev_mem_pxm_start" and
"dev_mem_pxm_count" object properties associated with the device. These
nodes as made MEM_AFFINITY_HOTPLUGGABLE. This allows the kernel to create
memory-less NUMA nodes on bootup to which a subrange (or entire range) of
device memory can be added/removed.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 54 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

Comments

Igor Mammedov Sept. 15, 2023, 2:52 p.m. UTC | #1
On Thu, 14 Sep 2023 19:45:58 -0700
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> During bootup, Linux kernel parse the ACPI SRAT to determine the PXM ids.
> This allows for the creation of NUMA nodes for each unique id.
> 
> Insert a series of the unique PXM ids in the VM SRAT ACPI table. The
> range of nodes can be determined from the "dev_mem_pxm_start" and
> "dev_mem_pxm_count" object properties associated with the device. These
> nodes as made MEM_AFFINITY_HOTPLUGGABLE. This allows the kernel to create
> memory-less NUMA nodes on bootup to which a subrange (or entire range) of
> device memory can be added/removed.

QEMU already has 'memory devices'. perhaps this case belongs to the same class
CCing David.

> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  hw/arm/virt-acpi-build.c | 54 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 54 insertions(+)
> 
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index 6b674231c2..6d1e3b6b8a 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -46,6 +46,7 @@
>  #include "hw/acpi/hmat.h"
>  #include "hw/pci/pcie_host.h"
>  #include "hw/pci/pci.h"
> +#include "hw/vfio/pci.h"
>  #include "hw/pci/pci_bus.h"
>  #include "hw/pci-host/gpex.h"
>  #include "hw/arm/virt.h"
> @@ -515,6 +516,57 @@ build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      acpi_table_end(linker, &table);
>  }
>  
> +static int devmem_device_list(Object *obj, void *opaque)
> +{
> +    GSList **list = opaque;
> +
> +    if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
> +        *list = g_slist_append(*list, DEVICE(obj));
> +    }
> +
> +    object_child_foreach(obj, devmem_device_list, opaque);
> +    return 0;
> +}
> +
> +static GSList *devmem_get_device_list(void)
> +{
> +    GSList *list = NULL;
> +
> +    object_child_foreach(qdev_get_machine(), devmem_device_list, &list);
> +    return list;
> +}
> +
> +static void build_srat_devmem(GArray *table_data)
> +{
> +    GSList *device_list, *list = devmem_get_device_list();
> +
> +    for (device_list = list; device_list; device_list = device_list->next) {
> +        DeviceState *dev = device_list->data;
> +        Object *obj = OBJECT(dev);
> +        VFIOPCIDevice *pcidev
> +            = ((VFIOPCIDevice *)object_dynamic_cast(OBJECT(obj),
> +               TYPE_VFIO_PCI));
> +
> +        if (pcidev->pdev.has_coherent_memory) {
> +            uint64_t start_node = object_property_get_uint(obj,
> +                                  "dev_mem_pxm_start", &error_abort);
> +            uint64_t node_count = object_property_get_uint(obj,
> +                                  "dev_mem_pxm_count", &error_abort);
> +            uint64_t node_index;
> +
> +            /*
> +             * Add the node_count PXM domains starting from start_node as
> +             * hot pluggable. The VM kernel parse the PXM domains and
> +             * creates NUMA nodes.
> +             */
> +            for (node_index = 0; node_index < node_count; node_index++)
> +                build_srat_memory(table_data, 0, 0, start_node + node_index,
> +                    MEM_AFFINITY_ENABLED | MEM_AFFINITY_HOTPLUGGABLE);
> +        }
> +    }
> +    g_slist_free(list);
> +}
> +
>  /*
>   * ACPI spec, Revision 5.1
>   * 5.2.16 System Resource Affinity Table (SRAT)
> @@ -569,6 +621,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>                            MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
>      }
>  
> +    build_srat_devmem(table_data);
> +
>      acpi_table_end(linker, &table);
>  }
>
David Hildenbrand Sept. 15, 2023, 3:49 p.m. UTC | #2
On 15.09.23 16:52, Igor Mammedov wrote:
> On Thu, 14 Sep 2023 19:45:58 -0700
> <ankita@nvidia.com> wrote:
> 
>> From: Ankit Agrawal <ankita@nvidia.com>
>>
>> During bootup, Linux kernel parse the ACPI SRAT to determine the PXM ids.
>> This allows for the creation of NUMA nodes for each unique id.
>>
>> Insert a series of the unique PXM ids in the VM SRAT ACPI table. The
>> range of nodes can be determined from the "dev_mem_pxm_start" and
>> "dev_mem_pxm_count" object properties associated with the device. These
>> nodes as made MEM_AFFINITY_HOTPLUGGABLE. This allows the kernel to create
>> memory-less NUMA nodes on bootup to which a subrange (or entire range) of
>> device memory can be added/removed.
> 
> QEMU already has 'memory devices'. perhaps this case belongs to the same class
> CCing David.

There is demand for something similar for memory devices (DIMMs, 
virtio-mem) as well.
Ankit Agrawal Sept. 22, 2023, 5:49 a.m. UTC | #3
Hi Jonathan

> > +        if (pcidev->pdev.has_coherent_memory) {
> > +            uint64_t start_node = object_property_get_uint(obj,
> > +                                  "dev_mem_pxm_start", &error_abort);
> > +            uint64_t node_count = object_property_get_uint(obj,
> > +                                  "dev_mem_pxm_count", &error_abort);
> > +            uint64_t node_index;
> > +
> > +            /*
> > +             * Add the node_count PXM domains starting from start_node as
> > +             * hot pluggable. The VM kernel parse the PXM domains and
> > +             * creates NUMA nodes.
> > +             */
> > +            for (node_index = 0; node_index < node_count; node_index++)
> > +                build_srat_memory(table_data, 0, 0, start_node + node_index,
> > +                    MEM_AFFINITY_ENABLED |
> > + MEM_AFFINITY_HOTPLUGGABLE);
> 
> 0 size SRAT entries for memory? That's not valid.

Can you explain in what sense are these invalid? The Linux kernel accepts
such setting and I had tested it.

> Seems like you've run into the same issue CXL has with dynamic addition of
> nodes to the kernel and all you want to do here is make sure it thinks there are
> enough nodes so initializes various structures large enough.
>
Yes, exactly.
Jason Gunthorpe Sept. 25, 2023, 2:03 p.m. UTC | #4
On Mon, Sep 25, 2023 at 02:54:40PM +0100, Jonathan Cameron wrote:

> Possible the ASWG folk would say this is fine and I'm reading too much into
> the spec, but I'd definitely suggest asking them via the appropriate path,
> or throwing in a code first proposal for a comment on this special case and
> see what response you get - my guess is it will be 'fix Linux' :(

The goal here is for qemu to emulate what the bare metal environment
is doing.

There may be a legitimate question if what the bare metal FW has done
is legitimate (though let's face it, there are lots of creative ACPI
things around), but I don't quite see how this is a qemu question?

Unless you are taking the position that qemu should not emulate this
HW?

Jason
Jason Gunthorpe Sept. 25, 2023, 4 p.m. UTC | #5
On Mon, Sep 25, 2023 at 03:53:51PM +0100, Jonathan Cameron wrote:
> On Mon, 25 Sep 2023 11:03:28 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Mon, Sep 25, 2023 at 02:54:40PM +0100, Jonathan Cameron wrote:
> > 
> > > Possible the ASWG folk would say this is fine and I'm reading too much into
> > > the spec, but I'd definitely suggest asking them via the appropriate path,
> > > or throwing in a code first proposal for a comment on this special case and
> > > see what response you get - my guess is it will be 'fix Linux' :(  
> > 
> > The goal here is for qemu to emulate what the bare metal environment
> > is doing.
> > 
> > There may be a legitimate question if what the bare metal FW has done
> > is legitimate (though let's face it, there are lots of creative ACPI
> > things around), but I don't quite see how this is a qemu question?
> > 
> > Unless you are taking the position that qemu should not emulate this
> > HW?
> 
> Ok. I'd failed to register that the bare metal code was doing this though
> with hindsight I guess that is obvious. Though without more info or
> a bare metal example being given its also possible the BIOS was doing
> enumeration etc (like CXL does for all < 2.0 devices) and hence was
> building SRAT with the necessary memory ranges in place - even if the
> driver then does the hot add dance later.

Ankit, maybe you can share some relavent ACPI dumps from the physical
hardware and explain how this compares?

> That's dubious and likely to break at some point unless the spec
> comprehends this use case, but meh, so are lots of other things and
> the hardware vendor gets to pick up the pieces and deal with grumpy
> customers.

Yes.

> I don't currently see this as a safe solution for the proposed other
> use cases however that are virtualization only.

So, how should that translate into a command line experiance? Sounds
like the broad concept is general but this actual specific HW is not?

Thanks,
Jason
Ankit Agrawal Sept. 26, 2023, 2:54 p.m. UTC | #6
> With an ACPI spec clarification agreed then I'm fine with
> using this for all the cases that have come up in this thread.
> Or a good argument that this is valid in under existing ACPI spec.

Hi Jonathan

I looked at the Section 5.2.16 in ACPI spec doc, but could not see
any mention of whether size == 0 is invalid or be avoided. 
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf

If that is not convincing, do you have suggestions as to how I may
confirm this?
Ankit Agrawal Sept. 27, 2023, 7:06 a.m. UTC | #7
>> Ok. I'd failed to register that the bare metal code was doing this though
>> with hindsight I guess that is obvious. Though without more info or
>> a bare metal example being given its also possible the BIOS was doing
>> enumeration etc (like CXL does for all < 2.0 devices) and hence was
>> building SRAT with the necessary memory ranges in place - even if the
>> driver then does the hot add dance later.
>
> Ankit, maybe you can share some relavent ACPI dumps from the physical
> hardware and explain how this compares?

Yeah, we are pretty much emulating the baremetal behavior here (I should have
been clearer). The PXM domains 8-15 are the "device associated" NUMA nodes
on the host. Here are the kernel logs from the baremetal system related to these
NUMA nodes.

[    0.000000] ACPI: SRAT: Node 1 PXM 8 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 2 PXM 9 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 3 PXM 10 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 4 PXM 11 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 5 PXM 12 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 6 PXM 13 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 7 PXM 14 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 8 PXM 15 [mem 0x00000000-0xffffffffffffffff] hotplug

...

[    0.000000] Initmem setup node 2 as memoryless
[    0.000000] Initmem setup node 3 as memoryless
[    0.000000] Initmem setup node 4 as memoryless
[    0.000000] Initmem setup node 5 as memoryless
[    0.000000] Initmem setup node 6 as memoryless
[    0.000000] Initmem setup node 7 as memoryless
[    0.000000] Initmem setup node 8 as memoryless
[    0.000000] Initmem setup node 9 as memoryless



On the VM, it looks like the following with the PXM 2-9 associated with the device.

[    0.000000] ACPI: SRAT: Node 2 PXM 2 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 3 PXM 3 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 4 PXM 4 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 5 PXM 5 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 6 PXM 6 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 7 PXM 7 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 8 PXM 8 [mem 0x00000000-0xffffffffffffffff] hotplug
[    0.000000] ACPI: SRAT: Node 9 PXM 9 [mem 0x00000000-0xffffffffffffffff] hotplug
diff mbox series

Patch

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 6b674231c2..6d1e3b6b8a 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -46,6 +46,7 @@ 
 #include "hw/acpi/hmat.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
+#include "hw/vfio/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/arm/virt.h"
@@ -515,6 +516,57 @@  build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     acpi_table_end(linker, &table);
 }
 
+static int devmem_device_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
+        *list = g_slist_append(*list, DEVICE(obj));
+    }
+
+    object_child_foreach(obj, devmem_device_list, opaque);
+    return 0;
+}
+
+static GSList *devmem_get_device_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), devmem_device_list, &list);
+    return list;
+}
+
+static void build_srat_devmem(GArray *table_data)
+{
+    GSList *device_list, *list = devmem_get_device_list();
+
+    for (device_list = list; device_list; device_list = device_list->next) {
+        DeviceState *dev = device_list->data;
+        Object *obj = OBJECT(dev);
+        VFIOPCIDevice *pcidev
+            = ((VFIOPCIDevice *)object_dynamic_cast(OBJECT(obj),
+               TYPE_VFIO_PCI));
+
+        if (pcidev->pdev.has_coherent_memory) {
+            uint64_t start_node = object_property_get_uint(obj,
+                                  "dev_mem_pxm_start", &error_abort);
+            uint64_t node_count = object_property_get_uint(obj,
+                                  "dev_mem_pxm_count", &error_abort);
+            uint64_t node_index;
+
+            /*
+             * Add the node_count PXM domains starting from start_node as
+             * hot pluggable. The VM kernel parse the PXM domains and
+             * creates NUMA nodes.
+             */
+            for (node_index = 0; node_index < node_count; node_index++)
+                build_srat_memory(table_data, 0, 0, start_node + node_index,
+                    MEM_AFFINITY_ENABLED | MEM_AFFINITY_HOTPLUGGABLE);
+        }
+    }
+    g_slist_free(list);
+}
+
 /*
  * ACPI spec, Revision 5.1
  * 5.2.16 System Resource Affinity Table (SRAT)
@@ -569,6 +621,8 @@  build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
                           MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
     }
 
+    build_srat_devmem(table_data);
+
     acpi_table_end(linker, &table);
 }