diff mbox series

[v5,6/8] hmat acpi: Build Memory Subsystem Address Range Structure(s) in ACPI HMAT

Message ID 20190614155626.27932-7-tao3.xu@intel.com
State New
Headers show
Series Build ACPI Heterogeneous Memory Attribute Table (HMAT) | expand

Commit Message

Tao Xu June 14, 2019, 3:56 p.m. UTC
From: Liu Jingqi <jingqi.liu@intel.com>

HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table (HMAT).
The specification references below link:
http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf

It describes the memory attributes, such as memory side cache
attributes and bandwidth and latency details, related to the
System Physical Address (SPA) Memory Ranges. The software is
expected to use this information as hint for optimization.

This structure describes the System Physical Address(SPA) range
occupied by memory subsystem and its associativity with processor
proximity domain as well as hint for memory usage.

Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
---

Changes in v5 -> v4:
    - Add more descriptions from ACPI spec (Igor)
    - Remove all the dependcy on PCMachineState (Igor)
---
 hw/acpi/Kconfig       |   5 ++
 hw/acpi/Makefile.objs |   1 +
 hw/acpi/hmat.c        | 153 ++++++++++++++++++++++++++++++++++++++++++
 hw/acpi/hmat.h        |  43 ++++++++++++
 hw/core/machine.c     |   2 +
 hw/i386/acpi-build.c  |   3 +
 include/sysemu/numa.h |   2 +
 numa.c                |   6 ++
 8 files changed, 215 insertions(+)
 create mode 100644 hw/acpi/hmat.c
 create mode 100644 hw/acpi/hmat.h

Comments

Jonathan Cameron June 27, 2019, 3:56 p.m. UTC | #1
On Fri, 14 Jun 2019 23:56:24 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> From: Liu Jingqi <jingqi.liu@intel.com>
> 
> HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table (HMAT).
> The specification references below link:
> http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
> 
> It describes the memory attributes, such as memory side cache
> attributes and bandwidth and latency details, related to the
> System Physical Address (SPA) Memory Ranges. The software is
> expected to use this information as hint for optimization.
> 
> This structure describes the System Physical Address(SPA) range
> occupied by memory subsystem and its associativity with processor
> proximity domain as well as hint for memory usage.
> 
> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>

Hi Tao,

Apologies if I missed an earlier discussion on this...

It's probably not letting an secrets out to say that there are very few
real hardware systems out there using the 6.2 version of HMAT.

Does it make sense to implement it rather than the somewhat tidied
up version in ACPI 6.3?

I would go so far as to say that one of the pushes behind making those
changes was that it shouldn't have much impact as no one was shipping
a firmware using the 6.2 version.  So any chance we can avoid
qemu effectively doing so, or at least defaulting to doing so?

I'm entirely in favor of the patch set in general btw as it's much
more useful than having to override with a hand crafted table, when
wanting to test unusual topologies.

Thanks,

Jonathan 

> ---
> 
> Changes in v5 -> v4:
>     - Add more descriptions from ACPI spec (Igor)
>     - Remove all the dependcy on PCMachineState (Igor)
> ---
>  hw/acpi/Kconfig       |   5 ++
>  hw/acpi/Makefile.objs |   1 +
>  hw/acpi/hmat.c        | 153 ++++++++++++++++++++++++++++++++++++++++++
>  hw/acpi/hmat.h        |  43 ++++++++++++
>  hw/core/machine.c     |   2 +
>  hw/i386/acpi-build.c  |   3 +
>  include/sysemu/numa.h |   2 +
>  numa.c                |   6 ++
>  8 files changed, 215 insertions(+)
>  create mode 100644 hw/acpi/hmat.c
>  create mode 100644 hw/acpi/hmat.h
> 
> diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
> index 7c59cf900b..039bb99efa 100644
> --- a/hw/acpi/Kconfig
> +++ b/hw/acpi/Kconfig
> @@ -7,6 +7,7 @@ config ACPI_X86
>      select ACPI_NVDIMM
>      select ACPI_CPU_HOTPLUG
>      select ACPI_MEMORY_HOTPLUG
> +    select ACPI_HMAT
>  
>  config ACPI_X86_ICH
>      bool
> @@ -31,3 +32,7 @@ config ACPI_VMGENID
>      bool
>      default y
>      depends on PC
> +
> +config ACPI_HMAT
> +    bool
> +    depends on ACPI
> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
> index 661a9b8c2f..20cc2fb124 100644
> --- a/hw/acpi/Makefile.objs
> +++ b/hw/acpi/Makefile.objs
> @@ -6,6 +6,7 @@ common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
>  common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o
>  common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>  common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
> +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
>  common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
>  
>  common-obj-y += acpi_interface.o
> diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c
> new file mode 100644
> index 0000000000..6fd434c4d9
> --- /dev/null
> +++ b/hw/acpi/hmat.c
> @@ -0,0 +1,153 @@
> +/*
> + * HMAT ACPI Implementation
> + *
> + * Copyright(C) 2019 Intel Corporation.
> + *
> + * Author:
> + *  Liu jingqi <jingqi.liu@linux.intel.com>
> + *  Tao Xu <tao3.xu@intel.com>
> + *
> + * HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table
> + * (HMAT)
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
> + */
> +
> +#include "qemu/osdep.h"
> +#include "sysemu/numa.h"
> +#include "hw/acpi/hmat.h"
> +#include "hw/mem/pc-dimm.h"
> +
> +/* ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure: Table 5-141 */
> +static void build_hmat_spa(GArray *table_data, uint16_t flags,
> +                           uint64_t base, uint64_t length, int node)
> +{
> +
> +    /* Memory Subsystem Address Range Structure */
> +    /* Type */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Length */
> +    build_append_int_noprefix(table_data, 40, 4);
> +    /* Flags */
> +    build_append_int_noprefix(table_data, flags, 2);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Process Proximity Domain */
> +    build_append_int_noprefix(table_data, node, 4);
> +    /* Memory Proximity Domain */
> +    build_append_int_noprefix(table_data, node, 4);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 4);
> +    /* System Physical Address Range Base */

These got scrapped in ACPI 6.3 because they didn't actually provide
any useful information that isn't always available from somewhere
else (SRAT mainly).

> +    build_append_int_noprefix(table_data, base, 8);
> +    /* System Physical Address Range Length */
> +    build_append_int_noprefix(table_data, length, 8);
> +}
> +
> +static int pc_dimm_device_list(Object *obj, void *opaque)
> +{
> +    GSList **list = opaque;
> +
> +    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
> +        DeviceState *dev = DEVICE(obj);
> +        if (dev->realized) { /* only realized memory devices matter */
> +            *list = g_slist_append(*list, DEVICE(obj));
> +        }
> +    }
> +
> +    object_child_foreach(obj, pc_dimm_device_list, opaque);
> +    return 0;
> +}
> +
> +/* Build HMAT sub table structures */
> +static void hmat_build_table_structs(GArray *table_data, MachineState *ms)
> +{
> +    GSList *device_list = NULL;
> +    uint16_t flags;
> +    uint64_t mem_base, mem_len;
> +    int i;
> +    NumaState *nstat = ms->numa_state;
> +    NumaMemRange *mem_range;
> +
> +    Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
> +    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
> +    AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
> +
> +    /*
> +     * ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure:
> +     * Table 5-141. The Proximity Domain of System Physical Address
> +     * ranges defined in the HMAT, NFIT and SRAT tables shall match
> +     * each other.
> +     */
> +    if (nstat->num_nodes && !nstat->mem_ranges_num) {
> +        nstat->mem_ranges = g_array_new(false, true /* clear */,
> +                                        sizeof *mem_range);
> +        adevc->build_mem_ranges(adev, ms);
> +    }
> +
> +    for (i = 0; i < nstat->mem_ranges_num; i++) {
> +        mem_range = &g_array_index(nstat->mem_ranges, NumaMemRange, i);
> +        flags = 0;
> +
> +        if (nstat->nodes[mem_range->node].is_initiator) {
> +            flags |= HMAT_SPA_PROC_VALID;
> +        }
> +        if (nstat->nodes[mem_range->node].is_target) {
> +            flags |= HMAT_SPA_MEM_VALID;
> +        }
> +
> +        build_hmat_spa(table_data, flags, mem_range->base,
> +                       mem_range->length,
> +                       mem_range->node);
> +    }
> +
> +    /* Build HMAT SPA structures for PC-DIMM devices. */
> +    object_child_foreach(OBJECT(ms), pc_dimm_device_list, &device_list);
> +
> +    for (; device_list; device_list = device_list->next) {
> +        PCDIMMDevice *dimm = device_list->data;
> +        mem_base = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
> +                                            NULL);
> +        mem_len = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
> +                                           NULL);
> +        i = object_property_get_uint(OBJECT(dimm), PC_DIMM_NODE_PROP, NULL);
> +        flags = 0;
> +
> +        if (nstat->nodes[i].is_initiator) {
> +            flags |= HMAT_SPA_PROC_VALID;
> +        }
> +        if (nstat->nodes[i].is_target) {
> +            flags |= HMAT_SPA_MEM_VALID;
> +        }
> +        build_hmat_spa(table_data, flags, mem_base, mem_len, i);
> +    }
> +}
> +
> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms)
> +{
> +    uint64_t hmat_start;
> +
> +    hmat_start = table_data->len;
> +
> +    /* reserve space for HMAT header  */
> +    acpi_data_push(table_data, 40);
> +
> +    hmat_build_table_structs(table_data, ms);
> +
> +    build_header(linker, table_data,
> +                 (void *)(table_data->data + hmat_start),
> +                 "HMAT", table_data->len - hmat_start, 1, NULL, NULL);
> +}
> diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h
> new file mode 100644
> index 0000000000..e24b673fad
> --- /dev/null
> +++ b/hw/acpi/hmat.h
> @@ -0,0 +1,43 @@
> +/*
> + * HMAT ACPI Implementation Header
> + *
> + * Copyright(C) 2019 Intel Corporation.
> + *
> + * Author:
> + *  Liu jingqi <jingqi.liu@linux.intel.com>
> + *  Tao Xu <tao3.xu@intel.com>
> + *
> + * HMAT is defined in ACPI 6.2.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
> + */
> +
> +#ifndef HMAT_H
> +#define HMAT_H
> +
> +#include "hw/acpi/acpi-defs.h"
> +#include "hw/acpi/acpi.h"
> +#include "hw/acpi/bios-linker-loader.h"
> +#include "hw/acpi/aml-build.h"
> +
> +/* the values of AcpiHmatSpaRange flag */
> +enum {
> +    HMAT_SPA_PROC_VALID       = 0x1,
> +    HMAT_SPA_MEM_VALID        = 0x2,
> +    HMAT_SPA_RESERVATION_HINT = 0x4,

Only the first bit ended up being kept for ACPI 6.3.

> +};
> +
> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms);
> +
> +#endif
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 14b29de0a9..2ad09ec23e 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -646,6 +646,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>                                 const CpuInstanceProperties *props, Error **errp)
>  {
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
> +    NodeInfo *numa_info = machine->numa_state->nodes;
>      bool match = false;
>      int i;
>  
> @@ -706,6 +707,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>          match = true;
>          slot->props.node_id = props->node_id;
>          slot->props.has_node_id = props->has_node_id;
> +        numa_info[props->node_id].is_initiator = true;
>      }
>  
>      if (!match) {
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 44dd447fa5..6584eac76e 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -66,6 +66,7 @@
>  #include "hw/i386/intel_iommu.h"
>  
>  #include "hw/acpi/ipmi.h"
> +#include "hw/acpi/hmat.h"
>  
>  /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
>   * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>              acpi_add_table(table_offsets, tables_blob);
>              build_slit(tables_blob, tables->linker, machine);
>          }
> +        acpi_add_table(table_offsets, tables_blob);
> +        build_hmat(tables_blob, tables->linker, machine);
>      }
>      if (acpi_get_mcfg(&mcfg)) {
>          acpi_add_table(table_offsets, tables_blob);
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index e3c85b77bc..13cff59112 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -10,6 +10,8 @@ struct NodeInfo {
>      uint64_t node_mem;
>      struct HostMemoryBackend *node_memdev;
>      bool present;
> +    bool is_initiator;
> +    bool is_target;
>      uint8_t distance[MAX_NODES];
>  };
>  
> diff --git a/numa.c b/numa.c
> index d23e130bce..5556d118c3 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -102,6 +102,10 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>          }
>      }
>  
> +    if (node->cpus) {
> +        numa_info[nodenr].is_initiator = true;
> +    }
> +
>      if (node->has_mem && node->has_memdev) {
>          error_setg(errp, "cannot specify both mem= and memdev=");
>          return;
> @@ -118,6 +122,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>  
>      if (node->has_mem) {
>          numa_info[nodenr].node_mem = node->mem;
> +        numa_info[nodenr].is_target = true;
>      }
>      if (node->has_memdev) {
>          Object *o;
> @@ -130,6 +135,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>          object_ref(o);
>          numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
> +        numa_info[nodenr].is_target = true;
>      }
>      numa_info[nodenr].present = true;
>      max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
Tao Xu July 1, 2019, 12:58 a.m. UTC | #2
On 6/27/2019 11:56 PM, Jonathan Cameron wrote:
> On Fri, 14 Jun 2019 23:56:24 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
> 
>> From: Liu Jingqi <jingqi.liu@intel.com>
>>
>> HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table (HMAT).
>> The specification references below link:
>> http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
>>
>> It describes the memory attributes, such as memory side cache
>> attributes and bandwidth and latency details, related to the
>> System Physical Address (SPA) Memory Ranges. The software is
>> expected to use this information as hint for optimization.
>>
>> This structure describes the System Physical Address(SPA) range
>> occupied by memory subsystem and its associativity with processor
>> proximity domain as well as hint for memory usage.
>>
>> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> 
> Hi Tao,
> 
> Apologies if I missed an earlier discussion on this...
> 
> It's probably not letting an secrets out to say that there are very few
> real hardware systems out there using the 6.2 version of HMAT.
> 
> Does it make sense to implement it rather than the somewhat tidied
> up version in ACPI 6.3?
> 
> I would go so far as to say that one of the pushes behind making those
> changes was that it shouldn't have much impact as no one was shipping
> a firmware using the 6.2 version.  So any chance we can avoid
> qemu effectively doing so, or at least defaulting to doing so?
> 
> I'm entirely in favor of the patch set in general btw as it's much
> more useful than having to override with a hand crafted table, when
> wanting to test unusual topologies.
> 
> Thanks,
> 
> Jonathan
> 
Thanks for your suggestion. After discussion, we decide to use ACPI 6.3 
in next version.
Igor Mammedov July 1, 2019, 11:25 a.m. UTC | #3
On Fri, 14 Jun 2019 23:56:24 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> From: Liu Jingqi <jingqi.liu@intel.com>
> 
> HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table (HMAT).
> The specification references below link:
> http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
> 
> It describes the memory attributes, such as memory side cache
> attributes and bandwidth and latency details, related to the
> System Physical Address (SPA) Memory Ranges. The software is
> expected to use this information as hint for optimization.
> 
> This structure describes the System Physical Address(SPA) range
> occupied by memory subsystem and its associativity with processor
> proximity domain as well as hint for memory usage.
> 
> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
> 
> Changes in v5 -> v4:
>     - Add more descriptions from ACPI spec (Igor)
>     - Remove all the dependcy on PCMachineState (Igor)
> ---
>  hw/acpi/Kconfig       |   5 ++
>  hw/acpi/Makefile.objs |   1 +
>  hw/acpi/hmat.c        | 153 ++++++++++++++++++++++++++++++++++++++++++
>  hw/acpi/hmat.h        |  43 ++++++++++++
>  hw/core/machine.c     |   2 +
>  hw/i386/acpi-build.c  |   3 +
>  include/sysemu/numa.h |   2 +
>  numa.c                |   6 ++
>  8 files changed, 215 insertions(+)
>  create mode 100644 hw/acpi/hmat.c
>  create mode 100644 hw/acpi/hmat.h
> 
> diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
> index 7c59cf900b..039bb99efa 100644
> --- a/hw/acpi/Kconfig
> +++ b/hw/acpi/Kconfig
> @@ -7,6 +7,7 @@ config ACPI_X86
>      select ACPI_NVDIMM
>      select ACPI_CPU_HOTPLUG
>      select ACPI_MEMORY_HOTPLUG
> +    select ACPI_HMAT
>  
>  config ACPI_X86_ICH
>      bool
> @@ -31,3 +32,7 @@ config ACPI_VMGENID
>      bool
>      default y
>      depends on PC
> +
> +config ACPI_HMAT
> +    bool
> +    depends on ACPI
> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
> index 661a9b8c2f..20cc2fb124 100644
> --- a/hw/acpi/Makefile.objs
> +++ b/hw/acpi/Makefile.objs
> @@ -6,6 +6,7 @@ common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
>  common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o
>  common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>  common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
> +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
>  common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
>  
>  common-obj-y += acpi_interface.o
> diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c
> new file mode 100644
> index 0000000000..6fd434c4d9
> --- /dev/null
> +++ b/hw/acpi/hmat.c
> @@ -0,0 +1,153 @@
> +/*
> + * HMAT ACPI Implementation
> + *
> + * Copyright(C) 2019 Intel Corporation.
> + *
> + * Author:
> + *  Liu jingqi <jingqi.liu@linux.intel.com>
> + *  Tao Xu <tao3.xu@intel.com>
> + *
> + * HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table
> + * (HMAT)
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
> + */
> +
> +#include "qemu/osdep.h"
> +#include "sysemu/numa.h"
> +#include "hw/acpi/hmat.h"
> +#include "hw/mem/pc-dimm.h"
> +
> +/* ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure: Table 5-141 */
> +static void build_hmat_spa(GArray *table_data, uint16_t flags,
> +                           uint64_t base, uint64_t length, int node)
> +{
> +
> +    /* Memory Subsystem Address Range Structure */
> +    /* Type */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Length */
> +    build_append_int_noprefix(table_data, 40, 4);
> +    /* Flags */
> +    build_append_int_noprefix(table_data, flags, 2);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 2);
> +    /* Process Proximity Domain */
> +    build_append_int_noprefix(table_data, node, 4);
> +    /* Memory Proximity Domain */
> +    build_append_int_noprefix(table_data, node, 4);
> +    /* Reserved */
> +    build_append_int_noprefix(table_data, 0, 4);
> +    /* System Physical Address Range Base */
> +    build_append_int_noprefix(table_data, base, 8);
> +    /* System Physical Address Range Length */
> +    build_append_int_noprefix(table_data, length, 8);
> +}
> +
> +static int pc_dimm_device_list(Object *obj, void *opaque)
> +{
> +    GSList **list = opaque;
> +
> +    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
> +        DeviceState *dev = DEVICE(obj);
> +        if (dev->realized) { /* only realized memory devices matter */
> +            *list = g_slist_append(*list, DEVICE(obj));
> +        }
> +    }
> +
> +    object_child_foreach(obj, pc_dimm_device_list, opaque);
> +    return 0;
> +}
> +
> +/* Build HMAT sub table structures */
> +static void hmat_build_table_structs(GArray *table_data, MachineState *ms)
> +{
> +    GSList *device_list = NULL;
> +    uint16_t flags;
> +    uint64_t mem_base, mem_len;
> +    int i;
> +    NumaState *nstat = ms->numa_state;
> +    NumaMemRange *mem_range;
> +
> +    Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
> +    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
> +    AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
> +
> +    /*
> +     * ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure:
> +     * Table 5-141. The Proximity Domain of System Physical Address
> +     * ranges defined in the HMAT, NFIT and SRAT tables shall match
> +     * each other.
> +     */
> +    if (nstat->num_nodes && !nstat->mem_ranges_num) {
> +        nstat->mem_ranges = g_array_new(false, true /* clear */,
> +                                        sizeof *mem_range);
> +        adevc->build_mem_ranges(adev, ms);
another place you are tying to initialize nstat->mem_ranges
make initialization in generic numa init code

> +    }
> +
> +    for (i = 0; i < nstat->mem_ranges_num; i++) {
> +        mem_range = &g_array_index(nstat->mem_ranges, NumaMemRange, i);
> +        flags = 0;
> +
> +        if (nstat->nodes[mem_range->node].is_initiator) {
> +            flags |= HMAT_SPA_PROC_VALID;
> +        }
> +        if (nstat->nodes[mem_range->node].is_target) {
> +            flags |= HMAT_SPA_MEM_VALID;
> +        }
> +
> +        build_hmat_spa(table_data, flags, mem_range->base,
> +                       mem_range->length,
> +                       mem_range->node);
> +    }
> +
> +    /* Build HMAT SPA structures for PC-DIMM devices. */
> +    object_child_foreach(OBJECT(ms), pc_dimm_device_list, &device_list);
> +
> +    for (; device_list; device_list = device_list->next) {
> +        PCDIMMDevice *dimm = device_list->data;
> +        mem_base = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
> +                                            NULL);
> +        mem_len = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
> +                                           NULL);
> +        i = object_property_get_uint(OBJECT(dimm), PC_DIMM_NODE_PROP, NULL);
> +        flags = 0;
> +
> +        if (nstat->nodes[i].is_initiator) {
> +            flags |= HMAT_SPA_PROC_VALID;
> +        }
> +        if (nstat->nodes[i].is_target) {
> +            flags |= HMAT_SPA_MEM_VALID;
> +        }
> +        build_hmat_spa(table_data, flags, mem_base, mem_len, i);
> +    }
Don't you need to free device_list at this point?

> +}
> +
> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms)
> +{
> +    uint64_t hmat_start;
> +
> +    hmat_start = table_data->len;
> +
> +    /* reserve space for HMAT header  */
> +    acpi_data_push(table_data, 40);
> +
> +    hmat_build_table_structs(table_data, ms);
> +
> +    build_header(linker, table_data,
> +                 (void *)(table_data->data + hmat_start),
> +                 "HMAT", table_data->len - hmat_start, 1, NULL, NULL);
> +}
> diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h
> new file mode 100644
> index 0000000000..e24b673fad
> --- /dev/null
> +++ b/hw/acpi/hmat.h
> @@ -0,0 +1,43 @@
> +/*
> + * HMAT ACPI Implementation Header
> + *
> + * Copyright(C) 2019 Intel Corporation.
> + *
> + * Author:
> + *  Liu jingqi <jingqi.liu@linux.intel.com>
> + *  Tao Xu <tao3.xu@intel.com>
> + *
> + * HMAT is defined in ACPI 6.2.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
> + */
> +
> +#ifndef HMAT_H
> +#define HMAT_H
> +
> +#include "hw/acpi/acpi-defs.h"
> +#include "hw/acpi/acpi.h"
> +#include "hw/acpi/bios-linker-loader.h"
> +#include "hw/acpi/aml-build.h"
> +
> +/* the values of AcpiHmatSpaRange flag */
> +enum {
> +    HMAT_SPA_PROC_VALID       = 0x1,
> +    HMAT_SPA_MEM_VALID        = 0x2,
> +    HMAT_SPA_RESERVATION_HINT = 0x4,
> +};
> +
> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms);
> +
> +#endif
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 14b29de0a9..2ad09ec23e 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -646,6 +646,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>                                 const CpuInstanceProperties *props, Error **errp)
>  {
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
> +    NodeInfo *numa_info = machine->numa_state->nodes;
>      bool match = false;
>      int i;
>  
> @@ -706,6 +707,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>          match = true;
>          slot->props.node_id = props->node_id;
>          slot->props.has_node_id = props->has_node_id;
> +        numa_info[props->node_id].is_initiator = true;
>      }
>  
>      if (!match) {
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 44dd447fa5..6584eac76e 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -66,6 +66,7 @@
>  #include "hw/i386/intel_iommu.h"
>  
>  #include "hw/acpi/ipmi.h"
> +#include "hw/acpi/hmat.h"
>  
>  /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
>   * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>              acpi_add_table(table_offsets, tables_blob);
>              build_slit(tables_blob, tables->linker, machine);
>          }
> +        acpi_add_table(table_offsets, tables_blob);
> +        build_hmat(tables_blob, tables->linker, machine);
I'm not sure if we should add it unconditionally.
Is this table used in any meaningful manner by guest when
it's incomplete (i.e. populated only with SPA records)?

>      }
>      if (acpi_get_mcfg(&mcfg)) {
>          acpi_add_table(table_offsets, tables_blob);
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index e3c85b77bc..13cff59112 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -10,6 +10,8 @@ struct NodeInfo {
>      uint64_t node_mem;
>      struct HostMemoryBackend *node_memdev;
>      bool present;
> +    bool is_initiator;
> +    bool is_target;
>      uint8_t distance[MAX_NODES];
>  };
>  
> diff --git a/numa.c b/numa.c
> index d23e130bce..5556d118c3 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -102,6 +102,10 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>          }
>      }
>  
> +    if (node->cpus) {
> +        numa_info[nodenr].is_initiator = true;
> +    }
> +
>      if (node->has_mem && node->has_memdev) {
>          error_setg(errp, "cannot specify both mem= and memdev=");
>          return;
> @@ -118,6 +122,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>  
>      if (node->has_mem) {
>          numa_info[nodenr].node_mem = node->mem;
> +        numa_info[nodenr].is_target = true;
>      }
>      if (node->has_memdev) {
>          Object *o;
> @@ -130,6 +135,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>          object_ref(o);
>          numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
> +        numa_info[nodenr].is_target = true;
>      }
>      numa_info[nodenr].present = true;
>      max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
Tao Xu July 2, 2019, 1:14 a.m. UTC | #4
On 7/1/2019 7:25 PM, Igor Mammedov wrote:
> On Fri, 14 Jun 2019 23:56:24 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
> 
>> From: Liu Jingqi <jingqi.liu@intel.com>
>>
>> HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table (HMAT).
>> The specification references below link:
>> http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
>>
>> It describes the memory attributes, such as memory side cache
>> attributes and bandwidth and latency details, related to the
>> System Physical Address (SPA) Memory Ranges. The software is
>> expected to use this information as hint for optimization.
>>
>> This structure describes the System Physical Address(SPA) range
>> occupied by memory subsystem and its associativity with processor
>> proximity domain as well as hint for memory usage.
>>
>> Signed-off-by: Liu Jingqi <jingqi.liu@intel.com>
>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>> ---
>>
>> Changes in v5 -> v4:
>>      - Add more descriptions from ACPI spec (Igor)
>>      - Remove all the dependcy on PCMachineState (Igor)
>> ---
>>   hw/acpi/Kconfig       |   5 ++
>>   hw/acpi/Makefile.objs |   1 +
>>   hw/acpi/hmat.c        | 153 ++++++++++++++++++++++++++++++++++++++++++
>>   hw/acpi/hmat.h        |  43 ++++++++++++
>>   hw/core/machine.c     |   2 +
>>   hw/i386/acpi-build.c  |   3 +
>>   include/sysemu/numa.h |   2 +
>>   numa.c                |   6 ++
>>   8 files changed, 215 insertions(+)
>>   create mode 100644 hw/acpi/hmat.c
>>   create mode 100644 hw/acpi/hmat.h
>>
>> diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
>> index 7c59cf900b..039bb99efa 100644
>> --- a/hw/acpi/Kconfig
>> +++ b/hw/acpi/Kconfig
>> @@ -7,6 +7,7 @@ config ACPI_X86
>>       select ACPI_NVDIMM
>>       select ACPI_CPU_HOTPLUG
>>       select ACPI_MEMORY_HOTPLUG
>> +    select ACPI_HMAT
>>   
>>   config ACPI_X86_ICH
>>       bool
>> @@ -31,3 +32,7 @@ config ACPI_VMGENID
>>       bool
>>       default y
>>       depends on PC
>> +
>> +config ACPI_HMAT
>> +    bool
>> +    depends on ACPI
>> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
>> index 661a9b8c2f..20cc2fb124 100644
>> --- a/hw/acpi/Makefile.objs
>> +++ b/hw/acpi/Makefile.objs
>> @@ -6,6 +6,7 @@ common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
>>   common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o
>>   common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>>   common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
>> +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
>>   common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
>>   
>>   common-obj-y += acpi_interface.o
>> diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c
>> new file mode 100644
>> index 0000000000..6fd434c4d9
>> --- /dev/null
>> +++ b/hw/acpi/hmat.c
>> @@ -0,0 +1,153 @@
>> +/*
>> + * HMAT ACPI Implementation
>> + *
>> + * Copyright(C) 2019 Intel Corporation.
>> + *
>> + * Author:
>> + *  Liu jingqi <jingqi.liu@linux.intel.com>
>> + *  Tao Xu <tao3.xu@intel.com>
>> + *
>> + * HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table
>> + * (HMAT)
>> + *
>> + * This library is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2 of the License, or (at your option) any later version.
>> + *
>> + * This library is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "sysemu/numa.h"
>> +#include "hw/acpi/hmat.h"
>> +#include "hw/mem/pc-dimm.h"
>> +
>> +/* ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure: Table 5-141 */
>> +static void build_hmat_spa(GArray *table_data, uint16_t flags,
>> +                           uint64_t base, uint64_t length, int node)
>> +{
>> +
>> +    /* Memory Subsystem Address Range Structure */
>> +    /* Type */
>> +    build_append_int_noprefix(table_data, 0, 2);
>> +    /* Reserved */
>> +    build_append_int_noprefix(table_data, 0, 2);
>> +    /* Length */
>> +    build_append_int_noprefix(table_data, 40, 4);
>> +    /* Flags */
>> +    build_append_int_noprefix(table_data, flags, 2);
>> +    /* Reserved */
>> +    build_append_int_noprefix(table_data, 0, 2);
>> +    /* Process Proximity Domain */
>> +    build_append_int_noprefix(table_data, node, 4);
>> +    /* Memory Proximity Domain */
>> +    build_append_int_noprefix(table_data, node, 4);
>> +    /* Reserved */
>> +    build_append_int_noprefix(table_data, 0, 4);
>> +    /* System Physical Address Range Base */
>> +    build_append_int_noprefix(table_data, base, 8);
>> +    /* System Physical Address Range Length */
>> +    build_append_int_noprefix(table_data, length, 8);
>> +}
>> +
>> +static int pc_dimm_device_list(Object *obj, void *opaque)
>> +{
>> +    GSList **list = opaque;
>> +
>> +    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
>> +        DeviceState *dev = DEVICE(obj);
>> +        if (dev->realized) { /* only realized memory devices matter */
>> +            *list = g_slist_append(*list, DEVICE(obj));
>> +        }
>> +    }
>> +
>> +    object_child_foreach(obj, pc_dimm_device_list, opaque);
>> +    return 0;
>> +}
>> +
>> +/* Build HMAT sub table structures */
>> +static void hmat_build_table_structs(GArray *table_data, MachineState *ms)
>> +{
>> +    GSList *device_list = NULL;
>> +    uint16_t flags;
>> +    uint64_t mem_base, mem_len;
>> +    int i;
>> +    NumaState *nstat = ms->numa_state;
>> +    NumaMemRange *mem_range;
>> +
>> +    Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
>> +    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
>> +    AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
>> +
>> +    /*
>> +     * ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure:
>> +     * Table 5-141. The Proximity Domain of System Physical Address
>> +     * ranges defined in the HMAT, NFIT and SRAT tables shall match
>> +     * each other.
>> +     */
>> +    if (nstat->num_nodes && !nstat->mem_ranges_num) {
>> +        nstat->mem_ranges = g_array_new(false, true /* clear */,
>> +                                        sizeof *mem_range);
>> +        adevc->build_mem_ranges(adev, ms);
> another place you are tying to initialize nstat->mem_ranges
> make initialization in generic numa init code
> 
>> +    }
>> +
>> +    for (i = 0; i < nstat->mem_ranges_num; i++) {
>> +        mem_range = &g_array_index(nstat->mem_ranges, NumaMemRange, i);
>> +        flags = 0;
>> +
>> +        if (nstat->nodes[mem_range->node].is_initiator) {
>> +            flags |= HMAT_SPA_PROC_VALID;
>> +        }
>> +        if (nstat->nodes[mem_range->node].is_target) {
>> +            flags |= HMAT_SPA_MEM_VALID;
>> +        }
>> +
>> +        build_hmat_spa(table_data, flags, mem_range->base,
>> +                       mem_range->length,
>> +                       mem_range->node);
>> +    }
>> +
>> +    /* Build HMAT SPA structures for PC-DIMM devices. */
>> +    object_child_foreach(OBJECT(ms), pc_dimm_device_list, &device_list);
>> +
>> +    for (; device_list; device_list = device_list->next) {
>> +        PCDIMMDevice *dimm = device_list->data;
>> +        mem_base = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
>> +                                            NULL);
>> +        mem_len = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
>> +                                           NULL);
>> +        i = object_property_get_uint(OBJECT(dimm), PC_DIMM_NODE_PROP, NULL);
>> +        flags = 0;
>> +
>> +        if (nstat->nodes[i].is_initiator) {
>> +            flags |= HMAT_SPA_PROC_VALID;
>> +        }
>> +        if (nstat->nodes[i].is_target) {
>> +            flags |= HMAT_SPA_MEM_VALID;
>> +        }
>> +        build_hmat_spa(table_data, flags, mem_base, mem_len, i);
>> +    }
> Don't you need to free device_list at this point?
> 

Thank you for your suggestion, I will correct it.
>> +}
>> +
>> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms)
>> +{
>> +    uint64_t hmat_start;
>> +
>> +    hmat_start = table_data->len;
>> +
>> +    /* reserve space for HMAT header  */
>> +    acpi_data_push(table_data, 40);
>> +
>> +    hmat_build_table_structs(table_data, ms);
>> +
>> +    build_header(linker, table_data,
>> +                 (void *)(table_data->data + hmat_start),
>> +                 "HMAT", table_data->len - hmat_start, 1, NULL, NULL);
>> +}
>> diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h
>> new file mode 100644
>> index 0000000000..e24b673fad
>> --- /dev/null
>> +++ b/hw/acpi/hmat.h
>> @@ -0,0 +1,43 @@
>> +/*
>> + * HMAT ACPI Implementation Header
>> + *
>> + * Copyright(C) 2019 Intel Corporation.
>> + *
>> + * Author:
>> + *  Liu jingqi <jingqi.liu@linux.intel.com>
>> + *  Tao Xu <tao3.xu@intel.com>
>> + *
>> + * HMAT is defined in ACPI 6.2.
>> + *
>> + * This library is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2 of the License, or (at your option) any later version.
>> + *
>> + * This library is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
>> + */
>> +
>> +#ifndef HMAT_H
>> +#define HMAT_H
>> +
>> +#include "hw/acpi/acpi-defs.h"
>> +#include "hw/acpi/acpi.h"
>> +#include "hw/acpi/bios-linker-loader.h"
>> +#include "hw/acpi/aml-build.h"
>> +
>> +/* the values of AcpiHmatSpaRange flag */
>> +enum {
>> +    HMAT_SPA_PROC_VALID       = 0x1,
>> +    HMAT_SPA_MEM_VALID        = 0x2,
>> +    HMAT_SPA_RESERVATION_HINT = 0x4,
>> +};
>> +
>> +void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms);
>> +
>> +#endif
>> diff --git a/hw/core/machine.c b/hw/core/machine.c
>> index 14b29de0a9..2ad09ec23e 100644
>> --- a/hw/core/machine.c
>> +++ b/hw/core/machine.c
>> @@ -646,6 +646,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>>                                  const CpuInstanceProperties *props, Error **errp)
>>   {
>>       MachineClass *mc = MACHINE_GET_CLASS(machine);
>> +    NodeInfo *numa_info = machine->numa_state->nodes;
>>       bool match = false;
>>       int i;
>>   
>> @@ -706,6 +707,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
>>           match = true;
>>           slot->props.node_id = props->node_id;
>>           slot->props.has_node_id = props->has_node_id;
>> +        numa_info[props->node_id].is_initiator = true;
>>       }
>>   
>>       if (!match) {
>> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
>> index 44dd447fa5..6584eac76e 100644
>> --- a/hw/i386/acpi-build.c
>> +++ b/hw/i386/acpi-build.c
>> @@ -66,6 +66,7 @@
>>   #include "hw/i386/intel_iommu.h"
>>   
>>   #include "hw/acpi/ipmi.h"
>> +#include "hw/acpi/hmat.h"
>>   
>>   /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
>>    * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
>> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>>               acpi_add_table(table_offsets, tables_blob);
>>               build_slit(tables_blob, tables->linker, machine);
>>           }
>> +        acpi_add_table(table_offsets, tables_blob);
>> +        build_hmat(tables_blob, tables->linker, machine);
> I'm not sure if we should add it unconditionally.
> Is this table used in any meaningful manner by guest when
> it's incomplete (i.e. populated only with SPA records)?
> 
>>       }
>>       if (acpi_get_mcfg(&mcfg)) {
>>           acpi_add_table(table_offsets, tables_blob);
>> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
>> index e3c85b77bc..13cff59112 100644
>> --- a/include/sysemu/numa.h
>> +++ b/include/sysemu/numa.h
>> @@ -10,6 +10,8 @@ struct NodeInfo {
>>       uint64_t node_mem;
>>       struct HostMemoryBackend *node_memdev;
>>       bool present;
>> +    bool is_initiator;
>> +    bool is_target;
>>       uint8_t distance[MAX_NODES];
>>   };
>>   
>> diff --git a/numa.c b/numa.c
>> index d23e130bce..5556d118c3 100644
>> --- a/numa.c
>> +++ b/numa.c
>> @@ -102,6 +102,10 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>>           }
>>       }
>>   
>> +    if (node->cpus) {
>> +        numa_info[nodenr].is_initiator = true;
>> +    }
>> +
>>       if (node->has_mem && node->has_memdev) {
>>           error_setg(errp, "cannot specify both mem= and memdev=");
>>           return;
>> @@ -118,6 +122,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>>   
>>       if (node->has_mem) {
>>           numa_info[nodenr].node_mem = node->mem;
>> +        numa_info[nodenr].is_target = true;
>>       }
>>       if (node->has_memdev) {
>>           Object *o;
>> @@ -130,6 +135,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>>           object_ref(o);
>>           numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
>>           numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>> +        numa_info[nodenr].is_target = true;
>>       }
>>       numa_info[nodenr].present = true;
>>       max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
>
Tao Xu July 2, 2019, 8:50 a.m. UTC | #5
On 7/1/2019 7:25 PM, Igor Mammedov wrote:
> On Fri, 14 Jun 2019 23:56:24 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
> 
...
>> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>>               acpi_add_table(table_offsets, tables_blob);
>>               build_slit(tables_blob, tables->linker, machine);
>>           }
>> +        acpi_add_table(table_offsets, tables_blob);
>> +        build_hmat(tables_blob, tables->linker, machine);
> I'm not sure if we should add it unconditionally.
> Is this table used in any meaningful manner by guest when
> it's incomplete (i.e. populated only with SPA records)?
> 
Hi Igor,

In ACPI 6.2, the linux kernel use it to show the memory ranges' 
node-id(Proximity Domain). In ACPI 6.3, the linux kernel use it to show 
the numa node's closest initiator(Generic Initiator or Processor, directly
attached). It is useful for a memory only numa node, because with 
SPA(renamed as "Memory Proximity Domain Attributes Structure" in ACPI 
6.3) the user-space can know the topology of hardware heterogeneous 
memory. I think I should add a doc to describe the use case in QEMU.

Therefore, the numa CLI may be lack of a input which can indicate the 
Initiator of a memory only numa node. Dan suggested me to add a new 
parameter for that[1].

Maybe like:
-numa node,mem=4G,nodeid=2,initiator=0

[1] https://patchwork.kernel.org/cover/10934417/

Thanks

Tao
Igor Mammedov July 8, 2019, 9:09 a.m. UTC | #6
On Tue, 2 Jul 2019 16:50:24 +0800
Tao Xu <tao3.xu@intel.com> wrote:

> On 7/1/2019 7:25 PM, Igor Mammedov wrote:
> > On Fri, 14 Jun 2019 23:56:24 +0800
> > Tao Xu <tao3.xu@intel.com> wrote:
> >   
> ...
> >> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
> >>               acpi_add_table(table_offsets, tables_blob);
> >>               build_slit(tables_blob, tables->linker, machine);
> >>           }
> >> +        acpi_add_table(table_offsets, tables_blob);
> >> +        build_hmat(tables_blob, tables->linker, machine);  
> > I'm not sure if we should add it unconditionally.
> > Is this table used in any meaningful manner by guest when
> > it's incomplete (i.e. populated only with SPA records)?
> >   
> Hi Igor,
> 
> In ACPI 6.2, the linux kernel use it to show the memory ranges' 
> node-id(Proximity Domain). In ACPI 6.3, the linux kernel use it to show 
> the numa node's closest initiator(Generic Initiator or Processor, directly
> attached). It is useful for a memory only numa node, because with 
> SPA(renamed as "Memory Proximity Domain Attributes Structure" in ACPI 
> 6.3) the user-space can know the topology of hardware heterogeneous 
> memory. I think I should add a doc to describe the use case in QEMU.
Could you point out to me the specific kernel code that parses and uses HMAT?

> 
> Therefore, the numa CLI may be lack of a input which can indicate the 
> Initiator of a memory only numa node. Dan suggested me to add a new 
> parameter for that[1].
> 
> Maybe like:
> -numa node,mem=4G,nodeid=2,initiator=0
> 
> [1] https://patchwork.kernel.org/cover/10934417/
> 
> Thanks
> 
> Tao
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
>
Tao Xu July 9, 2019, 12:45 a.m. UTC | #7
On 7/8/2019 5:09 PM, Igor Mammedov wrote:
> On Tue, 2 Jul 2019 16:50:24 +0800
> Tao Xu <tao3.xu@intel.com> wrote:
> 
>> On 7/1/2019 7:25 PM, Igor Mammedov wrote:
>>> On Fri, 14 Jun 2019 23:56:24 +0800
>>> Tao Xu <tao3.xu@intel.com> wrote:
>>>    
>> ...
>>>> @@ -2710,6 +2711,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
>>>>                acpi_add_table(table_offsets, tables_blob);
>>>>                build_slit(tables_blob, tables->linker, machine);
>>>>            }
>>>> +        acpi_add_table(table_offsets, tables_blob);
>>>> +        build_hmat(tables_blob, tables->linker, machine);
>>> I'm not sure if we should add it unconditionally.
>>> Is this table used in any meaningful manner by guest when
>>> it's incomplete (i.e. populated only with SPA records)?
>>>    
>> Hi Igor,
>>
>> In ACPI 6.2, the linux kernel use it to show the memory ranges'
>> node-id(Proximity Domain). In ACPI 6.3, the linux kernel use it to show
>> the numa node's closest initiator(Generic Initiator or Processor, directly
>> attached). It is useful for a memory only numa node, because with
>> SPA(renamed as "Memory Proximity Domain Attributes Structure" in ACPI
>> 6.3) the user-space can know the topology of hardware heterogeneous
>> memory. I think I should add a doc to describe the use case in QEMU.
> Could you point out to me the specific kernel code that parses and uses HMAT?
> 

OK, it is in drivers/acpi/hmat/hmat.c

>>
>> Therefore, the numa CLI may be lack of a input which can indicate the
>> Initiator of a memory only numa node. Dan suggested me to add a new
>> parameter for that[1].
>>
>> Maybe like:
>> -numa node,mem=4G,nodeid=2,initiator=0
>>
>> [1] https://patchwork.kernel.org/cover/10934417/
>>
>> Thanks
>>
>> Tao
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
diff mbox series

Patch

diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
index 7c59cf900b..039bb99efa 100644
--- a/hw/acpi/Kconfig
+++ b/hw/acpi/Kconfig
@@ -7,6 +7,7 @@  config ACPI_X86
     select ACPI_NVDIMM
     select ACPI_CPU_HOTPLUG
     select ACPI_MEMORY_HOTPLUG
+    select ACPI_HMAT
 
 config ACPI_X86_ICH
     bool
@@ -31,3 +32,7 @@  config ACPI_VMGENID
     bool
     default y
     depends on PC
+
+config ACPI_HMAT
+    bool
+    depends on ACPI
diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index 661a9b8c2f..20cc2fb124 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -6,6 +6,7 @@  common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o
 common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
+common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
 common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
 
 common-obj-y += acpi_interface.o
diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c
new file mode 100644
index 0000000000..6fd434c4d9
--- /dev/null
+++ b/hw/acpi/hmat.c
@@ -0,0 +1,153 @@ 
+/*
+ * HMAT ACPI Implementation
+ *
+ * Copyright(C) 2019 Intel Corporation.
+ *
+ * Author:
+ *  Liu jingqi <jingqi.liu@linux.intel.com>
+ *  Tao Xu <tao3.xu@intel.com>
+ *
+ * HMAT is defined in ACPI 6.2: 5.2.27 Heterogeneous Memory Attribute Table
+ * (HMAT)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/numa.h"
+#include "hw/acpi/hmat.h"
+#include "hw/mem/pc-dimm.h"
+
+/* ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure: Table 5-141 */
+static void build_hmat_spa(GArray *table_data, uint16_t flags,
+                           uint64_t base, uint64_t length, int node)
+{
+
+    /* Memory Subsystem Address Range Structure */
+    /* Type */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Length */
+    build_append_int_noprefix(table_data, 40, 4);
+    /* Flags */
+    build_append_int_noprefix(table_data, flags, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Process Proximity Domain */
+    build_append_int_noprefix(table_data, node, 4);
+    /* Memory Proximity Domain */
+    build_append_int_noprefix(table_data, node, 4);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 4);
+    /* System Physical Address Range Base */
+    build_append_int_noprefix(table_data, base, 8);
+    /* System Physical Address Range Length */
+    build_append_int_noprefix(table_data, length, 8);
+}
+
+static int pc_dimm_device_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+        DeviceState *dev = DEVICE(obj);
+        if (dev->realized) { /* only realized memory devices matter */
+            *list = g_slist_append(*list, DEVICE(obj));
+        }
+    }
+
+    object_child_foreach(obj, pc_dimm_device_list, opaque);
+    return 0;
+}
+
+/* Build HMAT sub table structures */
+static void hmat_build_table_structs(GArray *table_data, MachineState *ms)
+{
+    GSList *device_list = NULL;
+    uint16_t flags;
+    uint64_t mem_base, mem_len;
+    int i;
+    NumaState *nstat = ms->numa_state;
+    NumaMemRange *mem_range;
+
+    Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
+    AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
+    AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
+
+    /*
+     * ACPI 6.2: 5.2.27.3 Memory Subsystem Address Range Structure:
+     * Table 5-141. The Proximity Domain of System Physical Address
+     * ranges defined in the HMAT, NFIT and SRAT tables shall match
+     * each other.
+     */
+    if (nstat->num_nodes && !nstat->mem_ranges_num) {
+        nstat->mem_ranges = g_array_new(false, true /* clear */,
+                                        sizeof *mem_range);
+        adevc->build_mem_ranges(adev, ms);
+    }
+
+    for (i = 0; i < nstat->mem_ranges_num; i++) {
+        mem_range = &g_array_index(nstat->mem_ranges, NumaMemRange, i);
+        flags = 0;
+
+        if (nstat->nodes[mem_range->node].is_initiator) {
+            flags |= HMAT_SPA_PROC_VALID;
+        }
+        if (nstat->nodes[mem_range->node].is_target) {
+            flags |= HMAT_SPA_MEM_VALID;
+        }
+
+        build_hmat_spa(table_data, flags, mem_range->base,
+                       mem_range->length,
+                       mem_range->node);
+    }
+
+    /* Build HMAT SPA structures for PC-DIMM devices. */
+    object_child_foreach(OBJECT(ms), pc_dimm_device_list, &device_list);
+
+    for (; device_list; device_list = device_list->next) {
+        PCDIMMDevice *dimm = device_list->data;
+        mem_base = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
+                                            NULL);
+        mem_len = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+                                           NULL);
+        i = object_property_get_uint(OBJECT(dimm), PC_DIMM_NODE_PROP, NULL);
+        flags = 0;
+
+        if (nstat->nodes[i].is_initiator) {
+            flags |= HMAT_SPA_PROC_VALID;
+        }
+        if (nstat->nodes[i].is_target) {
+            flags |= HMAT_SPA_MEM_VALID;
+        }
+        build_hmat_spa(table_data, flags, mem_base, mem_len, i);
+    }
+}
+
+void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms)
+{
+    uint64_t hmat_start;
+
+    hmat_start = table_data->len;
+
+    /* reserve space for HMAT header  */
+    acpi_data_push(table_data, 40);
+
+    hmat_build_table_structs(table_data, ms);
+
+    build_header(linker, table_data,
+                 (void *)(table_data->data + hmat_start),
+                 "HMAT", table_data->len - hmat_start, 1, NULL, NULL);
+}
diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h
new file mode 100644
index 0000000000..e24b673fad
--- /dev/null
+++ b/hw/acpi/hmat.h
@@ -0,0 +1,43 @@ 
+/*
+ * HMAT ACPI Implementation Header
+ *
+ * Copyright(C) 2019 Intel Corporation.
+ *
+ * Author:
+ *  Liu jingqi <jingqi.liu@linux.intel.com>
+ *  Tao Xu <tao3.xu@intel.com>
+ *
+ * HMAT is defined in ACPI 6.2.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#ifndef HMAT_H
+#define HMAT_H
+
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/acpi/aml-build.h"
+
+/* the values of AcpiHmatSpaRange flag */
+enum {
+    HMAT_SPA_PROC_VALID       = 0x1,
+    HMAT_SPA_MEM_VALID        = 0x2,
+    HMAT_SPA_RESERVATION_HINT = 0x4,
+};
+
+void build_hmat(GArray *table_data, BIOSLinker *linker, MachineState *ms);
+
+#endif
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 14b29de0a9..2ad09ec23e 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -646,6 +646,7 @@  void machine_set_cpu_numa_node(MachineState *machine,
                                const CpuInstanceProperties *props, Error **errp)
 {
     MachineClass *mc = MACHINE_GET_CLASS(machine);
+    NodeInfo *numa_info = machine->numa_state->nodes;
     bool match = false;
     int i;
 
@@ -706,6 +707,7 @@  void machine_set_cpu_numa_node(MachineState *machine,
         match = true;
         slot->props.node_id = props->node_id;
         slot->props.has_node_id = props->has_node_id;
+        numa_info[props->node_id].is_initiator = true;
     }
 
     if (!match) {
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 44dd447fa5..6584eac76e 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -66,6 +66,7 @@ 
 #include "hw/i386/intel_iommu.h"
 
 #include "hw/acpi/ipmi.h"
+#include "hw/acpi/hmat.h"
 
 /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
  * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
@@ -2710,6 +2711,8 @@  void acpi_build(AcpiBuildTables *tables, MachineState *machine)
             acpi_add_table(table_offsets, tables_blob);
             build_slit(tables_blob, tables->linker, machine);
         }
+        acpi_add_table(table_offsets, tables_blob);
+        build_hmat(tables_blob, tables->linker, machine);
     }
     if (acpi_get_mcfg(&mcfg)) {
         acpi_add_table(table_offsets, tables_blob);
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index e3c85b77bc..13cff59112 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -10,6 +10,8 @@  struct NodeInfo {
     uint64_t node_mem;
     struct HostMemoryBackend *node_memdev;
     bool present;
+    bool is_initiator;
+    bool is_target;
     uint8_t distance[MAX_NODES];
 };
 
diff --git a/numa.c b/numa.c
index d23e130bce..5556d118c3 100644
--- a/numa.c
+++ b/numa.c
@@ -102,6 +102,10 @@  static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
         }
     }
 
+    if (node->cpus) {
+        numa_info[nodenr].is_initiator = true;
+    }
+
     if (node->has_mem && node->has_memdev) {
         error_setg(errp, "cannot specify both mem= and memdev=");
         return;
@@ -118,6 +122,7 @@  static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
 
     if (node->has_mem) {
         numa_info[nodenr].node_mem = node->mem;
+        numa_info[nodenr].is_target = true;
     }
     if (node->has_memdev) {
         Object *o;
@@ -130,6 +135,7 @@  static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
         object_ref(o);
         numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
+        numa_info[nodenr].is_target = true;
     }
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);