diff mbox series

[v8,13/13] acpi/ghes: check if the BIOS pointers for HEST are correct

Message ID 52e6058feba318d01f54da6dca427b40ea5c9435.1723793768.git.mchehab+huawei@kernel.org
State New
Headers show
Series Add ACPI CPER firmware first error injection on ARM emulation | expand

Commit Message

Mauro Carvalho Chehab Aug. 16, 2024, 7:37 a.m. UTC
The OS kernels navigate between HEST, error source struct
and CPER by the usage of some pointers. Double-check if such
pointers were properly initializing, ensuring that they match
the right address for CPER.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 hw/acpi/ghes.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

Comments

Igor Mammedov Aug. 19, 2024, 2:07 p.m. UTC | #1
On Fri, 16 Aug 2024 09:37:45 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> The OS kernels navigate between HEST, error source struct
> and CPER by the usage of some pointers. Double-check if such
> pointers were properly initializing, ensuring that they match
> the right address for CPER.

as QEMU, we don't care about what guest wrote into those addresses
(aka it's not hw businesses), even if later qemu will trample
on wrong guest memory (it's guest responsibility to do init right).

However this patch introduces usage for hest_addr_le, that I was looking for.
See notes below.

> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> ---
>  hw/acpi/ghes.c | 30 +++++++++++++++++++++++++++++-
>  1 file changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index a822a5eafaa0..51e2e40e5a9c 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -85,6 +85,9 @@ enum AcpiHestSourceId {
>  #define HEST_GHES_V2_TABLE_SIZE  92
>  #define GHES_ACK_OFFSET          (64 + GAS_ADDR_OFFSET + ACPI_HEST_HEADER_SIZE)
>  
> +/* ACPI 6.2: 18.3.2.7: Generic Hardware Error Source */
> +#define GHES_ERR_ST_ADDR_OFFSET  (20 + GAS_ADDR_OFFSET + ACPI_HEST_HEADER_SIZE)
> +
>  /*
>   * Values for error_severity field
>   */
> @@ -425,7 +428,10 @@ NotifierList acpi_generic_error_notifiers =
>  void ghes_record_cper_errors(const void *cper, size_t len,
>                               enum AcpiGhesNotifyType notify, Error **errp)
>  {
> -    uint64_t cper_addr, read_ack_start_addr;
> +    uint64_t hest_read_ack_start_addr, read_ack_start_addr;
> +    uint64_t read_ack_start_addr_2, err_source_struct;
> +    uint64_t hest_err_block_addr, error_block_addr;
> +    uint64_t cper_addr, cper_addr_2;
>      enum AcpiHestSourceId source;
>      AcpiGedState *acpi_ged_state;
>      AcpiGhesState *ags;
> @@ -450,6 +456,28 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>      cper_addr += ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t);
>      cper_addr += source * ACPI_GHES_MAX_RAW_DATA_LENGTH;
>  
> +    err_source_struct = le64_to_cpu(ags->hest_addr_le) +
> +                        source * HEST_GHES_V2_TABLE_SIZE;

there is no guaranties that HEST table will contain only GHESv2 sources,
and once such is added this place becomes broken.

we need to iterate over HEST taking that into account
and find only ghesv2 structure with source id of interest.

This function (and acpi_ghes_record_errors() as well) taking source_id
as input should be able to lookup pointers from HEST in guest RAM,
very crude idea could look something like this:

typedef struct hest_source_type2len{
   uint16_t type
   int len
} hest_structure_type2len

hest_structure_type2len supported_hest_sources[] = {
    /* Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure */
    {.type = 10, .len = 92},
}

uint64_t find_error_source(src_id) {
    uint32_t struct_offset = hest_header_size;
    uint16_t type, id
    do {
       addr = ags->hest_addr_le + struct_offset
 
       cpu_physical_memory_read(addr, &id)
       if (src_id == id)
         return addr

       cpu_physical_memory_read(addr, &type)
       struct_offset ++= get_len_from_supported_hest_sources(type)
    while(struct_offset < hest_len)
    assert if not found
}

unit64_t get_error_status_block_addr(src_id) {
   struct_addr = find_error_source(src_id) 
   hest_err_block_addr =   struct_addr + GHES_ERR_ST_ADDR_OFFSET
   // read intermediate pointer to status block addr pointer in hw table
   cpu_physical_memory_read(hest_err_block_addr, &error_block_addr)
   // read actual pointer to status block
   cpu_physical_memory_read(error_block_addr, &error_status_block_addr)
   return error_status_block_addr
}
 
ditto for read_ack modulo indirection that we have for error_status_block_addr

This way we can easily map source id to error status block
and find needed addresses using pointer info from guest RAM
without fragile pointer math and assumptions which might go wrong
when new error sources are added and regardless of the order they
are being added.

> +    /* Check if BIOS addr pointers were properly generated */
> +
> +    hest_err_block_addr = err_source_struct + GHES_ERR_ST_ADDR_OFFSET;
> +    hest_read_ack_start_addr = err_source_struct + GHES_ACK_OFFSET;
> +
> +    cpu_physical_memory_read(hest_err_block_addr, &error_block_addr,
> +                             sizeof(error_block_addr));
> +
> +    cpu_physical_memory_read(error_block_addr, &cper_addr_2,
> +                             sizeof(error_block_addr));
> +
> +    cpu_physical_memory_read(hest_read_ack_start_addr, &read_ack_start_addr_2,
> +			     sizeof(read_ack_start_addr_2));
> +
> +    assert(cper_addr == cper_addr_2);
> +    assert(read_ack_start_addr == read_ack_start_addr_2);
> +
> +    /* Update ACK offset to notify about a new error */
> +
>      cpu_physical_memory_read(read_ack_start_addr,
>                               &read_ack, sizeof(uint64_t));
>
Mauro Carvalho Chehab Aug. 24, 2024, 12:15 a.m. UTC | #2
Em Mon, 19 Aug 2024 16:07:33 +0200
Igor Mammedov <imammedo@redhat.com> escreveu:

> > +    err_source_struct = le64_to_cpu(ags->hest_addr_le) +
> > +                        source * HEST_GHES_V2_TABLE_SIZE;  
> 
> there is no guaranties that HEST table will contain only GHESv2 sources,
> and once such is added this place becomes broken.
> 
> we need to iterate over HEST taking that into account
> and find only ghesv2 structure with source id of interest.
> 
> This function (and acpi_ghes_record_errors() as well) taking source_id
> as input should be able to lookup pointers from HEST in guest RAM,
> very crude idea could look something like this:
> 
> typedef struct hest_source_type2len{
>    uint16_t type
>    int len
> } hest_structure_type2len
> 
> hest_structure_type2len supported_hest_sources[] = {
>     /* Table 18-344 Generic Hardware Error Source version 2 (GHESv2) Structure */
>     {.type = 10, .len = 92},
> }

Sounds interesting, but IMO it should be done only when other types besides
ghes would be added, as:

1. Right now, the file is acpi/ghes.c. Adding non-type 10 HEST structures
   there would be a little weird. It should likely be renamed to acpi/hest.c
   when such time comes.

2. ACPI 6.5 has made clear that the above will only work up to type 11,
   as, from type 12 and above, the length will be added to the error
   struct, according with:

   https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-source-structure-header-type-12-onward

3. some types have variable size. Starting from the beginning, type 0, as
   defined at:
   https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#hardware-errors-and-error-sources

   has:

   size = 40 + 24 * Number of Hardware banks

   So, a simple table like the above with fixed sizes won't work.

   The code would need instead a switch if types are <= 11.

   Adding proper support for all already defined 12 types sounds lots of 
   work, as the code would need to calculate the size depending on the
   size, and we don't really initialize the HEST table with other types
   but GHES.

Ok, we could still do something like this pseudo-code to get the
error source offset:

	#define ACPI_HEST_TYPE_GHESV2	11

	err_struct_offset = 0;
	for (i = 0; i < source_id_count; i++) {
		/* NOTE: Other types may have different sizes */
		assert(ghes[i].type == ACPI_HEST_TYPE_GHESV2);
		if (ghes[i].source_id == source_id)
			break;
		err_struct_offset += HEST_GHES_V2_TABLE_SIZE;
	}
	assert (i < source_id_count);

---

That's said, maybe this will just add unwanted complexity, as QEMU
is already setting those offsets via bios_linker_loader_add_pointer().

So, an alternative for that is to merge the code on patch 13 with the one
on patch 5, dropping the math calcus there and relying that QEMU will
always handle properly bios links.

See, the logic which constructs GHESv2 source IDs do this to create
the links between HEST ACPI table and etc/hardware_errors:

with:

Per-source ID logic at build_ghes_v2():

    address_offset = table_data->len;
    /* Error Status Address */
    build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0x40, 0,
                     4 /* QWord access */, 0);
    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   address_offset + GAS_ADDR_OFFSET,
                                   sizeof(uint64_t),
                                   ACPI_HW_ERROR_FW_CFG_FILE,
                                   source_id * sizeof(uint64_t));
...
    /*
     * Read Ack Register
     * ACPI 6.1: 18.3.2.8 Generic Hardware Error Source
     * version 2 (GHESv2 - Type 10)
     */
    address_offset = table_data->len;
    build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0x40, 0,
                     4 /* QWord access */, 0);
    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   address_offset + GAS_ADDR_OFFSET,
                                   sizeof(uint64_t),
                                   ACPI_HW_ERROR_FW_CFG_FILE,
                                   (ACPI_HEST_SRC_ID_COUNT + source_id) *
                                   sizeof(uint64_t));

HEST table creation logic inside build_ghes_error_table():

    for (i = 0; i < ACPI_HEST_SRC_ID_COUNT; i++) {
        /*
         * Tell firmware to patch error_block_address entries to point to
         * corresponding "Generic Error Status Block"
         */
        bios_linker_loader_add_pointer(linker,
            ACPI_HW_ERROR_FW_CFG_FILE, sizeof(uint64_t) * i,
            sizeof(uint64_t), ACPI_HW_ERROR_FW_CFG_FILE,
            error_status_block_offset + i * ACPI_GHES_MAX_RAW_DATA_LENGTH);
    }

Using those, the location of the CPER and ack addresses is easy and won't
require any math:

	/* GHESv2 CPER offset */
	cpu_physical_memory_read(hest_err_block_addr, &error_block_addr,
                                 sizeof(error_block_addr));
	cpu_physical_memory_read(error_block_addr, &cper_addr,
                                 sizeof(error_block_addr));

	/* GHESv2 ack offset */
	cpu_physical_memory_read(hest_read_ack_start_addr, &read_ack_start_addr,
			         sizeof(read_ack_start_addr));


Regards,
Mauro
Mauro Carvalho Chehab Aug. 25, 2024, 3:48 a.m. UTC | #3
Em Sat, 24 Aug 2024 02:15:10 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> escreveu:

> Ok, we could still do something like this pseudo-code to get the
> error source offset:
> 
> 	#define ACPI_HEST_TYPE_GHESV2	11
> 
> 	err_struct_offset = 0;
> 	for (i = 0; i < source_id_count; i++) {
> 		/* NOTE: Other types may have different sizes */
> 		assert(ghes[i].type == ACPI_HEST_TYPE_GHESV2);
> 		if (ghes[i].source_id == source_id)
> 			break;
> 		err_struct_offset += HEST_GHES_V2_TABLE_SIZE;
> 	}
> 	assert (i < source_id_count);

This is what I ended implementing on v9.

Regards,
Mauro
diff mbox series

Patch

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index a822a5eafaa0..51e2e40e5a9c 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -85,6 +85,9 @@  enum AcpiHestSourceId {
 #define HEST_GHES_V2_TABLE_SIZE  92
 #define GHES_ACK_OFFSET          (64 + GAS_ADDR_OFFSET + ACPI_HEST_HEADER_SIZE)
 
+/* ACPI 6.2: 18.3.2.7: Generic Hardware Error Source */
+#define GHES_ERR_ST_ADDR_OFFSET  (20 + GAS_ADDR_OFFSET + ACPI_HEST_HEADER_SIZE)
+
 /*
  * Values for error_severity field
  */
@@ -425,7 +428,10 @@  NotifierList acpi_generic_error_notifiers =
 void ghes_record_cper_errors(const void *cper, size_t len,
                              enum AcpiGhesNotifyType notify, Error **errp)
 {
-    uint64_t cper_addr, read_ack_start_addr;
+    uint64_t hest_read_ack_start_addr, read_ack_start_addr;
+    uint64_t read_ack_start_addr_2, err_source_struct;
+    uint64_t hest_err_block_addr, error_block_addr;
+    uint64_t cper_addr, cper_addr_2;
     enum AcpiHestSourceId source;
     AcpiGedState *acpi_ged_state;
     AcpiGhesState *ags;
@@ -450,6 +456,28 @@  void ghes_record_cper_errors(const void *cper, size_t len,
     cper_addr += ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t);
     cper_addr += source * ACPI_GHES_MAX_RAW_DATA_LENGTH;
 
+    err_source_struct = le64_to_cpu(ags->hest_addr_le) +
+                        source * HEST_GHES_V2_TABLE_SIZE;
+
+    /* Check if BIOS addr pointers were properly generated */
+
+    hest_err_block_addr = err_source_struct + GHES_ERR_ST_ADDR_OFFSET;
+    hest_read_ack_start_addr = err_source_struct + GHES_ACK_OFFSET;
+
+    cpu_physical_memory_read(hest_err_block_addr, &error_block_addr,
+                             sizeof(error_block_addr));
+
+    cpu_physical_memory_read(error_block_addr, &cper_addr_2,
+                             sizeof(error_block_addr));
+
+    cpu_physical_memory_read(hest_read_ack_start_addr, &read_ack_start_addr_2,
+			     sizeof(read_ack_start_addr_2));
+
+    assert(cper_addr == cper_addr_2);
+    assert(read_ack_start_addr == read_ack_start_addr_2);
+
+    /* Update ACK offset to notify about a new error */
+
     cpu_physical_memory_read(read_ack_start_addr,
                              &read_ack, sizeof(uint64_t));