diff mbox series

[v5,6/7] acpi/ghes: add support for generic error injection via QAPI

Message ID 20c491e357340e0062b6ff09867c1661ed4d2479.1722634602.git.mchehab+huawei@kernel.org
State New
Headers show
Series Add ACPI CPER firmware first error injection on ARM emulation | expand

Commit Message

Mauro Carvalho Chehab Aug. 2, 2024, 9:44 p.m. UTC
Provide a generic interface for error injection via GHESv2.

This patch is co-authored:
    - original ghes logic to inject a simple ARM record by Shiju Jose;
    - generic logic to handle block addresses by Jonathan Cameron;
    - generic GHESv2 error inject by Mauro Carvalho Chehab;

Co-authored-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Co-authored-by: Shiju Jose <shiju.jose@huawei.com>
Co-authored-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 hw/acpi/ghes.c         | 159 ++++++++++++++++++++++++++++++++++++++---
 hw/acpi/ghes_cper.c    |   2 +-
 include/hw/acpi/ghes.h |   3 +
 3 files changed, 152 insertions(+), 12 deletions(-)

Comments

Igor Mammedov Aug. 6, 2024, 2:31 p.m. UTC | #1
On Fri,  2 Aug 2024 23:44:01 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Provide a generic interface for error injection via GHESv2.
> 
> This patch is co-authored:
>     - original ghes logic to inject a simple ARM record by Shiju Jose;
>     - generic logic to handle block addresses by Jonathan Cameron;
>     - generic GHESv2 error inject by Mauro Carvalho Chehab;
> 
> Co-authored-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Co-authored-by: Shiju Jose <shiju.jose@huawei.com>
> Co-authored-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Cc: Shiju Jose <shiju.jose@huawei.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> ---
>  hw/acpi/ghes.c         | 159 ++++++++++++++++++++++++++++++++++++++---
>  hw/acpi/ghes_cper.c    |   2 +-
>  include/hw/acpi/ghes.h |   3 +
>  3 files changed, 152 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index a745dcc7be5e..e125c9475773 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -395,23 +395,22 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
>      ags->present = true;
>  }
>  
> +static uint64_t ghes_get_state_start_address(void)

ghes_get_hardware_errors_address() might better reflect what address it will return

> +{
> +    AcpiGedState *acpi_ged_state =
> +        ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, NULL));
> +    AcpiGhesState *ags = &acpi_ged_state->ghes_state;
> +
> +    return le64_to_cpu(ags->ghes_addr_le);
> +}
> +
>  int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
>  {
>      uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> -    uint64_t start_addr;
> +    uint64_t start_addr = ghes_get_state_start_address();
>      bool ret = -1;
> -    AcpiGedState *acpi_ged_state;
> -    AcpiGhesState *ags;
> -
>      assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
>  
> -    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
> -                                                       NULL));
> -    g_assert(acpi_ged_state);
> -    ags = &acpi_ged_state->ghes_state;
> -
> -    start_addr = le64_to_cpu(ags->ghes_addr_le);
> -
>      if (physical_address) {
>          start_addr += source_id * sizeof(uint64_t);

above should be a separate patch

>  
> @@ -448,9 +447,147 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
>      return ret;
>  }
>  
> +/*
> + * Error register block data layout
> + *
> + * | +---------------------+ ges.ghes_addr_le
> + * | |error_block_address0 |
> + * | +---------------------+
> + * | |error_block_address1 |
> + * | +---------------------+ --+--
> + * | |    .............    | GHES_ADDRESS_SIZE
> + * | +---------------------+ --+--
> + * | |error_block_addressN |
> + * | +---------------------+
> + * | | read_ack0           |
> + * | +---------------------+ --+--
> + * | | read_ack1           | GHES_ADDRESS_SIZE
> + * | +---------------------+ --+--
> + * | |   .............     |
> + * | +---------------------+
> + * | | read_ackN           |
> + * | +---------------------+ --+--
> + * | |      CPER           |   |
> + * | |      ....           | GHES_MAX_RAW_DATA_LENGT
> + * | |      CPER           |   |
> + * | +---------------------+ --+--
> + * | |    ..........       |
> + * | +---------------------+
> + * | |      CPER           |
> + * | |      ....           |
> + * | |      CPER           |
> + * | +---------------------+
> + */

no need to duplicate docs/specs/acpi_hest_ghes.rst,
I'd just reffer to it and maybe add short comment as to why it's mentioned.

> +/* Map from uint32_t notify to entry offset in GHES */
> +static const uint8_t error_source_to_index[] = { 0xff, 0xff, 0xff, 0xff,
> +                                                 0xff, 0xff, 0xff, 1, 0};
> +
> +static bool ghes_get_addr(uint32_t notify, uint64_t *error_block_addr,
> +                          uint64_t *read_ack_addr)
> +{
> +    uint64_t base;
> +
> +    if (notify >= ACPI_GHES_NOTIFY_RESERVED) {
> +        return false;
> +    }
> +
> +    /* Find and check the source id for this new CPER */
> +    if (error_source_to_index[notify] == 0xff) {
> +        return false;
> +    }
> +
> +    base = ghes_get_state_start_address();
> +
> +    *read_ack_addr = base +
> +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> +        error_source_to_index[notify] * sizeof(uint64_t);
> +
> +    /* Could also be read back from the error_block_address register */
> +    *error_block_addr = base +
> +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> +
> +    return true;
> +}

I don't like all this pointer math, which is basically a reverse engineered
QEMU actions on startup + guest provided etc/hardware_errors address.

For once, it assumes error_source_to_index[] matches order in which HEST
error sources were described, which is fragile.

2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
in RAM migrated from older version might not match above assumptions
of target QEMU. 

I see 2 ways to rectify it:
  1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
       in guest RAM, like we do with etc/hardware_errors, see
            build_ghes_error_table()
               ...
               tell firmware to write hardware_errors GPA into
       and then fetch from HEST table in RAM, the guest patched error/ack addresses
       for given source_id

       code-wise: relatively simple once one wraps their own head over
                 how this whole APEI thing works in QEMU
                 workflow  is described in docs/specs/acpi_hest_ghes.rst
                 look to me as sufficient to grasp it.
                 (but my view is very biased given my prior knowledge,
                  aka: docs/comments/examples wrt acpi patching are good enough)
                 (if it's not clear how to do it, ask me for pointers)

  2nd:  sort of hack based on build_ghes_v2() Error Status Address/Read Ack Register
        patching instructions
               bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,                
                   address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t),                      
                   ACPI_GHES_ERRORS_FW_CFG_FILE, source_id * sizeof(uint64_t));
                                                 ^^^^^^^^^^^^^^^^^^^^^^^^^
        during build_ghes_v2() also store on a side mapping
             source_id -> error address offset : read ack address

        so when you are injecting error, you'd at least use offsets
        used at start time, to get rid of risk where injection code
        diverge from HEST:etc/hardware_errors layout at start time.

        However to make migration safe, one would need to add a fat
        comment not to change order ghest error sources in HEST _and_
        a dedicated unit test to make sure we catch it when that happens.
        bios_tables_test should be able to catch the change, but it won't
        say what's wrong, hence a test case that explicitly checks order
        and loudly & clear complains when we will break order assumptions.

        downside:
           * we are are limiting ways HEST could be composed/reshuffled in future
           * consumption of extra CI resources
           * and well, it relies on above duct tape holding all pieces together

>  NotifierList generic_error_notifiers =
>      NOTIFIER_LIST_INITIALIZER(error_device_notifiers);
>  
> +void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
> +                             uint32_t notify)
> +{
> +    int read_ack = 0;
       ^^^
[...]
> +    cpu_physical_memory_read(read_ack_addr,
> +                             &read_ack, sizeof(uint64_t));
                                                  ^^^^
it looks like possible stack corruption, isn't it?

> +    /* zero means OSPM does not acknowledge the error */
> +    if (!read_ack) {
> +        error_setg(errp,
> +                   "Last CPER record was not acknowledged yet");
> +        read_ack = 1;
> +        cpu_physical_memory_write(read_ack_addr,
> +                                  &read_ack, sizeof(uint64_t));
                                                        ^^^^^
and then who knows what we are writing back here

> +        return;
> +    }
> +
> +    read_ack = cpu_to_le64(0);
> +    cpu_physical_memory_write(read_ack_addr,
> +                              &read_ack, sizeof(uint64_t));
> +
> +    /* Build CPER record */
> +
> +    /*
> +     * Invalid fru id: ACPI 4.0: 17.3.2.6.1 Generic Error Data,
> +     * Table 17-13 Generic Error Data Entry
> +     */
> +    QemuUUID fru_id = {};
> +
> +    block = g_array_new(false, true /* clear */, 1);
> +    data_length = ACPI_GHES_DATA_LENGTH + cper->data_len;
> +
> +    /*
> +        * It should not run out of the preallocated memory if
> +        * adding a new generic error data entry
> +        */
> +    assert((data_length + ACPI_GHES_GESB_SIZE) <=
> +            ACPI_GHES_MAX_RAW_DATA_LENGTH);
it's better to error out gracefully here instead of crash
in case script generated too long record,
not the end of the world, but it's annoying to restart guest
on external mistake.

PS:
looking at the code, ACPI_GHES_MAX_RAW_DATA_LENGTH is 1K
and it is the total size of a error block for a error source.

However acpi_hest_ghes.rst (3) says it should be 4K,
am I mistaken? 


> +    /* Build the new generic error status block header */
> +    acpi_ghes_generic_error_status(block, ACPI_GEBS_UNCORRECTABLE,
> +                                    0, 0, data_length,
> +                                    ACPI_CPER_SEV_RECOVERABLE);
> +
> +    /* Build this new generic error data entry header */
> +    acpi_ghes_generic_error_data(block, cper->guid,
> +                                ACPI_CPER_SEV_RECOVERABLE, 0, 0,
> +                                cper->data_len, fru_id, 0);
> +

not that I mind, but I'd ax above calls with their hardcoded
assumptions and make script generate whole error block,
it's more flexible wrt ACPI_CPER_SEV_RECOVERABLE/ACPI_GEBS_UNCORRECTABLE
and then one can ditch from QAPI interface cper->guid.

basically inject whatever user provided via QAPI without any other assumptions.

> +    /* Add CPER data */
> +    for (i = 0; i < cper->data_len; i++) {
> +        build_append_int_noprefix(block, cper->data[i], 1);
> +    }
> +
> +    /* Write the generic error data entry into guest memory */
> +    cpu_physical_memory_write(error_block_addr, block->data, block->len);
> +
> +    g_array_free(block, true);
> +
> +    notifier_list_notify(&generic_error_notifiers, NULL);
> +}
> +
>  bool acpi_ghes_present(void)
>  {
>      AcpiGedState *acpi_ged_state;
> diff --git a/hw/acpi/ghes_cper.c b/hw/acpi/ghes_cper.c
> index 7aa7e71e90dc..d7ff7debee74 100644
> --- a/hw/acpi/ghes_cper.c
> +++ b/hw/acpi/ghes_cper.c
> @@ -39,7 +39,7 @@ void qmp_ghes_cper(CommonPlatformErrorRecord *qmp_cper,
>          return;
>      }
>  
> -    /* TODO: call a function at ghes */
> +    ghes_record_cper_errors(&cper, errp, ACPI_GHES_NOTIFY_GPIO);
>  
>      g_free(cper.data);
>  }
> diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
> index 06a5b8820cd5..ee6f6cd96911 100644
> --- a/include/hw/acpi/ghes.h
> +++ b/include/hw/acpi/ghes.h
> @@ -85,6 +85,9 @@ typedef struct AcpiGhesCper {
>      size_t data_len;
>  } AcpiGhesCper;
>  
> +void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
> +                             uint32_t notify);

maybe rename it to acpi_ghes_inject_error_block()

> +
>  /**
>   * acpi_ghes_present: Report whether ACPI GHES table is present
>   *
Mauro Carvalho Chehab Aug. 7, 2024, 7:47 a.m. UTC | #2
Em Tue, 6 Aug 2024 16:31:13 +0200
Igor Mammedov <imammedo@redhat.com> escreveu:

> PS:
> looking at the code, ACPI_GHES_MAX_RAW_DATA_LENGTH is 1K
> and it is the total size of a error block for a error source.
> 
> However acpi_hest_ghes.rst (3) says it should be 4K,
> am I mistaken?

Maybe Jonathan knows better, but I guess the 1K was just some
arbitrary limit to prevent a too big CPER. The 4K limit described
at acpi_hest_ghes.rst could be just some limit to cope with
the current bios implementation, but I didn't check myself how
this is implemented there. 

I was unable to find any limit at the specs. Yet, if you look at:

https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section

The processor Error Information Structure, starting at offset
40, can go up to 255*32, meaning an offset of 8200, which is
bigger than 4K.

Going further, processor context can have up to 65535 (spec
actually says 65536, but that sounds a typo, as the size is
stored on an uint16_t), containing multiple register values
there (the spec calls its length as "P").

So, the CPER record could, in theory, have:
	8200 + (65535 * P) + sizeof(vendor-specicific-info)

The CPER length is stored in Section Length record, which is
uint32_t.

So, I'd say that the GHES record can theoretically be a lot
bigger than 4K.	

Thanks,
Mauro
Mauro Carvalho Chehab Aug. 7, 2024, 1:23 p.m. UTC | #3
Em Wed, 7 Aug 2024 10:34:36 +0100
Jonathan Cameron <Jonathan.Cameron@Huawei.com> escreveu:

> On Wed, 7 Aug 2024 09:47:50 +0200
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > Em Tue, 6 Aug 2024 16:31:13 +0200
> > Igor Mammedov <imammedo@redhat.com> escreveu:
> >   
> > > PS:
> > > looking at the code, ACPI_GHES_MAX_RAW_DATA_LENGTH is 1K
> > > and it is the total size of a error block for a error source.
> > > 
> > > However acpi_hest_ghes.rst (3) says it should be 4K,
> > > am I mistaken?    
> > 
> > Maybe Jonathan knows better, but I guess the 1K was just some
> > arbitrary limit to prevent a too big CPER. The 4K limit described
> > at acpi_hest_ghes.rst could be just some limit to cope with
> > the current bios implementation, but I didn't check myself how
> > this is implemented there. 
> > 
> > I was unable to find any limit at the specs. Yet, if you look at:
> > 
> > https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section  
> 
> I think both limits are just made up.  You can in theory log huge
> error records.  Just not one does.

If both are made up, I would sync them, either patching the
documentation or the ghes driver.

> 
> > 
> > The processor Error Information Structure, starting at offset
> > 40, can go up to 255*32, meaning an offset of 8200, which is
> > bigger than 4K.
> > 
> > Going further, processor context can have up to 65535 (spec
> > actually says 65536, but that sounds a typo, as the size is
> > stored on an uint16_t), containing multiple register values
> > there (the spec calls its length as "P").
> > 
> > So, the CPER record could, in theory, have:
> > 	8200 + (65535 * P) + sizeof(vendor-specicific-info)
> > 
> > The CPER length is stored in Section Length record, which is
> > uint32_t.
> > 
> > So, I'd say that the GHES record can theoretically be a lot
> > bigger than 4K.	  
> Agreed - but I don't think we care for testing as long as it's
> big enough for plausible records.   Unless you really want
> to fuzz the limits?

Fuzz the limits could be interesting, but it is not on my
current plans.

Yet, 1K could be a little bit short for ARM CPER.

See: N.26 ARMv8 AArch64 GPRs (Type 4) has 256 bytes for
registers, plus 8 bytes for the header. So, a total size of
264 bytes, for a single context register dump. I would expect
that, in real life, type 4 to always be reported on aarch64,
on BIOS with context register support. Maybe other types could
also be dumped altogether (like context registers for EL1, 
EL2 and/or EL3).

If just one type 4 context is encoded, it means that, 1K has 
space for 23 errors (of a max limit of 255).

Just looking at the maximum number, my feeling is that 1K
might be too short to simulate some real life reports,
but that depends on how firmware is actually grouping
such events.

So, maybe this could be expanded to, let's say, 4K, thus
aligning with the ReST documentation.

Regards,
Mauro
Igor Mammedov Aug. 7, 2024, 1:28 p.m. UTC | #4
On Wed, 7 Aug 2024 10:34:36 +0100
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Wed, 7 Aug 2024 09:47:50 +0200
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > Em Tue, 6 Aug 2024 16:31:13 +0200
> > Igor Mammedov <imammedo@redhat.com> escreveu:
> >   
> > > PS:
> > > looking at the code, ACPI_GHES_MAX_RAW_DATA_LENGTH is 1K
> > > and it is the total size of a error block for a error source.
> > > 
> > > However acpi_hest_ghes.rst (3) says it should be 4K,
> > > am I mistaken?    
> > 
> > Maybe Jonathan knows better, but I guess the 1K was just some
> > arbitrary limit to prevent a too big CPER. The 4K limit described
> > at acpi_hest_ghes.rst could be just some limit to cope with
> > the current bios implementation, but I didn't check myself how
> > this is implemented there. 
> > 
> > I was unable to find any limit at the specs. Yet, if you look at:
> > 
> > https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section  
> 
> I think both limits are just made up.  You can in theory log huge
> error records.  Just not one does.

What I care about is what we actually allocate vs what we promised
in docs. Given that it's harder to change actual size (we would need
a compat handling here not to break old machine types), I  would vote
for syncing docs to match code.

A separate stand-alone patch for fixing it would do,
or it could be a part of series.

Also I'd like another pair of eyes to look at it to confirm actual
size we allocate, in case I'm not seeing it right.

> > The processor Error Information Structure, starting at offset
> > 40, can go up to 255*32, meaning an offset of 8200, which is
> > bigger than 4K.
> > 
> > Going further, processor context can have up to 65535 (spec
> > actually says 65536, but that sounds a typo, as the size is
> > stored on an uint16_t), containing multiple register values
> > there (the spec calls its length as "P").
> > 
> > So, the CPER record could, in theory, have:
> > 	8200 + (65535 * P) + sizeof(vendor-specicific-info)
> > 
> > The CPER length is stored in Section Length record, which is
> > uint32_t.
> > 
> > So, I'd say that the GHES record can theoretically be a lot
> > bigger than 4K.	  
> Agreed - but I don't think we care for testing as long as it's
> big enough for plausible records.   Unless you really want
> to fuzz the limits?
> 
> Jonathan
> 
> > 
> > Thanks,
> > Mauro  
>
Igor Mammedov Aug. 7, 2024, 1:43 p.m. UTC | #5
On Wed, 7 Aug 2024 15:23:57 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Em Wed, 7 Aug 2024 10:34:36 +0100
> Jonathan Cameron <Jonathan.Cameron@Huawei.com> escreveu:
> 
> > On Wed, 7 Aug 2024 09:47:50 +0200
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> >   
> > > Em Tue, 6 Aug 2024 16:31:13 +0200
> > > Igor Mammedov <imammedo@redhat.com> escreveu:
> > >     
> > > > PS:
> > > > looking at the code, ACPI_GHES_MAX_RAW_DATA_LENGTH is 1K
> > > > and it is the total size of a error block for a error source.
> > > > 
> > > > However acpi_hest_ghes.rst (3) says it should be 4K,
> > > > am I mistaken?      
> > > 
> > > Maybe Jonathan knows better, but I guess the 1K was just some
> > > arbitrary limit to prevent a too big CPER. The 4K limit described
> > > at acpi_hest_ghes.rst could be just some limit to cope with
> > > the current bios implementation, but I didn't check myself how
> > > this is implemented there. 
> > > 
> > > I was unable to find any limit at the specs. Yet, if you look at:
> > > 
> > > https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section    
> > 
> > I think both limits are just made up.  You can in theory log huge
> > error records.  Just not one does.  
> 
> If both are made up, I would sync them, either patching the
> documentation or the ghes driver.
> 
> >   
> > > 
> > > The processor Error Information Structure, starting at offset
> > > 40, can go up to 255*32, meaning an offset of 8200, which is
> > > bigger than 4K.
> > > 
> > > Going further, processor context can have up to 65535 (spec
> > > actually says 65536, but that sounds a typo, as the size is
> > > stored on an uint16_t), containing multiple register values
> > > there (the spec calls its length as "P").
> > > 
> > > So, the CPER record could, in theory, have:
> > > 	8200 + (65535 * P) + sizeof(vendor-specicific-info)
> > > 
> > > The CPER length is stored in Section Length record, which is
> > > uint32_t.
> > > 
> > > So, I'd say that the GHES record can theoretically be a lot
> > > bigger than 4K.	    
> > Agreed - but I don't think we care for testing as long as it's
> > big enough for plausible records.   Unless you really want
> > to fuzz the limits?  
> 
> Fuzz the limits could be interesting, but it is not on my
> current plans.
> 
> Yet, 1K could be a little bit short for ARM CPER.
> 
> See: N.26 ARMv8 AArch64 GPRs (Type 4) has 256 bytes for
> registers, plus 8 bytes for the header. So, a total size of
> 264 bytes, for a single context register dump. I would expect
> that, in real life, type 4 to always be reported on aarch64,
> on BIOS with context register support. Maybe other types could
> also be dumped altogether (like context registers for EL1, 
> EL2 and/or EL3).
> 
> If just one type 4 context is encoded, it means that, 1K has 
> space for 23 errors (of a max limit of 255).
> 
> Just looking at the maximum number, my feeling is that 1K
> might be too short to simulate some real life reports,
> but that depends on how firmware is actually grouping
> such events.

per my knowledge firmware is out of picture here, since all
it does in HEST case is allocate continuous space for
'etc/hardware_errors' blob as QEMU told it.

> 
> So, maybe this could be expanded to, let's say, 4K, thus
> aligning with the ReST documentation.
maybe to get moving, 1st get your series in with docs fixed
to today limit.
And then increase error_block size to desired value on top of that
as it's really not relevant to what you are doing here.

> Regards,
> Mauro
>
Igor Mammedov Aug. 8, 2024, 8:11 a.m. UTC | #6
On Wed, 7 Aug 2024 15:25:47 +0100
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Tue, 6 Aug 2024 16:31:13 +0200
> Igor Mammedov <imammedo@redhat.com> wrote:
> 
> > On Fri,  2 Aug 2024 23:44:01 +0200
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> >   
> > > Provide a generic interface for error injection via GHESv2.
> > > 
> > > This patch is co-authored:
> > >     - original ghes logic to inject a simple ARM record by Shiju Jose;
> > >     - generic logic to handle block addresses by Jonathan Cameron;
> > >     - generic GHESv2 error inject by Mauro Carvalho Chehab;
> > > 
> > > Co-authored-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > Co-authored-by: Shiju Jose <shiju.jose@huawei.com>
> > > Co-authored-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > Cc: Shiju Jose <shiju.jose@huawei.com>
> > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > ---
> > >  hw/acpi/ghes.c         | 159 ++++++++++++++++++++++++++++++++++++++---
> > >  hw/acpi/ghes_cper.c    |   2 +-
> > >  include/hw/acpi/ghes.h |   3 +
> > >  3 files changed, 152 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > index a745dcc7be5e..e125c9475773 100644
> > > --- a/hw/acpi/ghes.c
> > > +++ b/hw/acpi/ghes.c
> > > @@ -395,23 +395,22 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > >      ags->present = true;
> > >  }
> > >  
> > > +static uint64_t ghes_get_state_start_address(void)    
> > 
> > ghes_get_hardware_errors_address() might better reflect what address it will return
> >   
> > > +{
> > > +    AcpiGedState *acpi_ged_state =
> > > +        ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, NULL));
> > > +    AcpiGhesState *ags = &acpi_ged_state->ghes_state;
> > > +
> > > +    return le64_to_cpu(ags->ghes_addr_le);
> > > +}
> > > +
> > >  int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > >  {
> > >      uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> > > -    uint64_t start_addr;
> > > +    uint64_t start_addr = ghes_get_state_start_address();
> > >      bool ret = -1;
> > > -    AcpiGedState *acpi_ged_state;
> > > -    AcpiGhesState *ags;
> > > -
> > >      assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
> > >  
> > > -    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
> > > -                                                       NULL));
> > > -    g_assert(acpi_ged_state);
> > > -    ags = &acpi_ged_state->ghes_state;
> > > -
> > > -    start_addr = le64_to_cpu(ags->ghes_addr_le);
> > > -
> > >      if (physical_address) {
> > >          start_addr += source_id * sizeof(uint64_t);    
> > 
> > above should be a separate patch
> >   
> > >  
> > > @@ -448,9 +447,147 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > >      return ret;
> > >  }
> > >  
> > > +/*
> > > + * Error register block data layout
> > > + *
> > > + * | +---------------------+ ges.ghes_addr_le
> > > + * | |error_block_address0 |
> > > + * | +---------------------+
> > > + * | |error_block_address1 |
> > > + * | +---------------------+ --+--
> > > + * | |    .............    | GHES_ADDRESS_SIZE
> > > + * | +---------------------+ --+--
> > > + * | |error_block_addressN |
> > > + * | +---------------------+
> > > + * | | read_ack0           |
> > > + * | +---------------------+ --+--
> > > + * | | read_ack1           | GHES_ADDRESS_SIZE
> > > + * | +---------------------+ --+--
> > > + * | |   .............     |
> > > + * | +---------------------+
> > > + * | | read_ackN           |
> > > + * | +---------------------+ --+--
> > > + * | |      CPER           |   |
> > > + * | |      ....           | GHES_MAX_RAW_DATA_LENGT
> > > + * | |      CPER           |   |
> > > + * | +---------------------+ --+--
> > > + * | |    ..........       |
> > > + * | +---------------------+
> > > + * | |      CPER           |
> > > + * | |      ....           |
> > > + * | |      CPER           |
> > > + * | +---------------------+
> > > + */    
> > 
> > no need to duplicate docs/specs/acpi_hest_ghes.rst,
> > I'd just reffer to it and maybe add short comment as to why it's mentioned.
> >   
> > > +/* Map from uint32_t notify to entry offset in GHES */
> > > +static const uint8_t error_source_to_index[] = { 0xff, 0xff, 0xff, 0xff,
> > > +                                                 0xff, 0xff, 0xff, 1, 0};
> > > +
> > > +static bool ghes_get_addr(uint32_t notify, uint64_t *error_block_addr,
> > > +                          uint64_t *read_ack_addr)
> > > +{
> > > +    uint64_t base;
> > > +
> > > +    if (notify >= ACPI_GHES_NOTIFY_RESERVED) {
> > > +        return false;
> > > +    }
> > > +
> > > +    /* Find and check the source id for this new CPER */
> > > +    if (error_source_to_index[notify] == 0xff) {
> > > +        return false;
> > > +    }
> > > +
> > > +    base = ghes_get_state_start_address();
> > > +
> > > +    *read_ack_addr = base +
> > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > +        error_source_to_index[notify] * sizeof(uint64_t);
> > > +
> > > +    /* Could also be read back from the error_block_address register */
> > > +    *error_block_addr = base +
> > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> > > +
> > > +    return true;
> > > +}    
> > 
> > I don't like all this pointer math, which is basically a reverse engineered
> > QEMU actions on startup + guest provided etc/hardware_errors address.
> > 
> > For once, it assumes error_source_to_index[] matches order in which HEST
> > error sources were described, which is fragile.
> > 
> > 2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
> > in RAM migrated from older version might not match above assumptions
> > of target QEMU. 
> > 
> > I see 2 ways to rectify it:
> >   1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
> >        in guest RAM, like we do with etc/hardware_errors, see
> >             build_ghes_error_table()
> >                ...
> >                tell firmware to write hardware_errors GPA into
> >        and then fetch from HEST table in RAM, the guest patched error/ack addresses
> >        for given source_id
> > 
> >        code-wise: relatively simple once one wraps their own head over
> >                  how this whole APEI thing works in QEMU
> >                  workflow  is described in docs/specs/acpi_hest_ghes.rst
> >                  look to me as sufficient to grasp it.
> >                  (but my view is very biased given my prior knowledge,
> >                   aka: docs/comments/examples wrt acpi patching are good enough)
> >                  (if it's not clear how to do it, ask me for pointers)  
> 
> Hi Igor, I think I follow what you mean but maybe this question will reveal
> otherwise.  HEST is currently in ACPI_BUILD_TABLE_FILE.
> Would you suggest splitting it to it's own file, or using table_offsets
> to get the offset in ACPI_BUILD_TABLE_FILE GPA?
yep, offset taken right before HEST is to be created
doc comment for bios_linker_loader_write_pointer() explains how it works

we need something like:
       bios_linker_loader_write_pointer(linker,
           ACPI_HEST_TABLE_ADDR_FW_CFG_FILE, 0, sizeof(uint64_t),
           ACPI_BUILD_TABLE_FILE, hest_offset_within_ACPI_BUILD_TABLE_FILE); 

to register new file see:
   a08a64627 ACPI: Record the Generic Error Status Block address
   and to avoid copy past error maybe
   136fc6aa2 ACPI: Avoid infinite recursion when dump-vmstat
for this needs to be limited to new machine types and keep
old ones without this new feature. (I'd use hw_compat_ machinery for that)

while at it we should rename
  ACPI_GHES_DATA_ADDR_FW_CFG_FILE -> ACPI_GHES_ERRORS_ADDR_FW_CFG_FILE
Mauro Carvalho Chehab Aug. 8, 2024, 12:11 p.m. UTC | #7
Em Tue, 6 Aug 2024 16:31:13 +0200
Igor Mammedov <imammedo@redhat.com> escreveu:

> > +    /* Could also be read back from the error_block_address register */
> > +    *error_block_addr = base +
> > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> > +
> > +    return true;
> > +}  
> 
> I don't like all this pointer math, which is basically a reverse engineered
> QEMU actions on startup + guest provided etc/hardware_errors address.
> 
> For once, it assumes error_source_to_index[] matches order in which HEST
> error sources were described, which is fragile.
> 
> 2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
> in RAM migrated from older version might not match above assumptions
> of target QEMU. 
> 
> I see 2 ways to rectify it:
>   1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
>        in guest RAM, like we do with etc/hardware_errors, see
>             build_ghes_error_table()
>                ...
>                tell firmware to write hardware_errors GPA into
>        and then fetch from HEST table in RAM, the guest patched error/ack addresses
>        for given source_id
> 
>        code-wise: relatively simple once one wraps their own head over
>                  how this whole APEI thing works in QEMU
>                  workflow  is described in docs/specs/acpi_hest_ghes.rst
>                  look to me as sufficient to grasp it.
>                  (but my view is very biased given my prior knowledge,
>                   aka: docs/comments/examples wrt acpi patching are good enough)
>                  (if it's not clear how to do it, ask me for pointers)

That sounds a better approach, however...

>   2nd:  sort of hack based on build_ghes_v2() Error Status Address/Read Ack Register
>         patching instructions
>                bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,                
>                    address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t),                      
>                    ACPI_GHES_ERRORS_FW_CFG_FILE, source_id * sizeof(uint64_t));
>                                                  ^^^^^^^^^^^^^^^^^^^^^^^^^
>         during build_ghes_v2() also store on a side mapping
>              source_id -> error address offset : read ack address
> 
>         so when you are injecting error, you'd at least use offsets
>         used at start time, to get rid of risk where injection code
>         diverge from HEST:etc/hardware_errors layout at start time.
> 
>         However to make migration safe, one would need to add a fat
>         comment not to change order ghest error sources in HEST _and_
>         a dedicated unit test to make sure we catch it when that happens.
>         bios_tables_test should be able to catch the change, but it won't
>         say what's wrong, hence a test case that explicitly checks order
>         and loudly & clear complains when we will break order assumptions.
> 
>         downside:
>            * we are are limiting ways HEST could be composed/reshuffled in future
>            * consumption of extra CI resources
>            * and well, it relies on above duct tape holding all pieces together

I ended opting to do approach (2) on this changeset, as the current code
is already using bios_linker_loader_add_pointer() for ghes, being deeply 
relying on the block address/ack and cper calculus.

To avoid troubles on this duct tape, I opted to move all offset math
to a single function at ghes.c:

	/*
	 * ID numbers used to fill HEST source ID field
	 */
	enum AcpiHestSourceId {
	    ACPI_HEST_SRC_ID_SEA,
	    ACPI_HEST_SRC_ID_GED,
	
	    /* Shall be the last one */
	    ACPI_HEST_SRC_ID_COUNT
	} AcpiHestSourceId;

	...

	static bool acpi_hest_address_offset(enum AcpiGhesNotifyType notify,
        	                             uint64_t *error_block_offset,
                	                     uint64_t *ack_offset,
                        	             uint64_t *cper_offset,
                                	     enum AcpiHestSourceId *source_id)
	{
	    enum AcpiHestSourceId source;
	    uint64_t offset;

	    switch (notify) {
	    case ACPI_GHES_NOTIFY_SEA:      /* Only on ARMv8 */
	        source = ACPI_HEST_SRC_ID_SEA;
	        break;
	    case ACPI_GHES_NOTIFY_GPIO:
	        source = ACPI_HEST_SRC_ID_GED;
	        break;
	    default:
	        return true;
	    }

	    if (source_id) {
	        *source_id = source;
	    }

	    /*
	     * Please see docs/specs/acpi_hest_ghes.rst for the memory layout.
	     * In summary, memory starts with error addresses, then acks and
	     * finally CPER blocks.
	     */

	    offset = source * sizeof(uint64_t);

	    if (error_block_offset) {
	        *error_block_offset = offset;
	    }
	    if (ack_offset) {
	        *ack_offset = offset + ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t);
	    }
	    if (cper_offset) {
	        *cper_offset = 2 * ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t) +
	                       source * ACPI_GHES_MAX_RAW_DATA_LENGTH;
	    }

	    return false;
	}

I also removed the anonymous enum with SEA/GPIO source IDs, using
only the ACPI notify type as arguments at the function calls.

As there's now a single point where the offsets from
docs/specs/acpi_hest_ghes.rst are enforced, this should be error
prone.

The code could later be changed to use approach (2), on a separate
cleanup.

Thanks,
Mauro
Igor Mammedov Aug. 8, 2024, 12:45 p.m. UTC | #8
On Thu, 8 Aug 2024 14:11:14 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Em Tue, 6 Aug 2024 16:31:13 +0200
> Igor Mammedov <imammedo@redhat.com> escreveu:
> 
> > > +    /* Could also be read back from the error_block_address register */
> > > +    *error_block_addr = base +
> > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> > > +
> > > +    return true;
> > > +}    
> > 
> > I don't like all this pointer math, which is basically a reverse engineered
> > QEMU actions on startup + guest provided etc/hardware_errors address.
> > 
> > For once, it assumes error_source_to_index[] matches order in which HEST
> > error sources were described, which is fragile.
> > 
> > 2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
> > in RAM migrated from older version might not match above assumptions
> > of target QEMU. 
> > 
> > I see 2 ways to rectify it:
> >   1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
> >        in guest RAM, like we do with etc/hardware_errors, see
> >             build_ghes_error_table()
> >                ...
> >                tell firmware to write hardware_errors GPA into
> >        and then fetch from HEST table in RAM, the guest patched error/ack addresses
> >        for given source_id
> > 
> >        code-wise: relatively simple once one wraps their own head over
> >                  how this whole APEI thing works in QEMU
> >                  workflow  is described in docs/specs/acpi_hest_ghes.rst
> >                  look to me as sufficient to grasp it.
> >                  (but my view is very biased given my prior knowledge,
> >                   aka: docs/comments/examples wrt acpi patching are good enough)
> >                  (if it's not clear how to do it, ask me for pointers)  
> 
> That sounds a better approach, however...
> 
> >   2nd:  sort of hack based on build_ghes_v2() Error Status Address/Read Ack Register
> >         patching instructions
> >                bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,                
> >                    address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t),                      
> >                    ACPI_GHES_ERRORS_FW_CFG_FILE, source_id * sizeof(uint64_t));
> >                                                  ^^^^^^^^^^^^^^^^^^^^^^^^^
> >         during build_ghes_v2() also store on a side mapping
> >              source_id -> error address offset : read ack address
> > 
> >         so when you are injecting error, you'd at least use offsets
> >         used at start time, to get rid of risk where injection code
> >         diverge from HEST:etc/hardware_errors layout at start time.
> > 
> >         However to make migration safe, one would need to add a fat
> >         comment not to change order ghest error sources in HEST _and_
> >         a dedicated unit test to make sure we catch it when that happens.
> >         bios_tables_test should be able to catch the change, but it won't
> >         say what's wrong, hence a test case that explicitly checks order
> >         and loudly & clear complains when we will break order assumptions.
> > 
> >         downside:
> >            * we are are limiting ways HEST could be composed/reshuffled in future
> >            * consumption of extra CI resources
> >            * and well, it relies on above duct tape holding all pieces together  
> 
> I ended opting to do approach (2) on this changeset, as the current code
> is already using bios_linker_loader_add_pointer() for ghes, being deeply 
> relying on the block address/ack and cper calculus.

I consider (2) as a fallback in case (1) can't be done with reasonable effort.
At this point, (1) looks doable and I'm not convinced that duct tape
is necessary and that we badly need to rush in this series.
Hence I'd strongly prefer (1).

See my other reply to Jonathan, setting write pointer is not hard.
Parsing HEST doesn't have to be a full tables parser as long as
it respects table types/length/revision then we can cheat by using 
documented offsets from ACPI spec as is, for fields we
need to access.


> To avoid troubles on this duct tape, I opted to move all offset math
> to a single function at ghes.c:
> 
> 	/*
> 	 * ID numbers used to fill HEST source ID field
> 	 */
> 	enum AcpiHestSourceId {
> 	    ACPI_HEST_SRC_ID_SEA,
> 	    ACPI_HEST_SRC_ID_GED,
> 	
> 	    /* Shall be the last one */
> 	    ACPI_HEST_SRC_ID_COUNT
> 	} AcpiHestSourceId;
> 
> 	...
> 
> 	static bool acpi_hest_address_offset(enum AcpiGhesNotifyType notify,
>         	                             uint64_t *error_block_offset,
>                 	                     uint64_t *ack_offset,
>                         	             uint64_t *cper_offset,
>                                 	     enum AcpiHestSourceId *source_id)
> 	{
> 	    enum AcpiHestSourceId source;
> 	    uint64_t offset;
> 
> 	    switch (notify) {
> 	    case ACPI_GHES_NOTIFY_SEA:      /* Only on ARMv8 */
> 	        source = ACPI_HEST_SRC_ID_SEA;
> 	        break;
> 	    case ACPI_GHES_NOTIFY_GPIO:
> 	        source = ACPI_HEST_SRC_ID_GED;
> 	        break;
> 	    default:
> 	        return true;
> 	    }
> 
> 	    if (source_id) {
> 	        *source_id = source;
> 	    }
> 
> 	    /*
> 	     * Please see docs/specs/acpi_hest_ghes.rst for the memory layout.
> 	     * In summary, memory starts with error addresses, then acks and
> 	     * finally CPER blocks.
> 	     */
> 
> 	    offset = source * sizeof(uint64_t);
> 
> 	    if (error_block_offset) {
> 	        *error_block_offset = offset;
> 	    }
> 	    if (ack_offset) {
> 	        *ack_offset = offset + ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t);
> 	    }
> 	    if (cper_offset) {
> 	        *cper_offset = 2 * ACPI_HEST_SRC_ID_COUNT * sizeof(uint64_t) +
> 	                       source * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> 	    }
> 
> 	    return false;
> 	}
> 
> I also removed the anonymous enum with SEA/GPIO source IDs, using
> only the ACPI notify type as arguments at the function calls.
> 
> As there's now a single point where the offsets from
> docs/specs/acpi_hest_ghes.rst are enforced, this should be error
> prone.
> 
> The code could later be changed to use approach (2), on a separate
> cleanup.
> 
> Thanks,
> Mauro
>
Mauro Carvalho Chehab Aug. 8, 2024, 6:19 p.m. UTC | #9
Em Thu, 8 Aug 2024 10:11:07 +0200
Igor Mammedov <imammedo@redhat.com> escreveu:

> On Wed, 7 Aug 2024 15:25:47 +0100
> Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> 
> > On Tue, 6 Aug 2024 16:31:13 +0200
> > Igor Mammedov <imammedo@redhat.com> wrote:
> >   
> > > On Fri,  2 Aug 2024 23:44:01 +0200
> > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > >     
> > > > Provide a generic interface for error injection via GHESv2.
> > > > 
> > > > This patch is co-authored:
> > > >     - original ghes logic to inject a simple ARM record by Shiju Jose;
> > > >     - generic logic to handle block addresses by Jonathan Cameron;
> > > >     - generic GHESv2 error inject by Mauro Carvalho Chehab;
> > > > 
> > > > Co-authored-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > Co-authored-by: Shiju Jose <shiju.jose@huawei.com>
> > > > Co-authored-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > Cc: Shiju Jose <shiju.jose@huawei.com>
> > > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > ---
> > > >  hw/acpi/ghes.c         | 159 ++++++++++++++++++++++++++++++++++++++---
> > > >  hw/acpi/ghes_cper.c    |   2 +-
> > > >  include/hw/acpi/ghes.h |   3 +
> > > >  3 files changed, 152 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > index a745dcc7be5e..e125c9475773 100644
> > > > --- a/hw/acpi/ghes.c
> > > > +++ b/hw/acpi/ghes.c
> > > > @@ -395,23 +395,22 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > > >      ags->present = true;
> > > >  }
> > > >  
> > > > +static uint64_t ghes_get_state_start_address(void)      
> > > 
> > > ghes_get_hardware_errors_address() might better reflect what address it will return
> > >     
> > > > +{
> > > > +    AcpiGedState *acpi_ged_state =
> > > > +        ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, NULL));
> > > > +    AcpiGhesState *ags = &acpi_ged_state->ghes_state;
> > > > +
> > > > +    return le64_to_cpu(ags->ghes_addr_le);
> > > > +}
> > > > +
> > > >  int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > > >  {
> > > >      uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> > > > -    uint64_t start_addr;
> > > > +    uint64_t start_addr = ghes_get_state_start_address();
> > > >      bool ret = -1;
> > > > -    AcpiGedState *acpi_ged_state;
> > > > -    AcpiGhesState *ags;
> > > > -
> > > >      assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
> > > >  
> > > > -    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
> > > > -                                                       NULL));
> > > > -    g_assert(acpi_ged_state);
> > > > -    ags = &acpi_ged_state->ghes_state;
> > > > -
> > > > -    start_addr = le64_to_cpu(ags->ghes_addr_le);
> > > > -
> > > >      if (physical_address) {
> > > >          start_addr += source_id * sizeof(uint64_t);      
> > > 
> > > above should be a separate patch
> > >     
> > > >  
> > > > @@ -448,9 +447,147 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > > >      return ret;
> > > >  }
> > > >  
> > > > +/*
> > > > + * Error register block data layout
> > > > + *
> > > > + * | +---------------------+ ges.ghes_addr_le
> > > > + * | |error_block_address0 |
> > > > + * | +---------------------+
> > > > + * | |error_block_address1 |
> > > > + * | +---------------------+ --+--
> > > > + * | |    .............    | GHES_ADDRESS_SIZE
> > > > + * | +---------------------+ --+--
> > > > + * | |error_block_addressN |
> > > > + * | +---------------------+
> > > > + * | | read_ack0           |
> > > > + * | +---------------------+ --+--
> > > > + * | | read_ack1           | GHES_ADDRESS_SIZE
> > > > + * | +---------------------+ --+--
> > > > + * | |   .............     |
> > > > + * | +---------------------+
> > > > + * | | read_ackN           |
> > > > + * | +---------------------+ --+--
> > > > + * | |      CPER           |   |
> > > > + * | |      ....           | GHES_MAX_RAW_DATA_LENGT
> > > > + * | |      CPER           |   |
> > > > + * | +---------------------+ --+--
> > > > + * | |    ..........       |
> > > > + * | +---------------------+
> > > > + * | |      CPER           |
> > > > + * | |      ....           |
> > > > + * | |      CPER           |
> > > > + * | +---------------------+
> > > > + */      
> > > 
> > > no need to duplicate docs/specs/acpi_hest_ghes.rst,
> > > I'd just reffer to it and maybe add short comment as to why it's mentioned.
> > >     
> > > > +/* Map from uint32_t notify to entry offset in GHES */
> > > > +static const uint8_t error_source_to_index[] = { 0xff, 0xff, 0xff, 0xff,
> > > > +                                                 0xff, 0xff, 0xff, 1, 0};
> > > > +
> > > > +static bool ghes_get_addr(uint32_t notify, uint64_t *error_block_addr,
> > > > +                          uint64_t *read_ack_addr)
> > > > +{
> > > > +    uint64_t base;
> > > > +
> > > > +    if (notify >= ACPI_GHES_NOTIFY_RESERVED) {
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    /* Find and check the source id for this new CPER */
> > > > +    if (error_source_to_index[notify] == 0xff) {
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    base = ghes_get_state_start_address();
> > > > +
> > > > +    *read_ack_addr = base +
> > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > +        error_source_to_index[notify] * sizeof(uint64_t);
> > > > +
> > > > +    /* Could also be read back from the error_block_address register */
> > > > +    *error_block_addr = base +
> > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> > > > +
> > > > +    return true;
> > > > +}      
> > > 
> > > I don't like all this pointer math, which is basically a reverse engineered
> > > QEMU actions on startup + guest provided etc/hardware_errors address.
> > > 
> > > For once, it assumes error_source_to_index[] matches order in which HEST
> > > error sources were described, which is fragile.
> > > 
> > > 2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
> > > in RAM migrated from older version might not match above assumptions
> > > of target QEMU. 
> > > 
> > > I see 2 ways to rectify it:
> > >   1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
> > >        in guest RAM, like we do with etc/hardware_errors, see
> > >             build_ghes_error_table()
> > >                ...
> > >                tell firmware to write hardware_errors GPA into
> > >        and then fetch from HEST table in RAM, the guest patched error/ack addresses
> > >        for given source_id
> > > 
> > >        code-wise: relatively simple once one wraps their own head over
> > >                  how this whole APEI thing works in QEMU
> > >                  workflow  is described in docs/specs/acpi_hest_ghes.rst
> > >                  look to me as sufficient to grasp it.
> > >                  (but my view is very biased given my prior knowledge,
> > >                   aka: docs/comments/examples wrt acpi patching are good enough)
> > >                  (if it's not clear how to do it, ask me for pointers)    
> > 
> > Hi Igor, I think I follow what you mean but maybe this question will reveal
> > otherwise.  HEST is currently in ACPI_BUILD_TABLE_FILE.
> > Would you suggest splitting it to it's own file, or using table_offsets
> > to get the offset in ACPI_BUILD_TABLE_FILE GPA?  
> yep, offset taken right before HEST is to be created
> doc comment for bios_linker_loader_write_pointer() explains how it works
> 
> we need something like:
>        bios_linker_loader_write_pointer(linker,
>            ACPI_HEST_TABLE_ADDR_FW_CFG_FILE, 0, sizeof(uint64_t),
>            ACPI_BUILD_TABLE_FILE, hest_offset_within_ACPI_BUILD_TABLE_FILE); 
> 
> to register new file see:
>    a08a64627 ACPI: Record the Generic Error Status Block address
>    and to avoid copy past error maybe
>    136fc6aa2 ACPI: Avoid infinite recursion when dump-vmstat
> for this needs to be limited to new machine types and keep
> old ones without this new feature. (I'd use hw_compat_ machinery for that)

Not sure if I got it. The code, after this patch from my v6:

	https://lore.kernel.org/qemu-devel/5710c364d7ef6cdab6b2f1e127ef191bdf84e8c2.1723119423.git.mchehab+huawei@kernel.org/T/#u

Already stores two of the three address offsets via 
bios_linker_loader_add_pointer(), e. g. it is similar to the
code below (I simplified the code to make the example clearer):

<snip>
/* From hw/arm/virt-acpi-build.c */
static
void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
{
    ...
    if (vms->ras) {
        build_ghes_error_table(tables->hardware_errors, tables->linker);
        acpi_add_table(table_offsets, tables_blob);
	/* internally, call build_ghes_v2() for SEA and GED notification sources */
        acpi_build_hest(tables_blob, tables->linker, vms->oem_id,
                        vms->oem_table_id);
    }
    ...
}

/* From hw/acpi/ghes.c */
static void build_ghes_v2(GArray *table_data,
                          enum AcpiGhesNotifyType notify,
                          BIOSLinker *linker)
{
    uint64_t address_offset, ack_offset, block_addr_offset, cper_offset;
    enum AcpiHestSourceId source_id;

    /* 
     * Get offsets for either SEA or GED notification - easy to extend
     * to all mechanisms like MCE and SCI to better support x86
     */
    assert(!acpi_hest_address_offset(notify, &block_addr_offset, &ack_offset,
                                     &cper_offset, &source_id));

    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   address_offset + GAS_ADDR_OFFSET,
                                   sizeof(uint64_t),
                                   ACPI_GHES_ERRORS_FW_CFG_FILE,
                                   block_addr_offset);

    bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
                                   address_offset + GAS_ADDR_OFFSET,
                                   sizeof(uint64_t),
                                   ACPI_GHES_ERRORS_FW_CFG_FILE,
                                   ack_offset);

    /* Current code ignores &cper_offset when creating HEST */
}

void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
                             enum AcpiGhesNotifyType notify)
{
    uint64_t cper_addr, read_ack_start_addr;

    assert(!ghes_get_hardware_errors_address(notify, NULL, &read_ack_start_addr,
                                         &cper_addr, NULL));

    /*
     * Use cpu_physical_memory_read/write() to
     *  - read/store at read_ack_start_addr 
     *  - Write cper block GArray at cper_addr
     */
}
</snip>

We may also store cper_offset there via bios_linker_loader_add_pointer()
and/or use bios_linker_loader_write_pointer(), but I can't see how the
data stored there can be retrieved, nor any advantage of using it instead
of the current code, as, in the end, we'll have 3 addresses that will be
used:

	- an address where a pointer to CPER record will be stored;
	- an address where the ack will be stored;
	- an address where the actual CPER record will be stored.

And those are calculated on a single function and are all stored at the
ACPI table files.

What am I missing?

Thanks,
Mauro
Igor Mammedov Aug. 12, 2024, 9:39 a.m. UTC | #10
On Thu, 8 Aug 2024 20:19:03 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Em Thu, 8 Aug 2024 10:11:07 +0200
> Igor Mammedov <imammedo@redhat.com> escreveu:
> 
> > On Wed, 7 Aug 2024 15:25:47 +0100
> > Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> >   
> > > On Tue, 6 Aug 2024 16:31:13 +0200
> > > Igor Mammedov <imammedo@redhat.com> wrote:
> > >     
> > > > On Fri,  2 Aug 2024 23:44:01 +0200
> > > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > > >       
> > > > > Provide a generic interface for error injection via GHESv2.
> > > > > 
> > > > > This patch is co-authored:
> > > > >     - original ghes logic to inject a simple ARM record by Shiju Jose;
> > > > >     - generic logic to handle block addresses by Jonathan Cameron;
> > > > >     - generic GHESv2 error inject by Mauro Carvalho Chehab;
> > > > > 
> > > > > Co-authored-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > > Co-authored-by: Shiju Jose <shiju.jose@huawei.com>
> > > > > Co-authored-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > > Cc: Shiju Jose <shiju.jose@huawei.com>
> > > > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > > ---
> > > > >  hw/acpi/ghes.c         | 159 ++++++++++++++++++++++++++++++++++++++---
> > > > >  hw/acpi/ghes_cper.c    |   2 +-
> > > > >  include/hw/acpi/ghes.h |   3 +
> > > > >  3 files changed, 152 insertions(+), 12 deletions(-)
> > > > > 
> > > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > > index a745dcc7be5e..e125c9475773 100644
> > > > > --- a/hw/acpi/ghes.c
> > > > > +++ b/hw/acpi/ghes.c
> > > > > @@ -395,23 +395,22 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > > > >      ags->present = true;
> > > > >  }
> > > > >  
> > > > > +static uint64_t ghes_get_state_start_address(void)        
> > > > 
> > > > ghes_get_hardware_errors_address() might better reflect what address it will return
> > > >       
> > > > > +{
> > > > > +    AcpiGedState *acpi_ged_state =
> > > > > +        ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, NULL));
> > > > > +    AcpiGhesState *ags = &acpi_ged_state->ghes_state;
> > > > > +
> > > > > +    return le64_to_cpu(ags->ghes_addr_le);
> > > > > +}
> > > > > +
> > > > >  int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > > > >  {
> > > > >      uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> > > > > -    uint64_t start_addr;
> > > > > +    uint64_t start_addr = ghes_get_state_start_address();
> > > > >      bool ret = -1;
> > > > > -    AcpiGedState *acpi_ged_state;
> > > > > -    AcpiGhesState *ags;
> > > > > -
> > > > >      assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
> > > > >  
> > > > > -    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
> > > > > -                                                       NULL));
> > > > > -    g_assert(acpi_ged_state);
> > > > > -    ags = &acpi_ged_state->ghes_state;
> > > > > -
> > > > > -    start_addr = le64_to_cpu(ags->ghes_addr_le);
> > > > > -
> > > > >      if (physical_address) {
> > > > >          start_addr += source_id * sizeof(uint64_t);        
> > > > 
> > > > above should be a separate patch
> > > >       
> > > > >  
> > > > > @@ -448,9 +447,147 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
> > > > >      return ret;
> > > > >  }
> > > > >  
> > > > > +/*
> > > > > + * Error register block data layout
> > > > > + *
> > > > > + * | +---------------------+ ges.ghes_addr_le
> > > > > + * | |error_block_address0 |
> > > > > + * | +---------------------+
> > > > > + * | |error_block_address1 |
> > > > > + * | +---------------------+ --+--
> > > > > + * | |    .............    | GHES_ADDRESS_SIZE
> > > > > + * | +---------------------+ --+--
> > > > > + * | |error_block_addressN |
> > > > > + * | +---------------------+
> > > > > + * | | read_ack0           |
> > > > > + * | +---------------------+ --+--
> > > > > + * | | read_ack1           | GHES_ADDRESS_SIZE
> > > > > + * | +---------------------+ --+--
> > > > > + * | |   .............     |
> > > > > + * | +---------------------+
> > > > > + * | | read_ackN           |
> > > > > + * | +---------------------+ --+--
> > > > > + * | |      CPER           |   |
> > > > > + * | |      ....           | GHES_MAX_RAW_DATA_LENGT
> > > > > + * | |      CPER           |   |
> > > > > + * | +---------------------+ --+--
> > > > > + * | |    ..........       |
> > > > > + * | +---------------------+
> > > > > + * | |      CPER           |
> > > > > + * | |      ....           |
> > > > > + * | |      CPER           |
> > > > > + * | +---------------------+
> > > > > + */        
> > > > 
> > > > no need to duplicate docs/specs/acpi_hest_ghes.rst,
> > > > I'd just reffer to it and maybe add short comment as to why it's mentioned.
> > > >       
> > > > > +/* Map from uint32_t notify to entry offset in GHES */
> > > > > +static const uint8_t error_source_to_index[] = { 0xff, 0xff, 0xff, 0xff,
> > > > > +                                                 0xff, 0xff, 0xff, 1, 0};
> > > > > +
> > > > > +static bool ghes_get_addr(uint32_t notify, uint64_t *error_block_addr,
> > > > > +                          uint64_t *read_ack_addr)
> > > > > +{
> > > > > +    uint64_t base;
> > > > > +
> > > > > +    if (notify >= ACPI_GHES_NOTIFY_RESERVED) {
> > > > > +        return false;
> > > > > +    }
> > > > > +
> > > > > +    /* Find and check the source id for this new CPER */
> > > > > +    if (error_source_to_index[notify] == 0xff) {
> > > > > +        return false;
> > > > > +    }
> > > > > +
> > > > > +    base = ghes_get_state_start_address();
> > > > > +
> > > > > +    *read_ack_addr = base +
> > > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > > +        error_source_to_index[notify] * sizeof(uint64_t);
> > > > > +
> > > > > +    /* Could also be read back from the error_block_address register */
> > > > > +    *error_block_addr = base +
> > > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > > +        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
> > > > > +        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
> > > > > +
> > > > > +    return true;
> > > > > +}        
> > > > 
> > > > I don't like all this pointer math, which is basically a reverse engineered
> > > > QEMU actions on startup + guest provided etc/hardware_errors address.
> > > > 
> > > > For once, it assumes error_source_to_index[] matches order in which HEST
> > > > error sources were described, which is fragile.
> > > > 
> > > > 2nd: migration-wive it's disaster, since old/new HEST/hardware_errors tables
> > > > in RAM migrated from older version might not match above assumptions
> > > > of target QEMU. 
> > > > 
> > > > I see 2 ways to rectify it:
> > > >   1st: preferred/cleanest would be to tell QEMU (via fw_cfg) address of HEST table
> > > >        in guest RAM, like we do with etc/hardware_errors, see
> > > >             build_ghes_error_table()
> > > >                ...
> > > >                tell firmware to write hardware_errors GPA into
> > > >        and then fetch from HEST table in RAM, the guest patched error/ack addresses
> > > >        for given source_id
> > > > 
> > > >        code-wise: relatively simple once one wraps their own head over
> > > >                  how this whole APEI thing works in QEMU
> > > >                  workflow  is described in docs/specs/acpi_hest_ghes.rst
> > > >                  look to me as sufficient to grasp it.
> > > >                  (but my view is very biased given my prior knowledge,
> > > >                   aka: docs/comments/examples wrt acpi patching are good enough)
> > > >                  (if it's not clear how to do it, ask me for pointers)      
> > > 
> > > Hi Igor, I think I follow what you mean but maybe this question will reveal
> > > otherwise.  HEST is currently in ACPI_BUILD_TABLE_FILE.
> > > Would you suggest splitting it to it's own file, or using table_offsets
> > > to get the offset in ACPI_BUILD_TABLE_FILE GPA?    
> > yep, offset taken right before HEST is to be created
> > doc comment for bios_linker_loader_write_pointer() explains how it works
> > 
> > we need something like:
> >        bios_linker_loader_write_pointer(linker,
> >            ACPI_HEST_TABLE_ADDR_FW_CFG_FILE, 0, sizeof(uint64_t),
> >            ACPI_BUILD_TABLE_FILE, hest_offset_within_ACPI_BUILD_TABLE_FILE); 
> > 
> > to register new file see:
> >    a08a64627 ACPI: Record the Generic Error Status Block address
> >    and to avoid copy past error maybe
> >    136fc6aa2 ACPI: Avoid infinite recursion when dump-vmstat
> > for this needs to be limited to new machine types and keep
> > old ones without this new feature. (I'd use hw_compat_ machinery for that)  
> 
> Not sure if I got it. The code, after this patch from my v6:
> 
> 	https://lore.kernel.org/qemu-devel/5710c364d7ef6cdab6b2f1e127ef191bdf84e8c2.1723119423.git.mchehab+huawei@kernel.org/T/#u
> 
> Already stores two of the three address offsets via 
> bios_linker_loader_add_pointer(), e. g. it is similar to the
> code below (I simplified the code to make the example clearer):
> 
> <snip>
> /* From hw/arm/virt-acpi-build.c */
> static
> void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
> {
>     ...
>     if (vms->ras) {
>         build_ghes_error_table(tables->hardware_errors, tables->linker);
>         acpi_add_table(table_offsets, tables_blob);
> 	/* internally, call build_ghes_v2() for SEA and GED notification sources */
>         acpi_build_hest(tables_blob, tables->linker, vms->oem_id,
>                         vms->oem_table_id);
>     }
>     ...
> }
> 
> /* From hw/acpi/ghes.c */
> static void build_ghes_v2(GArray *table_data,
>                           enum AcpiGhesNotifyType notify,
>                           BIOSLinker *linker)
> {
>     uint64_t address_offset, ack_offset, block_addr_offset, cper_offset;
>     enum AcpiHestSourceId source_id;
> 
>     /* 
>      * Get offsets for either SEA or GED notification - easy to extend
>      * to all mechanisms like MCE and SCI to better support x86
>      */
>     assert(!acpi_hest_address_offset(notify, &block_addr_offset, &ack_offset,
>                                      &cper_offset, &source_id));
> 
>     bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
>                                    address_offset + GAS_ADDR_OFFSET,
>                                    sizeof(uint64_t),
>                                    ACPI_GHES_ERRORS_FW_CFG_FILE,
>                                    block_addr_offset);
> 
>     bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
>                                    address_offset + GAS_ADDR_OFFSET,
>                                    sizeof(uint64_t),
>                                    ACPI_GHES_ERRORS_FW_CFG_FILE,
>                                    ack_offset);
> 
>     /* Current code ignores &cper_offset when creating HEST */
> }
> 
> void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
>                              enum AcpiGhesNotifyType notify)
> {
>     uint64_t cper_addr, read_ack_start_addr;
> 
>     assert(!ghes_get_hardware_errors_address(notify, NULL, &read_ack_start_addr,
>                                          &cper_addr, NULL));
> 
>     /*
>      * Use cpu_physical_memory_read/write() to
>      *  - read/store at read_ack_start_addr 
>      *  - Write cper block GArray at cper_addr
>      */
> }
> </snip>
> 
> We may also store cper_offset there via bios_linker_loader_add_pointer()
> and/or use bios_linker_loader_write_pointer(), but I can't see how the
> data stored there can be retrieved, nor any advantage of using it instead
> of the current code, as, in the end, we'll have 3 addresses that will be
> used:
> 
> 	- an address where a pointer to CPER record will be stored;
> 	- an address where the ack will be stored;
> 	- an address where the actual CPER record will be stored.
> 
> And those are calculated on a single function and are all stored at the
> ACPI table files.
>
> What am I missing?

That's basically (2) approach and it works to some degree,
unfortunately it's fragile when we start talking about migration
and changing layout in the future.

Lets take as example increasing size of 1) 'Generic Error Status Block',
we are considering. Old QEMU will, tell firmware to allocate 1K buffer
for it and calculated offsets to [1] (that you've stored/calculated) will
include this assumption.
Then in newer we QEMU increase size of [1] and all hardcoded offsets will
account for new size, but if we migrate guest from old QEMU to this newer
one all HEST tables layout within guest will match old QEMU assumptions,
and as result newer QEMU with larger block size will write CPERs at wrong
address considering we are still running guest from old QEMU.
That's just one example.

To make it work there a number of ways, but the ultimate goal is to pick
one that's the least fragile and won't snowball in maintenance nightmare
as number of GHES sources increases over time.

This series tries to solve problem of mapping GHES source to
a corresponding 'Generic Error Status Block' and related registers.
However we are missing access to this mapping since it only
exists in guest patched HEST (i.e in guest RAM only).

The robust way to make it work would be for QEMU to get a pointer
to whole HEST table and then enumerate GHES sources and related
error/ack registers directly from guest RAM (sidestepping layout
change issues this way).

what I'm proposing is to use bios_linker_loader_write_pointer()
(only once) so that firmware could tell QEMU address of HEST table,
in which one can find a GHES source and always correct error/ack
pointers (regardless of table[s] layout changes).


> Thanks,
> Mauro
>
Mauro Carvalho Chehab Aug. 13, 2024, 6:59 p.m. UTC | #11
Em Mon, 12 Aug 2024 11:39:00 +0200
Igor Mammedov <imammedo@redhat.com> escreveu:

> > We may also store cper_offset there via bios_linker_loader_add_pointer()
> > and/or use bios_linker_loader_write_pointer(), but I can't see how the
> > data stored there can be retrieved, nor any advantage of using it instead
> > of the current code, as, in the end, we'll have 3 addresses that will be
> > used:
> > 
> > 	- an address where a pointer to CPER record will be stored;
> > 	- an address where the ack will be stored;
> > 	- an address where the actual CPER record will be stored.
> > 
> > And those are calculated on a single function and are all stored at the
> > ACPI table files.
> >
> > What am I missing?  
> 
> That's basically (2) approach and it works to some degree,
> unfortunately it's fragile when we start talking about migration
> and changing layout in the future.
> 
> Lets take as example increasing size of 1) 'Generic Error Status Block',
> we are considering. Old QEMU will, tell firmware to allocate 1K buffer
> for it and calculated offsets to [1] (that you've stored/calculated) will
> include this assumption.
> Then in newer we QEMU increase size of [1] and all hardcoded offsets will
> account for new size, but if we migrate guest from old QEMU to this newer
> one all HEST tables layout within guest will match old QEMU assumptions,
> and as result newer QEMU with larger block size will write CPERs at wrong
> address considering we are still running guest from old QEMU.
> That's just one example.
> 
> To make it work there a number of ways, but the ultimate goal is to pick
> one that's the least fragile and won't snowball in maintenance nightmare
> as number of GHES sources increases over time.
> 
> This series tries to solve problem of mapping GHES source to
> a corresponding 'Generic Error Status Block' and related registers.
> However we are missing access to this mapping since it only
> exists in guest patched HEST (i.e in guest RAM only).
> 
> The robust way to make it work would be for QEMU to get a pointer
> to whole HEST table and then enumerate GHES sources and related
> error/ack registers directly from guest RAM (sidestepping layout
> change issues this way).
> 
> what I'm proposing is to use bios_linker_loader_write_pointer()
> (only once) so that firmware could tell QEMU address of HEST table,
> in which one can find a GHES source and always correct error/ack
> pointers (regardless of table[s] layout changes).

Ok, got it. Such change was not easy, but I finally figured out how
to make it actually work.

I'll address tomorrow your comment on patch 5/10 about using raw data also 
for the other parts of CPER (generic error status and generic error data).

If you want to do a sneak peak, I'm keeping the latest development
version here:

	https://gitlab.com/mchehab_kernel/qemu/-/commits/qemu_submission?ref_type=heads

In particular, the patch changing from /etc/hardware_errors offset to
a HEST offset is at:

	https://gitlab.com/mchehab_kernel/qemu/-/commit/9197d22de09df97ce3d6725cb21bd2114c2eb43c

It contains several cleanups to make the logic clearer and more robust.

Thanks,
Mauro
diff mbox series

Patch

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index a745dcc7be5e..e125c9475773 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -395,23 +395,22 @@  void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
     ags->present = true;
 }
 
+static uint64_t ghes_get_state_start_address(void)
+{
+    AcpiGedState *acpi_ged_state =
+        ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, NULL));
+    AcpiGhesState *ags = &acpi_ged_state->ghes_state;
+
+    return le64_to_cpu(ags->ghes_addr_le);
+}
+
 int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
 {
     uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
-    uint64_t start_addr;
+    uint64_t start_addr = ghes_get_state_start_address();
     bool ret = -1;
-    AcpiGedState *acpi_ged_state;
-    AcpiGhesState *ags;
-
     assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
 
-    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
-                                                       NULL));
-    g_assert(acpi_ged_state);
-    ags = &acpi_ged_state->ghes_state;
-
-    start_addr = le64_to_cpu(ags->ghes_addr_le);
-
     if (physical_address) {
         start_addr += source_id * sizeof(uint64_t);
 
@@ -448,9 +447,147 @@  int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
     return ret;
 }
 
+/*
+ * Error register block data layout
+ *
+ * | +---------------------+ ges.ghes_addr_le
+ * | |error_block_address0 |
+ * | +---------------------+
+ * | |error_block_address1 |
+ * | +---------------------+ --+--
+ * | |    .............    | GHES_ADDRESS_SIZE
+ * | +---------------------+ --+--
+ * | |error_block_addressN |
+ * | +---------------------+
+ * | | read_ack0           |
+ * | +---------------------+ --+--
+ * | | read_ack1           | GHES_ADDRESS_SIZE
+ * | +---------------------+ --+--
+ * | |   .............     |
+ * | +---------------------+
+ * | | read_ackN           |
+ * | +---------------------+ --+--
+ * | |      CPER           |   |
+ * | |      ....           | GHES_MAX_RAW_DATA_LENGT
+ * | |      CPER           |   |
+ * | +---------------------+ --+--
+ * | |    ..........       |
+ * | +---------------------+
+ * | |      CPER           |
+ * | |      ....           |
+ * | |      CPER           |
+ * | +---------------------+
+ */
+
+/* Map from uint32_t notify to entry offset in GHES */
+static const uint8_t error_source_to_index[] = { 0xff, 0xff, 0xff, 0xff,
+                                                 0xff, 0xff, 0xff, 1, 0};
+
+static bool ghes_get_addr(uint32_t notify, uint64_t *error_block_addr,
+                          uint64_t *read_ack_addr)
+{
+    uint64_t base;
+
+    if (notify >= ACPI_GHES_NOTIFY_RESERVED) {
+        return false;
+    }
+
+    /* Find and check the source id for this new CPER */
+    if (error_source_to_index[notify] == 0xff) {
+        return false;
+    }
+
+    base = ghes_get_state_start_address();
+
+    *read_ack_addr = base +
+        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
+        error_source_to_index[notify] * sizeof(uint64_t);
+
+    /* Could also be read back from the error_block_address register */
+    *error_block_addr = base +
+        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
+        ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t) +
+        error_source_to_index[notify] * ACPI_GHES_MAX_RAW_DATA_LENGTH;
+
+    return true;
+}
+
 NotifierList generic_error_notifiers =
     NOTIFIER_LIST_INITIALIZER(error_device_notifiers);
 
+void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
+                             uint32_t notify)
+{
+    int read_ack = 0;
+    uint32_t i;
+    uint64_t read_ack_addr = 0;
+    uint64_t error_block_addr = 0;
+    uint32_t data_length;
+    GArray *block;
+
+    if (!ghes_get_addr(notify, &error_block_addr, &read_ack_addr)) {
+        error_setg(errp, "GHES: Invalid error block/ack address(es)");
+        return;
+    }
+
+    cpu_physical_memory_read(read_ack_addr,
+                             &read_ack, sizeof(uint64_t));
+
+    /* zero means OSPM does not acknowledge the error */
+    if (!read_ack) {
+        error_setg(errp,
+                   "Last CPER record was not acknowledged yet");
+        read_ack = 1;
+        cpu_physical_memory_write(read_ack_addr,
+                                  &read_ack, sizeof(uint64_t));
+        return;
+    }
+
+    read_ack = cpu_to_le64(0);
+    cpu_physical_memory_write(read_ack_addr,
+                              &read_ack, sizeof(uint64_t));
+
+    /* Build CPER record */
+
+    /*
+     * Invalid fru id: ACPI 4.0: 17.3.2.6.1 Generic Error Data,
+     * Table 17-13 Generic Error Data Entry
+     */
+    QemuUUID fru_id = {};
+
+    block = g_array_new(false, true /* clear */, 1);
+    data_length = ACPI_GHES_DATA_LENGTH + cper->data_len;
+
+    /*
+        * It should not run out of the preallocated memory if
+        * adding a new generic error data entry
+        */
+    assert((data_length + ACPI_GHES_GESB_SIZE) <=
+            ACPI_GHES_MAX_RAW_DATA_LENGTH);
+
+    /* Build the new generic error status block header */
+    acpi_ghes_generic_error_status(block, ACPI_GEBS_UNCORRECTABLE,
+                                    0, 0, data_length,
+                                    ACPI_CPER_SEV_RECOVERABLE);
+
+    /* Build this new generic error data entry header */
+    acpi_ghes_generic_error_data(block, cper->guid,
+                                ACPI_CPER_SEV_RECOVERABLE, 0, 0,
+                                cper->data_len, fru_id, 0);
+
+    /* Add CPER data */
+    for (i = 0; i < cper->data_len; i++) {
+        build_append_int_noprefix(block, cper->data[i], 1);
+    }
+
+    /* Write the generic error data entry into guest memory */
+    cpu_physical_memory_write(error_block_addr, block->data, block->len);
+
+    g_array_free(block, true);
+
+    notifier_list_notify(&generic_error_notifiers, NULL);
+}
+
 bool acpi_ghes_present(void)
 {
     AcpiGedState *acpi_ged_state;
diff --git a/hw/acpi/ghes_cper.c b/hw/acpi/ghes_cper.c
index 7aa7e71e90dc..d7ff7debee74 100644
--- a/hw/acpi/ghes_cper.c
+++ b/hw/acpi/ghes_cper.c
@@ -39,7 +39,7 @@  void qmp_ghes_cper(CommonPlatformErrorRecord *qmp_cper,
         return;
     }
 
-    /* TODO: call a function at ghes */
+    ghes_record_cper_errors(&cper, errp, ACPI_GHES_NOTIFY_GPIO);
 
     g_free(cper.data);
 }
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 06a5b8820cd5..ee6f6cd96911 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -85,6 +85,9 @@  typedef struct AcpiGhesCper {
     size_t data_len;
 } AcpiGhesCper;
 
+void ghes_record_cper_errors(AcpiGhesCper *cper, Error **errp,
+                             uint32_t notify);
+
 /**
  * acpi_ghes_present: Report whether ACPI GHES table is present
  *