diff mbox series

[v5,1/3] i386: Fix MCE support for AMD hosts

Message ID 20240603193622.47156-2-john.allen@amd.com
State New
Headers show
Series Fix MCE handling on AMD hosts | expand

Commit Message

John Allen June 3, 2024, 7:36 p.m. UTC
For the most part, AMD hosts can use the same MCE injection code as Intel, but
there are instances where the qemu implementation is Intel specific. First, MCE
delivery works differently on AMD and does not support broadcast. Second,
kvm_mce_inject generates MCEs that include a number of Intel specific status
bits. Modify kvm_mce_inject to properly generate MCEs on AMD platforms.

Reported-by: William Roche <william.roche@oracle.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
v3:
  - Update to latest qemu code that introduces using MCG_STATUS_RIPV in the
    case of a BUS_MCEERR_AR on a non-AMD machine.
v5:
  - This version implements proper support for AO (deferred) errors. For
    deferred errors, there are some differences in the flags that need to be
    set in order to be handled correctly. First, the UC flag must not be set
    to allow deferred errors to be picked up by the machine_check_poll
    function in the guest kernel, mirroring baremetal behavior. If the UC
    flag is set, the injected MCE will be sent as an interrupt and will be
    handled in do_machine_check which currently is not expected to handle
    deferred errors and would need significant modification to do so.

  - Both the deferred (AO) and uncorrected (AR) error cases set the POISON
    bit. For the uncorrected error case, this is expected and properly
    mirrors baremetal behavior. For the deferred error case, this bit would
    not be set by hardware, but is being reused as a convenient workaround
    to give better behavior in the case that the MCE_KILL_EARLY flag is set
    in the guest process. See code comment for more details.
---
 target/i386/cpu.h     |  2 ++
 target/i386/helper.c  |  4 ++++
 target/i386/kvm/kvm.c | 39 +++++++++++++++++++++++++++++++--------
 3 files changed, 37 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index dfe43b8204..ddf53937b4 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -365,6 +365,8 @@  typedef enum X86Seg {
 #define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
 #define MCI_STATUS_S     (1ULL<<56)  /* Signaled machine check */
 #define MCI_STATUS_AR    (1ULL<<55)  /* Action required */
+#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* Deferred error */
+#define MCI_STATUS_POISON      (1ULL<<43)  /* Poisoned data consumed */
 
 /* MISC register defines */
 #define MCM_ADDR_SEGOFF  0      /* segment offset */
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 2070dd0dda..6f6b0f02e0 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -91,6 +91,10 @@  int cpu_x86_support_mca_broadcast(CPUX86State *env)
     int family = 0;
     int model = 0;
 
+    if (IS_AMD_CPU(env)) {
+        return 0;
+    }
+
     cpu_x86_version(env, &family, &model);
     if ((family == 6 && model >= 14) || family > 6) {
         return 1;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 42970ab046..810f586a59 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -582,17 +582,40 @@  static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
 {
     CPUState *cs = CPU(cpu);
     CPUX86State *env = &cpu->env;
-    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
-                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
-    uint64_t mcg_status = MCG_STATUS_MCIP;
+    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_MISCV |
+                      MCI_STATUS_ADDRV;
+    uint64_t mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
     int flags = 0;
 
-    if (code == BUS_MCEERR_AR) {
-        status |= MCI_STATUS_AR | 0x134;
-        mcg_status |= MCG_STATUS_RIPV | MCG_STATUS_EIPV;
+    if (!IS_AMD_CPU(env)) {
+        status |= MCI_STATUS_S | MCI_STATUS_UC;
+        if (code == BUS_MCEERR_AR) {
+            status |= MCI_STATUS_AR | 0x134;
+            mcg_status |= MCG_STATUS_EIPV;
+        } else {
+            status |= 0xc0;
+        }
     } else {
-        status |= 0xc0;
-        mcg_status |= MCG_STATUS_RIPV;
+        if (code == BUS_MCEERR_AR) {
+            status |= MCI_STATUS_UC | MCI_STATUS_POISON;
+            mcg_status |= MCG_STATUS_EIPV;
+        } else {
+            /* Setting the POISON bit for deferred errors indicates to the
+             * guest kernel that the address provided by the MCE is valid
+             * and usable which will ensure that the guest kernel will send
+             * a SIGBUS_AO signal to the guest process. This allows for
+             * more desirable behavior in the case that the guest process
+             * with poisoned memory has set the MCE_KILL_EARLY prctl flag
+             * which indicates that the process would prefer to handle or
+             * shutdown due to the poisoned memory condition before the
+             * memory has been accessed.
+             *
+             * While the POISON bit would not be set in a deferred error
+             * sent from hardware, the bit is not meaningful for deferred
+             * errors and can be reused in this scenario.
+             */
+            status |= MCI_STATUS_DEFERRED | MCI_STATUS_POISON;
+        }
     }
 
     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;