diff mbox series

hw/intc: riscv-imsic: Fix interrupt state updates.

Message ID a7604e4d61068ca4d384ae2a1377e1521d4d0235.1725651699.git.tjeznach@rivosinc.com
State New
Headers show
Series hw/intc: riscv-imsic: Fix interrupt state updates. | expand

Commit Message

Tomasz Jeznach Sept. 6, 2024, 8:23 p.m. UTC
The IMSIC state variable eistate[] is modified by CSR instructions
within a range dedicated to the local CPU and by MMIO writes from any CPU.
Access to eistate from MMIO accessors is protected by the BQL, but
read-modify-write (RMW) sequences from CSRRW do not acquire the BQL,
making the RMW sequence vulnerable to a race condition with MMIO access
from a remote CPU.

This race can manifest as missing IPI or MSI in multi-CPU systems, eg:

[   43.008092] watchdog: BUG: soft lockup - CPU#2 stuck for 27s! [kworker/u19:1:52]
[   43.011723] CPU: 2 UID: 0 PID: 52 Comm: kworker/u19:1 Not tainted 6.11.0-rc6
[   43.013070] Workqueue: events_unbound deferred_probe_work_func
[   43.018776] [<ffffffff800b4a86>] smp_call_function_many_cond+0x190/0x5c2
[   43.019205] [<ffffffff800b4f28>] on_each_cpu_cond_mask+0x20/0x32
[   43.019447] [<ffffffff8001069a>] __flush_tlb_range+0xf2/0x190
[   43.019683] [<ffffffff80010914>] flush_tlb_kernel_range+0x20/0x28

The interrupt line raise/lower sequence was changed to prevent a race
between the evaluation of the eistate and the execution of the qemu_irq
raise/lower, ensuring that the interrupt line is not incorrectly
deactivated based on a stale topei check result. To avoid holding BQL
all modifications of eistate are converted to atomic operations.

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 hw/intc/riscv_imsic.c | 50 +++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 18 deletions(-)


base-commit: fd1952d814da738ed107e05583b3e02ac11e88ff

Comments

Alistair Francis Sept. 9, 2024, 5:05 a.m. UTC | #1
On Sat, Sep 7, 2024 at 6:24 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> The IMSIC state variable eistate[] is modified by CSR instructions
> within a range dedicated to the local CPU and by MMIO writes from any CPU.
> Access to eistate from MMIO accessors is protected by the BQL, but
> read-modify-write (RMW) sequences from CSRRW do not acquire the BQL,
> making the RMW sequence vulnerable to a race condition with MMIO access
> from a remote CPU.
>
> This race can manifest as missing IPI or MSI in multi-CPU systems, eg:
>
> [   43.008092] watchdog: BUG: soft lockup - CPU#2 stuck for 27s! [kworker/u19:1:52]
> [   43.011723] CPU: 2 UID: 0 PID: 52 Comm: kworker/u19:1 Not tainted 6.11.0-rc6
> [   43.013070] Workqueue: events_unbound deferred_probe_work_func
> [   43.018776] [<ffffffff800b4a86>] smp_call_function_many_cond+0x190/0x5c2
> [   43.019205] [<ffffffff800b4f28>] on_each_cpu_cond_mask+0x20/0x32
> [   43.019447] [<ffffffff8001069a>] __flush_tlb_range+0xf2/0x190
> [   43.019683] [<ffffffff80010914>] flush_tlb_kernel_range+0x20/0x28
>
> The interrupt line raise/lower sequence was changed to prevent a race
> between the evaluation of the eistate and the execution of the qemu_irq
> raise/lower, ensuring that the interrupt line is not incorrectly
> deactivated based on a stale topei check result. To avoid holding BQL
> all modifications of eistate are converted to atomic operations.
>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> ---
>  hw/intc/riscv_imsic.c | 50 +++++++++++++++++++++++++++----------------
>  1 file changed, 32 insertions(+), 18 deletions(-)
>
> diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c
> index b90f0d731d..9ef65d4012 100644
> --- a/hw/intc/riscv_imsic.c
> +++ b/hw/intc/riscv_imsic.c
> @@ -55,7 +55,7 @@ static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
>                 (imsic->eithreshold[page] <= imsic->num_irqs)) ?
>                 imsic->eithreshold[page] : imsic->num_irqs;
>      for (i = 1; i < max_irq; i++) {
> -        if ((imsic->eistate[base + i] & IMSIC_EISTATE_ENPEND) ==
> +        if ((qatomic_read(&imsic->eistate[base + i]) & IMSIC_EISTATE_ENPEND) ==
>                  IMSIC_EISTATE_ENPEND) {
>              return (i << IMSIC_TOPEI_IID_SHIFT) | i;
>          }
> @@ -66,10 +66,24 @@ static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
>
>  static void riscv_imsic_update(RISCVIMSICState *imsic, uint32_t page)
>  {
> +    uint32_t base = page * imsic->num_irqs;
> +
> +    /*
> +     * Lower the interrupt line if necessary, then evaluate the current
> +     * IMSIC state.
> +     * This sequence ensures that any race between evaluating the eistate and
> +     * updating the interrupt line will not result in an incorrectly
> +     * deactivated connected CPU IRQ line.
> +     * If multiple interrupts are pending, this sequence functions identically
> +     * to qemu_irq_pulse.
> +     */
> +
> +    if (qatomic_fetch_and(&imsic->eistate[base], ~IMSIC_EISTATE_ENPEND)) {
> +        qemu_irq_lower(imsic->external_irqs[page]);
> +    }
>      if (imsic->eidelivery[page] && riscv_imsic_topei(imsic, page)) {
>          qemu_irq_raise(imsic->external_irqs[page]);
> -    } else {
> -        qemu_irq_lower(imsic->external_irqs[page]);
> +        qatomic_or(&imsic->eistate[base], IMSIC_EISTATE_ENPEND);
>      }
>  }
>
> @@ -125,12 +139,11 @@ static int riscv_imsic_topei_rmw(RISCVIMSICState *imsic, uint32_t page,
>          topei >>= IMSIC_TOPEI_IID_SHIFT;
>          base = page * imsic->num_irqs;
>          if (topei) {
> -            imsic->eistate[base + topei] &= ~IMSIC_EISTATE_PENDING;
> +            qatomic_and(&imsic->eistate[base + topei], ~IMSIC_EISTATE_PENDING);
>          }
> -
> -        riscv_imsic_update(imsic, page);
>      }
>
> +    riscv_imsic_update(imsic, page);
>      return 0;
>  }
>
> @@ -139,7 +152,7 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>                                 uint32_t num, bool pend, target_ulong *val,
>                                 target_ulong new_val, target_ulong wr_mask)
>  {
> -    uint32_t i, base;
> +    uint32_t i, base, prev;
>      target_ulong mask;
>      uint32_t state = (pend) ? IMSIC_EISTATE_PENDING : IMSIC_EISTATE_ENABLED;
>
> @@ -157,10 +170,6 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>
>      if (val) {
>          *val = 0;
> -        for (i = 0; i < xlen; i++) {
> -            mask = (target_ulong)1 << i;
> -            *val |= (imsic->eistate[base + i] & state) ? mask : 0;
> -        }
>      }
>
>      for (i = 0; i < xlen; i++) {
> @@ -172,10 +181,15 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>          mask = (target_ulong)1 << i;
>          if (wr_mask & mask) {
>              if (new_val & mask) {
> -                imsic->eistate[base + i] |= state;
> +                prev = qatomic_fetch_or(&imsic->eistate[base + i], state);
>              } else {
> -                imsic->eistate[base + i] &= ~state;
> +                prev = qatomic_fetch_and(&imsic->eistate[base + i], ~state);
>              }
> +        } else {
> +            prev = qatomic_read(&imsic->eistate[base + i]);
> +        }
> +        if (val && (prev & state)) {
> +            *val |= mask;
>          }
>      }
>
> @@ -302,14 +316,14 @@ static void riscv_imsic_write(void *opaque, hwaddr addr, uint64_t value,
>      page = addr >> IMSIC_MMIO_PAGE_SHIFT;
>      if ((addr & (IMSIC_MMIO_PAGE_SZ - 1)) == IMSIC_MMIO_PAGE_LE) {
>          if (value && (value < imsic->num_irqs)) {
> -            imsic->eistate[(page * imsic->num_irqs) + value] |=
> -                                                    IMSIC_EISTATE_PENDING;
> +            qatomic_or(&imsic->eistate[(page * imsic->num_irqs) + value],
> +                       IMSIC_EISTATE_PENDING);
> +
> +            /* Update CPU external interrupt status */
> +            riscv_imsic_update(imsic, page);
>          }
>      }
>
> -    /* Update CPU external interrupt status */
> -    riscv_imsic_update(imsic, page);
> -
>      return;
>
>  err:
>
> base-commit: fd1952d814da738ed107e05583b3e02ac11e88ff
> --
> 2.34.1
>
>
Alistair Francis Sept. 10, 2024, 4:58 a.m. UTC | #2
On Sat, Sep 7, 2024 at 6:24 AM Tomasz Jeznach <tjeznach@rivosinc.com> wrote:
>
> The IMSIC state variable eistate[] is modified by CSR instructions
> within a range dedicated to the local CPU and by MMIO writes from any CPU.
> Access to eistate from MMIO accessors is protected by the BQL, but
> read-modify-write (RMW) sequences from CSRRW do not acquire the BQL,
> making the RMW sequence vulnerable to a race condition with MMIO access
> from a remote CPU.
>
> This race can manifest as missing IPI or MSI in multi-CPU systems, eg:
>
> [   43.008092] watchdog: BUG: soft lockup - CPU#2 stuck for 27s! [kworker/u19:1:52]
> [   43.011723] CPU: 2 UID: 0 PID: 52 Comm: kworker/u19:1 Not tainted 6.11.0-rc6
> [   43.013070] Workqueue: events_unbound deferred_probe_work_func
> [   43.018776] [<ffffffff800b4a86>] smp_call_function_many_cond+0x190/0x5c2
> [   43.019205] [<ffffffff800b4f28>] on_each_cpu_cond_mask+0x20/0x32
> [   43.019447] [<ffffffff8001069a>] __flush_tlb_range+0xf2/0x190
> [   43.019683] [<ffffffff80010914>] flush_tlb_kernel_range+0x20/0x28
>
> The interrupt line raise/lower sequence was changed to prevent a race
> between the evaluation of the eistate and the execution of the qemu_irq
> raise/lower, ensuring that the interrupt line is not incorrectly
> deactivated based on a stale topei check result. To avoid holding BQL
> all modifications of eistate are converted to atomic operations.
>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>

Thanks!

Applied to riscv-to-apply.next

Alistair

> ---
>  hw/intc/riscv_imsic.c | 50 +++++++++++++++++++++++++++----------------
>  1 file changed, 32 insertions(+), 18 deletions(-)
>
> diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c
> index b90f0d731d..9ef65d4012 100644
> --- a/hw/intc/riscv_imsic.c
> +++ b/hw/intc/riscv_imsic.c
> @@ -55,7 +55,7 @@ static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
>                 (imsic->eithreshold[page] <= imsic->num_irqs)) ?
>                 imsic->eithreshold[page] : imsic->num_irqs;
>      for (i = 1; i < max_irq; i++) {
> -        if ((imsic->eistate[base + i] & IMSIC_EISTATE_ENPEND) ==
> +        if ((qatomic_read(&imsic->eistate[base + i]) & IMSIC_EISTATE_ENPEND) ==
>                  IMSIC_EISTATE_ENPEND) {
>              return (i << IMSIC_TOPEI_IID_SHIFT) | i;
>          }
> @@ -66,10 +66,24 @@ static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
>
>  static void riscv_imsic_update(RISCVIMSICState *imsic, uint32_t page)
>  {
> +    uint32_t base = page * imsic->num_irqs;
> +
> +    /*
> +     * Lower the interrupt line if necessary, then evaluate the current
> +     * IMSIC state.
> +     * This sequence ensures that any race between evaluating the eistate and
> +     * updating the interrupt line will not result in an incorrectly
> +     * deactivated connected CPU IRQ line.
> +     * If multiple interrupts are pending, this sequence functions identically
> +     * to qemu_irq_pulse.
> +     */
> +
> +    if (qatomic_fetch_and(&imsic->eistate[base], ~IMSIC_EISTATE_ENPEND)) {
> +        qemu_irq_lower(imsic->external_irqs[page]);
> +    }
>      if (imsic->eidelivery[page] && riscv_imsic_topei(imsic, page)) {
>          qemu_irq_raise(imsic->external_irqs[page]);
> -    } else {
> -        qemu_irq_lower(imsic->external_irqs[page]);
> +        qatomic_or(&imsic->eistate[base], IMSIC_EISTATE_ENPEND);
>      }
>  }
>
> @@ -125,12 +139,11 @@ static int riscv_imsic_topei_rmw(RISCVIMSICState *imsic, uint32_t page,
>          topei >>= IMSIC_TOPEI_IID_SHIFT;
>          base = page * imsic->num_irqs;
>          if (topei) {
> -            imsic->eistate[base + topei] &= ~IMSIC_EISTATE_PENDING;
> +            qatomic_and(&imsic->eistate[base + topei], ~IMSIC_EISTATE_PENDING);
>          }
> -
> -        riscv_imsic_update(imsic, page);
>      }
>
> +    riscv_imsic_update(imsic, page);
>      return 0;
>  }
>
> @@ -139,7 +152,7 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>                                 uint32_t num, bool pend, target_ulong *val,
>                                 target_ulong new_val, target_ulong wr_mask)
>  {
> -    uint32_t i, base;
> +    uint32_t i, base, prev;
>      target_ulong mask;
>      uint32_t state = (pend) ? IMSIC_EISTATE_PENDING : IMSIC_EISTATE_ENABLED;
>
> @@ -157,10 +170,6 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>
>      if (val) {
>          *val = 0;
> -        for (i = 0; i < xlen; i++) {
> -            mask = (target_ulong)1 << i;
> -            *val |= (imsic->eistate[base + i] & state) ? mask : 0;
> -        }
>      }
>
>      for (i = 0; i < xlen; i++) {
> @@ -172,10 +181,15 @@ static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
>          mask = (target_ulong)1 << i;
>          if (wr_mask & mask) {
>              if (new_val & mask) {
> -                imsic->eistate[base + i] |= state;
> +                prev = qatomic_fetch_or(&imsic->eistate[base + i], state);
>              } else {
> -                imsic->eistate[base + i] &= ~state;
> +                prev = qatomic_fetch_and(&imsic->eistate[base + i], ~state);
>              }
> +        } else {
> +            prev = qatomic_read(&imsic->eistate[base + i]);
> +        }
> +        if (val && (prev & state)) {
> +            *val |= mask;
>          }
>      }
>
> @@ -302,14 +316,14 @@ static void riscv_imsic_write(void *opaque, hwaddr addr, uint64_t value,
>      page = addr >> IMSIC_MMIO_PAGE_SHIFT;
>      if ((addr & (IMSIC_MMIO_PAGE_SZ - 1)) == IMSIC_MMIO_PAGE_LE) {
>          if (value && (value < imsic->num_irqs)) {
> -            imsic->eistate[(page * imsic->num_irqs) + value] |=
> -                                                    IMSIC_EISTATE_PENDING;
> +            qatomic_or(&imsic->eistate[(page * imsic->num_irqs) + value],
> +                       IMSIC_EISTATE_PENDING);
> +
> +            /* Update CPU external interrupt status */
> +            riscv_imsic_update(imsic, page);
>          }
>      }
>
> -    /* Update CPU external interrupt status */
> -    riscv_imsic_update(imsic, page);
> -
>      return;
>
>  err:
>
> base-commit: fd1952d814da738ed107e05583b3e02ac11e88ff
> --
> 2.34.1
>
>
diff mbox series

Patch

diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c
index b90f0d731d..9ef65d4012 100644
--- a/hw/intc/riscv_imsic.c
+++ b/hw/intc/riscv_imsic.c
@@ -55,7 +55,7 @@  static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
                (imsic->eithreshold[page] <= imsic->num_irqs)) ?
                imsic->eithreshold[page] : imsic->num_irqs;
     for (i = 1; i < max_irq; i++) {
-        if ((imsic->eistate[base + i] & IMSIC_EISTATE_ENPEND) ==
+        if ((qatomic_read(&imsic->eistate[base + i]) & IMSIC_EISTATE_ENPEND) ==
                 IMSIC_EISTATE_ENPEND) {
             return (i << IMSIC_TOPEI_IID_SHIFT) | i;
         }
@@ -66,10 +66,24 @@  static uint32_t riscv_imsic_topei(RISCVIMSICState *imsic, uint32_t page)
 
 static void riscv_imsic_update(RISCVIMSICState *imsic, uint32_t page)
 {
+    uint32_t base = page * imsic->num_irqs;
+
+    /*
+     * Lower the interrupt line if necessary, then evaluate the current
+     * IMSIC state.
+     * This sequence ensures that any race between evaluating the eistate and
+     * updating the interrupt line will not result in an incorrectly
+     * deactivated connected CPU IRQ line.
+     * If multiple interrupts are pending, this sequence functions identically
+     * to qemu_irq_pulse.
+     */
+
+    if (qatomic_fetch_and(&imsic->eistate[base], ~IMSIC_EISTATE_ENPEND)) {
+        qemu_irq_lower(imsic->external_irqs[page]);
+    }
     if (imsic->eidelivery[page] && riscv_imsic_topei(imsic, page)) {
         qemu_irq_raise(imsic->external_irqs[page]);
-    } else {
-        qemu_irq_lower(imsic->external_irqs[page]);
+        qatomic_or(&imsic->eistate[base], IMSIC_EISTATE_ENPEND);
     }
 }
 
@@ -125,12 +139,11 @@  static int riscv_imsic_topei_rmw(RISCVIMSICState *imsic, uint32_t page,
         topei >>= IMSIC_TOPEI_IID_SHIFT;
         base = page * imsic->num_irqs;
         if (topei) {
-            imsic->eistate[base + topei] &= ~IMSIC_EISTATE_PENDING;
+            qatomic_and(&imsic->eistate[base + topei], ~IMSIC_EISTATE_PENDING);
         }
-
-        riscv_imsic_update(imsic, page);
     }
 
+    riscv_imsic_update(imsic, page);
     return 0;
 }
 
@@ -139,7 +152,7 @@  static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
                                uint32_t num, bool pend, target_ulong *val,
                                target_ulong new_val, target_ulong wr_mask)
 {
-    uint32_t i, base;
+    uint32_t i, base, prev;
     target_ulong mask;
     uint32_t state = (pend) ? IMSIC_EISTATE_PENDING : IMSIC_EISTATE_ENABLED;
 
@@ -157,10 +170,6 @@  static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
 
     if (val) {
         *val = 0;
-        for (i = 0; i < xlen; i++) {
-            mask = (target_ulong)1 << i;
-            *val |= (imsic->eistate[base + i] & state) ? mask : 0;
-        }
     }
 
     for (i = 0; i < xlen; i++) {
@@ -172,10 +181,15 @@  static int riscv_imsic_eix_rmw(RISCVIMSICState *imsic,
         mask = (target_ulong)1 << i;
         if (wr_mask & mask) {
             if (new_val & mask) {
-                imsic->eistate[base + i] |= state;
+                prev = qatomic_fetch_or(&imsic->eistate[base + i], state);
             } else {
-                imsic->eistate[base + i] &= ~state;
+                prev = qatomic_fetch_and(&imsic->eistate[base + i], ~state);
             }
+        } else {
+            prev = qatomic_read(&imsic->eistate[base + i]);
+        }
+        if (val && (prev & state)) {
+            *val |= mask;
         }
     }
 
@@ -302,14 +316,14 @@  static void riscv_imsic_write(void *opaque, hwaddr addr, uint64_t value,
     page = addr >> IMSIC_MMIO_PAGE_SHIFT;
     if ((addr & (IMSIC_MMIO_PAGE_SZ - 1)) == IMSIC_MMIO_PAGE_LE) {
         if (value && (value < imsic->num_irqs)) {
-            imsic->eistate[(page * imsic->num_irqs) + value] |=
-                                                    IMSIC_EISTATE_PENDING;
+            qatomic_or(&imsic->eistate[(page * imsic->num_irqs) + value],
+                       IMSIC_EISTATE_PENDING);
+
+            /* Update CPU external interrupt status */
+            riscv_imsic_update(imsic, page);
         }
     }
 
-    /* Update CPU external interrupt status */
-    riscv_imsic_update(imsic, page);
-
     return;
 
 err: