diff mbox

[RFC,07/26] ppc/xive: add MMIO handlers to the XIVE interrupt source

Message ID 1499274819-15607-8-git-send-email-clg@kaod.org
State New
Headers show

Commit Message

Cédric Le Goater July 5, 2017, 5:13 p.m. UTC
Each interrupt source is associated with a 2-bit state machine called
an Event State Buffer (ESB). It is controlled by MMIO to trigger
events.

See code for more details on the states.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/xive.c        | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/xive.h |   3 +
 2 files changed, 233 insertions(+)

Comments

David Gibson July 24, 2017, 4:29 a.m. UTC | #1
On Wed, Jul 05, 2017 at 07:13:20PM +0200, Cédric Le Goater wrote:
> Each interrupt source is associated with a 2-bit state machine called
> an Event State Buffer (ESB). It is controlled by MMIO to trigger
> events.
> 
> See code for more details on the states.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/xive.c        | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/xive.h |   3 +
>  2 files changed, 233 insertions(+)
> 
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 9ff14c0da595..816031b8ac81 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -32,6 +32,226 @@ static void xive_icp_irq(XiveICSState *xs, int lisn)
>  }
>  
>  /*
> + * "magic" Event State Buffer (ESB) MMIO offsets.
> + *
> + * Each interrupt source has a 2-bit state machine called ESB
> + * which can be controlled by MMIO. It's made of 2 bits, P and
> + * Q. P indicates that an interrupt is pending (has been sent
> + * to a queue and is waiting for an EOI). Q indicates that the
> + * interrupt has been triggered while pending.
> + *
> + * This acts as a coalescing mechanism in order to guarantee
> + * that a given interrupt only occurs at most once in a queue.
> + *
> + * When doing an EOI, the Q bit will indicate if the interrupt
> + * needs to be re-triggered.
> + *
> + * The following offsets into the ESB MMIO allow to read or
> + * manipulate the PQ bits. They must be used with an 8-bytes
> + * load instruction. They all return the previous state of the
> + * interrupt (atomically).
> + *
> + * Additionally, some ESB pages support doing an EOI via a
> + * store at 0 and some ESBs support doing a trigger via a
> + * separate trigger page.
> + */
> +#define XIVE_ESB_GET            0x800
> +#define XIVE_ESB_SET_PQ_00      0xc00
> +#define XIVE_ESB_SET_PQ_01      0xd00
> +#define XIVE_ESB_SET_PQ_10      0xe00
> +#define XIVE_ESB_SET_PQ_11      0xf00
> +
> +#define XIVE_ESB_VAL_P          0x2
> +#define XIVE_ESB_VAL_Q          0x1
> +
> +#define XIVE_ESB_RESET          0x0
> +#define XIVE_ESB_PENDING        0x2
> +#define XIVE_ESB_QUEUED         0x3
> +#define XIVE_ESB_OFF            0x1
> +
> +static uint8_t xive_pq_get(XIVE *x, uint32_t lisn)
> +{
> +    uint32_t idx = lisn;
> +    uint32_t byte = idx / 4;
> +    uint32_t bit  = (idx % 4) * 2;
> +    uint8_t* pqs = (uint8_t *) x->sbe;
> +
> +    return (pqs[byte] >> bit) & 0x3;
> +}
> +
> +static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq)
> +{
> +    uint32_t idx = lisn;
> +    uint32_t byte = idx / 4;
> +    uint32_t bit  = (idx % 4) * 2;
> +    uint8_t* pqs = (uint8_t *) x->sbe;
> +
> +    pqs[byte] &= ~(0x3 << bit);
> +    pqs[byte] |= (pq & 0x3) << bit;

I know it probably amounts to the same thing given the context, but
I'd be more comfortable with a temporary and an obviously atomic
update than two writes to the real state variable.

> +}
> +
> +static bool xive_pq_eoi(XIVE *x, uint32_t lisn)
> +{
> +    uint8_t old_pq = xive_pq_get(x, lisn);
> +
> +    switch (old_pq) {
> +    case XIVE_ESB_RESET:
> +        xive_pq_set(x, lisn, XIVE_ESB_RESET);
> +        return false;
> +    case XIVE_ESB_PENDING:
> +        xive_pq_set(x, lisn, XIVE_ESB_RESET);
> +        return false;
> +    case XIVE_ESB_QUEUED:
> +        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
> +        return true;
> +    case XIVE_ESB_OFF:
> +        xive_pq_set(x, lisn, XIVE_ESB_OFF);
> +        return false;
> +    default:
> +         g_assert_not_reached();
> +    }
> +}
> +
> +static bool xive_pq_trigger(XIVE *x, uint32_t lisn)
> +{
> +    uint8_t old_pq = xive_pq_get(x, lisn);
> +
> +    switch (old_pq) {
> +    case XIVE_ESB_RESET:
> +        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
> +        return true;
> +    case XIVE_ESB_PENDING:
> +        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
> +        return true;
> +    case XIVE_ESB_QUEUED:
> +        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
> +        return true;
> +    case XIVE_ESB_OFF:
> +        xive_pq_set(x, lisn, XIVE_ESB_OFF);
> +        return false;
> +    default:
> +         g_assert_not_reached();
> +    }
> +}
> +
> +/*
> + * XIVE Interrupt Source MMIOs
> + */
> +static void xive_ics_eoi(XiveICSState *xs, uint32_t srcno)
> +{
> +    ICSIRQState *irq = &ICS_BASE(xs)->irqs[srcno];
> +
> +    if (irq->flags & XICS_FLAGS_IRQ_LSI) {
> +        irq->status &= ~XICS_STATUS_SENT;
> +    }
> +}
> +
> +/* TODO: handle second page */

Is this comment still relevent?

> +static uint64_t xive_esb_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    XiveICSState *xs = ICS_XIVE(opaque);
> +    XIVE *x = xs->xive;
> +    uint32_t offset = addr & 0xF00;
> +    uint32_t srcno = addr >> xs->esb_shift;
> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
> +    XiveIVE *ive;
> +    uint64_t ret = -1;
> +
> +    ive = xive_get_ive(x, lisn);
> +    if (!ive || !(ive->w & IVE_VALID))  {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> +        goto out;
> +    }
> +
> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
> +        qemu_log_mask(LOG_GUEST_ERROR,
> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
> +        goto out;
> +    }
> +
> +    switch (offset) {
> +    case 0:
> +        xive_ics_eoi(xs, srcno);
> +
> +        /* return TRUE or FALSE depending on PQ value */
> +        ret = xive_pq_eoi(x, lisn);
> +        break;
> +
> +    case XIVE_ESB_GET:
> +        ret = xive_pq_get(x, lisn);
> +        break;
> +
> +    case XIVE_ESB_SET_PQ_00:
> +    case XIVE_ESB_SET_PQ_01:
> +    case XIVE_ESB_SET_PQ_10:
> +    case XIVE_ESB_SET_PQ_11:
> +        ret = xive_pq_get(x, lisn);
> +        xive_pq_set(x, lisn, (offset >> 8) & 0x3);

Again I'd prefer xive_pq_set() return the old value itself, for more
obvious atomicity.

> +        break;
> +    default:
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
> +    }
> +
> +out:
> +    return ret;
> +}
> +
> +static void xive_esb_write(void *opaque, hwaddr addr,
> +                           uint64_t value, unsigned size)
> +{
> +    XiveICSState *xs = ICS_XIVE(opaque);
> +    XIVE *x = xs->xive;
> +    uint32_t offset = addr & 0xF00;
> +    uint32_t srcno = addr >> xs->esb_shift;
> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
> +    XiveIVE *ive;
> +    bool notify = false;
> +
> +    ive = xive_get_ive(x, lisn);
> +    if (!ive || !(ive->w & IVE_VALID))  {
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> +        return;
> +    }

Having this code associated with the individual ICS look directly at
the IVE table in the core xive object seems a bit dubious.  This also
points out another mismatch between the re-used ICS code and the new
XIVE code: ICS gathers all the per-source-irq flags/state into the
irqstate structure, whereas xive has per-irq information in the
centralized ecb and IVE tables.  There can certainly be good reasons
for that, but using both at once is kind of clunky.

> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
> +        qemu_log_mask(LOG_GUEST_ERROR,
> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
> +        return;
> +    }
> +
> +    switch (offset) {
> +    case 0:
> +        /* TODO: should we trigger even if the IVE is masked ? */
> +        notify = xive_pq_trigger(x, lisn);
> +        break;
> +    default:
> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
> +                      offset);
> +        return;
> +    }
> +
> +    if (notify && !(ive->w & IVE_MASKED)) {
> +        qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]);
> +    }
> +}
> +
> +static const MemoryRegionOps xive_esb_ops = {
> +    .read = xive_esb_read,
> +    .write = xive_esb_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +    .valid = {
> +        .min_access_size = 8,
> +        .max_access_size = 8,
> +    },
> +    .impl = {
> +        .min_access_size = 8,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +/*
>   * XIVE Interrupt Source
>   */
>  static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val)
> @@ -106,15 +326,25 @@ static void xive_ics_realize(ICSState *ics, Error **errp)
>          return;
>      }
>  
> +    if (!xs->esb_shift) {
> +        error_setg(errp, "ESB page size needs to be greater 0");
> +        return;
> +    }
> +
>      ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
>      ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs);
>  
> +    memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs,
> +                          "xive.esb",
> +                          (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs);
> +
>      qemu_register_reset(xive_ics_reset, xs);
>  }
>  
>  static Property xive_ics_properties[] = {
>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
>      DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0),
> +    DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 544cc6e0c796..5303d96f5f59 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -33,6 +33,9 @@ typedef struct XiveICSState XiveICSState;
>  struct XiveICSState {
>      ICSState parent_obj;
>  
> +    uint32_t     esb_shift;
> +    MemoryRegion esb_iomem;
> +
>      XIVE         *xive;
>  };
>
Alexey Kardashevskiy July 24, 2017, 6:50 a.m. UTC | #2
On 06/07/17 03:13, Cédric Le Goater wrote:
> Each interrupt source is associated with a 2-bit state machine called
> an Event State Buffer (ESB). It is controlled by MMIO to trigger
> events.
> 
> See code for more details on the states.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/xive.c        | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/xive.h |   3 +
>  2 files changed, 233 insertions(+)
> 
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 9ff14c0da595..816031b8ac81 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -32,6 +32,226 @@ static void xive_icp_irq(XiveICSState *xs, int lisn)
>  }
>  
>  /*
> + * "magic" Event State Buffer (ESB) MMIO offsets.
> + *
> + * Each interrupt source has a 2-bit state machine called ESB
> + * which can be controlled by MMIO. It's made of 2 bits, P and
> + * Q. P indicates that an interrupt is pending (has been sent
> + * to a queue and is waiting for an EOI). Q indicates that the
> + * interrupt has been triggered while pending.
> + *
> + * This acts as a coalescing mechanism in order to guarantee
> + * that a given interrupt only occurs at most once in a queue.
> + *
> + * When doing an EOI, the Q bit will indicate if the interrupt
> + * needs to be re-triggered.
> + *
> + * The following offsets into the ESB MMIO allow to read or
> + * manipulate the PQ bits. They must be used with an 8-bytes
> + * load instruction. They all return the previous state of the
> + * interrupt (atomically).
> + *
> + * Additionally, some ESB pages support doing an EOI via a
> + * store at 0 and some ESBs support doing a trigger via a
> + * separate trigger page.
> + */
> +#define XIVE_ESB_GET            0x800
> +#define XIVE_ESB_SET_PQ_00      0xc00
> +#define XIVE_ESB_SET_PQ_01      0xd00
> +#define XIVE_ESB_SET_PQ_10      0xe00
> +#define XIVE_ESB_SET_PQ_11      0xf00
> +
> +#define XIVE_ESB_VAL_P          0x2
> +#define XIVE_ESB_VAL_Q          0x1


These are not used. I'd suggest defining the states below using these two.


> +
> +#define XIVE_ESB_RESET          0x0
> +#define XIVE_ESB_PENDING        0x2
> +#define XIVE_ESB_QUEUED         0x3
> +#define XIVE_ESB_OFF            0x1
> +
> +static uint8_t xive_pq_get(XIVE *x, uint32_t lisn)
> +{
> +    uint32_t idx = lisn;
> +    uint32_t byte = idx / 4;
> +    uint32_t bit  = (idx % 4) * 2;
> +    uint8_t* pqs = (uint8_t *) x->sbe;
> +
> +    return (pqs[byte] >> bit) & 0x3;
> +}
> +
> +static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq)
> +{
> +    uint32_t idx = lisn;
> +    uint32_t byte = idx / 4;
> +    uint32_t bit  = (idx % 4) * 2;
> +    uint8_t* pqs = (uint8_t *) x->sbe;
> +
> +    pqs[byte] &= ~(0x3 << bit);
> +    pqs[byte] |= (pq & 0x3) << bit;
> +}
> +
> +static bool xive_pq_eoi(XIVE *x, uint32_t lisn)


Should not it return uint8_t as well (like xive_pq_get() does)? The value
than returned from .read() is uint64_t (a binary value).
Benjamin Herrenschmidt July 24, 2017, 8:56 a.m. UTC | #3
On Mon, 2017-07-24 at 14:29 +1000, David Gibson wrote:
> > +    case XIVE_ESB_SET_PQ_00:
> > +    case XIVE_ESB_SET_PQ_01:
> > +    case XIVE_ESB_SET_PQ_10:
> > +    case XIVE_ESB_SET_PQ_11:
> > +        ret = xive_pq_get(x, lisn);
> > +        xive_pq_set(x, lisn, (offset >> 8) & 0x3);
> 
> Again I'd prefer xive_pq_set() return the old value itself, for more
> obvious atomicity.

Agreed.  That will also help with StoreEOI (store to 0x400 of the EOI
page) which does an EOI then re-sends an interrupt if the old value was
11 (while the load EOI doesn't resend).

Cheers,
Ben.
Cédric Le Goater July 24, 2017, 3:39 p.m. UTC | #4
On 07/24/2017 08:50 AM, Alexey Kardashevskiy wrote:
> On 06/07/17 03:13, Cédric Le Goater wrote:
>> Each interrupt source is associated with a 2-bit state machine called
>> an Event State Buffer (ESB). It is controlled by MMIO to trigger
>> events.
>>
>> See code for more details on the states.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/xive.c        | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/xive.h |   3 +
>>  2 files changed, 233 insertions(+)
>>
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 9ff14c0da595..816031b8ac81 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -32,6 +32,226 @@ static void xive_icp_irq(XiveICSState *xs, int lisn)
>>  }
>>  
>>  /*
>> + * "magic" Event State Buffer (ESB) MMIO offsets.
>> + *
>> + * Each interrupt source has a 2-bit state machine called ESB
>> + * which can be controlled by MMIO. It's made of 2 bits, P and
>> + * Q. P indicates that an interrupt is pending (has been sent
>> + * to a queue and is waiting for an EOI). Q indicates that the
>> + * interrupt has been triggered while pending.
>> + *
>> + * This acts as a coalescing mechanism in order to guarantee
>> + * that a given interrupt only occurs at most once in a queue.
>> + *
>> + * When doing an EOI, the Q bit will indicate if the interrupt
>> + * needs to be re-triggered.
>> + *
>> + * The following offsets into the ESB MMIO allow to read or
>> + * manipulate the PQ bits. They must be used with an 8-bytes
>> + * load instruction. They all return the previous state of the
>> + * interrupt (atomically).
>> + *
>> + * Additionally, some ESB pages support doing an EOI via a
>> + * store at 0 and some ESBs support doing a trigger via a
>> + * separate trigger page.
>> + */
>> +#define XIVE_ESB_GET            0x800
>> +#define XIVE_ESB_SET_PQ_00      0xc00
>> +#define XIVE_ESB_SET_PQ_01      0xd00
>> +#define XIVE_ESB_SET_PQ_10      0xe00
>> +#define XIVE_ESB_SET_PQ_11      0xf00
>> +
>> +#define XIVE_ESB_VAL_P          0x2
>> +#define XIVE_ESB_VAL_Q          0x1
> 
> 
> These are not used. I'd suggest defining the states below using these two.

yes. I will add a VAL_PQ also.

> 
>> +
>> +#define XIVE_ESB_RESET          0x0
>> +#define XIVE_ESB_PENDING        0x2
>> +#define XIVE_ESB_QUEUED         0x3
>> +#define XIVE_ESB_OFF            0x1
>> +
>> +static uint8_t xive_pq_get(XIVE *x, uint32_t lisn)
>> +{
>> +    uint32_t idx = lisn;
>> +    uint32_t byte = idx / 4;
>> +    uint32_t bit  = (idx % 4) * 2;
>> +    uint8_t* pqs = (uint8_t *) x->sbe;
>> +
>> +    return (pqs[byte] >> bit) & 0x3;
>> +}
>> +
>> +static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq)
>> +{
>> +    uint32_t idx = lisn;
>> +    uint32_t byte = idx / 4;
>> +    uint32_t bit  = (idx % 4) * 2;
>> +    uint8_t* pqs = (uint8_t *) x->sbe;
>> +
>> +    pqs[byte] &= ~(0x3 << bit);
>> +    pqs[byte] |= (pq & 0x3) << bit;
>> +}
>> +
>> +static bool xive_pq_eoi(XIVE *x, uint32_t lisn)
> 
> 
> Should not it return uint8_t as well (like xive_pq_get() does)? The value
> than returned from .read() is uint64_t (a binary value).

Yes. The bool only reflects the state machine specs but we can 
change that in the code.   

Thanks,

C.
Cédric Le Goater July 24, 2017, 3:55 p.m. UTC | #5
On 07/24/2017 06:29 AM, David Gibson wrote:
> On Wed, Jul 05, 2017 at 07:13:20PM +0200, Cédric Le Goater wrote:
>> Each interrupt source is associated with a 2-bit state machine called
>> an Event State Buffer (ESB). It is controlled by MMIO to trigger
>> events.
>>
>> See code for more details on the states.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/xive.c        | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/xive.h |   3 +
>>  2 files changed, 233 insertions(+)
>>
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 9ff14c0da595..816031b8ac81 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -32,6 +32,226 @@ static void xive_icp_irq(XiveICSState *xs, int lisn)
>>  }
>>  
>>  /*
>> + * "magic" Event State Buffer (ESB) MMIO offsets.
>> + *
>> + * Each interrupt source has a 2-bit state machine called ESB
>> + * which can be controlled by MMIO. It's made of 2 bits, P and
>> + * Q. P indicates that an interrupt is pending (has been sent
>> + * to a queue and is waiting for an EOI). Q indicates that the
>> + * interrupt has been triggered while pending.
>> + *
>> + * This acts as a coalescing mechanism in order to guarantee
>> + * that a given interrupt only occurs at most once in a queue.
>> + *
>> + * When doing an EOI, the Q bit will indicate if the interrupt
>> + * needs to be re-triggered.
>> + *
>> + * The following offsets into the ESB MMIO allow to read or
>> + * manipulate the PQ bits. They must be used with an 8-bytes
>> + * load instruction. They all return the previous state of the
>> + * interrupt (atomically).
>> + *
>> + * Additionally, some ESB pages support doing an EOI via a
>> + * store at 0 and some ESBs support doing a trigger via a
>> + * separate trigger page.
>> + */
>> +#define XIVE_ESB_GET            0x800
>> +#define XIVE_ESB_SET_PQ_00      0xc00
>> +#define XIVE_ESB_SET_PQ_01      0xd00
>> +#define XIVE_ESB_SET_PQ_10      0xe00
>> +#define XIVE_ESB_SET_PQ_11      0xf00
>> +
>> +#define XIVE_ESB_VAL_P          0x2
>> +#define XIVE_ESB_VAL_Q          0x1
>> +
>> +#define XIVE_ESB_RESET          0x0
>> +#define XIVE_ESB_PENDING        0x2
>> +#define XIVE_ESB_QUEUED         0x3
>> +#define XIVE_ESB_OFF            0x1
>> +
>> +static uint8_t xive_pq_get(XIVE *x, uint32_t lisn)
>> +{
>> +    uint32_t idx = lisn;
>> +    uint32_t byte = idx / 4;
>> +    uint32_t bit  = (idx % 4) * 2;
>> +    uint8_t* pqs = (uint8_t *) x->sbe;
>> +
>> +    return (pqs[byte] >> bit) & 0x3;
>> +}
>> +
>> +static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq)
>> +{
>> +    uint32_t idx = lisn;
>> +    uint32_t byte = idx / 4;
>> +    uint32_t bit  = (idx % 4) * 2;
>> +    uint8_t* pqs = (uint8_t *) x->sbe;
>> +
>> +    pqs[byte] &= ~(0x3 << bit);
>> +    pqs[byte] |= (pq & 0x3) << bit;
> 
> I know it probably amounts to the same thing given the context, but
> I'd be more comfortable with a temporary and an obviously atomic
> update than two writes to the real state variable.

yes. I will look better.

>> +}
>> +
>> +static bool xive_pq_eoi(XIVE *x, uint32_t lisn)
>> +{
>> +    uint8_t old_pq = xive_pq_get(x, lisn);
>> +
>> +    switch (old_pq) {
>> +    case XIVE_ESB_RESET:
>> +        xive_pq_set(x, lisn, XIVE_ESB_RESET);
>> +        return false;
>> +    case XIVE_ESB_PENDING:
>> +        xive_pq_set(x, lisn, XIVE_ESB_RESET);
>> +        return false;
>> +    case XIVE_ESB_QUEUED:
>> +        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
>> +        return true;
>> +    case XIVE_ESB_OFF:
>> +        xive_pq_set(x, lisn, XIVE_ESB_OFF);
>> +        return false;
>> +    default:
>> +         g_assert_not_reached();
>> +    }
>> +}
>> +
>> +static bool xive_pq_trigger(XIVE *x, uint32_t lisn)
>> +{
>> +    uint8_t old_pq = xive_pq_get(x, lisn);
>> +
>> +    switch (old_pq) {
>> +    case XIVE_ESB_RESET:
>> +        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
>> +        return true;
>> +    case XIVE_ESB_PENDING:
>> +        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
>> +        return true;
>> +    case XIVE_ESB_QUEUED:
>> +        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
>> +        return true;
>> +    case XIVE_ESB_OFF:
>> +        xive_pq_set(x, lisn, XIVE_ESB_OFF);
>> +        return false;
>> +    default:
>> +         g_assert_not_reached();
>> +    }
>> +}
>> +
>> +/*
>> + * XIVE Interrupt Source MMIOs
>> + */
>> +static void xive_ics_eoi(XiveICSState *xs, uint32_t srcno)
>> +{
>> +    ICSIRQState *irq = &ICS_BASE(xs)->irqs[srcno];
>> +
>> +    if (irq->flags & XICS_FLAGS_IRQ_LSI) {
>> +        irq->status &= ~XICS_STATUS_SENT;
>> +    }
>> +}
>> +
>> +/* TODO: handle second page */
> 
> Is this comment still relevent?

Some HW have a second page to trigger the event. I am not sure we need 
to model it though. I will make some inquiries. 

>> +static uint64_t xive_esb_read(void *opaque, hwaddr addr, unsigned size)
>> +{
>> +    XiveICSState *xs = ICS_XIVE(opaque);
>> +    XIVE *x = xs->xive;
>> +    uint32_t offset = addr & 0xF00;
>> +    uint32_t srcno = addr >> xs->esb_shift;
>> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
>> +    XiveIVE *ive;
>> +    uint64_t ret = -1;
>> +
>> +    ive = xive_get_ive(x, lisn);
>> +    if (!ive || !(ive->w & IVE_VALID))  {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
>> +        goto out;
>> +    }
>> +
>> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
>> +        qemu_log_mask(LOG_GUEST_ERROR,
>> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
>> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
>> +        goto out;
>> +    }
>> +
>> +    switch (offset) {
>> +    case 0:
>> +        xive_ics_eoi(xs, srcno);
>> +
>> +        /* return TRUE or FALSE depending on PQ value */
>> +        ret = xive_pq_eoi(x, lisn);
>> +        break;
>> +
>> +    case XIVE_ESB_GET:
>> +        ret = xive_pq_get(x, lisn);
>> +        break;
>> +
>> +    case XIVE_ESB_SET_PQ_00:
>> +    case XIVE_ESB_SET_PQ_01:
>> +    case XIVE_ESB_SET_PQ_10:
>> +    case XIVE_ESB_SET_PQ_11:
>> +        ret = xive_pq_get(x, lisn);
>> +        xive_pq_set(x, lisn, (offset >> 8) & 0x3);
> 
> Again I'd prefer xive_pq_set() return the old value itself, for more
> obvious atomicity.

yes. ok.

> 
>> +        break;
>> +    default:
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
>> +    }
>> +
>> +out:
>> +    return ret;
>> +}
>> +
>> +static void xive_esb_write(void *opaque, hwaddr addr,
>> +                           uint64_t value, unsigned size)
>> +{
>> +    XiveICSState *xs = ICS_XIVE(opaque);
>> +    XIVE *x = xs->xive;
>> +    uint32_t offset = addr & 0xF00;
>> +    uint32_t srcno = addr >> xs->esb_shift;
>> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
>> +    XiveIVE *ive;
>> +    bool notify = false;
>> +
>> +    ive = xive_get_ive(x, lisn);
>> +    if (!ive || !(ive->w & IVE_VALID))  {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
>> +        return;
>> +    }
> 
> Having this code associated with the individual ICS look directly at
> the IVE table in the core xive object seems a bit dubious.

The IVE table holds the validity and mask status of the interrupt 
entries, so we need that lookup. However, (continues below) ...

> This also
> points out another mismatch between the re-used ICS code and the new
> XIVE code: ICS gathers all the per-source-irq flags/state into the
> irqstate structure, whereas xive has per-irq information in the
> centralized ecb and IVE tables.  There can certainly be good reasons
> for that, but using both at once is kind of clunky.

I understand that you would rather put the esbs in the source they 
belong to. That is the case on real HW but it makes the modeling a 
bit more difficult. We would need to choose a MMIO address to give 
to the guest OS. I had some issues with the allocator (I need 
to look at this problem closer).

It might also be an "issue" for KVM. Ben talked about maintaining 
all the esbs of a guest under a single memory region to be able to 
map the pages in the host.

Any how, I agree this is another point to discuss in the sPAPR 
model.

Thanks,

C. 


>> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
>> +        qemu_log_mask(LOG_GUEST_ERROR,
>> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
>> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
>> +        return;
>> +    }
>> +
>> +    switch (offset) {
>> +    case 0:
>> +        /* TODO: should we trigger even if the IVE is masked ? */
>> +        notify = xive_pq_trigger(x, lisn);
>> +        break;
>> +    default:
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
>> +                      offset);
>> +        return;
>> +    }
>> +
>> +    if (notify && !(ive->w & IVE_MASKED)) {
>> +        qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]);
>> +    }
>> +}
>> +
>> +static const MemoryRegionOps xive_esb_ops = {
>> +    .read = xive_esb_read,
>> +    .write = xive_esb_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 8,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 8,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>> +/*
>>   * XIVE Interrupt Source
>>   */
>>  static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val)
>> @@ -106,15 +326,25 @@ static void xive_ics_realize(ICSState *ics, Error **errp)
>>          return;
>>      }
>>  
>> +    if (!xs->esb_shift) {
>> +        error_setg(errp, "ESB page size needs to be greater 0");
>> +        return;
>> +    }
>> +
>>      ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
>>      ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs);
>>  
>> +    memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs,
>> +                          "xive.esb",
>> +                          (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs);
>> +
>>      qemu_register_reset(xive_ics_reset, xs);
>>  }
>>  
>>  static Property xive_ics_properties[] = {
>>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
>>      DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0),
>> +    DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 544cc6e0c796..5303d96f5f59 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -33,6 +33,9 @@ typedef struct XiveICSState XiveICSState;
>>  struct XiveICSState {
>>      ICSState parent_obj;
>>  
>> +    uint32_t     esb_shift;
>> +    MemoryRegion esb_iomem;
>> +
>>      XIVE         *xive;
>>  };
>>  
>
David Gibson July 25, 2017, 12:21 p.m. UTC | #6
On Mon, Jul 24, 2017 at 05:55:42PM +0200, Cédric Le Goater wrote:
> On 07/24/2017 06:29 AM, David Gibson wrote:
> > On Wed, Jul 05, 2017 at 07:13:20PM +0200, Cédric Le Goater wrote:
> >> Each interrupt source is associated with a 2-bit state machine called
> >> an Event State Buffer (ESB). It is controlled by MMIO to trigger
> >> events.
[snip]
> >> +/* TODO: handle second page */
> > 
> > Is this comment still relevent?
> 
> Some HW have a second page to trigger the event. I am not sure we need 
> to model it though. I will make some inquiries.

Ah, ok.  Maybe clarify the comment a bit.

[snip]
> >> +static void xive_esb_write(void *opaque, hwaddr addr,
> >> +                           uint64_t value, unsigned size)
> >> +{
> >> +    XiveICSState *xs = ICS_XIVE(opaque);
> >> +    XIVE *x = xs->xive;
> >> +    uint32_t offset = addr & 0xF00;
> >> +    uint32_t srcno = addr >> xs->esb_shift;
> >> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
> >> +    XiveIVE *ive;
> >> +    bool notify = false;
> >> +
> >> +    ive = xive_get_ive(x, lisn);
> >> +    if (!ive || !(ive->w & IVE_VALID))  {
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
> >> +        return;
> >> +    }
> > 
> > Having this code associated with the individual ICS look directly at
> > the IVE table in the core xive object seems a bit dubious.
> 
> The IVE table holds the validity and mask status of the interrupt 
> entries, so we need that lookup. However, (continues below) ...
> 
> > This also
> > points out another mismatch between the re-used ICS code and the new
> > XIVE code: ICS gathers all the per-source-irq flags/state into the
> > irqstate structure, whereas xive has per-irq information in the
> > centralized ecb and IVE tables.  There can certainly be good reasons
> > for that, but using both at once is kind of clunky.
> 
> I understand that you would rather put the esbs in the source they 
> belong to. That is the case on real HW but it makes the modeling a 
> bit more difficult. We would need to choose a MMIO address to give 
> to the guest OS. I had some issues with the allocator (I need 
> to look at this problem closer).

Uh.. what do MMIO addresses have to do with this?  I'm talking about
the actual ESB state in the packed bit array.

> It might also be an "issue" for KVM. Ben talked about maintaining 
> all the esbs of a guest under a single memory region to be able to 
> map the pages in the host.
> 
> Any how, I agree this is another point to discuss in the sPAPR 
> model.
> 
> Thanks,
> 
> C. 
> 
> 
> >> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
> >> +        qemu_log_mask(LOG_GUEST_ERROR,
> >> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
> >> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
> >> +        return;
> >> +    }
> >> +
> >> +    switch (offset) {
> >> +    case 0:
> >> +        /* TODO: should we trigger even if the IVE is masked ? */
> >> +        notify = xive_pq_trigger(x, lisn);
> >> +        break;
> >> +    default:
> >> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
> >> +                      offset);
> >> +        return;
> >> +    }
> >> +
> >> +    if (notify && !(ive->w & IVE_MASKED)) {
> >> +        qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]);
> >> +    }
> >> +}
> >> +
> >> +static const MemoryRegionOps xive_esb_ops = {
> >> +    .read = xive_esb_read,
> >> +    .write = xive_esb_write,
> >> +    .endianness = DEVICE_BIG_ENDIAN,
> >> +    .valid = {
> >> +        .min_access_size = 8,
> >> +        .max_access_size = 8,
> >> +    },
> >> +    .impl = {
> >> +        .min_access_size = 8,
> >> +        .max_access_size = 8,
> >> +    },
> >> +};
> >> +
> >> +/*
> >>   * XIVE Interrupt Source
> >>   */
> >>  static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val)
> >> @@ -106,15 +326,25 @@ static void xive_ics_realize(ICSState *ics, Error **errp)
> >>          return;
> >>      }
> >>  
> >> +    if (!xs->esb_shift) {
> >> +        error_setg(errp, "ESB page size needs to be greater 0");
> >> +        return;
> >> +    }
> >> +
> >>      ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
> >>      ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs);
> >>  
> >> +    memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs,
> >> +                          "xive.esb",
> >> +                          (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs);
> >> +
> >>      qemu_register_reset(xive_ics_reset, xs);
> >>  }
> >>  
> >>  static Property xive_ics_properties[] = {
> >>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
> >>      DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0),
> >> +    DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>  
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index 544cc6e0c796..5303d96f5f59 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -33,6 +33,9 @@ typedef struct XiveICSState XiveICSState;
> >>  struct XiveICSState {
> >>      ICSState parent_obj;
> >>  
> >> +    uint32_t     esb_shift;
> >> +    MemoryRegion esb_iomem;
> >> +
> >>      XIVE         *xive;
> >>  };
> >>  
> > 
>
Cédric Le Goater July 25, 2017, 3:42 p.m. UTC | #7
On 07/25/2017 02:21 PM, David Gibson wrote:
> On Mon, Jul 24, 2017 at 05:55:42PM +0200, Cédric Le Goater wrote:
>> On 07/24/2017 06:29 AM, David Gibson wrote:
>>> On Wed, Jul 05, 2017 at 07:13:20PM +0200, Cédric Le Goater wrote:
>>>> Each interrupt source is associated with a 2-bit state machine called
>>>> an Event State Buffer (ESB). It is controlled by MMIO to trigger
>>>> events.
> [snip]
>>>> +/* TODO: handle second page */
>>>
>>> Is this comment still relevent?
>>
>> Some HW have a second page to trigger the event. I am not sure we need 
>> to model it though. I will make some inquiries.
> 
> Ah, ok.  Maybe clarify the comment a bit.
> 
> [snip]
>>>> +static void xive_esb_write(void *opaque, hwaddr addr,
>>>> +                           uint64_t value, unsigned size)
>>>> +{
>>>> +    XiveICSState *xs = ICS_XIVE(opaque);
>>>> +    XIVE *x = xs->xive;
>>>> +    uint32_t offset = addr & 0xF00;
>>>> +    uint32_t srcno = addr >> xs->esb_shift;
>>>> +    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
>>>> +    XiveIVE *ive;
>>>> +    bool notify = false;
>>>> +
>>>> +    ive = xive_get_ive(x, lisn);
>>>> +    if (!ive || !(ive->w & IVE_VALID))  {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
>>>> +        return;
>>>> +    }
>>>
>>> Having this code associated with the individual ICS look directly at
>>> the IVE table in the core xive object seems a bit dubious.
>>
>> The IVE table holds the validity and mask status of the interrupt 
>> entries, so we need that lookup. However, (continues below) ...
>>
>>> This also
>>> points out another mismatch between the re-used ICS code and the new
>>> XIVE code: ICS gathers all the per-source-irq flags/state into the
>>> irqstate structure, whereas xive has per-irq information in the
>>> centralized ecb and IVE tables.  There can certainly be good reasons
>>> for that, but using both at once is kind of clunky.
>>
>> I understand that you would rather put the esbs in the source they 
>> belong to. That is the case on real HW but it makes the modeling a 
>> bit more difficult. We would need to choose a MMIO address to give 
>> to the guest OS. I had some issues with the allocator (I need 
>> to look at this problem closer).
> 
> Uh.. what do MMIO addresses have to do with this?  I'm talking about
> the actual ESB state in the packed bit array.

To ease the modeling, the ESB states of all IRQs are under the same
bit array and they are exposed to the guest OS through a single memory 
region so that all accesses to the ESB states are centralized. On real 
HW, the ESB cache and the MMIO base depends on the source. 
 
Anyway, that might not be an issue. I will see what I can do to 
decorrelate the XIVE source from the main XIVE object. We can 
probably move the ESB state array below the XIVE source.

C.

>> It might also be an "issue" for KVM. Ben talked about maintaining 
>> all the esbs of a guest under a single memory region to be able to 
>> map the pages in the host.
>>
>> Any how, I agree this is another point to discuss in the sPAPR 
>> model.
>>
>> Thanks,
>>
>> C. 
>>
>>
>>>> +    if (srcno >= ICS_BASE(xs)->nr_irqs) {
>>>> +        qemu_log_mask(LOG_GUEST_ERROR,
>>>> +                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
>>>> +                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    switch (offset) {
>>>> +    case 0:
>>>> +        /* TODO: should we trigger even if the IVE is masked ? */
>>>> +        notify = xive_pq_trigger(x, lisn);
>>>> +        break;
>>>> +    default:
>>>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
>>>> +                      offset);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (notify && !(ive->w & IVE_MASKED)) {
>>>> +        qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]);
>>>> +    }
>>>> +}
>>>> +
>>>> +static const MemoryRegionOps xive_esb_ops = {
>>>> +    .read = xive_esb_read,
>>>> +    .write = xive_esb_write,
>>>> +    .endianness = DEVICE_BIG_ENDIAN,
>>>> +    .valid = {
>>>> +        .min_access_size = 8,
>>>> +        .max_access_size = 8,
>>>> +    },
>>>> +    .impl = {
>>>> +        .min_access_size = 8,
>>>> +        .max_access_size = 8,
>>>> +    },
>>>> +};
>>>> +
>>>> +/*
>>>>   * XIVE Interrupt Source
>>>>   */
>>>>  static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val)
>>>> @@ -106,15 +326,25 @@ static void xive_ics_realize(ICSState *ics, Error **errp)
>>>>          return;
>>>>      }
>>>>  
>>>> +    if (!xs->esb_shift) {
>>>> +        error_setg(errp, "ESB page size needs to be greater 0");
>>>> +        return;
>>>> +    }
>>>> +
>>>>      ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
>>>>      ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs);
>>>>  
>>>> +    memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs,
>>>> +                          "xive.esb",
>>>> +                          (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs);
>>>> +
>>>>      qemu_register_reset(xive_ics_reset, xs);
>>>>  }
>>>>  
>>>>  static Property xive_ics_properties[] = {
>>>>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
>>>>      DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0),
>>>> +    DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0),
>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>  };
>>>>  
>>>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>>>> index 544cc6e0c796..5303d96f5f59 100644
>>>> --- a/include/hw/ppc/xive.h
>>>> +++ b/include/hw/ppc/xive.h
>>>> @@ -33,6 +33,9 @@ typedef struct XiveICSState XiveICSState;
>>>>  struct XiveICSState {
>>>>      ICSState parent_obj;
>>>>  
>>>> +    uint32_t     esb_shift;
>>>> +    MemoryRegion esb_iomem;
>>>> +
>>>>      XIVE         *xive;
>>>>  };
>>>>  
>>>
>>
>
diff mbox

Patch

diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 9ff14c0da595..816031b8ac81 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -32,6 +32,226 @@  static void xive_icp_irq(XiveICSState *xs, int lisn)
 }
 
 /*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_GET            0x800
+#define XIVE_ESB_SET_PQ_00      0xc00
+#define XIVE_ESB_SET_PQ_01      0xd00
+#define XIVE_ESB_SET_PQ_10      0xe00
+#define XIVE_ESB_SET_PQ_11      0xf00
+
+#define XIVE_ESB_VAL_P          0x2
+#define XIVE_ESB_VAL_Q          0x1
+
+#define XIVE_ESB_RESET          0x0
+#define XIVE_ESB_PENDING        0x2
+#define XIVE_ESB_QUEUED         0x3
+#define XIVE_ESB_OFF            0x1
+
+static uint8_t xive_pq_get(XIVE *x, uint32_t lisn)
+{
+    uint32_t idx = lisn;
+    uint32_t byte = idx / 4;
+    uint32_t bit  = (idx % 4) * 2;
+    uint8_t* pqs = (uint8_t *) x->sbe;
+
+    return (pqs[byte] >> bit) & 0x3;
+}
+
+static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq)
+{
+    uint32_t idx = lisn;
+    uint32_t byte = idx / 4;
+    uint32_t bit  = (idx % 4) * 2;
+    uint8_t* pqs = (uint8_t *) x->sbe;
+
+    pqs[byte] &= ~(0x3 << bit);
+    pqs[byte] |= (pq & 0x3) << bit;
+}
+
+static bool xive_pq_eoi(XIVE *x, uint32_t lisn)
+{
+    uint8_t old_pq = xive_pq_get(x, lisn);
+
+    switch (old_pq) {
+    case XIVE_ESB_RESET:
+        xive_pq_set(x, lisn, XIVE_ESB_RESET);
+        return false;
+    case XIVE_ESB_PENDING:
+        xive_pq_set(x, lisn, XIVE_ESB_RESET);
+        return false;
+    case XIVE_ESB_QUEUED:
+        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
+        return true;
+    case XIVE_ESB_OFF:
+        xive_pq_set(x, lisn, XIVE_ESB_OFF);
+        return false;
+    default:
+         g_assert_not_reached();
+    }
+}
+
+static bool xive_pq_trigger(XIVE *x, uint32_t lisn)
+{
+    uint8_t old_pq = xive_pq_get(x, lisn);
+
+    switch (old_pq) {
+    case XIVE_ESB_RESET:
+        xive_pq_set(x, lisn, XIVE_ESB_PENDING);
+        return true;
+    case XIVE_ESB_PENDING:
+        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
+        return true;
+    case XIVE_ESB_QUEUED:
+        xive_pq_set(x, lisn, XIVE_ESB_QUEUED);
+        return true;
+    case XIVE_ESB_OFF:
+        xive_pq_set(x, lisn, XIVE_ESB_OFF);
+        return false;
+    default:
+         g_assert_not_reached();
+    }
+}
+
+/*
+ * XIVE Interrupt Source MMIOs
+ */
+static void xive_ics_eoi(XiveICSState *xs, uint32_t srcno)
+{
+    ICSIRQState *irq = &ICS_BASE(xs)->irqs[srcno];
+
+    if (irq->flags & XICS_FLAGS_IRQ_LSI) {
+        irq->status &= ~XICS_STATUS_SENT;
+    }
+}
+
+/* TODO: handle second page */
+static uint64_t xive_esb_read(void *opaque, hwaddr addr, unsigned size)
+{
+    XiveICSState *xs = ICS_XIVE(opaque);
+    XIVE *x = xs->xive;
+    uint32_t offset = addr & 0xF00;
+    uint32_t srcno = addr >> xs->esb_shift;
+    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
+    XiveIVE *ive;
+    uint64_t ret = -1;
+
+    ive = xive_get_ive(x, lisn);
+    if (!ive || !(ive->w & IVE_VALID))  {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
+        goto out;
+    }
+
+    if (srcno >= ICS_BASE(xs)->nr_irqs) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
+                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
+        goto out;
+    }
+
+    switch (offset) {
+    case 0:
+        xive_ics_eoi(xs, srcno);
+
+        /* return TRUE or FALSE depending on PQ value */
+        ret = xive_pq_eoi(x, lisn);
+        break;
+
+    case XIVE_ESB_GET:
+        ret = xive_pq_get(x, lisn);
+        break;
+
+    case XIVE_ESB_SET_PQ_00:
+    case XIVE_ESB_SET_PQ_01:
+    case XIVE_ESB_SET_PQ_10:
+    case XIVE_ESB_SET_PQ_11:
+        ret = xive_pq_get(x, lisn);
+        xive_pq_set(x, lisn, (offset >> 8) & 0x3);
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", offset);
+    }
+
+out:
+    return ret;
+}
+
+static void xive_esb_write(void *opaque, hwaddr addr,
+                           uint64_t value, unsigned size)
+{
+    XiveICSState *xs = ICS_XIVE(opaque);
+    XIVE *x = xs->xive;
+    uint32_t offset = addr & 0xF00;
+    uint32_t srcno = addr >> xs->esb_shift;
+    uint32_t lisn = srcno + ICS_BASE(xs)->offset;
+    XiveIVE *ive;
+    bool notify = false;
+
+    ive = xive_get_ive(x, lisn);
+    if (!ive || !(ive->w & IVE_VALID))  {
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn);
+        return;
+    }
+
+    if (srcno >= ICS_BASE(xs)->nr_irqs) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "XIVE: invalid IRQ number: %d/%d lisn: %d\n",
+                      srcno, ICS_BASE(xs)->nr_irqs, lisn);
+        return;
+    }
+
+    switch (offset) {
+    case 0:
+        /* TODO: should we trigger even if the IVE is masked ? */
+        notify = xive_pq_trigger(x, lisn);
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n",
+                      offset);
+        return;
+    }
+
+    if (notify && !(ive->w & IVE_MASKED)) {
+        qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]);
+    }
+}
+
+static const MemoryRegionOps xive_esb_ops = {
+    .read = xive_esb_read,
+    .write = xive_esb_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+    .valid = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+};
+
+/*
  * XIVE Interrupt Source
  */
 static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val)
@@ -106,15 +326,25 @@  static void xive_ics_realize(ICSState *ics, Error **errp)
         return;
     }
 
+    if (!xs->esb_shift) {
+        error_setg(errp, "ESB page size needs to be greater 0");
+        return;
+    }
+
     ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
     ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs);
 
+    memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs,
+                          "xive.esb",
+                          (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs);
+
     qemu_register_reset(xive_ics_reset, xs);
 }
 
 static Property xive_ics_properties[] = {
     DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
     DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0),
+    DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index 544cc6e0c796..5303d96f5f59 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -33,6 +33,9 @@  typedef struct XiveICSState XiveICSState;
 struct XiveICSState {
     ICSState parent_obj;
 
+    uint32_t     esb_shift;
+    MemoryRegion esb_iomem;
+
     XIVE         *xive;
 };