diff mbox

[RFC,04/26] ppc/xive: introduce a skeleton for the XIVE interrupt controller model

Message ID 1499274819-15607-5-git-send-email-clg@kaod.org
State New
Headers show

Commit Message

Cédric Le Goater July 5, 2017, 5:13 p.m. UTC
Let's provide an empty shell for the XIVE controller model with a
couple of attributes for the IRQ number allocator. The latter is
largely inspired by OPAL which allocates IPI IRQ numbers from the
bottom of the IRQ number space and allocates the HW IRQ numbers from
the top.

The number of IPIs is simply deduced from the max number of CPUs the
guest supports and we provision a arbitrary number of HW irqs.

The XIVE object is kept private because it will hold internal tables
which do not need to be exposed to sPAPR.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 default-configs/ppc64-softmmu.mak |  1 +
 hw/intc/Makefile.objs             |  1 +
 hw/intc/xive-internal.h           | 28 ++++++++++++
 hw/intc/xive.c                    | 94 +++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/xive.h             | 27 +++++++++++
 5 files changed, 151 insertions(+)
 create mode 100644 hw/intc/xive-internal.h
 create mode 100644 hw/intc/xive.c
 create mode 100644 include/hw/ppc/xive.h

Comments

David Gibson July 19, 2017, 3:08 a.m. UTC | #1
On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> Let's provide an empty shell for the XIVE controller model with a
> couple of attributes for the IRQ number allocator. The latter is
> largely inspired by OPAL which allocates IPI IRQ numbers from the
> bottom of the IRQ number space and allocates the HW IRQ numbers from
> the top.
> 
> The number of IPIs is simply deduced from the max number of CPUs the
> guest supports and we provision a arbitrary number of HW irqs.
> 
> The XIVE object is kept private because it will hold internal tables
> which do not need to be exposed to sPAPR.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  default-configs/ppc64-softmmu.mak |  1 +
>  hw/intc/Makefile.objs             |  1 +
>  hw/intc/xive-internal.h           | 28 ++++++++++++
>  hw/intc/xive.c                    | 94 +++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/xive.h             | 27 +++++++++++
>  5 files changed, 151 insertions(+)
>  create mode 100644 hw/intc/xive-internal.h
>  create mode 100644 hw/intc/xive.c
>  create mode 100644 include/hw/ppc/xive.h
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 46c95993217d..1179c07e6e9f 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -56,6 +56,7 @@ CONFIG_SM501=y
>  CONFIG_XICS=$(CONFIG_PSERIES)
>  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
>  CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
> +CONFIG_XIVE=$(CONFIG_PSERIES)
>  # For PReP
>  CONFIG_SERIAL_ISA=y
>  CONFIG_MC146818RTC=y
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 78426a7dafcd..28b83456bfcc 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -35,6 +35,7 @@ obj-$(CONFIG_SH4) += sh_intc.o
>  obj-$(CONFIG_XICS) += xics.o
>  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
>  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> +obj-$(CONFIG_XIVE) += xive.o
>  obj-$(CONFIG_POWERNV) += xics_pnv.o
>  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
>  obj-$(CONFIG_S390_FLIC) += s390_flic.o
> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> new file mode 100644
> index 000000000000..155c2dcd6066
> --- /dev/null
> +++ b/hw/intc/xive-internal.h
> @@ -0,0 +1,28 @@
> +/*
> + * Copyright 2016,2017 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +#ifndef _INTC_XIVE_INTERNAL_H
> +#define _INTC_XIVE_INTERNAL_H
> +
> +#include <hw/sysbus.h>
> +
> +struct XIVE {
> +    SysBusDevice parent;

XIVE probably shouldn't be a SysBusDevice.  According to agraf, that
should only be used for things which have an MMIO presence on a bus
structure that's not worth the bother of more specifically modelling.

I don't think that's the case for XIVE, so it should just have
TYPE_DEVICE as its parent.  There are several pseries things which
already get this wrong (mostly because I made them before fully
understanding the role of the SysBus), but we should avoid adding
others.

> +    /* Properties */
> +    uint32_t     nr_targets;
> +
> +    /* IRQ number allocator */
> +    uint32_t     int_count;     /* Number of interrupts: nr_targets + HW IRQs */
> +    uint32_t     int_base;      /* Min index */
> +    uint32_t     int_max;       /* Max index */
> +    uint32_t     int_hw_bot;    /* Bottom index of HW IRQ allocator */
> +    uint32_t     int_ipi_top;   /* Highest IPI index handed out so far + 1 */
> +};
> +
> +#endif /* _INTC_XIVE_INTERNAL_H */
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> new file mode 100644
> index 000000000000..5b4ea915d87c
> --- /dev/null
> +++ b/hw/intc/xive.c
> @@ -0,0 +1,94 @@
> +/*
> + * QEMU PowerPC XIVE model
> + *
> + * Copyright (c) 2017, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qapi/error.h"
> +#include "target/ppc/cpu.h"
> +#include "sysemu/cpus.h"
> +#include "sysemu/dma.h"
> +#include "monitor/monitor.h"
> +#include "hw/ppc/xive.h"
> +
> +#include "xive-internal.h"
> +
> +/*
> + * Main XIVE object

As with XICs, does it really make sense for there to be a "main" XIVE
object, or should be an interface attached to the machine?

> + */
> +
> +/* Let's provision some HW IRQ numbers. We could use a XIVE property
> + * also but it does not seem necessary for the moment.
> + */
> +#define MAX_HW_IRQS_ENTRIES (8 * 1024)
> +
> +static void xive_init(Object *obj)
> +{
> +    ;
> +}
> +
> +static void xive_realize(DeviceState *dev, Error **errp)
> +{
> +    XIVE *x = XIVE(dev);
> +
> +    if (!x->nr_targets) {
> +        error_setg(errp, "Number of interrupt targets needs to be greater 0");
> +        return;
> +    }
> +
> +    /* Initialize IRQ number allocator. Let's use a base number if we
> +     * need to introduce a notion of blocks one day.
> +     */
> +    x->int_base = 0;
> +    x->int_count = x->nr_targets + MAX_HW_IRQS_ENTRIES;
> +    x->int_max = x->int_base + x->int_count;
> +    x->int_hw_bot = x->int_max;
> +    x->int_ipi_top = x->int_base;
> +
> +    /* Reserve some numbers as OPAL does ? */
> +    if (x->int_ipi_top < 0x10) {
> +        x->int_ipi_top = 0x10;
> +    }

I'm somewhat uncomfortable with an irq allocater here in the intc
code.  As a rule, irq allocation is the responsibility of the machine,
not any sub-component.  Furthermore, it should allocate in a way which
is repeatable, since they need to stay stable across reboots and
migrations.

And, yes, we have an allocator of sorts in XICS - it has caused a
number of problems in the past.

> +}
> +
> +static Property xive_properties[] = {
> +    DEFINE_PROP_UINT32("nr-targets", XIVE, nr_targets, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void xive_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = xive_realize;
> +    dc->props = xive_properties;
> +    dc->desc = "XIVE";
> +}
> +
> +static const TypeInfo xive_info = {
> +    .name = TYPE_XIVE,
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_init = xive_init,
> +    .instance_size = sizeof(XIVE),
> +    .class_init = xive_class_init,
> +};
> +
> +static void xive_register_types(void)
> +{
> +    type_register_static(&xive_info);
> +}
> +
> +type_init(xive_register_types)
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> new file mode 100644
> index 000000000000..863f5a9c6b5f
> --- /dev/null
> +++ b/include/hw/ppc/xive.h
> @@ -0,0 +1,27 @@
> +/*
> + * QEMU PowerPC XIVE model
> + *
> + * Copyright (c) 2017, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef PPC_XIVE_H
> +#define PPC_XIVE_H
> +
> +typedef struct XIVE XIVE;
> +
> +#define TYPE_XIVE "xive"
> +#define XIVE(obj) OBJECT_CHECK(XIVE, (obj), TYPE_XIVE)
> +
> +#endif /* PPC_XIVE_H */
David Gibson July 19, 2017, 3:23 a.m. UTC | #2
On Wed, Jul 19, 2017 at 01:08:49PM +1000, David Gibson wrote:
> On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> > Let's provide an empty shell for the XIVE controller model with a
> > couple of attributes for the IRQ number allocator. The latter is
> > largely inspired by OPAL which allocates IPI IRQ numbers from the
> > bottom of the IRQ number space and allocates the HW IRQ numbers from
> > the top.
> > 
> > The number of IPIs is simply deduced from the max number of CPUs the
> > guest supports and we provision a arbitrary number of HW irqs.
> > 
> > The XIVE object is kept private because it will hold internal tables
> > which do not need to be exposed to sPAPR.
> > 
> > Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > ---
> >  default-configs/ppc64-softmmu.mak |  1 +
> >  hw/intc/Makefile.objs             |  1 +
> >  hw/intc/xive-internal.h           | 28 ++++++++++++
> >  hw/intc/xive.c                    | 94 +++++++++++++++++++++++++++++++++++++++
> >  include/hw/ppc/xive.h             | 27 +++++++++++
> >  5 files changed, 151 insertions(+)
> >  create mode 100644 hw/intc/xive-internal.h
> >  create mode 100644 hw/intc/xive.c
> >  create mode 100644 include/hw/ppc/xive.h
> > 
> > diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> > index 46c95993217d..1179c07e6e9f 100644
> > --- a/default-configs/ppc64-softmmu.mak
> > +++ b/default-configs/ppc64-softmmu.mak
> > @@ -56,6 +56,7 @@ CONFIG_SM501=y
> >  CONFIG_XICS=$(CONFIG_PSERIES)
> >  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
> >  CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
> > +CONFIG_XIVE=$(CONFIG_PSERIES)
> >  # For PReP
> >  CONFIG_SERIAL_ISA=y
> >  CONFIG_MC146818RTC=y
> > diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> > index 78426a7dafcd..28b83456bfcc 100644
> > --- a/hw/intc/Makefile.objs
> > +++ b/hw/intc/Makefile.objs
> > @@ -35,6 +35,7 @@ obj-$(CONFIG_SH4) += sh_intc.o
> >  obj-$(CONFIG_XICS) += xics.o
> >  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
> >  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> > +obj-$(CONFIG_XIVE) += xive.o
> >  obj-$(CONFIG_POWERNV) += xics_pnv.o
> >  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
> >  obj-$(CONFIG_S390_FLIC) += s390_flic.o
> > diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> > new file mode 100644
> > index 000000000000..155c2dcd6066
> > --- /dev/null
> > +++ b/hw/intc/xive-internal.h
> > @@ -0,0 +1,28 @@
> > +/*
> > + * Copyright 2016,2017 IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version
> > + * 2 of the License, or (at your option) any later version.
> > + */
> > +#ifndef _INTC_XIVE_INTERNAL_H
> > +#define _INTC_XIVE_INTERNAL_H
> > +
> > +#include <hw/sysbus.h>
> > +
> > +struct XIVE {
> > +    SysBusDevice parent;
> 
> XIVE probably shouldn't be a SysBusDevice.  According to agraf, that
> should only be used for things which have an MMIO presence on a bus
> structure that's not worth the bother of more specifically modelling.
> 
> I don't think that's the case for XIVE, so it should just have
> TYPE_DEVICE as its parent.  There are several pseries things which
> already get this wrong (mostly because I made them before fully
> understanding the role of the SysBus), but we should avoid adding
> others.
> 
> > +    /* Properties */
> > +    uint32_t     nr_targets;
> > +
> > +    /* IRQ number allocator */
> > +    uint32_t     int_count;     /* Number of interrupts: nr_targets + HW IRQs */
> > +    uint32_t     int_base;      /* Min index */
> > +    uint32_t     int_max;       /* Max index */
> > +    uint32_t     int_hw_bot;    /* Bottom index of HW IRQ allocator */
> > +    uint32_t     int_ipi_top;   /* Highest IPI index handed out so far + 1 */
> > +};
> > +
> > +#endif /* _INTC_XIVE_INTERNAL_H */
> > diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> > new file mode 100644
> > index 000000000000..5b4ea915d87c
> > --- /dev/null
> > +++ b/hw/intc/xive.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * QEMU PowerPC XIVE model
> > + *
> > + * Copyright (c) 2017, IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License, version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +#include "qemu/osdep.h"
> > +#include "qemu/log.h"
> > +#include "qapi/error.h"
> > +#include "target/ppc/cpu.h"
> > +#include "sysemu/cpus.h"
> > +#include "sysemu/dma.h"
> > +#include "monitor/monitor.h"
> > +#include "hw/ppc/xive.h"
> > +
> > +#include "xive-internal.h"
> > +
> > +/*
> > + * Main XIVE object
> 
> As with XICs, does it really make sense for there to be a "main" XIVE
> object, or should be an interface attached to the machine?
> 
> > + */
> > +
> > +/* Let's provision some HW IRQ numbers. We could use a XIVE property
> > + * also but it does not seem necessary for the moment.
> > + */
> > +#define MAX_HW_IRQS_ENTRIES (8 * 1024)
> > +
> > +static void xive_init(Object *obj)
> > +{
> > +    ;
> > +}
> > +
> > +static void xive_realize(DeviceState *dev, Error **errp)
> > +{
> > +    XIVE *x = XIVE(dev);
> > +
> > +    if (!x->nr_targets) {
> > +        error_setg(errp, "Number of interrupt targets needs to be greater 0");
> > +        return;
> > +    }
> > +
> > +    /* Initialize IRQ number allocator. Let's use a base number if we
> > +     * need to introduce a notion of blocks one day.
> > +     */
> > +    x->int_base = 0;
> > +    x->int_count = x->nr_targets + MAX_HW_IRQS_ENTRIES;
> > +    x->int_max = x->int_base + x->int_count;

Also storing a value which is easily derivable from values already in
the structure is a bit silly.  I think you should drop either
int_count or int_max.

> > +    x->int_hw_bot = x->int_max;
> > +    x->int_ipi_top = x->int_base;
> > +
> > +    /* Reserve some numbers as OPAL does ? */
> > +    if (x->int_ipi_top < 0x10) {
> > +        x->int_ipi_top = 0x10;
> > +    }
> 
> I'm somewhat uncomfortable with an irq allocater here in the intc
> code.  As a rule, irq allocation is the responsibility of the machine,
> not any sub-component.  Furthermore, it should allocate in a way which
> is repeatable, since they need to stay stable across reboots and
> migrations.
> 
> And, yes, we have an allocator of sorts in XICS - it has caused a
> number of problems in the past.
> 
> > +}
> > +
> > +static Property xive_properties[] = {
> > +    DEFINE_PROP_UINT32("nr-targets", XIVE, nr_targets, 0),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void xive_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +
> > +    dc->realize = xive_realize;
> > +    dc->props = xive_properties;
> > +    dc->desc = "XIVE";
> > +}
> > +
> > +static const TypeInfo xive_info = {
> > +    .name = TYPE_XIVE,
> > +    .parent = TYPE_SYS_BUS_DEVICE,
> > +    .instance_init = xive_init,
> > +    .instance_size = sizeof(XIVE),
> > +    .class_init = xive_class_init,
> > +};
> > +
> > +static void xive_register_types(void)
> > +{
> > +    type_register_static(&xive_info);
> > +}
> > +
> > +type_init(xive_register_types)
> > diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> > new file mode 100644
> > index 000000000000..863f5a9c6b5f
> > --- /dev/null
> > +++ b/include/hw/ppc/xive.h
> > @@ -0,0 +1,27 @@
> > +/*
> > + * QEMU PowerPC XIVE model
> > + *
> > + * Copyright (c) 2017, IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License, version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef PPC_XIVE_H
> > +#define PPC_XIVE_H
> > +
> > +typedef struct XIVE XIVE;
> > +
> > +#define TYPE_XIVE "xive"
> > +#define XIVE(obj) OBJECT_CHECK(XIVE, (obj), TYPE_XIVE)
> > +
> > +#endif /* PPC_XIVE_H */
>
Benjamin Herrenschmidt July 19, 2017, 3:56 a.m. UTC | #3
On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> > Let's provide an empty shell for the XIVE controller model with a
> > couple of attributes for the IRQ number allocator. The latter is
> > largely inspired by OPAL which allocates IPI IRQ numbers from the
> > bottom of the IRQ number space and allocates the HW IRQ numbers from
> > the top.
> > 
> > The number of IPIs is simply deduced from the max number of CPUs the
> > guest supports and we provision a arbitrary number of HW irqs.
> > 
> > The XIVE object is kept private because it will hold internal tables
> > which do not need to be exposed to sPAPR.

It does have an MMIO presence though... more than one even. There's the
TIMA (per-HW thread control area) and there's the per-interrupt MMIO
space which are exposed to the guest. There's also the per-queue
MMIO control area too.

Ben.

> > Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > ---
> >  default-configs/ppc64-softmmu.mak |  1 +
> >  hw/intc/Makefile.objs             |  1 +
> >  hw/intc/xive-internal.h           | 28 ++++++++++++
> >  hw/intc/xive.c                    | 94 +++++++++++++++++++++++++++++++++++++++
> >  include/hw/ppc/xive.h             | 27 +++++++++++
> >  5 files changed, 151 insertions(+)
> >  create mode 100644 hw/intc/xive-internal.h
> >  create mode 100644 hw/intc/xive.c
> >  create mode 100644 include/hw/ppc/xive.h
> > 
> > diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> > index 46c95993217d..1179c07e6e9f 100644
> > --- a/default-configs/ppc64-softmmu.mak
> > +++ b/default-configs/ppc64-softmmu.mak
> > @@ -56,6 +56,7 @@ CONFIG_SM501=y
> >  CONFIG_XICS=$(CONFIG_PSERIES)
> >  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
> >  CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
> > +CONFIG_XIVE=$(CONFIG_PSERIES)
> >  # For PReP
> >  CONFIG_SERIAL_ISA=y
> >  CONFIG_MC146818RTC=y
> > diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> > index 78426a7dafcd..28b83456bfcc 100644
> > --- a/hw/intc/Makefile.objs
> > +++ b/hw/intc/Makefile.objs
> > @@ -35,6 +35,7 @@ obj-$(CONFIG_SH4) += sh_intc.o
> >  obj-$(CONFIG_XICS) += xics.o
> >  obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
> >  obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> > +obj-$(CONFIG_XIVE) += xive.o
> >  obj-$(CONFIG_POWERNV) += xics_pnv.o
> >  obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
> >  obj-$(CONFIG_S390_FLIC) += s390_flic.o
> > diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
> > new file mode 100644
> > index 000000000000..155c2dcd6066
> > --- /dev/null
> > +++ b/hw/intc/xive-internal.h
> > @@ -0,0 +1,28 @@
> > +/*
> > + * Copyright 2016,2017 IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version
> > + * 2 of the License, or (at your option) any later version.
> > + */
> > +#ifndef _INTC_XIVE_INTERNAL_H
> > +#define _INTC_XIVE_INTERNAL_H
> > +
> > +#include <hw/sysbus.h>
> > +
> > +struct XIVE {
> > +    SysBusDevice parent;
> 
> XIVE probably shouldn't be a SysBusDevice.  According to agraf, that
> should only be used for things which have an MMIO presence on a bus
> structure that's not worth the bother of more specifically modelling.
> 
> I don't think that's the case for XIVE, so it should just have
> TYPE_DEVICE as its parent.  There are several pseries things which
> already get this wrong (mostly because I made them before fully
> understanding the role of the SysBus), but we should avoid adding
> others.
> 
> > +    /* Properties */
> > +    uint32_t     nr_targets;
> > +
> > +    /* IRQ number allocator */
> > +    uint32_t     int_count;     /* Number of interrupts: nr_targets + HW IRQs */
> > +    uint32_t     int_base;      /* Min index */
> > +    uint32_t     int_max;       /* Max index */
> > +    uint32_t     int_hw_bot;    /* Bottom index of HW IRQ allocator */
> > +    uint32_t     int_ipi_top;   /* Highest IPI index handed out so far + 1 */
> > +};
> > +
> > +#endif /* _INTC_XIVE_INTERNAL_H */
> > diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> > new file mode 100644
> > index 000000000000..5b4ea915d87c
> > --- /dev/null
> > +++ b/hw/intc/xive.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * QEMU PowerPC XIVE model
> > + *
> > + * Copyright (c) 2017, IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License, version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +#include "qemu/osdep.h"
> > +#include "qemu/log.h"
> > +#include "qapi/error.h"
> > +#include "target/ppc/cpu.h"
> > +#include "sysemu/cpus.h"
> > +#include "sysemu/dma.h"
> > +#include "monitor/monitor.h"
> > +#include "hw/ppc/xive.h"
> > +
> > +#include "xive-internal.h"
> > +
> > +/*
> > + * Main XIVE object
> 
> As with XICs, does it really make sense for there to be a "main" XIVE
> object, or should be an interface attached to the machine?
> 
> > + */
> > +
> > +/* Let's provision some HW IRQ numbers. We could use a XIVE property
> > + * also but it does not seem necessary for the moment.
> > + */
> > +#define MAX_HW_IRQS_ENTRIES (8 * 1024)
> > +
> > +static void xive_init(Object *obj)
> > +{
> > +    ;
> > +}
> > +
> > +static void xive_realize(DeviceState *dev, Error **errp)
> > +{
> > +    XIVE *x = XIVE(dev);
> > +
> > +    if (!x->nr_targets) {
> > +        error_setg(errp, "Number of interrupt targets needs to be greater 0");
> > +        return;
> > +    }
> > +
> > +    /* Initialize IRQ number allocator. Let's use a base number if we
> > +     * need to introduce a notion of blocks one day.
> > +     */
> > +    x->int_base = 0;
> > +    x->int_count = x->nr_targets + MAX_HW_IRQS_ENTRIES;
> > +    x->int_max = x->int_base + x->int_count;
> > +    x->int_hw_bot = x->int_max;
> > +    x->int_ipi_top = x->int_base;
> > +
> > +    /* Reserve some numbers as OPAL does ? */
> > +    if (x->int_ipi_top < 0x10) {
> > +        x->int_ipi_top = 0x10;
> > +    }
> 
> I'm somewhat uncomfortable with an irq allocater here in the intc
> code.  As a rule, irq allocation is the responsibility of the machine,
> not any sub-component.  Furthermore, it should allocate in a way which
> is repeatable, since they need to stay stable across reboots and
> migrations.
> 
> And, yes, we have an allocator of sorts in XICS - it has caused a
> number of problems in the past.
> 
> > +}
> > +
> > +static Property xive_properties[] = {
> > +    DEFINE_PROP_UINT32("nr-targets", XIVE, nr_targets, 0),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void xive_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +
> > +    dc->realize = xive_realize;
> > +    dc->props = xive_properties;
> > +    dc->desc = "XIVE";
> > +}
> > +
> > +static const TypeInfo xive_info = {
> > +    .name = TYPE_XIVE,
> > +    .parent = TYPE_SYS_BUS_DEVICE,
> > +    .instance_init = xive_init,
> > +    .instance_size = sizeof(XIVE),
> > +    .class_init = xive_class_init,
> > +};
> > +
> > +static void xive_register_types(void)
> > +{
> > +    type_register_static(&xive_info);
> > +}
> > +
> > +type_init(xive_register_types)
> > diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> > new file mode 100644
> > index 000000000000..863f5a9c6b5f
> > --- /dev/null
> > +++ b/include/hw/ppc/xive.h
> > @@ -0,0 +1,27 @@
> > +/*
> > + * QEMU PowerPC XIVE model
> > + *
> > + * Copyright (c) 2017, IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License, version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef PPC_XIVE_H
> > +#define PPC_XIVE_H
> > +
> > +typedef struct XIVE XIVE;
> > +
> > +#define TYPE_XIVE "xive"
> > +#define XIVE(obj) OBJECT_CHECK(XIVE, (obj), TYPE_XIVE)
> > +
> > +#endif /* PPC_XIVE_H */
> 
>
David Gibson July 19, 2017, 4:01 a.m. UTC | #4
On Wed, Jul 19, 2017 at 01:56:57PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> > > Let's provide an empty shell for the XIVE controller model with a
> > > couple of attributes for the IRQ number allocator. The latter is
> > > largely inspired by OPAL which allocates IPI IRQ numbers from the
> > > bottom of the IRQ number space and allocates the HW IRQ numbers from
> > > the top.
> > > 
> > > The number of IPIs is simply deduced from the max number of CPUs the
> > > guest supports and we provision a arbitrary number of HW irqs.
> > > 
> > > The XIVE object is kept private because it will hold internal tables
> > > which do not need to be exposed to sPAPR.
> 
> It does have an MMIO presence though... more than one even. There's the
> TIMA (per-HW thread control area) and there's the per-interrupt MMIO
> space which are exposed to the guest. There's also the per-queue
> MMIO control area too.

Ok.  Always?  Or just on powernv?

If it only has an MMIO presence on powernv, then the "core" xive
object should probably be TYPE_DEVICE, with the powernv specific
device being a SysBusDevice which incorporates the core xive device
inside it.
Benjamin Herrenschmidt July 19, 2017, 4:02 a.m. UTC | #5
On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> 
> I'm somewhat uncomfortable with an irq allocater here in the intc
> code.  As a rule, irq allocation is the responsibility of the machine,
> not any sub-component.  Furthermore, it should allocate in a way which
> is repeatable, since they need to stay stable across reboots and
> migrations.
> 
> And, yes, we have an allocator of sorts in XICS - it has caused a
> number of problems in the past.

So....

For a bare metal model (which we don't have yet) of XIVE, the IRQ
numbering is entirely an artifact of how the HW is configured. There
should thus be no interrupt numbers visible to qemu.

For a PAPR model things are a bit different, but if we want to
maximize code re-use between the two, we probably need to make sure
the interrupts "allocated" by the machine for XIVE can be represented
by the HW model.

That means:

 - Each chip has a range (high bits are the block ID, which maps to a
chip, low bits, around 512K to 1M interrupts is the per-chip space).

 - Interrupts 0...N of that range (N depends on how much backing
memory and MMIO space is provisioned for each chip) are "generic IPIs"
which are somewhat generic interrupt source that can be triggered with
an MMIO store and routed to any target. Those are used in PAPR for
things like IPIs and some type of accelerator interrupts.

 - Portions of that range (which may or may not overlap the 0...N
above, if they do they "shadow" the generic interrupts) can be
configured to be the HW sources from the various PCIe bridges and
the PSI controller.

Cheers,
Ben.

> > +}
> > +
> > +static Property xive_properties[] = {
> > +    DEFINE_PROP_UINT32("nr-targets", XIVE, nr_targets, 0),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void xive_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +
> > +    dc->realize = xive_realize;
> > +    dc->props = xive_properties;
> > +    dc->desc = "XIVE";
> > +}
> > +
> > +static const TypeInfo xive_info = {
> > +    .name = TYPE_XIVE,
> > +    .parent = TYPE_SYS_BUS_DEVICE,
> > +    .instance_init = xive_init,
> > +    .instance_size = sizeof(XIVE),
> > +    .class_init = xive_class_init,
> > +};
> > +
> > +static void xive_register_types(void)
> > +{
> > +    type_register_static(&xive_info);
> > +}
> > +
> > +type_init(xive_register_types)
> > diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> > new file mode 100644
> > index 000000000000..863f5a9c6b5f
> > --- /dev/null
> > +++ b/include/hw/ppc/xive.h
> > @@ -0,0 +1,27 @@
> > +/*
> > + * QEMU PowerPC XIVE model
> > + *
> > + * Copyright (c) 2017, IBM Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License, version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef PPC_XIVE_H
> > +#define PPC_XIVE_H
> > +
> > +typedef struct XIVE XIVE;
> > +
> > +#define TYPE_XIVE "xive"
> > +#define XIVE(obj) OBJECT_CHECK(XIVE, (obj), TYPE_XIVE)
> > +
> > +#endif /* PPC_XIVE_H */
>
Benjamin Herrenschmidt July 19, 2017, 4:18 a.m. UTC | #6
On Wed, 2017-07-19 at 14:01 +1000, David Gibson wrote:
> On Wed, Jul 19, 2017 at 01:56:57PM +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > > On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> > > > Let's provide an empty shell for the XIVE controller model with a
> > > > couple of attributes for the IRQ number allocator. The latter is
> > > > largely inspired by OPAL which allocates IPI IRQ numbers from the
> > > > bottom of the IRQ number space and allocates the HW IRQ numbers from
> > > > the top.
> > > > 
> > > > The number of IPIs is simply deduced from the max number of CPUs the
> > > > guest supports and we provision a arbitrary number of HW irqs.
> > > > 
> > > > The XIVE object is kept private because it will hold internal tables
> > > > which do not need to be exposed to sPAPR.
> > 
> > It does have an MMIO presence though... more than one even. There's the
> > TIMA (per-HW thread control area) and there's the per-interrupt MMIO
> > space which are exposed to the guest. There's also the per-queue
> > MMIO control area too.
> 
> Ok.  Always?  Or just on powernv?
> 
> If it only has an MMIO presence on powernv, then the "core" xive
> object should probably be TYPE_DEVICE, with the powernv specific
> device being a SysBusDevice which incorporates the core xive device
> inside it.

No the ones above are on PAPR. PowerNV has even more :-)

The TIMA (thread management area) is the MMIO area through which
you control the current CPU priority etc...

It's designed in HW to "know" which core/thread is accessing it (it's
at a fixed address) and respond appropriately based on that and which
virtual CPU has been activated on that core/thread.

It's part of what allows XIVE to deliver interrupts without any HV
calls.

Cheers,
Ben.
David Gibson July 19, 2017, 4:25 a.m. UTC | #7
On Wed, Jul 19, 2017 at 02:18:17PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2017-07-19 at 14:01 +1000, David Gibson wrote:
> > On Wed, Jul 19, 2017 at 01:56:57PM +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > > > On Wed, Jul 05, 2017 at 07:13:17PM +0200, Cédric Le Goater wrote:
> > > > > Let's provide an empty shell for the XIVE controller model with a
> > > > > couple of attributes for the IRQ number allocator. The latter is
> > > > > largely inspired by OPAL which allocates IPI IRQ numbers from the
> > > > > bottom of the IRQ number space and allocates the HW IRQ numbers from
> > > > > the top.
> > > > > 
> > > > > The number of IPIs is simply deduced from the max number of CPUs the
> > > > > guest supports and we provision a arbitrary number of HW irqs.
> > > > > 
> > > > > The XIVE object is kept private because it will hold internal tables
> > > > > which do not need to be exposed to sPAPR.
> > > 
> > > It does have an MMIO presence though... more than one even. There's the
> > > TIMA (per-HW thread control area) and there's the per-interrupt MMIO
> > > space which are exposed to the guest. There's also the per-queue
> > > MMIO control area too.
> > 
> > Ok.  Always?  Or just on powernv?
> > 
> > If it only has an MMIO presence on powernv, then the "core" xive
> > object should probably be TYPE_DEVICE, with the powernv specific
> > device being a SysBusDevice which incorporates the core xive device
> > inside it.
> 
> No the ones above are on PAPR. PowerNV has even more :-)

Ok.  SusBusDevice is reasonable then.

> The TIMA (thread management area) is the MMIO area through which
> you control the current CPU priority etc...
> 
> It's designed in HW to "know" which core/thread is accessing it (it's
> at a fixed address) and respond appropriately based on that and which
> virtual CPU has been activated on that core/thread.
> 
> It's part of what allows XIVE to deliver interrupts without any HV
> calls.
>
David Gibson July 21, 2017, 7:50 a.m. UTC | #8
On Wed, Jul 19, 2017 at 02:02:18PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > 
> > I'm somewhat uncomfortable with an irq allocater here in the intc
> > code.  As a rule, irq allocation is the responsibility of the machine,
> > not any sub-component.  Furthermore, it should allocate in a way which
> > is repeatable, since they need to stay stable across reboots and
> > migrations.
> > 
> > And, yes, we have an allocator of sorts in XICS - it has caused a
> > number of problems in the past.
> 
> So....
> 
> For a bare metal model (which we don't have yet) of XIVE, the IRQ
> numbering is entirely an artifact of how the HW is configured. There
> should thus be no interrupt numbers visible to qemu.

Uh.. I don't entirely follow.  Do you mean that during boot the guest
programs the irq numbers into the various components?

In that case this allocator stuff definitely doesn't belong on the
xive code.

> For a PAPR model things are a bit different, but if we want to
> maximize code re-use between the two, we probably need to make sure
> the interrupts "allocated" by the machine for XIVE can be represented
> by the HW model.
> 
> That means:
> 
>  - Each chip has a range (high bits are the block ID, which maps to a
> chip, low bits, around 512K to 1M interrupts is the per-chip space).
> 
>  - Interrupts 0...N of that range (N depends on how much backing
> memory and MMIO space is provisioned for each chip) are "generic IPIs"
> which are somewhat generic interrupt source that can be triggered with
> an MMIO store and routed to any target. Those are used in PAPR for
> things like IPIs and some type of accelerator interrupts.
> 
>  - Portions of that range (which may or may not overlap the 0...N
> above, if they do they "shadow" the generic interrupts) can be
> configured to be the HW sources from the various PCIe bridges and
> the PSI controller.

Err.. I'm confused how this not sure this relates to spapr.  There are
no chips or PSI there, and the PCI bridges aren't really the same
thing.
Benjamin Herrenschmidt July 21, 2017, 8:21 a.m. UTC | #9
On Fri, 2017-07-21 at 17:50 +1000, David Gibson wrote:
> On Wed, Jul 19, 2017 at 02:02:18PM +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > > 
> > > I'm somewhat uncomfortable with an irq allocater here in the intc
> > > code.  As a rule, irq allocation is the responsibility of the machine,
> > > not any sub-component.  Furthermore, it should allocate in a way which
> > > is repeatable, since they need to stay stable across reboots and
> > > migrations.
> > > 
> > > And, yes, we have an allocator of sorts in XICS - it has caused a
> > > number of problems in the past.
> > 
> > So....
> > 
> > For a bare metal model (which we don't have yet) of XIVE, the IRQ
> > numbering is entirely an artifact of how the HW is configured. There
> > should thus be no interrupt numbers visible to qemu.
> 
> Uh.. I don't entirely follow.  Do you mean that during boot the guest
> programs the irq numbers into the various components?

I said a "bare metal model" but yes. Pretty much. 
> 
> In that case this allocator stuff definitely doesn't belong on the
> xive code.
> 
> > For a PAPR model things are a bit different, but if we want to
> > maximize code re-use between the two, we probably need to make sure
> > the interrupts "allocated" by the machine for XIVE can be represented
> > by the HW model.
> > 
> > That means:
> > 
> >  - Each chip has a range (high bits are the block ID, which maps to a
> > chip, low bits, around 512K to 1M interrupts is the per-chip space).
> > 
> >  - Interrupts 0...N of that range (N depends on how much backing
> > memory and MMIO space is provisioned for each chip) are "generic IPIs"
> > which are somewhat generic interrupt source that can be triggered with
> > an MMIO store and routed to any target. Those are used in PAPR for
> > things like IPIs and some type of accelerator interrupts.
> > 
> >  - Portions of that range (which may or may not overlap the 0...N
> > above, if they do they "shadow" the generic interrupts) can be
> > configured to be the HW sources from the various PCIe bridges and
> > the PSI controller.
> 
> Err.. I'm confused how this not sure this relates to spapr.  There are
> no chips or PSI there, and the PCI bridges aren't really the same
> thing.

The above is the HW model, sorry for the confusion. With a few comments
about how they are used in PAPR.

So yes, in PAPR there's an "allocator" because the hypervisor will
create a guest "virtual" (or logical to use PAPR terminology) interrupt
number space, in order to represents the various interrupts into the
guest.

Those numbers however are just tokens, they don't have to represent any
real HW concept. So they can be "allocated" in a rather fixed way, for
example, you could have something like a fixed map where you put all
the PCI interrupts at a certain number (a factor of the PHB# with room
or a fix number per PHB, maybe 16K or so, the HW does 4K max). Another
based would have a chunk of "general purpose" IPIs (for use for actual
IPIs and for other things to come). And a range for the virtual device
interrupts for example. Or you can just use an allocator.

But it's fundamentally an allocator that sits in the hypervisor, so in
our case, I would say in the spapr "component" of XIVE, rather than the
XIVE HW model itself.

Now what Cedric did, because XIVE is very complex and we need something
for PAPR quickly, is not a complete HW model, but a somewhat simplified
one that only handles what PAPR exposes. So in that case where the
allocator sits is a bit of a TBD...

Cheers,
Ben.
David Gibson July 24, 2017, 3:28 a.m. UTC | #10
On Fri, Jul 21, 2017 at 06:21:31PM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2017-07-21 at 17:50 +1000, David Gibson wrote:
> > On Wed, Jul 19, 2017 at 02:02:18PM +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
> > > > 
> > > > I'm somewhat uncomfortable with an irq allocater here in the intc
> > > > code.  As a rule, irq allocation is the responsibility of the machine,
> > > > not any sub-component.  Furthermore, it should allocate in a way which
> > > > is repeatable, since they need to stay stable across reboots and
> > > > migrations.
> > > > 
> > > > And, yes, we have an allocator of sorts in XICS - it has caused a
> > > > number of problems in the past.
> > > 
> > > So....
> > > 
> > > For a bare metal model (which we don't have yet) of XIVE, the IRQ
> > > numbering is entirely an artifact of how the HW is configured. There
> > > should thus be no interrupt numbers visible to qemu.
> > 
> > Uh.. I don't entirely follow.  Do you mean that during boot the guest
> > programs the irq numbers into the various components?
> 
> I said a "bare metal model" but yes. Pretty much. 

Right, by "guest" I meant the kernel running under qemu, even if its
running on a bare-metal equivalent platform.

> > In that case this allocator stuff definitely doesn't belong on the
> > xive code.
> > 
> > > For a PAPR model things are a bit different, but if we want to
> > > maximize code re-use between the two, we probably need to make sure
> > > the interrupts "allocated" by the machine for XIVE can be represented
> > > by the HW model.
> > > 
> > > That means:
> > > 
> > >  - Each chip has a range (high bits are the block ID, which maps to a
> > > chip, low bits, around 512K to 1M interrupts is the per-chip space).
> > > 
> > >  - Interrupts 0...N of that range (N depends on how much backing
> > > memory and MMIO space is provisioned for each chip) are "generic IPIs"
> > > which are somewhat generic interrupt source that can be triggered with
> > > an MMIO store and routed to any target. Those are used in PAPR for
> > > things like IPIs and some type of accelerator interrupts.
> > > 
> > >  - Portions of that range (which may or may not overlap the 0...N
> > > above, if they do they "shadow" the generic interrupts) can be
> > > configured to be the HW sources from the various PCIe bridges and
> > > the PSI controller.
> > 
> > Err.. I'm confused how this not sure this relates to spapr.  There are
> > no chips or PSI there, and the PCI bridges aren't really the same
> > thing.
> 
> The above is the HW model, sorry for the confusion. With a few comments
> about how they are used in PAPR.
> 
> So yes, in PAPR there's an "allocator" because the hypervisor will
> create a guest "virtual" (or logical to use PAPR terminology) interrupt
> number space, in order to represents the various interrupts into the
> guest.

Ok, but are each of those logical irqs bound to a specific device/PHB
line/whatever, or can they be configured by the guest?

> Those numbers however are just tokens, they don't have to represent any
> real HW concept. So they can be "allocated" in a rather fixed way, for
> example, you could have something like a fixed map where you put all
> the PCI interrupts at a certain number (a factor of the PHB# with room
> or a fix number per PHB, maybe 16K or so, the HW does 4K max). Another
> based would have a chunk of "general purpose" IPIs (for use for actual
> IPIs and for other things to come). And a range for the virtual device
> interrupts for example. Or you can just use an allocator.

Hm.  So what I'm meaning by an "allocator" is something at least
partially dynamic.  Something you say "give me an irq" and it gives
you the next available or similar.  As opposed to any mapping from
devices to (logical) irqs, which the machine will need to supply one
way or another.

> But it's fundamentally an allocator that sits in the hypervisor, so in
> our case, I would say in the spapr "component" of XIVE, rather than the
> XIVE HW model itself.

Maybe..

> Now what Cedric did, because XIVE is very complex and we need something
> for PAPR quickly, is not a complete HW model, but a somewhat simplified
> one that only handles what PAPR exposes. So in that case where the
> allocator sits is a bit of a TBD...

Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
machine type level needs extreme caution, or the irqs may not be
stable which will generally break migration.
Alexey Kardashevskiy July 24, 2017, 3:53 a.m. UTC | #11
On 24/07/17 13:28, David Gibson wrote:
> On Fri, Jul 21, 2017 at 06:21:31PM +1000, Benjamin Herrenschmidt wrote:
>> On Fri, 2017-07-21 at 17:50 +1000, David Gibson wrote:
>>> On Wed, Jul 19, 2017 at 02:02:18PM +1000, Benjamin Herrenschmidt wrote:
>>>> On Wed, 2017-07-19 at 13:08 +1000, David Gibson wrote:
>>>>>
>>>>> I'm somewhat uncomfortable with an irq allocater here in the intc
>>>>> code.  As a rule, irq allocation is the responsibility of the machine,
>>>>> not any sub-component.  Furthermore, it should allocate in a way which
>>>>> is repeatable, since they need to stay stable across reboots and
>>>>> migrations.
>>>>>
>>>>> And, yes, we have an allocator of sorts in XICS - it has caused a
>>>>> number of problems in the past.
>>>>
>>>> So....
>>>>
>>>> For a bare metal model (which we don't have yet) of XIVE, the IRQ
>>>> numbering is entirely an artifact of how the HW is configured. There
>>>> should thus be no interrupt numbers visible to qemu.
>>>
>>> Uh.. I don't entirely follow.  Do you mean that during boot the guest
>>> programs the irq numbers into the various components?
>>
>> I said a "bare metal model" but yes. Pretty much. 
> 
> Right, by "guest" I meant the kernel running under qemu, even if its
> running on a bare-metal equivalent platform.
> 
>>> In that case this allocator stuff definitely doesn't belong on the
>>> xive code.
>>>
>>>> For a PAPR model things are a bit different, but if we want to
>>>> maximize code re-use between the two, we probably need to make sure
>>>> the interrupts "allocated" by the machine for XIVE can be represented
>>>> by the HW model.
>>>>
>>>> That means:
>>>>
>>>>  - Each chip has a range (high bits are the block ID, which maps to a
>>>> chip, low bits, around 512K to 1M interrupts is the per-chip space).
>>>>
>>>>  - Interrupts 0...N of that range (N depends on how much backing
>>>> memory and MMIO space is provisioned for each chip) are "generic IPIs"
>>>> which are somewhat generic interrupt source that can be triggered with
>>>> an MMIO store and routed to any target. Those are used in PAPR for
>>>> things like IPIs and some type of accelerator interrupts.
>>>>
>>>>  - Portions of that range (which may or may not overlap the 0...N
>>>> above, if they do they "shadow" the generic interrupts) can be
>>>> configured to be the HW sources from the various PCIe bridges and
>>>> the PSI controller.
>>>
>>> Err.. I'm confused how this not sure this relates to spapr.  There are
>>> no chips or PSI there, and the PCI bridges aren't really the same
>>> thing.
>>
>> The above is the HW model, sorry for the confusion. With a few comments
>> about how they are used in PAPR.
>>
>> So yes, in PAPR there's an "allocator" because the hypervisor will
>> create a guest "virtual" (or logical to use PAPR terminology) interrupt
>> number space, in order to represents the various interrupts into the
>> guest.
> 
> Ok, but are each of those logical irqs bound to a specific device/PHB
> line/whatever, or can they be configured by the guest?
> 
>> Those numbers however are just tokens, they don't have to represent any
>> real HW concept. So they can be "allocated" in a rather fixed way, for
>> example, you could have something like a fixed map where you put all
>> the PCI interrupts at a certain number (a factor of the PHB# with room
>> or a fix number per PHB, maybe 16K or so, the HW does 4K max). Another
>> based would have a chunk of "general purpose" IPIs (for use for actual
>> IPIs and for other things to come). And a range for the virtual device
>> interrupts for example. Or you can just use an allocator.
> 
> Hm.  So what I'm meaning by an "allocator" is something at least
> partially dynamic.  Something you say "give me an irq" and it gives
> you the next available or similar.  As opposed to any mapping from
> devices to (logical) irqs, which the machine will need to supply one
> way or another.

I am probably reading it wrong but the XIVE's allocator allocates IRQ
ranges for interrupt source controls (which are CPU cores, PHBs, PSI - in
bare metal - so they allocate just once per machine creation). Individual
interrupts are still allocated via spapr_ics_alloc_block().



> 
>> But it's fundamentally an allocator that sits in the hypervisor, so in
>> our case, I would say in the spapr "component" of XIVE, rather than the
>> XIVE HW model itself.
> 
> Maybe..
> 
>> Now what Cedric did, because XIVE is very complex and we need something
>> for PAPR quickly, is not a complete HW model, but a somewhat simplified
>> one that only handles what PAPR exposes. So in that case where the
>> allocator sits is a bit of a TBD...
> 
> Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> machine type level needs extreme caution, or the irqs may not be
> stable which will generally break migration.
>
Benjamin Herrenschmidt July 24, 2017, 5:04 a.m. UTC | #12
On Mon, 2017-07-24 at 13:28 +1000, David Gibson wrote:
> > So yes, in PAPR there's an "allocator" because the hypervisor will
> > create a guest "virtual" (or logical to use PAPR terminology) interrupt
> > number space, in order to represents the various interrupts into the
> > guest.
> 
> Ok, but are each of those logical irqs bound to a specific device/PHB
> line/whatever, or can they be configured by the guest?

So for clarity, let's first establish the terminology :-)

 - HW number is a HW interrupt number on a "bare metal" system or
powernv guest. For now we will ignore those, they are effectively a
side effect of how skiboot configure the XIVE and qemu per-se doesn't
allocate them.

 - A logical number is a "guest physical" interrupt number for a PAPR
guest. These fall into roughly 2 categories at the moment:

    * "interrupts" (or related) properties in the DT, typically
interrupts for a PCI device, ranges of MSIs etc... that correspond to
HW sources from a PHB.

    * "generic IPIs". Those are ranges of "generic" interrupts that the
hypervisor gives the guest. On a real system, they correspond to chunks
allocated off a HW facility for generic interrupts. Generic interrupts
are the same as normal interrupts from the prespective of
managing/receiving them, but are "triggered" by an MMIO to a certain HW
page. There's a DT property telling the guest the interrupt number
ranges for these guys.

So that logical number above is what a PAPR guest obtains from the DT
and uses for the various H-call used to manage and configure interrupt
sources.

In addition, the XIVE supports renumbering the interrupt number that
you obtain in the queues. Both bare metal linux, KVM and guests make
use of this. This only changes the number you observe in a queue when
you receive an interrupt, it has no effect on the HW number or logical
number used for the various management calls.

This is used by Linux so that:

  - On bare metal systems or PAPR guest with "exploitation mode" (ie,
PAPR guest directly using the XIVE), we put the linux interrupt number
in there as to avoid the reverse-mapping done by linux otherwise when
receiving an interrupt.

  - On PARP guests using the legacy hcalls, KVM configures the logical
number there.

> > Those numbers however are just tokens, they don't have to represent any
> > real HW concept. So they can be "allocated" in a rather fixed way, for
> > example, you could have something like a fixed map where you put all
> > the PCI interrupts at a certain number (a factor of the PHB# with room
> > or a fix number per PHB, maybe 16K or so, the HW does 4K max). Another
> > based would have a chunk of "general purpose" IPIs (for use for actual
> > IPIs and for other things to come). And a range for the virtual device
> > interrupts for example. Or you can just use an allocator.
> 
> Hm.  So what I'm meaning by an "allocator" is something at least
> partially dynamic.  Something you say "give me an irq" and it gives
> you the next available or similar.  As opposed to any mapping from
> devices to (logical) irqs, which the machine will need to supply one
> way or another.

For the sake of repeatability/migration etc... I think a mapping is
better than an allocator.  IE, a fixed number scheme so that the range
of interrupts for PHB#x is always a fixed function of x.

We can fix the number of "generic" interrupts given to a guest. The
only requirements from a PAPR perspective is that there should be at
least as many as there are possible threads in the guest so they can be
used as IPIs.

But we may need more for other things. We can make this a machine
parameter with a default value of something like 4096. If we call N
that number of extra generic interrupts, then the number of generic
interrutps would be #possible-vcpu's + N, or something like that.

> > But it's fundamentally an allocator that sits in the hypervisor, so in
> > our case, I would say in the spapr "component" of XIVE, rather than the
> > XIVE HW model itself.
> 
> Maybe..

You are right in that a mapping is a better term than an allocator
here.

> > Now what Cedric did, because XIVE is very complex and we need something
> > for PAPR quickly, is not a complete HW model, but a somewhat simplified
> > one that only handles what PAPR exposes. So in that case where the
> > allocator sits is a bit of a TBD...
> 
> Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> machine type level needs extreme caution, or the irqs may not be
> stable which will generally break migration.

Yes you are right. We should probably create a more "static" scheme.

Cheers,
Ben.
David Gibson July 24, 2017, 5:38 a.m. UTC | #13
On Mon, Jul 24, 2017 at 03:04:05PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2017-07-24 at 13:28 +1000, David Gibson wrote:
> > > So yes, in PAPR there's an "allocator" because the hypervisor will
> > > create a guest "virtual" (or logical to use PAPR terminology) interrupt
> > > number space, in order to represents the various interrupts into the
> > > guest.
> > 
> > Ok, but are each of those logical irqs bound to a specific device/PHB
> > line/whatever, or can they be configured by the guest?
> 
> So for clarity, let's first establish the terminology :-)
> 
>  - HW number is a HW interrupt number on a "bare metal" system or
> powernv guest. For now we will ignore those, they are effectively a
> side effect of how skiboot configure the XIVE and qemu per-se doesn't
> allocate them.
> 
>  - A logical number is a "guest physical" interrupt number for a PAPR
> guest. These fall into roughly 2 categories at the moment:
> 
>     * "interrupts" (or related) properties in the DT, typically
> interrupts for a PCI device, ranges of MSIs etc... that correspond to
> HW sources from a PHB.

Ok, I think this is the one I've mostly been thinking of.

>     * "generic IPIs". Those are ranges of "generic" interrupts that the
> hypervisor gives the guest. On a real system, they correspond to chunks
> allocated off a HW facility for generic interrupts. Generic interrupts
> are the same as normal interrupts from the prespective of
> managing/receiving them, but are "triggered" by an MMIO to a certain HW
> page. There's a DT property telling the guest the interrupt number
> ranges for these guys.
> 
> So that logical number above is what a PAPR guest obtains from the DT
> and uses for the various H-call used to manage and configure interrupt
> sources.

Ok.

> In addition, the XIVE supports renumbering the interrupt number that
> you obtain in the queues. Both bare metal linux, KVM and guests make
> use of this. This only changes the number you observe in a queue when
> you receive an interrupt, it has no effect on the HW number or logical
> number used for the various management calls.

Ok.

> This is used by Linux so that:
> 
>   - On bare metal systems or PAPR guest with "exploitation mode" (ie,
> PAPR guest directly using the XIVE), we put the linux interrupt number
> in there as to avoid the reverse-mapping done by linux otherwise when
> receiving an interrupt.
> 
>   - On PARP guests using the legacy hcalls, KVM configures the logical
> number there.

Ok.

> > > Those numbers however are just tokens, they don't have to represent any
> > > real HW concept. So they can be "allocated" in a rather fixed way, for
> > > example, you could have something like a fixed map where you put all
> > > the PCI interrupts at a certain number (a factor of the PHB# with room
> > > or a fix number per PHB, maybe 16K or so, the HW does 4K max). Another
> > > based would have a chunk of "general purpose" IPIs (for use for actual
> > > IPIs and for other things to come). And a range for the virtual device
> > > interrupts for example. Or you can just use an allocator.
> > 
> > Hm.  So what I'm meaning by an "allocator" is something at least
> > partially dynamic.  Something you say "give me an irq" and it gives
> > you the next available or similar.  As opposed to any mapping from
> > devices to (logical) irqs, which the machine will need to supply one
> > way or another.
> 
> For the sake of repeatability/migration etc... I think a mapping is
> better than an allocator.  IE, a fixed number scheme so that the range
> of interrupts for PHB#x is always a fixed function of x.

Yes, I agree.  In fact that's pretty much exactly the point I'm trying
to make.

Can we assign our logical numbers sparsely, or will that cause other
problems?

Note that for PAPR we also have the question of finding logical
interrupts for legacy PAPR VIO devices.

> We can fix the number of "generic" interrupts given to a guest. The
> only requirements from a PAPR perspective is that there should be at
> least as many as there are possible threads in the guest so they can be
> used as IPIs.

Ok.  If we can do things sparsely, allocating these well away from the
hw interrupts would make things easier.

> But we may need more for other things. We can make this a machine
> parameter with a default value of something like 4096. If we call N
> that number of extra generic interrupts, then the number of generic
> interrutps would be #possible-vcpu's + N, or something like that.

That seems reasonable.

> > > But it's fundamentally an allocator that sits in the hypervisor, so in
> > > our case, I would say in the spapr "component" of XIVE, rather than the
> > > XIVE HW model itself.
> > 
> > Maybe..
> 
> You are right in that a mapping is a better term than an allocator
> here.
> 
> > > Now what Cedric did, because XIVE is very complex and we need something
> > > for PAPR quickly, is not a complete HW model, but a somewhat simplified
> > > one that only handles what PAPR exposes. So in that case where the
> > > allocator sits is a bit of a TBD...
> > 
> > Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> > machine type level needs extreme caution, or the irqs may not be
> > stable which will generally break migration.
> 
> Yes you are right. We should probably create a more "static" scheme.

Sounds like we're in violent agreement.
Benjamin Herrenschmidt July 24, 2017, 7:20 a.m. UTC | #14
On Mon, 2017-07-24 at 15:38 +1000, David Gibson wrote:
> 
> Can we assign our logical numbers sparsely, or will that cause other
> problems?

The main issue is that they probably needs to be the same between XICS
and XIVE because by the time we get the CAS call to chose between XICS
and XIVE, we have already handed out interrupts and constructed the DT,
no ? Unless we do a real CAS reboot...

Otherwise, there's no reason they can't be sparse no.

> Note that for PAPR we also have the question of finding logical
> interrupts for legacy PAPR VIO devices.

We just make them another range ? With KVM legacy today, I just use the
generic interrupt facility for those. So when you do the ioctl to
"trigger" one, I just do an MMIO to the corresponding page and the
interrupt magically shows up wherever the guest is running the target
vcpu. In fact, I'd like to add a way to mmap that page into qemu so
that qemu can triggers them without an ioctl.

The guest doesn't care, from the guest perspective they are interrupts
coming from the DT, so they are like PCI etc...

> > We can fix the number of "generic" interrupts given to a guest. The
> > only requirements from a PAPR perspective is that there should be at
> > least as many as there are possible threads in the guest so they can be
> > used as IPIs.
> 
> Ok.  If we can do things sparsely, allocating these well away from the
> hw interrupts would make things easier.
> 
> > But we may need more for other things. We can make this a machine
> > parameter with a default value of something like 4096. If we call N
> > that number of extra generic interrupts, then the number of generic
> > interrutps would be #possible-vcpu's + N, or something like that.
> 
> That seems reasonable.
> 
> > > > But it's fundamentally an allocator that sits in the hypervisor, so in
> > > > our case, I would say in the spapr "component" of XIVE, rather than the
> > > > XIVE HW model itself.
> > > 
> > > Maybe..
> > 
> > You are right in that a mapping is a better term than an allocator
> > here.
> > 
> > > > Now what Cedric did, because XIVE is very complex and we need something
> > > > for PAPR quickly, is not a complete HW model, but a somewhat simplified
> > > > one that only handles what PAPR exposes. So in that case where the
> > > > allocator sits is a bit of a TBD...
> > > 
> > > Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> > > machine type level needs extreme caution, or the irqs may not be
> > > stable which will generally break migration.
> > 
> > Yes you are right. We should probably create a more "static" scheme.
> 
> Sounds like we're in violent agreement.

Yup :)

Cheers,
Ben.
David Gibson July 24, 2017, 10:03 a.m. UTC | #15
On Mon, Jul 24, 2017 at 05:20:26PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2017-07-24 at 15:38 +1000, David Gibson wrote:
> > 
> > Can we assign our logical numbers sparsely, or will that cause other
> > problems?
> 
> The main issue is that they probably needs to be the same between XICS
> and XIVE because by the time we get the CAS call to chose between XICS
> and XIVE, we have already handed out interrupts and constructed the DT,
> no ? Unless we do a real CAS reboot...

A real CAS reboot probably isn't unreasonable for this case.

I definitely think we need to go one way or the other - either fully
unify the irq mapping between xics and xive, or fully separate them.

> Otherwise, there's no reason they can't be sparse no.
> 
> > Note that for PAPR we also have the question of finding logical
> > interrupts for legacy PAPR VIO devices.
> 
> We just make them another range ? With KVM legacy today, I just use the
> generic interrupt facility for those. So when you do the ioctl to
> "trigger" one, I just do an MMIO to the corresponding page and the
> interrupt magically shows up wherever the guest is running the target
> vcpu. In fact, I'd like to add a way to mmap that page into qemu so
> that qemu can triggers them without an ioctl.

Ok.

> The guest doesn't care, from the guest perspective they are interrupts
> coming from the DT, so they are like PCI etc...

Ok.

> > > We can fix the number of "generic" interrupts given to a guest. The
> > > only requirements from a PAPR perspective is that there should be at
> > > least as many as there are possible threads in the guest so they can be
> > > used as IPIs.
> > 
> > Ok.  If we can do things sparsely, allocating these well away from the
> > hw interrupts would make things easier.
> > 
> > > But we may need more for other things. We can make this a machine
> > > parameter with a default value of something like 4096. If we call N
> > > that number of extra generic interrupts, then the number of generic
> > > interrutps would be #possible-vcpu's + N, or something like that.
> > 
> > That seems reasonable.
> > 
> > > > > But it's fundamentally an allocator that sits in the hypervisor, so in
> > > > > our case, I would say in the spapr "component" of XIVE, rather than the
> > > > > XIVE HW model itself.
> > > > 
> > > > Maybe..
> > > 
> > > You are right in that a mapping is a better term than an allocator
> > > here.
> > > 
> > > > > Now what Cedric did, because XIVE is very complex and we need something
> > > > > for PAPR quickly, is not a complete HW model, but a somewhat simplified
> > > > > one that only handles what PAPR exposes. So in that case where the
> > > > > allocator sits is a bit of a TBD...
> > > > 
> > > > Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> > > > machine type level needs extreme caution, or the irqs may not be
> > > > stable which will generally break migration.
> > > 
> > > Yes you are right. We should probably create a more "static" scheme.
> > 
> > Sounds like we're in violent agreement.
> 
> Yup :)
>
Cédric Le Goater July 24, 2017, 1 p.m. UTC | #16
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qapi/error.h"
>> +#include "target/ppc/cpu.h"
>> +#include "sysemu/cpus.h"
>> +#include "sysemu/dma.h"
>> +#include "monitor/monitor.h"
>> +#include "hw/ppc/xive.h"
>> +
>> +#include "xive-internal.h"
>> +
>> +/*
>> + * Main XIVE object
> 
> As with XICs, does it really make sense for there to be a "main" XIVE
> object, or should be an interface attached to the machine?

yes. There are internal tables which are very specific to the controller 
and I don't think they belong to the machine.

C.
Alexey Kardashevskiy July 25, 2017, 1:26 a.m. UTC | #17
On 24/07/17 23:00, Cédric Le Goater wrote:
>>> +#include "qemu/osdep.h"
>>> +#include "qemu/log.h"
>>> +#include "qapi/error.h"
>>> +#include "target/ppc/cpu.h"
>>> +#include "sysemu/cpus.h"
>>> +#include "sysemu/dma.h"
>>> +#include "monitor/monitor.h"
>>> +#include "hw/ppc/xive.h"
>>> +
>>> +#include "xive-internal.h"
>>> +
>>> +/*
>>> + * Main XIVE object
>>
>> As with XICs, does it really make sense for there to be a "main" XIVE
>> object, or should be an interface attached to the machine?
> 
> yes. There are internal tables which are very specific to the controller 
> and I don't think they belong to the machine.

These tables belong to a CPU chip (die?) and we do not emulate these now
(machines and cores are the closest) and since we do not want (do we?) to
treat a core as a chip, the machine is the most obvious owner for these tables.
David Gibson July 25, 2017, 2:17 a.m. UTC | #18
On Tue, Jul 25, 2017 at 11:26:13AM +1000, Alexey Kardashevskiy wrote:
> On 24/07/17 23:00, Cédric Le Goater wrote:
> >>> +#include "qemu/osdep.h"
> >>> +#include "qemu/log.h"
> >>> +#include "qapi/error.h"
> >>> +#include "target/ppc/cpu.h"
> >>> +#include "sysemu/cpus.h"
> >>> +#include "sysemu/dma.h"
> >>> +#include "monitor/monitor.h"
> >>> +#include "hw/ppc/xive.h"
> >>> +
> >>> +#include "xive-internal.h"
> >>> +
> >>> +/*
> >>> + * Main XIVE object
> >>
> >> As with XICs, does it really make sense for there to be a "main" XIVE
> >> object, or should be an interface attached to the machine?
> > 
> > yes. There are internal tables which are very specific to the controller 
> > and I don't think they belong to the machine.
> 
> These tables belong to a CPU chip (die?) and we do not emulate these now
> (machines and cores are the closest) and since we do not want (do we?) to
> treat a core as a chip, the machine is the most obvious owner for these tables.

No, I think it's reasonable for them to be owned by a XIVE object
under the machine.
Cédric Le Goater July 25, 2017, 8:52 a.m. UTC | #19
On 07/24/2017 12:03 PM, David Gibson wrote:
> On Mon, Jul 24, 2017 at 05:20:26PM +1000, Benjamin Herrenschmidt wrote:
>> On Mon, 2017-07-24 at 15:38 +1000, David Gibson wrote:
>>>
>>> Can we assign our logical numbers sparsely, or will that cause other
>>> problems?
>>
>> The main issue is that they probably needs to be the same between XICS
>> and XIVE because by the time we get the CAS call to chose between XICS
>> and XIVE, we have already handed out interrupts and constructed the DT,
>> no ? Unless we do a real CAS reboot...
> 
> A real CAS reboot probably isn't unreasonable for this case.
> 
> I definitely think we need to go one way or the other - either fully
> unify the irq mapping between xics and xive, or fully separate them.

To be able to change interrupt model at CAS time, we need to unify 
the IRQ numbering. We don't have much choice because the DT is 
already populated. We also need to share the ICSIRQState flags unless
we share the interrupt source object between the XIVE and XICS mode. 

In my current tree, I made sure that the same IRQ number ranges 
were being used in the XIVE and in the XICS allocator and that the 
ICSIRQState flags of the different sPAPR Interrupt sources (XIVE 
and XICS) were in sync. That works pretty well for reset, migration 
and hotplug, but it is bit hacky.

C.


>> Otherwise, there's no reason they can't be sparse no.
>>
>>> Note that for PAPR we also have the question of finding logical
>>> interrupts for legacy PAPR VIO devices.
>>
>> We just make them another range ? With KVM legacy today, I just use the
>> generic interrupt facility for those. So when you do the ioctl to
>> "trigger" one, I just do an MMIO to the corresponding page and the
>> interrupt magically shows up wherever the guest is running the target
>> vcpu. In fact, I'd like to add a way to mmap that page into qemu so
>> that qemu can triggers them without an ioctl.
> 
> Ok.
> 
>> The guest doesn't care, from the guest perspective they are interrupts
>> coming from the DT, so they are like PCI etc...
> 
> Ok.
> 
>>>> We can fix the number of "generic" interrupts given to a guest. The
>>>> only requirements from a PAPR perspective is that there should be at
>>>> least as many as there are possible threads in the guest so they can be
>>>> used as IPIs.
>>>
>>> Ok.  If we can do things sparsely, allocating these well away from the
>>> hw interrupts would make things easier.
>>>
>>>> But we may need more for other things. We can make this a machine
>>>> parameter with a default value of something like 4096. If we call N
>>>> that number of extra generic interrupts, then the number of generic
>>>> interrutps would be #possible-vcpu's + N, or something like that.
>>>
>>> That seems reasonable.
>>>
>>>>>> But it's fundamentally an allocator that sits in the hypervisor, so in
>>>>>> our case, I would say in the spapr "component" of XIVE, rather than the
>>>>>> XIVE HW model itself.
>>>>>
>>>>> Maybe..
>>>>
>>>> You are right in that a mapping is a better term than an allocator
>>>> here.
>>>>
>>>>>> Now what Cedric did, because XIVE is very complex and we need something
>>>>>> for PAPR quickly, is not a complete HW model, but a somewhat simplified
>>>>>> one that only handles what PAPR exposes. So in that case where the
>>>>>> allocator sits is a bit of a TBD...
>>>>>
>>>>> Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
>>>>> machine type level needs extreme caution, or the irqs may not be
>>>>> stable which will generally break migration.
>>>>
>>>> Yes you are right. We should probably create a more "static" scheme.
>>>
>>> Sounds like we're in violent agreement.
>>
>> Yup :)
>>
>
David Gibson July 25, 2017, 12:39 p.m. UTC | #20
On Tue, Jul 25, 2017 at 10:52:27AM +0200, Cédric Le Goater wrote:
> On 07/24/2017 12:03 PM, David Gibson wrote:
> > On Mon, Jul 24, 2017 at 05:20:26PM +1000, Benjamin Herrenschmidt wrote:
> >> On Mon, 2017-07-24 at 15:38 +1000, David Gibson wrote:
> >>>
> >>> Can we assign our logical numbers sparsely, or will that cause other
> >>> problems?
> >>
> >> The main issue is that they probably needs to be the same between XICS
> >> and XIVE because by the time we get the CAS call to chose between XICS
> >> and XIVE, we have already handed out interrupts and constructed the DT,
> >> no ? Unless we do a real CAS reboot...
> > 
> > A real CAS reboot probably isn't unreasonable for this case.
> > 
> > I definitely think we need to go one way or the other - either fully
> > unify the irq mapping between xics and xive, or fully separate them.
> 
> To be able to change interrupt model at CAS time, we need to unify 
> the IRQ numbering.

Not necessarily, though it certainly might make things easier.

> We don't have much choice because the DT is 
> already populated.

We could change that, though.

> We also need to share the ICSIRQState flags unless
> we share the interrupt source object between the XIVE and XICS mode. 
> 
> In my current tree, I made sure that the same IRQ number ranges 
> were being used in the XIVE and in the XICS allocator and that the 
> ICSIRQState flags of the different sPAPR Interrupt sources (XIVE 
> and XICS) were in sync. That works pretty well for reset, migration 
> and hotplug, but it is bit hacky.
> 
> C.
> 
> 
> >> Otherwise, there's no reason they can't be sparse no.
> >>
> >>> Note that for PAPR we also have the question of finding logical
> >>> interrupts for legacy PAPR VIO devices.
> >>
> >> We just make them another range ? With KVM legacy today, I just use the
> >> generic interrupt facility for those. So when you do the ioctl to
> >> "trigger" one, I just do an MMIO to the corresponding page and the
> >> interrupt magically shows up wherever the guest is running the target
> >> vcpu. In fact, I'd like to add a way to mmap that page into qemu so
> >> that qemu can triggers them without an ioctl.
> > 
> > Ok.
> > 
> >> The guest doesn't care, from the guest perspective they are interrupts
> >> coming from the DT, so they are like PCI etc...
> > 
> > Ok.
> > 
> >>>> We can fix the number of "generic" interrupts given to a guest. The
> >>>> only requirements from a PAPR perspective is that there should be at
> >>>> least as many as there are possible threads in the guest so they can be
> >>>> used as IPIs.
> >>>
> >>> Ok.  If we can do things sparsely, allocating these well away from the
> >>> hw interrupts would make things easier.
> >>>
> >>>> But we may need more for other things. We can make this a machine
> >>>> parameter with a default value of something like 4096. If we call N
> >>>> that number of extra generic interrupts, then the number of generic
> >>>> interrutps would be #possible-vcpu's + N, or something like that.
> >>>
> >>> That seems reasonable.
> >>>
> >>>>>> But it's fundamentally an allocator that sits in the hypervisor, so in
> >>>>>> our case, I would say in the spapr "component" of XIVE, rather than the
> >>>>>> XIVE HW model itself.
> >>>>>
> >>>>> Maybe..
> >>>>
> >>>> You are right in that a mapping is a better term than an allocator
> >>>> here.
> >>>>
> >>>>>> Now what Cedric did, because XIVE is very complex and we need something
> >>>>>> for PAPR quickly, is not a complete HW model, but a somewhat simplified
> >>>>>> one that only handles what PAPR exposes. So in that case where the
> >>>>>> allocator sits is a bit of a TBD...
> >>>>>
> >>>>> Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
> >>>>> machine type level needs extreme caution, or the irqs may not be
> >>>>> stable which will generally break migration.
> >>>>
> >>>> Yes you are right. We should probably create a more "static" scheme.
> >>>
> >>> Sounds like we're in violent agreement.
> >>
> >> Yup :)
> >>
> > 
>
Cédric Le Goater July 25, 2017, 1:48 p.m. UTC | #21
On 07/25/2017 02:39 PM, David Gibson wrote:
> On Tue, Jul 25, 2017 at 10:52:27AM +0200, Cédric Le Goater wrote:
>> On 07/24/2017 12:03 PM, David Gibson wrote:
>>> On Mon, Jul 24, 2017 at 05:20:26PM +1000, Benjamin Herrenschmidt wrote:
>>>> On Mon, 2017-07-24 at 15:38 +1000, David Gibson wrote:
>>>>>
>>>>> Can we assign our logical numbers sparsely, or will that cause other
>>>>> problems?
>>>>
>>>> The main issue is that they probably needs to be the same between XICS
>>>> and XIVE because by the time we get the CAS call to chose between XICS
>>>> and XIVE, we have already handed out interrupts and constructed the DT,
>>>> no ? Unless we do a real CAS reboot...
>>>
>>> A real CAS reboot probably isn't unreasonable for this case.
>>>
>>> I definitely think we need to go one way or the other - either fully
>>> unify the irq mapping between xics and xive, or fully separate them.
>>
>> To be able to change interrupt model at CAS time, we need to unify 
>> the IRQ numbering.
> 
> Not necessarily, though it certainly might make things easier.
> 
>> We don't have much choice because the DT is already populated.
> 
> We could change that, though.

It would certainly help to manage the change of interrupt mode.
May be we could even instantiate only one sPAPR interrupt source
object at a time. Today I am instantiating two and switch from 
one to another. I am not sure how compatible this is with migration 
tough.

We would need to postpone all IRQ allocation until the CAS 
negotiation is complete and populate the DT accordingly in the 
CAS response. Hopefully I think this is the only reason why we do
an early allocation of the XICS or XIVE objects today. 

I need to take a closer look at the spapr_ics_alloc() calls in 
spapr_{events,pci,vio}.c files.

C.
 
 
>> We also need to share the ICSIRQState flags unless
>> we share the interrupt source object between the XIVE and XICS mode. 
>>
>> In my current tree, I made sure that the same IRQ number ranges 
>> were being used in the XIVE and in the XICS allocator and that the 
>> ICSIRQState flags of the different sPAPR Interrupt sources (XIVE 
>> and XICS) were in sync. That works pretty well for reset, migration 
>> and hotplug, but it is bit hacky.
>>
>> C.
>>
>>
>>>> Otherwise, there's no reason they can't be sparse no.
>>>>
>>>>> Note that for PAPR we also have the question of finding logical
>>>>> interrupts for legacy PAPR VIO devices.
>>>>
>>>> We just make them another range ? With KVM legacy today, I just use the
>>>> generic interrupt facility for those. So when you do the ioctl to
>>>> "trigger" one, I just do an MMIO to the corresponding page and the
>>>> interrupt magically shows up wherever the guest is running the target
>>>> vcpu. In fact, I'd like to add a way to mmap that page into qemu so
>>>> that qemu can triggers them without an ioctl.
>>>
>>> Ok.
>>>
>>>> The guest doesn't care, from the guest perspective they are interrupts
>>>> coming from the DT, so they are like PCI etc...
>>>
>>> Ok.
>>>
>>>>>> We can fix the number of "generic" interrupts given to a guest. The
>>>>>> only requirements from a PAPR perspective is that there should be at
>>>>>> least as many as there are possible threads in the guest so they can be
>>>>>> used as IPIs.
>>>>>
>>>>> Ok.  If we can do things sparsely, allocating these well away from the
>>>>> hw interrupts would make things easier.
>>>>>
>>>>>> But we may need more for other things. We can make this a machine
>>>>>> parameter with a default value of something like 4096. If we call N
>>>>>> that number of extra generic interrupts, then the number of generic
>>>>>> interrutps would be #possible-vcpu's + N, or something like that.
>>>>>
>>>>> That seems reasonable.
>>>>>
>>>>>>>> But it's fundamentally an allocator that sits in the hypervisor, so in
>>>>>>>> our case, I would say in the spapr "component" of XIVE, rather than the
>>>>>>>> XIVE HW model itself.
>>>>>>>
>>>>>>> Maybe..
>>>>>>
>>>>>> You are right in that a mapping is a better term than an allocator
>>>>>> here.
>>>>>>
>>>>>>>> Now what Cedric did, because XIVE is very complex and we need something
>>>>>>>> for PAPR quickly, is not a complete HW model, but a somewhat simplified
>>>>>>>> one that only handles what PAPR exposes. So in that case where the
>>>>>>>> allocator sits is a bit of a TBD...
>>>>>>>
>>>>>>> Hm, ok.  My concern here is that "dynamic" allocation of irqs at the
>>>>>>> machine type level needs extreme caution, or the irqs may not be
>>>>>>> stable which will generally break migration.
>>>>>>
>>>>>> Yes you are right. We should probably create a more "static" scheme.
>>>>>
>>>>> Sounds like we're in violent agreement.
>>>>
>>>> Yup :)
>>>>
>>>
>>
>
diff mbox

Patch

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 46c95993217d..1179c07e6e9f 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -56,6 +56,7 @@  CONFIG_SM501=y
 CONFIG_XICS=$(CONFIG_PSERIES)
 CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
 CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
+CONFIG_XIVE=$(CONFIG_PSERIES)
 # For PReP
 CONFIG_SERIAL_ISA=y
 CONFIG_MC146818RTC=y
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 78426a7dafcd..28b83456bfcc 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -35,6 +35,7 @@  obj-$(CONFIG_SH4) += sh_intc.o
 obj-$(CONFIG_XICS) += xics.o
 obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o
 obj-$(CONFIG_XICS_KVM) += xics_kvm.o
+obj-$(CONFIG_XIVE) += xive.o
 obj-$(CONFIG_POWERNV) += xics_pnv.o
 obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o
 obj-$(CONFIG_S390_FLIC) += s390_flic.o
diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
new file mode 100644
index 000000000000..155c2dcd6066
--- /dev/null
+++ b/hw/intc/xive-internal.h
@@ -0,0 +1,28 @@ 
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _INTC_XIVE_INTERNAL_H
+#define _INTC_XIVE_INTERNAL_H
+
+#include <hw/sysbus.h>
+
+struct XIVE {
+    SysBusDevice parent;
+
+    /* Properties */
+    uint32_t     nr_targets;
+
+    /* IRQ number allocator */
+    uint32_t     int_count;     /* Number of interrupts: nr_targets + HW IRQs */
+    uint32_t     int_base;      /* Min index */
+    uint32_t     int_max;       /* Max index */
+    uint32_t     int_hw_bot;    /* Bottom index of HW IRQ allocator */
+    uint32_t     int_ipi_top;   /* Highest IPI index handed out so far + 1 */
+};
+
+#endif /* _INTC_XIVE_INTERNAL_H */
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
new file mode 100644
index 000000000000..5b4ea915d87c
--- /dev/null
+++ b/hw/intc/xive.c
@@ -0,0 +1,94 @@ 
+/*
+ * QEMU PowerPC XIVE model
+ *
+ * Copyright (c) 2017, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qapi/error.h"
+#include "target/ppc/cpu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/dma.h"
+#include "monitor/monitor.h"
+#include "hw/ppc/xive.h"
+
+#include "xive-internal.h"
+
+/*
+ * Main XIVE object
+ */
+
+/* Let's provision some HW IRQ numbers. We could use a XIVE property
+ * also but it does not seem necessary for the moment.
+ */
+#define MAX_HW_IRQS_ENTRIES (8 * 1024)
+
+static void xive_init(Object *obj)
+{
+    ;
+}
+
+static void xive_realize(DeviceState *dev, Error **errp)
+{
+    XIVE *x = XIVE(dev);
+
+    if (!x->nr_targets) {
+        error_setg(errp, "Number of interrupt targets needs to be greater 0");
+        return;
+    }
+
+    /* Initialize IRQ number allocator. Let's use a base number if we
+     * need to introduce a notion of blocks one day.
+     */
+    x->int_base = 0;
+    x->int_count = x->nr_targets + MAX_HW_IRQS_ENTRIES;
+    x->int_max = x->int_base + x->int_count;
+    x->int_hw_bot = x->int_max;
+    x->int_ipi_top = x->int_base;
+
+    /* Reserve some numbers as OPAL does ? */
+    if (x->int_ipi_top < 0x10) {
+        x->int_ipi_top = 0x10;
+    }
+}
+
+static Property xive_properties[] = {
+    DEFINE_PROP_UINT32("nr-targets", XIVE, nr_targets, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void xive_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = xive_realize;
+    dc->props = xive_properties;
+    dc->desc = "XIVE";
+}
+
+static const TypeInfo xive_info = {
+    .name = TYPE_XIVE,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_init = xive_init,
+    .instance_size = sizeof(XIVE),
+    .class_init = xive_class_init,
+};
+
+static void xive_register_types(void)
+{
+    type_register_static(&xive_info);
+}
+
+type_init(xive_register_types)
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
new file mode 100644
index 000000000000..863f5a9c6b5f
--- /dev/null
+++ b/include/hw/ppc/xive.h
@@ -0,0 +1,27 @@ 
+/*
+ * QEMU PowerPC XIVE model
+ *
+ * Copyright (c) 2017, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PPC_XIVE_H
+#define PPC_XIVE_H
+
+typedef struct XIVE XIVE;
+
+#define TYPE_XIVE "xive"
+#define XIVE(obj) OBJECT_CHECK(XIVE, (obj), TYPE_XIVE)
+
+#endif /* PPC_XIVE_H */