diff mbox

[v2,3/3] PCI: ARM: add support for generic PCI host controller

Message ID 1392236171-10512-4-git-send-email-will.deacon@arm.com
State Superseded
Headers show

Commit Message

Will Deacon Feb. 12, 2014, 8:16 p.m. UTC
This patch adds support for a generic PCI host controller, such as a
firmware-initialised device with static windows or an emulation by
something such as kvmtool.

The controller itself has no configuration registers and has its address
spaces described entirely by the device-tree (using the bindings from
ePAPR). Both CAM and ECAM are supported for Config Space accesses.

Corresponding documentation is added for the DT binding.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 .../devicetree/bindings/pci/arm-generic-pci.txt    |  51 ++++
 drivers/pci/host/Kconfig                           |   7 +
 drivers/pci/host/Makefile                          |   1 +
 drivers/pci/host/pci-arm-generic.c                 | 318 +++++++++++++++++++++
 4 files changed, 377 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/pci/arm-generic-pci.txt
 create mode 100644 drivers/pci/host/pci-arm-generic.c

Comments

Arnd Bergmann Feb. 12, 2014, 8:59 p.m. UTC | #1
On Wednesday 12 February 2014 20:16:11 Will Deacon wrote:
> +- ranges         : As described in IEEE Std 1275-1994, but must provide
> +                   at least a definition of one or both of IO and Memory
> +                   Space.

I'd say *must* provide at least non-prefetchable memory. *may* provide
prefetchable memory and/or I/O space.

> +- #address-cells : Must be 3
> +
> +- #size-cells    : Must be 2
> +
> +- reg            : The Configuration Space base address, as accessed by the
> +                   parent bus.
> +
> +Configuration Space is assumed to be memory-mapped (as opposed to being
> +accessed via an ioport) and laid out with a direct correspondence to the
> +geography of a PCI bus address by concatenating the various components to form
> +an offset.
> +
> +For CAM, this 24-bit offset is:
> +
> +        cfg_offset(bus, device, function, register) =
> +                   bus << 16 | device << 11 | function << 8 | register
> +
> +Whilst ECAM extends this by 4 bits to accomodate 4k of function space:
> +
> +        cfg_offset(bus, device, function, register) =
> +                   bus << 20 | device << 15 | function << 12 | register
> +
> +Interrupt mapping is exactly as described in `Open Firmware Recommended
> +Practice: Interrupt Mapping' and requires the following properties:
> +
> +- #interrupt-cells   : Must be 1
> +
> +- interrupt-map      : <see aforementioned specification>
> +
> +- interrupt-map-mask : <see aforementioned specification>

We probably want to add an optional 'bus-range' property (the default being
<0 255> if absent), so we don't have to map all the config space.

> diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> index 47d46c6d8468..491d74c36f6a 100644
> --- a/drivers/pci/host/Kconfig
> +++ b/drivers/pci/host/Kconfig
> @@ -33,4 +33,11 @@ config PCI_RCAR_GEN2
>  	  There are 3 internal PCI controllers available with a single
>  	  built-in EHCI/OHCI host controller present on each one.
>  
> +config PCI_ARM_GENERIC
> +	bool "ARM generic PCI host controller"
> +	depends on ARM && OF
> +	help
> +	  Say Y here if you want to support a simple generic PCI host
> +	  controller, such as the one emulated by kvmtool.
> +
>  endmenu

Looks good for now. In the long run, I'd hope to get rid of the 'ARM'
part here and make it work on any architecture, but that requires
significant work that we should not depend on here.


> +struct gen_pci_cfg_window {
> +	u64					cpu_phys;
> +	void __iomem				*base;
> +	u8					bus;
> +	spinlock_t				lock;
> +	const struct gen_pci_cfg_accessors	*accessors;
> +};
> +
> +struct gen_pci_resource {
> +	struct list_head			list;
> +	struct resource				cpu_res;
> +	resource_size_t				offset;
> +};

Your gen_pci_resource is actually identical to struct pci_host_bridge_window,
which I guess is coincidence, but it would be nice to actually use
the standard structure to make it easier to integrate with common
infrastructure later.

> +
> +struct gen_pci {
> +	struct device				*dev;
> +	struct resource				*io_res;
> +	struct list_head			mem_res;
> +	struct gen_pci_cfg_window		cfg;
> +};

How about making this structure derived from pci_host_bridge?
That would already contain a lot of the members, and gets two things
right: 

* it's useful to have an embedded 'struct device' here, and use dev->parent
  to point to the device from DT
* for I/O, we actually want a pci_host_bridge_window, not just a resource,
  since we should keep track of the offset

> +static void __iomem *gen_pci_map_cfg_bus_ecam(struct pci_bus *bus,
> +					      unsigned int devfn,
> +					      int where)
> +{
> +	struct pci_sys_data *sys = bus->sysdata;
> +	struct gen_pci *pci = sys->private_data;
> +	u32 busn = bus->number;
> +
> +	spin_lock(&pci->cfg.lock);
> +	if (pci->cfg.bus != busn) {
> +		resource_size_t offset;
> +
> +		devm_iounmap(pci->dev, pci->cfg.base);
> +		offset = pci->cfg.cpu_phys + (busn << 20);
> +		pci->cfg.base = devm_ioremap(pci->dev, offset, SZ_1M);
> +		pci->cfg.bus = busn;
> +	}
> +
> +	return pci->cfg.base + ((devfn << 12) | where);
> +}

Nice idea, but unfortunately broken: we need config space access from
atomic context, since there are drivers doing that.

> +static int gen_pci_probe(struct platform_device *pdev)
> +{
> +	struct hw_pci hw;
> +	struct of_pci_range range;
> +	struct of_pci_range_parser parser;
> +	struct gen_pci *pci;
> +	const __be32 *reg;
> +	const struct of_device_id *of_id;
> +	struct device *dev = &pdev->dev;
> +	struct device_node *np = dev->of_node;

Could you try to move almost all of this function into gen_pci_setup()?
I suspect this will make it easier later to share this driver with other
architectures.

> +
> +	hw = (struct hw_pci) {
> +		.nr_controllers	= 1,
> +		.private_data	= (void **)&pci,
> +		.setup		= gen_pci_setup,
> +		.map_irq	= of_irq_parse_and_map_pci,
> +		.ops		= &gen_pci_ops,
> +	};
> +
> +	pci_common_init_dev(dev, &hw);
> +	return 0;
> +}

A missing part here seems to be a way to propagate errors from
the pci_common_init_dev or gen_pci_setup back to the caller.

This seems to be a result of the arm pcibios implementation not
being meant for loadable modules, but I suspect it can be fixed.
The nr_controllers>1 logic gets a bit in the way there, but it's
also a model that seems to be getting out of fashion:
kirkwood/dove/orion5x/mv78xx0 use it at the moment, but are
migrating over to the new pci-mvebu code that doesn't as they
get rid of the non-DT probing. pci-rcar-gen2.c uses it, but it
seems they already ran into problems with that and are changing
it. That pretty much leaves iop13xx as the only user, but at
that point we can probably move the loop into iop13xx specific
code.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kumar Gala Feb. 12, 2014, 9:51 p.m. UTC | #2
On Feb 12, 2014, at 2:16 PM, Will Deacon <will.deacon@arm.com> wrote:

> This patch adds support for a generic PCI host controller, such as a
> firmware-initialised device with static windows or an emulation by
> something such as kvmtool.
> 
> The controller itself has no configuration registers and has its address
> spaces described entirely by the device-tree (using the bindings from
> ePAPR). Both CAM and ECAM are supported for Config Space accesses.
> 
> Corresponding documentation is added for the DT binding.
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
> .../devicetree/bindings/pci/arm-generic-pci.txt    |  51 ++++
> drivers/pci/host/Kconfig                           |   7 +
> drivers/pci/host/Makefile                          |   1 +
> drivers/pci/host/pci-arm-generic.c                 | 318 +++++++++++++++++++++
> 4 files changed, 377 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> create mode 100644 drivers/pci/host/pci-arm-generic.c
> 
> diff --git a/Documentation/devicetree/bindings/pci/arm-generic-pci.txt b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> new file mode 100644
> index 000000000000..cc7a35ecfa2d
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> @@ -0,0 +1,51 @@
> +* ARM generic PCI host controller
> +
> +Firmware-initialised PCI host controllers and PCI emulations, such as the
> +virtio-pci implementations found in kvmtool and other para-virtualised
> +systems, do not require driver support for complexities such as regulator and
> +clock management. In fact, the controller may not even require the
> +configuration of a control interface by the operating system, instead
> +presenting a set of fixed windows describing a subset of IO, Memory and
> +Configuration Spaces.
> +
> +Such a controller can be described purely in terms of the standardized device
> +tree bindings communicated in pci.txt:
> +
> +- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
> +                   depending on the layout of configuration space (CAM vs
> +                   ECAM respectively)

What’s arm specific here?  I don’t have a great suggestion, but seems odd for this to be vendor prefixed with "arm".

> +
> +- ranges         : As described in IEEE Std 1275-1994, but must provide
> +                   at least a definition of one or both of IO and Memory
> +                   Space.
> +
> +- #address-cells : Must be 3
> +
> +- #size-cells    : Must be 2
> +
> +- reg            : The Configuration Space base address, as accessed by the
> +                   parent bus.

Isn’t the size fixed here for cam or ecam?

> +
> +Configuration Space is assumed to be memory-mapped (as opposed to being
> +accessed via an ioport) and laid out with a direct correspondence to the
> +geography of a PCI bus address by concatenating the various components to form
> +an offset.
> +
> +For CAM, this 24-bit offset is:
> +
> +        cfg_offset(bus, device, function, register) =
> +                   bus << 16 | device << 11 | function << 8 | register
> +
> +Whilst ECAM extends this by 4 bits to accomodate 4k of function space:
> +
> +        cfg_offset(bus, device, function, register) =
> +                   bus << 20 | device << 15 | function << 12 | register
> +
> +Interrupt mapping is exactly as described in `Open Firmware Recommended
> +Practice: Interrupt Mapping' and requires the following properties:
> +
> +- #interrupt-cells   : Must be 1
> +
> +- interrupt-map      : <see aforementioned specification>
> +
> +- interrupt-map-mask : <see aforementioned specification>

Examples are always nice :)

- k
Will Deacon Feb. 13, 2014, 11:04 a.m. UTC | #3
Hi Arnd,

Thanks again for the comments.

On Wed, Feb 12, 2014 at 08:59:46PM +0000, Arnd Bergmann wrote:
> On Wednesday 12 February 2014 20:16:11 Will Deacon wrote:
> > +- ranges         : As described in IEEE Std 1275-1994, but must provide
> > +                   at least a definition of one or both of IO and Memory
> > +                   Space.
> 
> I'd say *must* provide at least non-prefetchable memory. *may* provide
> prefetchable memory and/or I/O space.

Can do. Should I enforce this in the driver too? (i.e. complain if
non-prefetchable memory is absent).

> > +Interrupt mapping is exactly as described in `Open Firmware Recommended
> > +Practice: Interrupt Mapping' and requires the following properties:
> > +
> > +- #interrupt-cells   : Must be 1
> > +
> > +- interrupt-map      : <see aforementioned specification>
> > +
> > +- interrupt-map-mask : <see aforementioned specification>
> 
> We probably want to add an optional 'bus-range' property (the default being
> <0 255> if absent), so we don't have to map all the config space.

Yes, and that will be important given your comments later on.

> > diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> > index 47d46c6d8468..491d74c36f6a 100644
> > --- a/drivers/pci/host/Kconfig
> > +++ b/drivers/pci/host/Kconfig
> > @@ -33,4 +33,11 @@ config PCI_RCAR_GEN2
> >  	  There are 3 internal PCI controllers available with a single
> >  	  built-in EHCI/OHCI host controller present on each one.
> >  
> > +config PCI_ARM_GENERIC
> > +	bool "ARM generic PCI host controller"
> > +	depends on ARM && OF
> > +	help
> > +	  Say Y here if you want to support a simple generic PCI host
> > +	  controller, such as the one emulated by kvmtool.
> > +
> >  endmenu
> 
> Looks good for now. In the long run, I'd hope to get rid of the 'ARM'
> part here and make it work on any architecture, but that requires
> significant work that we should not depend on here.

Agreed. arm64 is the obvious next target (once Liviu's series is sorted out
-- I'm currently using a simplified port of bios32.c for testing).

> > +struct gen_pci_cfg_window {
> > +	u64					cpu_phys;
> > +	void __iomem				*base;
> > +	u8					bus;
> > +	spinlock_t				lock;
> > +	const struct gen_pci_cfg_accessors	*accessors;
> > +};
> > +
> > +struct gen_pci_resource {
> > +	struct list_head			list;
> > +	struct resource				cpu_res;
> > +	resource_size_t				offset;
> > +};
> 
> Your gen_pci_resource is actually identical to struct pci_host_bridge_window,
> which I guess is coincidence, but it would be nice to actually use
> the standard structure to make it easier to integrate with common
> infrastructure later.

Ha, at least I managed to come up with the same struct! I'll move to the
generic version.

> > +
> > +struct gen_pci {
> > +	struct device				*dev;
> > +	struct resource				*io_res;
> > +	struct list_head			mem_res;
> > +	struct gen_pci_cfg_window		cfg;
> > +};
> 
> How about making this structure derived from pci_host_bridge?
> That would already contain a lot of the members, and gets two things
> right: 
> 
> * it's useful to have an embedded 'struct device' here, and use dev->parent
>   to point to the device from DT
> * for I/O, we actually want a pci_host_bridge_window, not just a resource,
>   since we should keep track of the offset

Sure. Also, if we kill nr_controllers, then we can have a simple I/O space
allocator to populate the offset.

> > +static void __iomem *gen_pci_map_cfg_bus_ecam(struct pci_bus *bus,
> > +					      unsigned int devfn,
> > +					      int where)
> > +{
> > +	struct pci_sys_data *sys = bus->sysdata;
> > +	struct gen_pci *pci = sys->private_data;
> > +	u32 busn = bus->number;
> > +
> > +	spin_lock(&pci->cfg.lock);
> > +	if (pci->cfg.bus != busn) {
> > +		resource_size_t offset;
> > +
> > +		devm_iounmap(pci->dev, pci->cfg.base);
> > +		offset = pci->cfg.cpu_phys + (busn << 20);
> > +		pci->cfg.base = devm_ioremap(pci->dev, offset, SZ_1M);
> > +		pci->cfg.bus = busn;
> > +	}
> > +
> > +	return pci->cfg.base + ((devfn << 12) | where);
> > +}
> 
> Nice idea, but unfortunately broken: we need config space access from
> atomic context, since there are drivers doing that.

That aside, I just took a spin_lock so this needs fixing regardless. The
only solution I can think of then is to map all of the buses at setup time
(making bus_ranges pretty important) and hope that I don't chew through all
of vmalloc.

> > +static int gen_pci_probe(struct platform_device *pdev)
> > +{
> > +	struct hw_pci hw;
> > +	struct of_pci_range range;
> > +	struct of_pci_range_parser parser;
> > +	struct gen_pci *pci;
> > +	const __be32 *reg;
> > +	const struct of_device_id *of_id;
> > +	struct device *dev = &pdev->dev;
> > +	struct device_node *np = dev->of_node;
> 
> Could you try to move almost all of this function into gen_pci_setup()?
> I suspect this will make it easier later to share this driver with other
> architectures.

I'll take a look. If we get rid of nr_controllers, as suggested later on,
the line between probe and setup is somewhat blurred.

> > +
> > +	hw = (struct hw_pci) {
> > +		.nr_controllers	= 1,
> > +		.private_data	= (void **)&pci,
> > +		.setup		= gen_pci_setup,
> > +		.map_irq	= of_irq_parse_and_map_pci,
> > +		.ops		= &gen_pci_ops,
> > +	};
> > +
> > +	pci_common_init_dev(dev, &hw);
> > +	return 0;
> > +}
> 
> A missing part here seems to be a way to propagate errors from
> the pci_common_init_dev or gen_pci_setup back to the caller.
> 
> This seems to be a result of the arm pcibios implementation not
> being meant for loadable modules, but I suspect it can be fixed.
> The nr_controllers>1 logic gets a bit in the way there, but it's
> also a model that seems to be getting out of fashion:
> kirkwood/dove/orion5x/mv78xx0 use it at the moment, but are
> migrating over to the new pci-mvebu code that doesn't as they
> get rid of the non-DT probing. pci-rcar-gen2.c uses it, but it
> seems they already ran into problems with that and are changing
> it. That pretty much leaves iop13xx as the only user, but at
> that point we can probably move the loop into iop13xx specific
> code.

Makes sense once there are no users of the field.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Feb. 13, 2014, 11:07 a.m. UTC | #4
On Wed, Feb 12, 2014 at 09:51:48PM +0000, Kumar Gala wrote:
> 
> On Feb 12, 2014, at 2:16 PM, Will Deacon <will.deacon@arm.com> wrote:
> 
> > This patch adds support for a generic PCI host controller, such as a
> > firmware-initialised device with static windows or an emulation by
> > something such as kvmtool.
> > 
> > The controller itself has no configuration registers and has its address
> > spaces described entirely by the device-tree (using the bindings from
> > ePAPR). Both CAM and ECAM are supported for Config Space accesses.
> > 
> > Corresponding documentation is added for the DT binding.
> > 
> > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > ---
> > .../devicetree/bindings/pci/arm-generic-pci.txt    |  51 ++++
> > drivers/pci/host/Kconfig                           |   7 +
> > drivers/pci/host/Makefile                          |   1 +
> > drivers/pci/host/pci-arm-generic.c                 | 318 +++++++++++++++++++++
> > 4 files changed, 377 insertions(+)
> > create mode 100644 Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> > create mode 100644 drivers/pci/host/pci-arm-generic.c
> > 
> > diff --git a/Documentation/devicetree/bindings/pci/arm-generic-pci.txt b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> > new file mode 100644
> > index 000000000000..cc7a35ecfa2d
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
> > @@ -0,0 +1,51 @@
> > +* ARM generic PCI host controller
> > +
> > +Firmware-initialised PCI host controllers and PCI emulations, such as the
> > +virtio-pci implementations found in kvmtool and other para-virtualised
> > +systems, do not require driver support for complexities such as regulator and
> > +clock management. In fact, the controller may not even require the
> > +configuration of a control interface by the operating system, instead
> > +presenting a set of fixed windows describing a subset of IO, Memory and
> > +Configuration Spaces.
> > +
> > +Such a controller can be described purely in terms of the standardized device
> > +tree bindings communicated in pci.txt:
> > +
> > +- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
> > +                   depending on the layout of configuration space (CAM vs
> > +                   ECAM respectively)
> 
> What’s arm specific here?  I don’t have a great suggestion, but seems odd
> for this to be vendor prefixed with "arm".

Happy to change it, but I'm also struggling for names. Maybe "linux,..."?

> > +- ranges         : As described in IEEE Std 1275-1994, but must provide
> > +                   at least a definition of one or both of IO and Memory
> > +                   Space.
> > +
> > +- #address-cells : Must be 3
> > +
> > +- #size-cells    : Must be 2
> > +
> > +- reg            : The Configuration Space base address, as accessed by the
> > +                   parent bus.
> 
> Isn’t the size fixed here for cam or ecam?

Yes, which is why reg just specifies the base address.

> > +Configuration Space is assumed to be memory-mapped (as opposed to being
> > +accessed via an ioport) and laid out with a direct correspondence to the
> > +geography of a PCI bus address by concatenating the various components to form
> > +an offset.
> > +
> > +For CAM, this 24-bit offset is:
> > +
> > +        cfg_offset(bus, device, function, register) =
> > +                   bus << 16 | device << 11 | function << 8 | register
> > +
> > +Whilst ECAM extends this by 4 bits to accomodate 4k of function space:
> > +
> > +        cfg_offset(bus, device, function, register) =
> > +                   bus << 20 | device << 15 | function << 12 | register
> > +
> > +Interrupt mapping is exactly as described in `Open Firmware Recommended
> > +Practice: Interrupt Mapping' and requires the following properties:
> > +
> > +- #interrupt-cells   : Must be 1
> > +
> > +- interrupt-map      : <see aforementioned specification>
> > +
> > +- interrupt-map-mask : <see aforementioned specification>
> 
> Examples are always nice :)

Not in this case! kvmtool generates the following:

	pci {
		#address-cells = <0x3>;
		#size-cells = <0x2>;
		#interrupt-cells = <0x1>;
		compatible = "arm,pci-cam-generic";
		reg = <0x0 0x40000000>;
		ranges = <0x1000000 0x0 0x0 0x0 0x0 0x0 0x10000 0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;
		interrupt-map = <0x0 0x0 0x0 0x1 0x1 0x0 0x4 0x1 0x800 0x0 0x0 0x1 0x1 0x0 0x5 0x1 0x1000 0x0 0x0 0x1 0x1 0x0 0x6 0x1 0x1800 0x0 0x0 0x1 0x1 0x0 0x7 0x1>;
		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
	};

I can add it if you like, but it looks like a random bunch of numbers to me.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 13, 2014, 11:47 a.m. UTC | #5
On Thursday 13 February 2014 11:04:02 Will Deacon wrote:
> Hi Arnd,
> 
> Thanks again for the comments.
> 
> On Wed, Feb 12, 2014 at 08:59:46PM +0000, Arnd Bergmann wrote:
> > On Wednesday 12 February 2014 20:16:11 Will Deacon wrote:
> > > +- ranges         : As described in IEEE Std 1275-1994, but must provide
> > > +                   at least a definition of one or both of IO and Memory
> > > +                   Space.
> > 
> > I'd say *must* provide at least non-prefetchable memory. *may* provide
> > prefetchable memory and/or I/O space.
> 
> Can do. Should I enforce this in the driver too? (i.e. complain if
> non-prefetchable memory is absent).

Yes, good idea.

> > > diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> > > index 47d46c6d8468..491d74c36f6a 100644
> > > --- a/drivers/pci/host/Kconfig
> > > +++ b/drivers/pci/host/Kconfig
> > > @@ -33,4 +33,11 @@ config PCI_RCAR_GEN2
> > >  	  There are 3 internal PCI controllers available with a single
> > >  	  built-in EHCI/OHCI host controller present on each one.
> > >  
> > > +config PCI_ARM_GENERIC
> > > +	bool "ARM generic PCI host controller"
> > > +	depends on ARM && OF
> > > +	help
> > > +	  Say Y here if you want to support a simple generic PCI host
> > > +	  controller, such as the one emulated by kvmtool.
> > > +
> > >  endmenu
> > 
> > Looks good for now. In the long run, I'd hope to get rid of the 'ARM'
> > part here and make it work on any architecture, but that requires
> > significant work that we should not depend on here.
> 
> Agreed. arm64 is the obvious next target (once Liviu's series is sorted out
> -- I'm currently using a simplified port of bios32.c for testing).

See also the reply I just sent on the previous thread for a migration
plan regarding the existing drivers.

> > > +struct gen_pci_cfg_window {
> > > +	u64					cpu_phys;
> > > +	void __iomem				*base;
> > > +	u8					bus;
> > > +	spinlock_t				lock;
> > > +	const struct gen_pci_cfg_accessors	*accessors;
> > > +};
> > > +
> > > +struct gen_pci_resource {
> > > +	struct list_head			list;
> > > +	struct resource				cpu_res;
> > > +	resource_size_t				offset;
> > > +};
> > 
> > Your gen_pci_resource is actually identical to struct pci_host_bridge_window,
> > which I guess is coincidence, but it would be nice to actually use
> > the standard structure to make it easier to integrate with common
> > infrastructure later.
> 
> Ha, at least I managed to come up with the same struct! I'll move to the
> generic version.

Hmm, I fear I was speaking too quickly, the pci_host_bridge_window actually
contains a pointer to the resource rather than the resource itself :(

There is probably a way to do this better, at least once we unify the
probe() and setup() functions.

> > > +static void __iomem *gen_pci_map_cfg_bus_ecam(struct pci_bus *bus,
> > > +					      unsigned int devfn,
> > > +					      int where)
> > > +{
> > > +	struct pci_sys_data *sys = bus->sysdata;
> > > +	struct gen_pci *pci = sys->private_data;
> > > +	u32 busn = bus->number;
> > > +
> > > +	spin_lock(&pci->cfg.lock);
> > > +	if (pci->cfg.bus != busn) {
> > > +		resource_size_t offset;
> > > +
> > > +		devm_iounmap(pci->dev, pci->cfg.base);
> > > +		offset = pci->cfg.cpu_phys + (busn << 20);
> > > +		pci->cfg.base = devm_ioremap(pci->dev, offset, SZ_1M);
> > > +		pci->cfg.bus = busn;
> > > +	}
> > > +
> > > +	return pci->cfg.base + ((devfn << 12) | where);
> > > +}
> > 
> > Nice idea, but unfortunately broken: we need config space access from
> > atomic context, since there are drivers doing that.
> 
> That aside, I just took a spin_lock so this needs fixing regardless. The
> only solution I can think of then is to map all of the buses at setup time
> (making bus_ranges pretty important) and hope that I don't chew through all
> of vmalloc.

It's possible we have to go further and only map the buses that are
actually used, rather than the ones that are defined for the bus,
but just mapping the entire range is a reasonable start I think.

> > > +
> > > +	hw = (struct hw_pci) {
> > > +		.nr_controllers	= 1,
> > > +		.private_data	= (void **)&pci,
> > > +		.setup		= gen_pci_setup,
> > > +		.map_irq	= of_irq_parse_and_map_pci,
> > > +		.ops		= &gen_pci_ops,
> > > +	};
> > > +
> > > +	pci_common_init_dev(dev, &hw);
> > > +	return 0;
> > > +}
> > 
> > A missing part here seems to be a way to propagate errors from
> > the pci_common_init_dev or gen_pci_setup back to the caller.
> > 
> > This seems to be a result of the arm pcibios implementation not
> > being meant for loadable modules, but I suspect it can be fixed.
> > The nr_controllers>1 logic gets a bit in the way there, but it's
> > also a model that seems to be getting out of fashion:
> > kirkwood/dove/orion5x/mv78xx0 use it at the moment, but are
> > migrating over to the new pci-mvebu code that doesn't as they
> > get rid of the non-DT probing. pci-rcar-gen2.c uses it, but it
> > seems they already ran into problems with that and are changing
> > it. That pretty much leaves iop13xx as the only user, but at
> > that point we can probably move the loop into iop13xx specific
> > code.
> 
> Makes sense once there are no users of the field.

With the patch I just suggested, we can simply keep
pci_common_init_dev() for older (non-multiplatform) controllers 
and not change them at all but move on to something else for
the interesting ones, i.e. those we want to share with arm64.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Feb. 13, 2014, noon UTC | #6
On Thu, Feb 13, 2014 at 11:47:46AM +0000, Arnd Bergmann wrote:
> On Thursday 13 February 2014 11:04:02 Will Deacon wrote:
> > On Wed, Feb 12, 2014 at 08:59:46PM +0000, Arnd Bergmann wrote:
> > > On Wednesday 12 February 2014 20:16:11 Will Deacon wrote:
> > > > +struct gen_pci_resource {
> > > > +	struct list_head			list;
> > > > +	struct resource				cpu_res;
> > > > +	resource_size_t				offset;
> > > > +};
> > > 
> > > Your gen_pci_resource is actually identical to struct pci_host_bridge_window,
> > > which I guess is coincidence, but it would be nice to actually use
> > > the standard structure to make it easier to integrate with common
> > > infrastructure later.
> > 
> > Ha, at least I managed to come up with the same struct! I'll move to the
> > generic version.
> 
> Hmm, I fear I was speaking too quickly, the pci_host_bridge_window actually
> contains a pointer to the resource rather than the resource itself :(

I can allocate the resources dynamically as I parse them, not a problem at
all.

> There is probably a way to do this better, at least once we unify the
> probe() and setup() functions.

Yes, I fully expect this to be iterative.

> > > > +	spin_lock(&pci->cfg.lock);
> > > > +	if (pci->cfg.bus != busn) {
> > > > +		resource_size_t offset;
> > > > +
> > > > +		devm_iounmap(pci->dev, pci->cfg.base);
> > > > +		offset = pci->cfg.cpu_phys + (busn << 20);
> > > > +		pci->cfg.base = devm_ioremap(pci->dev, offset, SZ_1M);
> > > > +		pci->cfg.bus = busn;
> > > > +	}
> > > 
> > > Nice idea, but unfortunately broken: we need config space access from
> > > atomic context, since there are drivers doing that.
> > 
> > That aside, I just took a spin_lock so this needs fixing regardless. The
> > only solution I can think of then is to map all of the buses at setup time
> > (making bus_ranges pretty important) and hope that I don't chew through all
> > of vmalloc.
> 
> It's possible we have to go further and only map the buses that are
> actually used, rather than the ones that are defined for the bus,
> but just mapping the entire range is a reasonable start I think.

Okey doke.

> > > > +	hw = (struct hw_pci) {
> > > > +		.nr_controllers	= 1,
> > > > +		.private_data	= (void **)&pci,
> > > > +		.setup		= gen_pci_setup,
> > > > +		.map_irq	= of_irq_parse_and_map_pci,
> > > > +		.ops		= &gen_pci_ops,
> > > > +	};
> > > > +
> > > > +	pci_common_init_dev(dev, &hw);
> > > > +	return 0;
> > > > +}
> > > 
> > > A missing part here seems to be a way to propagate errors from
> > > the pci_common_init_dev or gen_pci_setup back to the caller.
> > > 
> > > This seems to be a result of the arm pcibios implementation not
> > > being meant for loadable modules, but I suspect it can be fixed.
> > > The nr_controllers>1 logic gets a bit in the way there, but it's
> > > also a model that seems to be getting out of fashion:
> > > kirkwood/dove/orion5x/mv78xx0 use it at the moment, but are
> > > migrating over to the new pci-mvebu code that doesn't as they
> > > get rid of the non-DT probing. pci-rcar-gen2.c uses it, but it
> > > seems they already ran into problems with that and are changing
> > > it. That pretty much leaves iop13xx as the only user, but at
> > > that point we can probably move the loop into iop13xx specific
> > > code.
> > 
> > Makes sense once there are no users of the field.
> 
> With the patch I just suggested, we can simply keep
> pci_common_init_dev() for older (non-multiplatform) controllers 
> and not change them at all but move on to something else for
> the interesting ones, i.e. those we want to share with arm64.

Yes, that looks sensible. An alternative is to create an hw_pci in
pci_host_bridge_register with nr_controllers = 1 then call
pci_common_init_dev. It would remove slightly more code, but obviously ties
the thing to arm.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 13, 2014, 12:21 p.m. UTC | #7
On Thursday 13 February 2014 12:00:42 Will Deacon wrote:
> > With the patch I just suggested, we can simply keep
> > pci_common_init_dev() for older (non-multiplatform) controllers 
> > and not change them at all but move on to something else for
> > the interesting ones, i.e. those we want to share with arm64.
> 
> Yes, that looks sensible. An alternative is to create an hw_pci in
> pci_host_bridge_register with nr_controllers = 1 then call
> pci_common_init_dev. It would remove slightly more code, but obviously ties
> the thing to arm.

You'd still need to pass all the contents of the hw_pci struct that
get copied into pci_sys_data, so that's not different from passing
hw_pci as we do today.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kumar Gala Feb. 13, 2014, 4:22 p.m. UTC | #8
On Feb 13, 2014, at 5:07 AM, Will Deacon <will.deacon@arm.com> wrote:

> On Wed, Feb 12, 2014 at 09:51:48PM +0000, Kumar Gala wrote:
>> 
>> On Feb 12, 2014, at 2:16 PM, Will Deacon <will.deacon@arm.com> wrote:
>> 
>>> This patch adds support for a generic PCI host controller, such as a
>>> firmware-initialised device with static windows or an emulation by
>>> something such as kvmtool.
>>> 
>>> The controller itself has no configuration registers and has its address
>>> spaces described entirely by the device-tree (using the bindings from
>>> ePAPR). Both CAM and ECAM are supported for Config Space accesses.
>>> 
>>> Corresponding documentation is added for the DT binding.
>>> 
>>> Signed-off-by: Will Deacon <will.deacon@arm.com>
>>> ---
>>> .../devicetree/bindings/pci/arm-generic-pci.txt    |  51 ++++
>>> drivers/pci/host/Kconfig                           |   7 +
>>> drivers/pci/host/Makefile                          |   1 +
>>> drivers/pci/host/pci-arm-generic.c                 | 318 +++++++++++++++++++++
>>> 4 files changed, 377 insertions(+)
>>> create mode 100644 Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>> create mode 100644 drivers/pci/host/pci-arm-generic.c
>>> 
>>> diff --git a/Documentation/devicetree/bindings/pci/arm-generic-pci.txt b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>> new file mode 100644
>>> index 000000000000..cc7a35ecfa2d
>>> --- /dev/null
>>> +++ b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>> @@ -0,0 +1,51 @@
>>> +* ARM generic PCI host controller
>>> +
>>> +Firmware-initialised PCI host controllers and PCI emulations, such as the
>>> +virtio-pci implementations found in kvmtool and other para-virtualised
>>> +systems, do not require driver support for complexities such as regulator and
>>> +clock management. In fact, the controller may not even require the
>>> +configuration of a control interface by the operating system, instead
>>> +presenting a set of fixed windows describing a subset of IO, Memory and
>>> +Configuration Spaces.
>>> +
>>> +Such a controller can be described purely in terms of the standardized device
>>> +tree bindings communicated in pci.txt:
>>> +
>>> +- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
>>> +                   depending on the layout of configuration space (CAM vs
>>> +                   ECAM respectively)
>> 
>> What’s arm specific here?  I don’t have a great suggestion, but seems odd
>> for this to be vendor prefixed with "arm".
> 
> Happy to change it, but I'm also struggling for names. Maybe "linux,…"?

I was thinking that as well, I’d say go with “linux,”.

>>> +- ranges         : As described in IEEE Std 1275-1994, but must provide
>>> +                   at least a definition of one or both of IO and Memory
>>> +                   Space.
>>> +
>>> +- #address-cells : Must be 3
>>> +
>>> +- #size-cells    : Must be 2
>>> +
>>> +- reg            : The Configuration Space base address, as accessed by the
>>> +                   parent bus.
>> 
>> Isn’t the size fixed here for cam or ecam?
> 
> Yes, which is why reg just specifies the base address.

Huh?  The reg property clearly has the size in it (as shown in the example below).  I guess I was just asking for the description here to say what the size was for the 2 compatibles since its fixed and known.

> 
>>> +Configuration Space is assumed to be memory-mapped (as opposed to being
>>> +accessed via an ioport) and laid out with a direct correspondence to the
>>> +geography of a PCI bus address by concatenating the various components to form
>>> +an offset.
>>> +
>>> +For CAM, this 24-bit offset is:
>>> +
>>> +        cfg_offset(bus, device, function, register) =
>>> +                   bus << 16 | device << 11 | function << 8 | register
>>> +
>>> +Whilst ECAM extends this by 4 bits to accomodate 4k of function space:
>>> +
>>> +        cfg_offset(bus, device, function, register) =
>>> +                   bus << 20 | device << 15 | function << 12 | register
>>> +
>>> +Interrupt mapping is exactly as described in `Open Firmware Recommended
>>> +Practice: Interrupt Mapping' and requires the following properties:
>>> +
>>> +- #interrupt-cells   : Must be 1
>>> +
>>> +- interrupt-map      : <see aforementioned specification>
>>> +
>>> +- interrupt-map-mask : <see aforementioned specification>
>> 
>> Examples are always nice :)
> 
> Not in this case! kvmtool generates the following:
> 
> 	pci {
> 		#address-cells = <0x3>;
> 		#size-cells = <0x2>;
> 		#interrupt-cells = <0x1>;
> 		compatible = "arm,pci-cam-generic";
> 		reg = <0x0 0x40000000>;
> 		ranges = <0x1000000 0x0 0x0 0x0 0x0 0x0 0x10000 0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;
> 		interrupt-map = <0x0 0x0 0x0 0x1 0x1 0x0 0x4 0x1 0x800 0x0 0x0 0x1 0x1 0x0 0x5 0x1 0x1000 0x0 0x0 0x1 0x1 0x0 0x6 0x1 0x1800 0x0 0x0 0x1 0x1 0x0 0x7 0x1>;
> 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
> 	};
> 
> I can add it if you like, but it looks like a random bunch of numbers to me.

You could clean it up a bit to be human readable even if its kvmtool that’s creating it.

	pci {
		compatible = "arm,pci-cam-generic”;
		#address-cells = <3>;
		#size-cells = <2>;
		#interrupt-cells = <1>
		reg = <0x0 0x40000000>;
		ranges = <
			0x1000000 0x0 0x00000000 0x0 0x00000000 0x0 0x00010000
			0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000
			>;
		interrupt-map = <
			...
				>;

		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;

		


> 
> Will

- k
Will Deacon Feb. 13, 2014, 4:25 p.m. UTC | #9
On Thu, Feb 13, 2014 at 04:22:25PM +0000, Kumar Gala wrote:
> On Feb 13, 2014, at 5:07 AM, Will Deacon <will.deacon@arm.com> wrote:
> >>> +
> >>> +- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
> >>> +                   depending on the layout of configuration space (CAM vs
> >>> +                   ECAM respectively)
> >> 
> >> What’s arm specific here?  I don’t have a great suggestion, but seems odd
> >> for this to be vendor prefixed with "arm".
> > 
> > Happy to change it, but I'm also struggling for names. Maybe "linux,…"?
> 
> I was thinking that as well, I’d say go with “linux,”.

Great, I'll make that change.

> >>> +- ranges         : As described in IEEE Std 1275-1994, but must provide
> >>> +                   at least a definition of one or both of IO and Memory
> >>> +                   Space.
> >>> +
> >>> +- #address-cells : Must be 3
> >>> +
> >>> +- #size-cells    : Must be 2
> >>> +
> >>> +- reg            : The Configuration Space base address, as accessed by the
> >>> +                   parent bus.
> >> 
> >> Isn’t the size fixed here for cam or ecam?
> > 
> > Yes, which is why reg just specifies the base address.
> 
> Huh?  The reg property clearly has the size in it (as shown in the example
> below).  I guess I was just asking for the description here to say what
> the size was for the 2 compatibles since its fixed and known.

Actually, the example just has a 64-bit CPU physical address with no size.
Do I have to add a size to the binding? It's not at all useful for the driver.

> >> Examples are always nice :)
> > 
> > Not in this case! kvmtool generates the following:
> > 
> > 	pci {
> > 		#address-cells = <0x3>;
> > 		#size-cells = <0x2>;
> > 		#interrupt-cells = <0x1>;
> > 		compatible = "arm,pci-cam-generic";
> > 		reg = <0x0 0x40000000>;
> > 		ranges = <0x1000000 0x0 0x0 0x0 0x0 0x0 0x10000 0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;
> > 		interrupt-map = <0x0 0x0 0x0 0x1 0x1 0x0 0x4 0x1 0x800 0x0 0x0 0x1 0x1 0x0 0x5 0x1 0x1000 0x0 0x0 0x1 0x1 0x0 0x6 0x1 0x1800 0x0 0x0 0x1 0x1 0x0 0x7 0x1>;
> > 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
> > 	};
> > 
> > I can add it if you like, but it looks like a random bunch of numbers to me.
> 
> You could clean it up a bit to be human readable even if its kvmtool that’s creating it.
> 
> 	pci {
> 		compatible = "arm,pci-cam-generic”;
> 		#address-cells = <3>;
> 		#size-cells = <2>;
> 		#interrupt-cells = <1>
> 		reg = <0x0 0x40000000>;
> 		ranges = <
> 			0x1000000 0x0 0x00000000 0x0 0x00000000 0x0 0x00010000
> 			0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000
> 			>;
> 		interrupt-map = <
> 			...
> 				>;
> 
> 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;

Sure, if you think it helps.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 13, 2014, 4:28 p.m. UTC | #10
On Thursday 13 February 2014 10:22:25 Kumar Gala wrote:
> On Feb 13, 2014, at 5:07 AM, Will Deacon <will.deacon@arm.com> wrote:
> > Happy to change it, but I'm also struggling for names. Maybe "linux,…"?
>
> I was thinking that as well, I’d say go with “linux,”.

I see nothing linux specific in there. I would only use that namespace
for things we don't expect to work with another OS.

> >>> +- ranges         : As described in IEEE Std 1275-1994, but must provide
> >>> +                   at least a definition of one or both of IO and Memory
> >>> +                   Space.
> >>> +
> >>> +- #address-cells : Must be 3
> >>> +
> >>> +- #size-cells    : Must be 2
> >>> +
> >>> +- reg            : The Configuration Space base address, as accessed by the
> >>> +                   parent bus.
> >> 
> >> Isn’t the size fixed here for cam or ecam?
> > 
> > Yes, which is why reg just specifies the base address.
> 
> Huh?  The reg property clearly has the size in it (as shown in the example below).
> I guess I was just asking for the description here to say what the size was for
> the 2 compatibles since its fixed and known.

It's still an open question whether the config space in the reg property should
cover all 256 buses or just the ones in the bus-range. In the latter case,
it would be variable (but predictable) size.

> You could clean it up a bit to be human readable even if its kvmtool that’s creating it.
> 
> 	pci {
> 		compatible = "arm,pci-cam-generic”;
> 		#address-cells = <3>;
> 		#size-cells = <2>;
> 		#interrupt-cells = <1>
> 		reg = <0x0 0x40000000>;
> 		ranges = <
> 			0x1000000 0x0 0x00000000 0x0 0x00000000 0x0 0x00010000
> 			0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000
> 			>;

Make it 

 		ranges = <0x1000000 0x0 0x00000000 0x0 0x00000000 0x0 0x00010000>,
 			 <0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;

and it will be perfect ;-)

	Arnd


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 13, 2014, 6:06 p.m. UTC | #11
On Thu, Feb 13, 2014 at 11:07:21AM +0000, Will Deacon wrote:

> Not in this case! kvmtool generates the following:

Well, the example is nice so someone can review it..
 
> 	pci {
> 		#address-cells = <0x3>;
> 		#size-cells = <0x2>;
> 		#interrupt-cells = <0x1>;
> 		compatible = "arm,pci-cam-generic";
> 		reg = <0x0 0x40000000>;
> 		ranges = <0x1000000 0x0 0x0 0x0 0x0 0x0 0x10000 0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;
> 		interrupt-map = <0x0 0x0 0x0 0x1 0x1 0x0 0x4 0x1 0x800 0x0 0x0 0x1 0x1 0x0 0x5 0x1 0x1000 0x0 0x0 0x1 0x1 0x0 0x6 0x1 0x1800 0x0 0x0 0x1 0x1 0x0 0x7 0x1>;
> 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
> 	};


Looks like there are a few misses in the above. How about this:

pci {
    compatible = "arm,pci-cam-generic"
    device_type = "pci";
    ** ^^^^^^^^^^^^^^^^^
    ** MANDATORY for the host bridge node
    #address-cells = <3>;
    #size-cells = <2>;

    // BUS_ADDRESS(3)  CPU_PHYSICAL(2)  SIZE(2)
    ranges = <0x1000000 0x0 0x00000000  0x0 0x00000000  0x0 0x00010000>,
                                        ^^^^^^^^^^^^^^
            ** ?? Is this why you had problems with the offset? Should
	    ** be the cpu physical address of the start of the IO window.
             <0x2000000 0x0 0x41000000  0x0 0x41000000  0x0 0x3f000000>;

   
    #interrupt-cells = <0x1>;
    // PCI_DEVICE(3) INT#(1)  CONTROLLER(PHANDLE)  CONTROLLER_DATA(3)
    interrupt-map = <  0x0 0x0 0x0 0x1  0x1  0x0 0x4 0x1
                     0x800 0x0 0x0 0x1  0x1  0x0 0x5 0x1 
                    0x1000 0x0 0x0 0x1  0x1  0x0 0x6 0x1
                    0x1800 0x0 0x0 0x1  0x1  0x0 0x7 0x1>;
                 **                    ^^^^^^
                 ** This should be a phandle to the interrupt controller
    // PCI_DEVICE(3) INT#1()
    interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
}

I keep thinking of making a pcie-dt.h file with helper macros for
all this. :|

FWWI, I like to put a double space between the logically distinct
things in the lists.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 13, 2014, 6:26 p.m. UTC | #12
On Thu, Feb 13, 2014 at 05:28:20PM +0100, Arnd Bergmann wrote:

> > Huh?  The reg property clearly has the size in it (as shown in the
> > example below).  I guess I was just asking for the description
> > here to say what the size was for the 2 compatibles since its
> > fixed and known.
> 
> It's still an open question whether the config space in the reg
> property should cover all 256 buses or just the ones in the
> bus-range. In the latter case, it would be variable (but
> predictable) size.

The 'describe the hardware principle' says the reg should be the
entire available ECAM/CAM region the hardware is able to support.

This may be less than 256 busses, as ECAM allows the implementor to
select how many upper address bits are actually supported.

IMHO, the bus-range should be used to indicate the range of busses
discovered by the firmware, but we have historically tweaked it to
indicate the max range of bus numbers available on this bus (I think
to support the hack where two physical PCI domains were roughly glued
into a single Linux domain).

Which is not necessary when the DT stanza maps 1:1 into a PCI
domain. The driver default for bus-range should just be 0 to 255, and
it shouldn't be included in most cases.

The max busnr resource passed to the PCI core should be the lower of
the busnr property and the reg limit, IMHO.

The issue with burning VM in Linux is a Linux issue and shouldn't leak
into the DT... Ideally the solution here would be to have the PCI core
call back to the host driver when it allocates/deallocates a bus
number and then the driver can manage the config space VM mapping on a
bus by bus basis.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Feb. 13, 2014, 7:51 p.m. UTC | #13
On Thu, Feb 13, 2014 at 06:06:16PM +0000, Jason Gunthorpe wrote:
> On Thu, Feb 13, 2014 at 11:07:21AM +0000, Will Deacon wrote:
> 
> > Not in this case! kvmtool generates the following:
> 
> Well, the example is nice so someone can review it..
>  
> > 	pci {
> > 		#address-cells = <0x3>;
> > 		#size-cells = <0x2>;
> > 		#interrupt-cells = <0x1>;
> > 		compatible = "arm,pci-cam-generic";
> > 		reg = <0x0 0x40000000>;
> > 		ranges = <0x1000000 0x0 0x0 0x0 0x0 0x0 0x10000 0x2000000 0x0 0x41000000 0x0 0x41000000 0x0 0x3f000000>;
> > 		interrupt-map = <0x0 0x0 0x0 0x1 0x1 0x0 0x4 0x1 0x800 0x0 0x0 0x1 0x1 0x0 0x5 0x1 0x1000 0x0 0x0 0x1 0x1 0x0 0x6 0x1 0x1800 0x0 0x0 0x1 0x1 0x0 0x7 0x1>;
> > 		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
> > 	};
> 
> 
> Looks like there are a few misses in the above. How about this:
> 
> pci {
>     compatible = "arm,pci-cam-generic"
>     device_type = "pci";
>     ** ^^^^^^^^^^^^^^^^^
>     ** MANDATORY for the host bridge node

Ok, I'll add that.

>     #address-cells = <3>;
>     #size-cells = <2>;
> 
>     // BUS_ADDRESS(3)  CPU_PHYSICAL(2)  SIZE(2)
>     ranges = <0x1000000 0x0 0x00000000  0x0 0x00000000  0x0 0x00010000>,
>                                         ^^^^^^^^^^^^^^
>             ** ?? Is this why you had problems with the offset? Should
> 	    ** be the cpu physical address of the start of the IO window.
>              <0x2000000 0x0 0x41000000  0x0 0x41000000  0x0 0x3f000000>;
> 
>    
>     #interrupt-cells = <0x1>;
>     // PCI_DEVICE(3) INT#(1)  CONTROLLER(PHANDLE)  CONTROLLER_DATA(3)
>     interrupt-map = <  0x0 0x0 0x0 0x1  0x1  0x0 0x4 0x1
>                      0x800 0x0 0x0 0x1  0x1  0x0 0x5 0x1 
>                     0x1000 0x0 0x0 0x1  0x1  0x0 0x6 0x1
>                     0x1800 0x0 0x0 0x1  0x1  0x0 0x7 0x1>;
>                  **                    ^^^^^^
>                  ** This should be a phandle to the interrupt controller

That is a phandle, libfdt/kvmtool just ends up dealing with the number
directly. I'll fix in the example.

>     // PCI_DEVICE(3) INT#1()
>     interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
> }
> 
> I keep thinking of making a pcie-dt.h file with helper macros for
> all this. :|

If it's not being generated automatically, that could be useful.

> FWWI, I like to put a double space between the logically distinct
> things in the lists.


Makes sense, I may well do the same.

Thanks,

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Herring Feb. 13, 2014, 7:52 p.m. UTC | #14
On Thu, Feb 13, 2014 at 10:22 AM, Kumar Gala <galak@codeaurora.org> wrote:
>
> On Feb 13, 2014, at 5:07 AM, Will Deacon <will.deacon@arm.com> wrote:
>
>> On Wed, Feb 12, 2014 at 09:51:48PM +0000, Kumar Gala wrote:
>>>
>>> On Feb 12, 2014, at 2:16 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>
>>>> This patch adds support for a generic PCI host controller, such as a
>>>> firmware-initialised device with static windows or an emulation by
>>>> something such as kvmtool.
>>>>
>>>> The controller itself has no configuration registers and has its address
>>>> spaces described entirely by the device-tree (using the bindings from
>>>> ePAPR). Both CAM and ECAM are supported for Config Space accesses.
>>>>
>>>> Corresponding documentation is added for the DT binding.
>>>>
>>>> Signed-off-by: Will Deacon <will.deacon@arm.com>
>>>> ---
>>>> .../devicetree/bindings/pci/arm-generic-pci.txt    |  51 ++++
>>>> drivers/pci/host/Kconfig                           |   7 +
>>>> drivers/pci/host/Makefile                          |   1 +
>>>> drivers/pci/host/pci-arm-generic.c                 | 318 +++++++++++++++++++++
>>>> 4 files changed, 377 insertions(+)
>>>> create mode 100644 Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>>> create mode 100644 drivers/pci/host/pci-arm-generic.c
>>>>
>>>> diff --git a/Documentation/devicetree/bindings/pci/arm-generic-pci.txt b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>>> new file mode 100644
>>>> index 000000000000..cc7a35ecfa2d
>>>> --- /dev/null
>>>> +++ b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
>>>> @@ -0,0 +1,51 @@
>>>> +* ARM generic PCI host controller
>>>> +
>>>> +Firmware-initialised PCI host controllers and PCI emulations, such as the
>>>> +virtio-pci implementations found in kvmtool and other para-virtualised
>>>> +systems, do not require driver support for complexities such as regulator and
>>>> +clock management. In fact, the controller may not even require the
>>>> +configuration of a control interface by the operating system, instead
>>>> +presenting a set of fixed windows describing a subset of IO, Memory and
>>>> +Configuration Spaces.
>>>> +
>>>> +Such a controller can be described purely in terms of the standardized device
>>>> +tree bindings communicated in pci.txt:
>>>> +
>>>> +- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
>>>> +                   depending on the layout of configuration space (CAM vs
>>>> +                   ECAM respectively)
>>>
>>> What's arm specific here?  I don't have a great suggestion, but seems odd
>>> for this to be vendor prefixed with "arm".
>>
>> Happy to change it, but I'm also struggling for names. Maybe "linux,..."?
>
> I was thinking that as well, I'd say go with "linux,".

Just drop the prefix altogether.

I'm wondering if this should have host or rc in the name.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Feb. 13, 2014, 7:53 p.m. UTC | #15
On Thu, Feb 13, 2014 at 06:26:54PM +0000, Jason Gunthorpe wrote:
> On Thu, Feb 13, 2014 at 05:28:20PM +0100, Arnd Bergmann wrote:
> 
> > > Huh?  The reg property clearly has the size in it (as shown in the
> > > example below).  I guess I was just asking for the description
> > > here to say what the size was for the 2 compatibles since its
> > > fixed and known.
> > 
> > It's still an open question whether the config space in the reg
> > property should cover all 256 buses or just the ones in the
> > bus-range. In the latter case, it would be variable (but
> > predictable) size.
> 
> The 'describe the hardware principle' says the reg should be the
> entire available ECAM/CAM region the hardware is able to support.
> 
> This may be less than 256 busses, as ECAM allows the implementor to
> select how many upper address bits are actually supported.

Ok, but the ECAM/CAM base always corresponds to bus 0, right?

> IMHO, the bus-range should be used to indicate the range of busses
> discovered by the firmware, but we have historically tweaked it to
> indicate the max range of bus numbers available on this bus (I think
> to support the hack where two physical PCI domains were roughly glued
> into a single Linux domain).

Ok, so this answers Kumar's point about the reg property. I'll augment it
with a size.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 13, 2014, 8:20 p.m. UTC | #16
On Thu, Feb 13, 2014 at 07:53:17PM +0000, Will Deacon wrote:
> > This may be less than 256 busses, as ECAM allows the implementor to
> > select how many upper address bits are actually supported.
> 
> Ok, but the ECAM/CAM base always corresponds to bus 0, right?

Yes, or it isn't ECAM.

> Ok, so this answers Kumar's point about the reg property. I'll augment it
> with a size.

Don't forget to request_region it as well...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 14, 2014, 9:59 a.m. UTC | #17
On Thursday 13 February 2014 19:53:17 Will Deacon wrote:
> On Thu, Feb 13, 2014 at 06:26:54PM +0000, Jason Gunthorpe wrote:
> > On Thu, Feb 13, 2014 at 05:28:20PM +0100, Arnd Bergmann wrote:
> > 
> > > > Huh?  The reg property clearly has the size in it (as shown in the
> > > > example below).  I guess I was just asking for the description
> > > > here to say what the size was for the 2 compatibles since its
> > > > fixed and known.
> > > 
> > > It's still an open question whether the config space in the reg
> > > property should cover all 256 buses or just the ones in the
> > > bus-range. In the latter case, it would be variable (but
> > > predictable) size.
> > 
> > The 'describe the hardware principle' says the reg should be the
> > entire available ECAM/CAM region the hardware is able to support.
> > 
> > This may be less than 256 busses, as ECAM allows the implementor to
> > select how many upper address bits are actually supported.
> 
> Ok, but the ECAM/CAM base always corresponds to bus 0, right?

Ah, plus I suppose it ought to be a power-of-two size?

> > IMHO, the bus-range should be used to indicate the range of busses
> > discovered by the firmware, but we have historically tweaked it to
> > indicate the max range of bus numbers available on this bus (I think
> > to support the hack where two physical PCI domains were roughly glued
> > into a single Linux domain).

There is an interesting point about the domain assignment, brought to
my attention by Russell's comment about the hw_pci struct: If we want
to support arbitrary combinations of pci host bridges described in DT,
we need a better policy to decide what domain to use. The approaches
I've seen so far are:

1. We assume each host bridge in DT is a domain by itself. I think
we do that for all DT probed bridges on ARM (aside from shmobile)
at the moment. In some cases, the the host bridge is a really a
fiction made up by the host driver to couple various identical
but independent PCIe root ports, but the same fiction is shared
between DT and the PCI core view of it. This requires that we
enable the PCI domain code unconditionally, and breaks all user
space that doesn't understand domains (this should be rare but
can still exist for x86 based software).

2. The architecture or platform code decides and uses a method equivalent
to ARM's pci_common_init_dev() after it has found all host bridges.
The architecture "knows" how many domains it wants and calls
pci_common_init_dev() for each domain, and then the setup() callbacks
grab as many buses as they need within the domain. For a generic
multiplatform kernel, this means we need to add a top-level driver
that looks at all pci hosts in DT before any of them are probed.
It also means the pci host drivers can't be loadable modules.

3. We assume there is only one domain, and require each host bridge
in DT to specify a bus-range that is a subset of the available 256
bus numbers. This should work for anything but really big systems
with many hot-pluggable ports, since we need to reserve a few bus
numbers on each port for hotplugging.

4. Like 3, but start a new domain if the bus-range properties don't
fit in the existing domains.

5. Like 3, but specify a generic "pci-domain" property for DT
that allows putting host bridges into explicit domains in
a predictable way.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liviu Dudau Feb. 14, 2014, 10 p.m. UTC | #18
On Fri, Feb 14, 2014 at 10:59:06AM +0100, Arnd Bergmann wrote:
> On Thursday 13 February 2014 19:53:17 Will Deacon wrote:
> > On Thu, Feb 13, 2014 at 06:26:54PM +0000, Jason Gunthorpe wrote:
> > > On Thu, Feb 13, 2014 at 05:28:20PM +0100, Arnd Bergmann wrote:
> > > 
> > > > > Huh?  The reg property clearly has the size in it (as shown in the
> > > > > example below).  I guess I was just asking for the description
> > > > > here to say what the size was for the 2 compatibles since its
> > > > > fixed and known.
> > > > 
> > > > It's still an open question whether the config space in the reg
> > > > property should cover all 256 buses or just the ones in the
> > > > bus-range. In the latter case, it would be variable (but
> > > > predictable) size.
> > > 
> > > The 'describe the hardware principle' says the reg should be the
> > > entire available ECAM/CAM region the hardware is able to support.
> > > 
> > > This may be less than 256 busses, as ECAM allows the implementor to
> > > select how many upper address bits are actually supported.
> > 
> > Ok, but the ECAM/CAM base always corresponds to bus 0, right?
> 
> Ah, plus I suppose it ought to be a power-of-two size?
> 
> > > IMHO, the bus-range should be used to indicate the range of busses
> > > discovered by the firmware, but we have historically tweaked it to
> > > indicate the max range of bus numbers available on this bus (I think
> > > to support the hack where two physical PCI domains were roughly glued
> > > into a single Linux domain).
> 
> There is an interesting point about the domain assignment, brought to
> my attention by Russell's comment about the hw_pci struct: If we want
> to support arbitrary combinations of pci host bridges described in DT,
> we need a better policy to decide what domain to use. The approaches
> I've seen so far are:
> 
> 1. We assume each host bridge in DT is a domain by itself. I think
> we do that for all DT probed bridges on ARM (aside from shmobile)
> at the moment. In some cases, the the host bridge is a really a
> fiction made up by the host driver to couple various identical
> but independent PCIe root ports, but the same fiction is shared
> between DT and the PCI core view of it. This requires that we
> enable the PCI domain code unconditionally, and breaks all user
> space that doesn't understand domains (this should be rare but
> can still exist for x86 based software).
> 
> 2. The architecture or platform code decides and uses a method equivalent
> to ARM's pci_common_init_dev() after it has found all host bridges.
> The architecture "knows" how many domains it wants and calls
> pci_common_init_dev() for each domain, and then the setup() callbacks
> grab as many buses as they need within the domain. For a generic
> multiplatform kernel, this means we need to add a top-level driver
> that looks at all pci hosts in DT before any of them are probed.
> It also means the pci host drivers can't be loadable modules.
> 
> 3. We assume there is only one domain, and require each host bridge
> in DT to specify a bus-range that is a subset of the available 256
> bus numbers. This should work for anything but really big systems
> with many hot-pluggable ports, since we need to reserve a few bus
> numbers on each port for hotplugging.
> 
> 4. Like 3, but start a new domain if the bus-range properties don't
> fit in the existing domains.
> 
> 5. Like 3, but specify a generic "pci-domain" property for DT
> that allows putting host bridges into explicit domains in
> a predictable way.

What I'm going to suggest in my v2 patch (hope to send it before Monday)
is a new API in the generic PCI code that will allow you to create a
host bridge in a new domain or in the existing domain, with the management
of the domain number being done in the generic code.

Something like:

  int create_hostbridge_in_new_domain(....);
  int create_hostbridge(....);

with the functions being wrappers around the pci_hostbridge_of_init function
that I'm introducing.

What do you think?

Best regards,
Liviu

> 
> 	Arnd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 15, 2014, 1:03 p.m. UTC | #19
On Friday 14 February 2014 22:00:47 Liviu Dudau wrote:
> 
> What I'm going to suggest in my v2 patch (hope to send it before Monday)
> is a new API in the generic PCI code that will allow you to create a
> host bridge in a new domain or in the existing domain, with the management
> of the domain number being done in the generic code.
> 
> Something like:
> 
>   int create_hostbridge_in_new_domain(....);
>   int create_hostbridge(....);
> 
> with the functions being wrappers around the pci_hostbridge_of_init function
> that I'm introducing.
> 
> What do you think?

Not sure. It would still keep the decision about whether to use a
new domain or not in the host bridge driver, but that doesn't seem
like the right place. The same driver might be used in different
ways based on what is around it.

I've also had a look at the MIPS implementation now, which has its
own way of dealing with this, see arch/mips/pci/pci.c.

There was also an interesting comment in the MIPS header file:

        /* For compatibility with current (as of July 2003) pciutils
           and XFree86. Eventually will be removed. */
        unsigned int need_domain_info;

This is over ten years old, so I wonder if we can start assuming that
domains work out of the box now. All references to problems from 
PCI domains are about old code (ca. pre-2007) that doesn't understand
nonzero domains and that has since been fixed. I am pretty sure we
don't need to ever worry about stuffing multiple host bridges into
a domain other than the first one, and I also doubt we have to worry
about the problem at all on arm64 as we won't run old binaries on it
(or arm32 compat mode binaries that need to manage PCI devices).

Can anyone with more experience on the subject than me (Bjorn,
Russell, Jason, ...) think of a reason why we would not want to
just use a new domain for every host bridge we find?

If we do need to stuff multiple buses into one domain, how about
using this approach in pci_hostbridge_of_init():

  The first time we add a host bridge, scan the entire DT for
  devices setting device_type="pci". If there is actually more
  than one host bridge, check all "bus-range" properties to
  see if they overlap. If there is no overlap or only one
  bridge, don't use domains.
  If there is any overlap at all, or there are multiple host
  bridge and one of them does not have a bus-range property,
  use a separate domain per host bridge.
  Remember the function was called before so it doesn't
  have to scan the DT again, and count the domain locally.

Does this make sense?

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 18, 2014, 5:41 p.m. UTC | #20
On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:

> Can anyone with more experience on the subject than me (Bjorn,
> Russell, Jason, ...) think of a reason why we would not want to
> just use a new domain for every host bridge we find?

I personaly think we can safely move away from stuffing multiple host
bridges into a single domain for DT cases. The reasons for doing this
have long since been superseded.

Most importantly, I have a feeling keeping a 1:1 relationship between
domain and driver will making building a proper modular and hot
pluggable host driver infrastructure in the PCI core significantly
simpler. The domain object gives a nice natural place to put things in
sysfs, a natural place to keep function pointers and avoids all the
messy questions of what happens if probing overlaps bus numbers, how
do you number things, how do you hot plug downstream busses, etc.

Having a PCI core driver infrastructure that supports both 'as a
domain' and 'as a bunch of busses' seems much more complex, and I
can't really identify what is being gained by continuing to support
this.

As far as I know the host bridge stuffing is something that was
created before domains to solve the problem on embedded of multiple
PCI host bridges on a SOC/System Controller. I know I have used it
that way in the distant past (for MIPS).

However today PCI-SIG has a standard way to describe multi-port
root-complexes in config space, so we should not need to use the
multi-bus hack. SOCs with non-compliant HW that *really* need single
domain can get there: mvebu shows how to write a driver that provides
a software version of the missing hardware elements. Pushing mess like
this out of the core code seems like a good strategy.

The only reason I can think of to avoid using a domain is if Linux has
to interface with external firmware that uses bus:device.function
notation for coding information. (eg Intel-MP tables on x86 encode
interrupt routing use B:D.F) In this case Linux would need a way to
map between Linux B:D.F and the Firwmare B:D.F, or it would need to
use the Firmware B:D.F layout. But this argument does not apply to DT
systems as DT encodes the domain too. Presumably ACPI will be the
same.

Also, bear in mind we now already have multi-domain host drivers for
ARM, so the multi-platform kernels need to have this option turned on.

So the Liviu, I would say the API should be similar to what we see in
other OF enabled driver bases subsystems, call the core code with a
platform_device pointer and function_ops pointer and have the core
code create a domain, figure out the domain # from the DT (via
aliases?), and so on.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 18, 2014, 6:25 p.m. UTC | #21
On Tuesday 18 February 2014 10:41:25 Jason Gunthorpe wrote:
> On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:
> 
> > Can anyone with more experience on the subject than me (Bjorn,
> > Russell, Jason, ...) think of a reason why we would not want to
> > just use a new domain for every host bridge we find?
> 
> I personaly think we can safely move away from stuffing multiple host
> bridges into a single domain for DT cases. The reasons for doing this
> have long since been superseded.

Ok, that would definitely be the simplest answer. :)

> As far as I know the host bridge stuffing is something that was
> created before domains to solve the problem on embedded of multiple
> PCI host bridges on a SOC/System Controller. I know I have used it
> that way in the distant past (for MIPS).

Apple have also used the same trick on their G5 Macs, presumably
to simplify things for OS9 and OS-X, but even at the time making
it harder for Linux.
 
> However today PCI-SIG has a standard way to describe multi-port
> root-complexes in config space, so we should not need to use the
> multi-bus hack. SOCs with non-compliant HW that *really* need single
> domain can get there: mvebu shows how to write a driver that provides
> a software version of the missing hardware elements. Pushing mess like
> this out of the core code seems like a good strategy.

Ok.

> The only reason I can think of to avoid using a domain is if Linux has
> to interface with external firmware that uses bus:device.function
> notation for coding information. (eg Intel-MP tables on x86 encode
> interrupt routing use B:D.F) In this case Linux would need a way to
> map between Linux B:D.F and the Firwmare B:D.F, or it would need to
> use the Firmware B:D.F layout. But this argument does not apply to DT
> systems as DT encodes the domain too. Presumably ACPI will be the
> same.

ACPI definitely has a concept of domains, as I noticed when looking
at the x86 ACPI PCI probing code.

> So the Liviu, I would say the API should be similar to what we see in
> other OF enabled driver bases subsystems, call the core code with a
> platform_device pointer and function_ops pointer and have the core
> code create a domain, figure out the domain # from the DT (via
> aliases?), and so on.

Do we even need stable domain numbers? If we do, aliases sound fine.
A more complex method would be to sort them by MMIO window address
or perhaps by phandle.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 18, 2014, 6:45 p.m. UTC | #22
On Tue, Feb 18, 2014 at 07:25:35PM +0100, Arnd Bergmann wrote:

> Apple have also used the same trick on their G5 Macs, presumably
> to simplify things for OS9 and OS-X, but even at the time making
> it harder for Linux.

I actually think some x86's have done the same thing, however the
firmware hides all that from the OS and the OS just sees a normal PCI
environment. So if we can't hide the mess in firmware, the next best
place is in drivers?

> Do we even need stable domain numbers? If we do, aliases sound fine.
> A more complex method would be to sort them by MMIO window address
> or perhaps by phandle.

PCI ordering has been a bane in the past on x86, so I think stable
domain numbers is certainly desirable. Particularly since the domain
number may be influenced by module load order :( Alises followed by
sorting by phandle sounds great to me.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 18, 2014, 7:13 p.m. UTC | #23
On Tuesday 18 February 2014 11:45:42 Jason Gunthorpe wrote:
> 
> > Do we even need stable domain numbers? If we do, aliases sound fine.
> > A more complex method would be to sort them by MMIO window address
> > or perhaps by phandle.
> 
> PCI ordering has been a bane in the past on x86, so I think stable
> domain numbers is certainly desirable. Particularly since the domain
> number may be influenced by module load order  Alises followed by
> sorting by phandle sounds great to me.

Ok, makes sense. Let's make sure we come up with a stable numbering
enforced by DT then, and warn when it's not there (or refuse to
work even).

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Feb. 19, 2014, 12:28 a.m. UTC | #24
On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:
> On Friday 14 February 2014 22:00:47 Liviu Dudau wrote:
> > 
> > What I'm going to suggest in my v2 patch (hope to send it before Monday)
> > is a new API in the generic PCI code that will allow you to create a
> > host bridge in a new domain or in the existing domain, with the management
> > of the domain number being done in the generic code.
> > 
> > Something like:
> > 
> >   int create_hostbridge_in_new_domain(....);
> >   int create_hostbridge(....);
> > 
> > with the functions being wrappers around the pci_hostbridge_of_init function
> > that I'm introducing.
> > 
> > What do you think?
> 
> Not sure. It would still keep the decision about whether to use a
> new domain or not in the host bridge driver, but that doesn't seem
> like the right place. The same driver might be used in different
> ways based on what is around it.
> 
> I've also had a look at the MIPS implementation now, which has its
> own way of dealing with this, see arch/mips/pci/pci.c.
> 
> There was also an interesting comment in the MIPS header file:
> 
>         /* For compatibility with current (as of July 2003) pciutils
>            and XFree86. Eventually will be removed. */
>         unsigned int need_domain_info;
> 
> This is over ten years old, so I wonder if we can start assuming that
> domains work out of the box now. All references to problems from 
> PCI domains are about old code (ca. pre-2007) that doesn't understand
> nonzero domains and that has since been fixed. I am pretty sure we
> don't need to ever worry about stuffing multiple host bridges into
> a domain other than the first one, and I also doubt we have to worry
> about the problem at all on arm64 as we won't run old binaries on it
> (or arm32 compat mode binaries that need to manage PCI devices).
> 
> Can anyone with more experience on the subject than me (Bjorn,
> Russell, Jason, ...) think of a reason why we would not want to
> just use a new domain for every host bridge we find?

With ACPI on x86 and ia64, we currently use _SEG directly as the Linux
PCI domain.  Are you proposing that we make the Linux PCI domain
independent of _SEG?

It will look sort of funny to have things like this:

  ACPI: PCI Root Bridge [L000] (_SEG 0 domain 0000 [bus 00-1b])
  ACPI: PCI Root Bridge [L001] (_SEG 0 domain 0001 [bus 1c-37])
  ACPI: PCI Root Bridge [L002] (_SEG 0 domain 0002 [bus 38-53])

where the firmware had _SEG==0 for all the bridges and assigned
non-overlapping bus number ranges, but since nothing in PCI really
depends on the domain, I guess it should work.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liviu Dudau Feb. 19, 2014, 2:44 a.m. UTC | #25
On Tue, Feb 18, 2014 at 10:41:25AM -0700, Jason Gunthorpe wrote:
> On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:
> 
> > Can anyone with more experience on the subject than me (Bjorn,
> > Russell, Jason, ...) think of a reason why we would not want to
> > just use a new domain for every host bridge we find?
> 
> I personaly think we can safely move away from stuffing multiple host
> bridges into a single domain for DT cases. The reasons for doing this
> have long since been superseded.
> 
> Most importantly, I have a feeling keeping a 1:1 relationship between
> domain and driver will making building a proper modular and hot
> pluggable host driver infrastructure in the PCI core significantly
> simpler. The domain object gives a nice natural place to put things in
> sysfs, a natural place to keep function pointers and avoids all the
> messy questions of what happens if probing overlaps bus numbers, how
> do you number things, how do you hot plug downstream busses, etc.
> 
> Having a PCI core driver infrastructure that supports both 'as a
> domain' and 'as a bunch of busses' seems much more complex, and I
> can't really identify what is being gained by continuing to support
> this.
> 
> As far as I know the host bridge stuffing is something that was
> created before domains to solve the problem on embedded of multiple
> PCI host bridges on a SOC/System Controller. I know I have used it
> that way in the distant past (for MIPS).
> 
> However today PCI-SIG has a standard way to describe multi-port
> root-complexes in config space, so we should not need to use the
> multi-bus hack. SOCs with non-compliant HW that *really* need single
> domain can get there: mvebu shows how to write a driver that provides
> a software version of the missing hardware elements. Pushing mess like
> this out of the core code seems like a good strategy.
> 
> The only reason I can think of to avoid using a domain is if Linux has
> to interface with external firmware that uses bus:device.function
> notation for coding information. (eg Intel-MP tables on x86 encode
> interrupt routing use B:D.F) In this case Linux would need a way to
> map between Linux B:D.F and the Firwmare B:D.F, or it would need to
> use the Firmware B:D.F layout. But this argument does not apply to DT
> systems as DT encodes the domain too. Presumably ACPI will be the
> same.
> 
> Also, bear in mind we now already have multi-domain host drivers for
> ARM, so the multi-platform kernels need to have this option turned on.
> 
> So the Liviu, I would say the API should be similar to what we see in
> other OF enabled driver bases subsystems, call the core code with a
> platform_device pointer and function_ops pointer and have the core
> code create a domain, figure out the domain # from the DT (via
> aliases?), and so on.

I wish things were easier!

Lets look at the 'int pci_domain_nr(struct pci_bus *bus);' function. It is
used to obtain the domain number of the bus passed as an argument.

- include/linux/pci.h defines it as an inline function returning zero if
  !CONFIG_PCI || !CONFIG_PCI_DOMAINS. Otherwise it is silent on what the
  function might look like.

- alpha, ia64, microblaze, mips, powerpc and tile all define it as a cast
  of bus->sysdata to "struct pci_controller *" and then access a data
  member from there to get the domain number. But ... the pci_controller
  structure is different for each architecture, with few more members in
  addition that might be actually shared with generic framework code.

- arm, s390, sparc and x86 have all different names for their sysdata,
  but they all contain inside a member that is used for giving a domain
  back. sparc gets an honorary mention here for getting the API wrong
  and returning -ENXIO in certain conditions (not like the generic code
  cares).

That takes care of the implementation. But what about usage?

- drivers/pci/probe.c: pci_create_root_bus allocates a new bus structure,
  sets up the sysdata and ops member pointers and then goes straight
  into trying to find if the newly created bus doesn't already exist by
  using the bus number given as parameter and pci_domain_nr() with the
  new bus structure as argument. I'm trying really hard to figure out
  what was the intention here, but from the point of view of someone
  trying to implement the pci_domain_nr() function I have no idea what
  to return for a virgin bus structure that is not yet attached to any
  parent.

So I can see the intent of what Jason is proposing and I'm heading that
way myself, but I think I need to cleanup pci_create_root_bus first
(change the creation order between bridge and bus). And if someone has
a good idea on how to determine the domain # from DT we can pluck it
into the pcibios_root_bridge_prepare() function (either the generic
version or the arch specific one).

Best regards,
Liviu


> 
> Jason
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 19, 2014, 6:48 a.m. UTC | #26
On Wed, Feb 19, 2014 at 02:44:27AM +0000, Liviu Dudau wrote:
 
> I wish things were easier!

Yah, I've seen some of this complexity too..

A thought that crosses my head is to try and leave the entire arch
support intact and build a separate 'domain-driver-based' support
that completely replaces the arch support, a domain would use one or
the other but never both.

So pci_domain_nr could become:

  struct pci_domain_driver *drv = pci_get_domain_driver(bus);
  if (drv)
       return drv->domain_nr;
  else
       return _arch_pci_domain_nr(bus);

And other funcs would change effectively the same way.

> And if someone has a good idea on how to determine the domain # from
> DT we can pluck it into the pcibios_root_bridge_prepare() function
> (either the generic version or the arch specific one).

You can probably start with 'of_alias_get_id'..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 19, 2014, 9:58 a.m. UTC | #27
On Tuesday 18 February 2014 17:28:14 Bjorn Helgaas wrote:
> On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:
> > On Friday 14 February 2014 22:00:47 Liviu Dudau wrote:
> > 
> > Can anyone with more experience on the subject than me (Bjorn,
> > Russell, Jason, ...) think of a reason why we would not want to
> > just use a new domain for every host bridge we find?
> 
> With ACPI on x86 and ia64, we currently use _SEG directly as the Linux
> PCI domain.  Are you proposing that we make the Linux PCI domain
> independent of _SEG?

I don't think we should change anything for ACPI.
The point is that we don't currently have the equivalent of _SEG
for DT probing. The best two options we have are:

a) introduce a 'pci-segment' property with the same meaning as _SEG
on ACPI, and use that as the domain number

b) don't introduce the segment concept on DT but just give each
host bridge its own domain.

The second one seems a little easier to implement, and I don't
see what _SEG is used for other than to avoid having domains
when you don't need them. Is there more to it that I'm missing?

> It will look sort of funny to have things like this:
> 
>   ACPI: PCI Root Bridge [L000] (_SEG 0 domain 0000 [bus 00-1b])
>   ACPI: PCI Root Bridge [L001] (_SEG 0 domain 0001 [bus 1c-37])
>   ACPI: PCI Root Bridge [L002] (_SEG 0 domain 0002 [bus 38-53])
> 
> where the firmware had _SEG==0 for all the bridges and assigned
> non-overlapping bus number ranges, but since nothing in PCI really
> depends on the domain, I guess it should work.

I would expect that with DT, this would look like

Root bridge 0: domain 0 [bus 0-1b]
Root bridge 1: domain 1 [bus 0-1b]
Root bridge 2: domain 2 [bus 0-1b]

Since you wouldn't have a reason to use a bus range starting at
a nonzero number.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Feb. 19, 2014, 6:20 p.m. UTC | #28
On Wed, Feb 19, 2014 at 2:58 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Tuesday 18 February 2014 17:28:14 Bjorn Helgaas wrote:
>> On Sat, Feb 15, 2014 at 02:03:26PM +0100, Arnd Bergmann wrote:
>> > On Friday 14 February 2014 22:00:47 Liviu Dudau wrote:
>> >
>> > Can anyone with more experience on the subject than me (Bjorn,
>> > Russell, Jason, ...) think of a reason why we would not want to
>> > just use a new domain for every host bridge we find?
>>
>> With ACPI on x86 and ia64, we currently use _SEG directly as the Linux
>> PCI domain.  Are you proposing that we make the Linux PCI domain
>> independent of _SEG?
>
> I don't think we should change anything for ACPI.
> The point is that we don't currently have the equivalent of _SEG
> for DT probing. The best two options we have are:
>
> a) introduce a 'pci-segment' property with the same meaning as _SEG
> on ACPI, and use that as the domain number
>
> b) don't introduce the segment concept on DT but just give each
> host bridge its own domain.
>
> The second one seems a little easier to implement, and I don't
> see what _SEG is used for other than to avoid having domains
> when you don't need them. Is there more to it that I'm missing?

Not really, but I do have a question related to OS management of host
bridge bus number apertures.  Currently, Linux never changes a host
bridge's bus number range, but it's conceivable that we could in some
hotplug scenario.  ACPI does provide a mechanism to do so (_PRS,
_SRS), and other host bridge drivers could also do this by programming
CSRs to change the bus number range.  The PCI domain is the logical
place to manage allocation of the 00-ff range of bus numbers.

1) x86 platforms may have constraints because PCIBIOS and the 0xcf8
config access mechanism are unaware of segments.  If a platform has to
support legacy OSes that use those, it can't reuse bus numbers even in
different segment groups.  The platform might have to use multiple
segments to allow multiple ECAM regions, but use _PRS/_SRS to prevent
bus number overlaps to keep legacy config access working.  Obviously
this is only an issue if there are non-segment aware config access
methods.

2) If two host bridges share an ECAM region, I think we're forced to
put them in the same domain: if we put them in different domains,
Linux might assign [bus 00-ff] to both bridges, and ECAM config
accesses would only work for one of the bridges.  This is quite common
on x86 and is a potential issue for any architecture.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 19, 2014, 7:06 p.m. UTC | #29
On Wednesday 19 February 2014 11:20:19 Bjorn Helgaas wrote:
> On Wed, Feb 19, 2014 at 2:58 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> > On Tuesday 18 February 2014 17:28:14 Bjorn Helgaas wrote:

> > The second one seems a little easier to implement, and I don't
> > see what _SEG is used for other than to avoid having domains
> > when you don't need them. Is there more to it that I'm missing?
> 
> Not really, but I do have a question related to OS management of host
> bridge bus number apertures.  Currently, Linux never changes a host
> bridge's bus number range, but it's conceivable that we could in some
> hotplug scenario.  ACPI does provide a mechanism to do so (_PRS,
> _SRS), and other host bridge drivers could also do this by programming
> CSRs to change the bus number range.  The PCI domain is the logical
> place to manage allocation of the 00-ff range of bus numbers.
> 
> 1) x86 platforms may have constraints because PCIBIOS and the 0xcf8
> config access mechanism are unaware of segments.  If a platform has to
> support legacy OSes that use those, it can't reuse bus numbers even in
> different segment groups.  The platform might have to use multiple
> segments to allow multiple ECAM regions, but use _PRS/_SRS to prevent
> bus number overlaps to keep legacy config access working.  Obviously
> this is only an issue if there are non-segment aware config access
> methods.

Right, I don't think this will be an issue outside of x86/ia64/alpha,
since on all other architectures I'm aware of you have no PCIBIOS
and each host controller would also have its own config space.
Even host controllers using 0xfc8 would be fine because each host
bridge normally has its own I/O space.

> 2) If two host bridges share an ECAM region, I think we're forced to
> put them in the same domain: if we put them in different domains,
> Linux might assign [bus 00-ff] to both bridges, and ECAM config
> accesses would only work for one of the bridges.  This is quite common
> on x86 and is a potential issue for any architecture.

Right, this is an interesting case indeed, and I think we haven't
considered it in the binding so far. We already encode a "bus-range"
in DT, so we can easily partition the ECAM config space, but it
still violates one of the two assumptions:
a) that the register ranges for the two host bridge devices are
   non-overlapping in DT
b) that the ECAM register range as specified in DT starts at bus
   0 and is a power-of-two size.
Since the binding is not fixed that, we could change the definition to
say that the ECAM register range in the "reg" property must match
the buses listed in the "bus-range" property.

I still want to make sure I understand exactly what this case is about
though, i.e. what is shared and what is separate if you have two host
bridges with a common ECAM region:

* I assume I/O space is always shared on x86, but probably separate
  elsewhere.
* Each host would always have a fixed memory space aperture, right?
* From what I understand from your description, the hardware does
  not enforce specific bus numbers for each host. How does the
  host bridge know its root bus number then?
* Should I expect one IOMMU per host bridge or one ECAM region,
  or can either be possible?
* The IntA-IntB IRQ numbers are always per host bridge I assume.
* Memory space on one host bridge is visible to bus master DMA
  from a device on another host bridge on x86, right? I assume
  this won't normally be the case on other architectures.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Feb. 19, 2014, 8:18 p.m. UTC | #30
On Wed, Feb 19, 2014 at 12:06 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Wednesday 19 February 2014 11:20:19 Bjorn Helgaas wrote:
>> On Wed, Feb 19, 2014 at 2:58 AM, Arnd Bergmann <arnd@arndb.de> wrote:
>> > On Tuesday 18 February 2014 17:28:14 Bjorn Helgaas wrote:
>
>> > The second one seems a little easier to implement, and I don't
>> > see what _SEG is used for other than to avoid having domains
>> > when you don't need them. Is there more to it that I'm missing?
>>
>> Not really, but I do have a question related to OS management of host
>> bridge bus number apertures.  Currently, Linux never changes a host
>> bridge's bus number range, but it's conceivable that we could in some
>> hotplug scenario.  ACPI does provide a mechanism to do so (_PRS,
>> _SRS), and other host bridge drivers could also do this by programming
>> CSRs to change the bus number range.  The PCI domain is the logical
>> place to manage allocation of the 00-ff range of bus numbers.
>>
>> 1) x86 platforms may have constraints because PCIBIOS and the 0xcf8
>> config access mechanism are unaware of segments.  If a platform has to
>> support legacy OSes that use those, it can't reuse bus numbers even in
>> different segment groups.  The platform might have to use multiple
>> segments to allow multiple ECAM regions, but use _PRS/_SRS to prevent
>> bus number overlaps to keep legacy config access working.  Obviously
>> this is only an issue if there are non-segment aware config access
>> methods.
>
> Right, I don't think this will be an issue outside of x86/ia64/alpha,
> since on all other architectures I'm aware of you have no PCIBIOS
> and each host controller would also have its own config space.
> Even host controllers using 0xfc8 would be fine because each host
> bridge normally has its own I/O space.
>
>> 2) If two host bridges share an ECAM region, I think we're forced to
>> put them in the same domain: if we put them in different domains,
>> Linux might assign [bus 00-ff] to both bridges, and ECAM config
>> accesses would only work for one of the bridges.  This is quite common
>> on x86 and is a potential issue for any architecture.
>
> Right, this is an interesting case indeed, and I think we haven't
> considered it in the binding so far. We already encode a "bus-range"
> in DT, so we can easily partition the ECAM config space, but it
> still violates one of the two assumptions:
> a) that the register ranges for the two host bridge devices are
>    non-overlapping in DT
> b) that the ECAM register range as specified in DT starts at bus
>    0 and is a power-of-two size.
> Since the binding is not fixed that, we could change the definition to
> say that the ECAM register range in the "reg" property must match
> the buses listed in the "bus-range" property.

Addresses in the ACPI MCFG table correspond to bus number 0, but the
MCFG also provides start & end bus numbers, so the valid range does
not necessarily start with bus 0 and need not be power-of-two in size.
 Something similar sounds like a good idea for DT.

> I still want to make sure I understand exactly what this case is about
> though, i.e. what is shared and what is separate if you have two host
> bridges with a common ECAM region:
>
> * I assume I/O space is always shared on x86, but probably separate
>   elsewhere.

I think x86 *could* support multiple I/O spaces, but user-mode
inb/outb could only reach the 0-64K space.  I think ia64 has the same
limitation on user code, but it supports many spaces in the kernel.

> * Each host would always have a fixed memory space aperture, right?

The ACPI _CRS/_PRS/_SRS mechanism theoretically allows changes to the
bus number, I/O space, and memory space apertures of host bridges.
But we don't do any of those changes today, and I don't know if any
BIOSes actually allow it.

> * From what I understand from your description, the hardware does
>   not enforce specific bus numbers for each host. How does the
>   host bridge know its root bus number then?

I don't know details of any specific hardware.  I'm just saying that
ACPI provides a mechanism for the OS to manipulate the bus number
range below a host bridge.  Of course, a BIOS is free to omit _PRS and
_SRS, and in that case, the bus/IO/memory apertures reported by _CRS
are fixed and can't be changed.  We learn the root bus number from the
host bridge _CRS (but I'm sure you knew that, so maybe I missed the
point of your question).

> * Should I expect one IOMMU per host bridge or one ECAM region,
>   or can either be possible?

It's possible to have multiple IOMMUs per host bridge, and I think
they can even be buried down in the PCIe hierarchy.

> * The IntA-IntB IRQ numbers are always per host bridge I assume.

For conventional PCI, INTx are just wires that could go anywhere, so
there's no connection between them and a host bridge.  You have to
have a _PRT or similar to make sense out of them.  For PCIe, a Root
Complex maps INTx emulation messages to system interrupts in an
implementation-specific way, so we need a _PRT there, too.  I don't
think there's a requirement that these IRQ numbers be per-host bridge.

> * Memory space on one host bridge is visible to bus master DMA
>   from a device on another host bridge on x86, right? I assume
>   this won't normally be the case on other architectures.

I think this is also implementation dependent, and I'm not aware of an
ACPI or other generic way to learn what a particular platform does.
It seems like this was an issue for MPS configuration, where
peer-to-peer DMA is a problem because we don't really know how Root
Complexes handle peer-to-peer transactions.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 19, 2014, 8:48 p.m. UTC | #31
On Wednesday 19 February 2014 13:18:24 Bjorn Helgaas wrote:
> >
> > Right, this is an interesting case indeed, and I think we haven't
> > considered it in the binding so far. We already encode a "bus-range"
> > in DT, so we can easily partition the ECAM config space, but it
> > still violates one of the two assumptions:
> > a) that the register ranges for the two host bridge devices are
> >    non-overlapping in DT
> > b) that the ECAM register range as specified in DT starts at bus
> >    0 and is a power-of-two size.
> > Since the binding is not fixed that, we could change the definition to
> > say that the ECAM register range in the "reg" property must match
> > the buses listed in the "bus-range" property.
> 
> Addresses in the ACPI MCFG table correspond to bus number 0, but the
> MCFG also provides start & end bus numbers, so the valid range does
> not necessarily start with bus 0 and need not be power-of-two in size.
>  Something similar sounds like a good idea for DT.

Hmm, we'll have to think about that. From a DT perspective, we try
to keep things local to the node using it, so listing only the
registers we are allowed to access is more natural.

Another option would be to have a separate device node for the ECAM
registers and point to that from the host controller node, which
would describe this cleanly but also add a bit of complexity that
will rarely be used.

> > I still want to make sure I understand exactly what this case is about
> > though, i.e. what is shared and what is separate if you have two host
> > bridges with a common ECAM region:
> >
> > * I assume I/O space is always shared on x86, but probably separate
> >   elsewhere.
> 
> I think x86 *could* support multiple I/O spaces, but user-mode
> inb/outb could only reach the 0-64K space.  I think ia64 has the same
> limitation on user code, but it supports many spaces in the kernel.

Ok.

> > * Each host would always have a fixed memory space aperture, right?
> 
> The ACPI _CRS/_PRS/_SRS mechanism theoretically allows changes to the
> bus number, I/O space, and memory space apertures of host bridges.
> But we don't do any of those changes today, and I don't know if any
> BIOSes actually allow it.

I mean non-overlapping apertures in particular. We also have
cases where the aperture that we list in DT is just programmed
into hardware registers by the host driver and could be arbitrary,
but you can't normally have the same MMIO address go to two
devices on internal buses (or not in a sensible way).

> > * From what I understand from your description, the hardware does
> >   not enforce specific bus numbers for each host. How does the
> >   host bridge know its root bus number then?
> 
> I don't know details of any specific hardware.  I'm just saying that
> ACPI provides a mechanism for the OS to manipulate the bus number
> range below a host bridge.  Of course, a BIOS is free to omit _PRS and
> _SRS, and in that case, the bus/IO/memory apertures reported by _CRS
> are fixed and can't be changed.  We learn the root bus number from the
> host bridge _CRS (but I'm sure you knew that, so maybe I missed the
> point of your question).

I guess the answer then is that the host bridge can have a register
for programming the root bus number, but it's not standardized and
therefore the access is hidden in the _PRS/_SRS methods. If we have
the same on DT and want to reprogram the bus numbers, we'd need to
have a kernel driver for the nonstandard registers of the host bridge.

> > * Should I expect one IOMMU per host bridge or one ECAM region,
> >   or can either be possible?
> 
> It's possible to have multiple IOMMUs per host bridge, and I think
> they can even be buried down in the PCIe hierarchy.

Oh, I didn't know that. So how do you actually find the IOMMU for
a given domain/bus/device/function combination?

> > * The IntA-IntB IRQ numbers are always per host bridge I assume.
> 
> For conventional PCI, INTx are just wires that could go anywhere, so
> there's no connection between them and a host bridge.  You have to
> have a _PRT or similar to make sense out of them.  For PCIe, a Root
> Complex maps INTx emulation messages to system interrupts in an
> implementation-specific way, so we need a _PRT there, too.  I don't
> think there's a requirement that these IRQ numbers be per-host bridge.

Right, makes sense.

Thanks for the detailed explanations!

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Feb. 19, 2014, 9:10 p.m. UTC | #32
On Wed, Feb 19, 2014 at 09:48:48PM +0100, Arnd Bergmann wrote:

> Hmm, we'll have to think about that. From a DT perspective, we try
> to keep things local to the node using it, so listing only the
> registers we are allowed to access is more natural.

If I understand the restriction properly, in a modern PCI-E world it
boils down to a limition on the configuration of each PCI-PCI root
port bridge (eg, a limited range of valid bus values, and apertures)

AFAIK it comes from the hidden per-socket routing registers that the
firwmare configures. Range X->Y (bus #, IO and MMIO) will be routed to
a specific physical socket, and then the PCI-E bridges in that socket
claim the transatcion based on their local config to select the
ultimate egress port.

So describing and restricting the bridge DT node itself, under a
single top level PCI domain stanza seems pretty reasonable.

As does containing the restrictions in a HW driver with knowledge of
the hidden registers, especially for firmware-less embedded. This is
part of what mvebu is doing already.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Feb. 19, 2014, 9:33 p.m. UTC | #33
On Wed, Feb 19, 2014 at 1:48 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Wednesday 19 February 2014 13:18:24 Bjorn Helgaas wrote:
>> >
>> > Right, this is an interesting case indeed, and I think we haven't
>> > considered it in the binding so far. We already encode a "bus-range"
>> > in DT, so we can easily partition the ECAM config space, but it
>> > still violates one of the two assumptions:
>> > a) that the register ranges for the two host bridge devices are
>> >    non-overlapping in DT
>> > b) that the ECAM register range as specified in DT starts at bus
>> >    0 and is a power-of-two size.
>> > Since the binding is not fixed that, we could change the definition to
>> > say that the ECAM register range in the "reg" property must match
>> > the buses listed in the "bus-range" property.
>>
>> Addresses in the ACPI MCFG table correspond to bus number 0, but the
>> MCFG also provides start & end bus numbers, so the valid range does
>> not necessarily start with bus 0 and need not be power-of-two in size.
>>  Something similar sounds like a good idea for DT.
>
> Hmm, we'll have to think about that. From a DT perspective, we try
> to keep things local to the node using it, so listing only the
> registers we are allowed to access is more natural.

The combination of MCFG base address for bus 00 and the bus number
range from _CRS, plus the obvious offset computation does effectively
describe the registers you're allowed to access; it's just up to the
OS to compute the offsets.  _CBA (an optional method that returns the
ECAM address for a hot-added host bridge) uses the same bus 00 base.
My guess is that _CBA uses a bus number 00 base so it can return a
constant, regardless of whether the OS changes the bus number range.
If _CBA returned the ECAM base for the current bus number aperture, it
would be dependent on _CRS (the current settings), and the OS would
have to re-evaluate _CBA if it ever changed the bus number aperture.

>> > * Each host would always have a fixed memory space aperture, right?
>>
>> The ACPI _CRS/_PRS/_SRS mechanism theoretically allows changes to the
>> bus number, I/O space, and memory space apertures of host bridges.
>> But we don't do any of those changes today, and I don't know if any
>> BIOSes actually allow it.
>
> I mean non-overlapping apertures in particular. We also have
> cases where the aperture that we list in DT is just programmed
> into hardware registers by the host driver and could be arbitrary,
> but you can't normally have the same MMIO address go to two
> devices on internal buses (or not in a sensible way).

I don't remember specific spec statements about that, but I can't
imagine how to make sense of an address that's claimed by two devices.

>> > * From what I understand from your description, the hardware does
>> >   not enforce specific bus numbers for each host. How does the
>> >   host bridge know its root bus number then?
>>
>> I don't know details of any specific hardware.  I'm just saying that
>> ACPI provides a mechanism for the OS to manipulate the bus number
>> range below a host bridge.  Of course, a BIOS is free to omit _PRS and
>> _SRS, and in that case, the bus/IO/memory apertures reported by _CRS
>> are fixed and can't be changed.  We learn the root bus number from the
>> host bridge _CRS (but I'm sure you knew that, so maybe I missed the
>> point of your question).
>
> I guess the answer then is that the host bridge can have a register
> for programming the root bus number, but it's not standardized and
> therefore the access is hidden in the _PRS/_SRS methods. If we have
> the same on DT and want to reprogram the bus numbers, we'd need to
> have a kernel driver for the nonstandard registers of the host bridge.

Exactly; that's my mental model of how it works: _CRS/_PRS/_SRS are
basically accessors for generalized BARs.

>> > * Should I expect one IOMMU per host bridge or one ECAM region,
>> >   or can either be possible?
>>
>> It's possible to have multiple IOMMUs per host bridge, and I think
>> they can even be buried down in the PCIe hierarchy.
>
> Oh, I didn't know that. So how do you actually find the IOMMU for
> a given domain/bus/device/function combination?

For VT-d on x86, there's a DMAR table that describes the remapping
units (IOMMUs), and each has a list of associated devices.  This is
one place where the FW/OS interface uses segment and bus numbers.
There's something different for AMD IOMMUs, but I think it also
involves looking up the device in a table from the firmware.

 Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Feb. 19, 2014, 10:12 p.m. UTC | #34
On Wednesday 19 February 2014 14:33:54 Bjorn Helgaas wrote:
> On Wed, Feb 19, 2014 at 1:48 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> > On Wednesday 19 February 2014 13:18:24 Bjorn Helgaas wrote:
> >> >
> >> > Right, this is an interesting case indeed, and I think we haven't
> >> > considered it in the binding so far. We already encode a "bus-range"
> >> > in DT, so we can easily partition the ECAM config space, but it
> >> > still violates one of the two assumptions:
> >> > a) that the register ranges for the two host bridge devices are
> >> >    non-overlapping in DT
> >> > b) that the ECAM register range as specified in DT starts at bus
> >> >    0 and is a power-of-two size.
> >> > Since the binding is not fixed that, we could change the definition to
> >> > say that the ECAM register range in the "reg" property must match
> >> > the buses listed in the "bus-range" property.
> >>
> >> Addresses in the ACPI MCFG table correspond to bus number 0, but the
> >> MCFG also provides start & end bus numbers, so the valid range does
> >> not necessarily start with bus 0 and need not be power-of-two in size.
> >>  Something similar sounds like a good idea for DT.
> >
> > Hmm, we'll have to think about that. From a DT perspective, we try
> > to keep things local to the node using it, so listing only the
> > registers we are allowed to access is more natural.
> 
> The combination of MCFG base address for bus 00 and the bus number
> range from _CRS, plus the obvious offset computation does effectively
> describe the registers you're allowed to access; it's just up to the
> OS to compute the offsets.

That's not what I meant. The 'reg' property is supposed to list the
registers that are exclusively owned by the device. We normally
expect that we want to request_mem_region() and ioremap() the entire
range for each device, which doesn't make sense if we only want a
small part of it, or if another device owns the same registers.

Having a separate device that just owns the ECAM region would work
fine, because then it can ioremap that once and get referenced
by the host bridge drivers.

> >> > * Should I expect one IOMMU per host bridge or one ECAM region,
> >> >   or can either be possible?
> >>
> >> It's possible to have multiple IOMMUs per host bridge, and I think
> >> they can even be buried down in the PCIe hierarchy.
> >
> > Oh, I didn't know that. So how do you actually find the IOMMU for
> > a given domain/bus/device/function combination?
> 
> For VT-d on x86, there's a DMAR table that describes the remapping
> units (IOMMUs), and each has a list of associated devices.  This is
> one place where the FW/OS interface uses segment and bus numbers.
> There's something different for AMD IOMMUs, but I think it also
> involves looking up the device in a table from the firmware.

Ok, that seems complicated. I hope we don't have to get this deep
on ARM and can keep it on a per host-bridge basis rather than having
to get down all the way to the device. We wouldn't be able to encode
IOMMUs for hotplugged devices in DT, because the firmware is no
longer running by the time they show up.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas Feb. 19, 2014, 10:18 p.m. UTC | #35
On Wed, Feb 19, 2014 at 3:12 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Wednesday 19 February 2014 14:33:54 Bjorn Helgaas wrote:
>> On Wed, Feb 19, 2014 at 1:48 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>> > On Wednesday 19 February 2014 13:18:24 Bjorn Helgaas wrote:
>> >> >
>> >> > Right, this is an interesting case indeed, and I think we haven't
>> >> > considered it in the binding so far. We already encode a "bus-range"
>> >> > in DT, so we can easily partition the ECAM config space, but it
>> >> > still violates one of the two assumptions:
>> >> > a) that the register ranges for the two host bridge devices are
>> >> >    non-overlapping in DT
>> >> > b) that the ECAM register range as specified in DT starts at bus
>> >> >    0 and is a power-of-two size.
>> >> > Since the binding is not fixed that, we could change the definition to
>> >> > say that the ECAM register range in the "reg" property must match
>> >> > the buses listed in the "bus-range" property.
>> >>
>> >> Addresses in the ACPI MCFG table correspond to bus number 0, but the
>> >> MCFG also provides start & end bus numbers, so the valid range does
>> >> not necessarily start with bus 0 and need not be power-of-two in size.
>> >>  Something similar sounds like a good idea for DT.
>> >
>> > Hmm, we'll have to think about that. From a DT perspective, we try
>> > to keep things local to the node using it, so listing only the
>> > registers we are allowed to access is more natural.
>>
>> The combination of MCFG base address for bus 00 and the bus number
>> range from _CRS, plus the obvious offset computation does effectively
>> describe the registers you're allowed to access; it's just up to the
>> OS to compute the offsets.
>
> That's not what I meant. The 'reg' property is supposed to list the
> registers that are exclusively owned by the device. We normally
> expect that we want to request_mem_region() and ioremap() the entire
> range for each device, which doesn't make sense if we only want a
> small part of it, or if another device owns the same registers.

Oh, right, that makes good sense.  The ACPI approach means we have to
have special-case code for that; it can't be handled by
request_mem_region() in a generic _CRS parser.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/devicetree/bindings/pci/arm-generic-pci.txt b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
new file mode 100644
index 000000000000..cc7a35ecfa2d
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/arm-generic-pci.txt
@@ -0,0 +1,51 @@ 
+* ARM generic PCI host controller
+
+Firmware-initialised PCI host controllers and PCI emulations, such as the
+virtio-pci implementations found in kvmtool and other para-virtualised
+systems, do not require driver support for complexities such as regulator and
+clock management. In fact, the controller may not even require the
+configuration of a control interface by the operating system, instead
+presenting a set of fixed windows describing a subset of IO, Memory and
+Configuration Spaces.
+
+Such a controller can be described purely in terms of the standardized device
+tree bindings communicated in pci.txt:
+
+- compatible     : Must be "arm,pci-cam-generic" or "arm,pci-ecam-generic"
+                   depending on the layout of configuration space (CAM vs
+                   ECAM respectively)
+
+- ranges         : As described in IEEE Std 1275-1994, but must provide
+                   at least a definition of one or both of IO and Memory
+                   Space.
+
+- #address-cells : Must be 3
+
+- #size-cells    : Must be 2
+
+- reg            : The Configuration Space base address, as accessed by the
+                   parent bus.
+
+Configuration Space is assumed to be memory-mapped (as opposed to being
+accessed via an ioport) and laid out with a direct correspondence to the
+geography of a PCI bus address by concatenating the various components to form
+an offset.
+
+For CAM, this 24-bit offset is:
+
+        cfg_offset(bus, device, function, register) =
+                   bus << 16 | device << 11 | function << 8 | register
+
+Whilst ECAM extends this by 4 bits to accomodate 4k of function space:
+
+        cfg_offset(bus, device, function, register) =
+                   bus << 20 | device << 15 | function << 12 | register
+
+Interrupt mapping is exactly as described in `Open Firmware Recommended
+Practice: Interrupt Mapping' and requires the following properties:
+
+- #interrupt-cells   : Must be 1
+
+- interrupt-map      : <see aforementioned specification>
+
+- interrupt-map-mask : <see aforementioned specification>
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 47d46c6d8468..491d74c36f6a 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -33,4 +33,11 @@  config PCI_RCAR_GEN2
 	  There are 3 internal PCI controllers available with a single
 	  built-in EHCI/OHCI host controller present on each one.
 
+config PCI_ARM_GENERIC
+	bool "ARM generic PCI host controller"
+	depends on ARM && OF
+	help
+	  Say Y here if you want to support a simple generic PCI host
+	  controller, such as the one emulated by kvmtool.
+
 endmenu
diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
index 13fb3333aa05..17f5555f8a29 100644
--- a/drivers/pci/host/Makefile
+++ b/drivers/pci/host/Makefile
@@ -4,3 +4,4 @@  obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
 obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o
+obj-$(CONFIG_PCI_ARM_GENERIC) += pci-arm-generic.o
diff --git a/drivers/pci/host/pci-arm-generic.c b/drivers/pci/host/pci-arm-generic.c
new file mode 100644
index 000000000000..31ce03ee2607
--- /dev/null
+++ b/drivers/pci/host/pci-arm-generic.c
@@ -0,0 +1,318 @@ 
+/*
+ * Simple, generic PCI host controller driver targetting firmware-initialised
+ * systems and virtual machines (e.g. the PCI emulation provided by kvmtool).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * The current driver is limited to one I/O space per controller, and
+ * only supports a single controller.
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/platform_device.h>
+
+struct gen_pci_cfg_accessors {
+	void __iomem *(*map_bus)(struct pci_bus *, unsigned int, int);
+	void (*unmap_bus)(struct pci_bus *);
+};
+
+struct gen_pci_cfg_window {
+	u64					cpu_phys;
+	void __iomem				*base;
+	u8					bus;
+	spinlock_t				lock;
+	const struct gen_pci_cfg_accessors	*accessors;
+};
+
+struct gen_pci_resource {
+	struct list_head			list;
+	struct resource				cpu_res;
+	resource_size_t				offset;
+};
+
+struct gen_pci {
+	struct device				*dev;
+	struct resource				*io_res;
+	struct list_head			mem_res;
+	struct gen_pci_cfg_window		cfg;
+};
+
+/*
+ * Configuration space accessors. We support CAM (simply map the entire
+ * 16M space) or ECAM (map a single 1M bus at a time).
+ */
+static void __iomem *gen_pci_map_cfg_bus_cam(struct pci_bus *bus,
+					     unsigned int devfn,
+					     int where)
+{
+	struct pci_sys_data *sys = bus->sysdata;
+	struct gen_pci *pci = sys->private_data;
+	u32 busn = bus->number;
+
+	return pci->cfg.base + ((busn << 16) | (devfn << 8) | where);
+}
+
+static void gen_pci_unmap_cfg_bus_cam(struct pci_bus *bus)
+{
+}
+
+static struct gen_pci_cfg_accessors gen_pci_cfg_cam_accessors = {
+	.map_bus	= gen_pci_map_cfg_bus_cam,
+	.unmap_bus	= gen_pci_unmap_cfg_bus_cam,
+};
+
+static void __iomem *gen_pci_map_cfg_bus_ecam(struct pci_bus *bus,
+					      unsigned int devfn,
+					      int where)
+{
+	struct pci_sys_data *sys = bus->sysdata;
+	struct gen_pci *pci = sys->private_data;
+	u32 busn = bus->number;
+
+	spin_lock(&pci->cfg.lock);
+	if (pci->cfg.bus != busn) {
+		resource_size_t offset;
+
+		devm_iounmap(pci->dev, pci->cfg.base);
+		offset = pci->cfg.cpu_phys + (busn << 20);
+		pci->cfg.base = devm_ioremap(pci->dev, offset, SZ_1M);
+		pci->cfg.bus = busn;
+	}
+
+	return pci->cfg.base + ((devfn << 12) | where);
+}
+
+static void gen_pci_unmap_cfg_bus_ecam(struct pci_bus *bus)
+{
+	struct pci_sys_data *sys = bus->sysdata;
+	struct gen_pci *pci = sys->private_data;
+
+	spin_unlock(&pci->cfg.lock);
+}
+
+static struct gen_pci_cfg_accessors gen_pci_cfg_ecam_accessors = {
+	.map_bus	= gen_pci_map_cfg_bus_ecam,
+	.unmap_bus	= gen_pci_unmap_cfg_bus_ecam,
+};
+
+static int gen_pci_config_read(struct pci_bus *bus, unsigned int devfn,
+				int where, int size, u32 *val)
+{
+	struct pci_sys_data *sys = bus->sysdata;
+	struct gen_pci *pci = sys->private_data;
+	void __iomem *addr = pci->cfg.accessors->map_bus(bus, devfn, where);
+
+	switch (size) {
+	case 1:
+		*val = readb(addr);
+		break;
+	case 2:
+		*val = readw(addr);
+		break;
+	default:
+		*val = readl(addr);
+	}
+
+	pci->cfg.accessors->unmap_bus(bus);
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int gen_pci_config_write(struct pci_bus *bus, unsigned int devfn,
+				 int where, int size, u32 val)
+{
+	struct pci_sys_data *sys = bus->sysdata;
+	struct gen_pci *pci = sys->private_data;
+	void __iomem *addr = pci->cfg.accessors->map_bus(bus, devfn, where);
+
+	switch (size) {
+	case 1:
+		writeb(val, addr);
+		break;
+	case 2:
+		writew(val, addr);
+		break;
+	default:
+		writel(val, addr);
+	}
+
+	pci->cfg.accessors->unmap_bus(bus);
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops gen_pci_ops = {
+	.read	= gen_pci_config_read,
+	.write	= gen_pci_config_write,
+};
+
+static int gen_pci_setup(int nr, struct pci_sys_data *sys)
+{
+	int err;
+	struct gen_pci_resource *res;
+	struct gen_pci *pci = sys->private_data;
+
+	/* Map an initial Configuration Space window */
+	pci->cfg.base = devm_ioremap(pci->dev, pci->cfg.cpu_phys, SZ_1M);
+	if (!pci->cfg.base)
+		return -ENOMEM;
+
+	/* Register our I/O space */
+	if (pci->io_res) {
+		resource_size_t offset = nr * SZ_64K;
+
+		err = request_resource(&ioport_resource, pci->io_res);
+		if (err)
+			goto out_unmap_cfg_space;
+
+		err = pci_ioremap_io(offset, pci->io_res->start);
+		if (err)
+			goto out_release_io_res;
+
+		pci_add_resource_offset(&sys->resources, pci->io_res, offset);
+	}
+
+	/* Register our memory resources */
+	list_for_each_entry(res, &pci->mem_res, list) {
+		err = request_resource(&iomem_resource, &res->cpu_res);
+		if (err)
+			goto out_release_mem_res;
+
+		pci_add_resource_offset(&sys->resources,
+					&res->cpu_res,
+					res->offset);
+	}
+
+	return 1;
+out_release_mem_res:
+	list_for_each_entry_continue_reverse(res, &pci->mem_res, list)
+		release_resource(&res->cpu_res);
+out_release_io_res:
+	release_resource(pci->io_res);
+out_unmap_cfg_space:
+	devm_iounmap(pci->dev, pci->cfg.base);
+	return err;
+}
+
+static const struct of_device_id gen_pci_of_match[] = {
+	{ .compatible = "arm,pci-cam-generic",
+	  .data = &gen_pci_cfg_cam_accessors },
+
+	{ .compatible = "arm,pci-ecam-generic",
+	  .data = &gen_pci_cfg_ecam_accessors },
+
+	{ },
+};
+MODULE_DEVICE_TABLE(of, gen_pci_of_match);
+
+static int gen_pci_probe(struct platform_device *pdev)
+{
+	struct hw_pci hw;
+	struct of_pci_range range;
+	struct of_pci_range_parser parser;
+	struct gen_pci *pci;
+	const __be32 *reg;
+	const struct of_device_id *of_id;
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+
+	if (!np)
+		return -ENODEV;
+
+	if (of_pci_range_parser_init(&parser, np)) {
+		dev_err(dev, "missing \"ranges\" property\n");
+		return -EINVAL;
+	}
+
+	reg = of_get_property(np, "reg", NULL);
+	if (!reg) {
+		dev_err(dev, "missing \"reg\" property\n");
+		return -EINVAL;
+	}
+
+	pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL);
+	if (!pci)
+		return -ENOMEM;
+
+	pci->cfg.cpu_phys = of_translate_address(np, reg);
+	if (pci->cfg.cpu_phys == OF_BAD_ADDR)
+		return -EINVAL;
+
+	of_id = of_match_node(gen_pci_of_match, np);
+	pci->cfg.accessors = of_id->data;
+	spin_lock_init(&pci->cfg.lock);
+	INIT_LIST_HEAD(&pci->mem_res);
+	pci->dev = dev;
+
+	for_each_of_pci_range(&parser, &range) {
+		struct gen_pci_resource *res;
+		u32 restype = range.flags & IORESOURCE_TYPE_BITS;
+
+		res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+		if (!res)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&res->list);
+		of_pci_range_to_resource(&range, np, &res->cpu_res);
+
+		switch (restype) {
+		case IORESOURCE_IO:
+			if (pci->io_res) {
+				dev_warn(dev,
+					 "ignoring additional io resource\n");
+				devm_kfree(dev, res);
+			} else {
+				pci->io_res = &res->cpu_res;
+			}
+			break;
+		case IORESOURCE_MEM:
+			res->offset = range.cpu_addr - range.pci_addr;
+			list_add(&res->list, &pci->mem_res);
+			break;
+		default:
+			dev_warn(dev,
+				"ignoring unknown/unsupported resource type %x\n",
+				 restype);
+		}
+	}
+
+	hw = (struct hw_pci) {
+		.nr_controllers	= 1,
+		.private_data	= (void **)&pci,
+		.setup		= gen_pci_setup,
+		.map_irq	= of_irq_parse_and_map_pci,
+		.ops		= &gen_pci_ops,
+	};
+
+	pci_common_init_dev(dev, &hw);
+	return 0;
+}
+
+static struct platform_driver gen_pci_driver = {
+	.driver = {
+		.name = "pci-arm-generic",
+		.owner = THIS_MODULE,
+		.of_match_table = gen_pci_of_match,
+	},
+	.probe = gen_pci_probe,
+};
+module_platform_driver(gen_pci_driver);
+
+MODULE_DESCRIPTION("ARM generic PCI host driver");
+MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
+MODULE_LICENSE("GPLv2");