diff mbox series

[6/6] npu2: Add support for relaxed-ordering mode

Message ID 1532702862-22751-7-git-send-email-arbab@linux.ibm.com
State Superseded
Headers show
Series npu2: Add support for relaxed-ordering mode | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied
snowpatch_ozlabs/make_check success Test make_check on branch master

Commit Message

Reza Arbab July 27, 2018, 2:47 p.m. UTC
Some device drivers support out of order access to GPU memory. This does
not affect the CPU view of memory but it does affect the GPU view of
memory. It should only be enabled once the GPU driver has requested it
to be enabled.

Current hardware only allows it to be enabled per PCIe root port,
therefore all devices on a given PCIe root port must be explicitly
enabled for relaxed-ordering prior to it actually being enabled.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
---
 hw/npu2.c           | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/npu2-regs.h |  21 +++-
 include/opal-api.h  |   4 +-
 include/pci.h       |   8 ++
 include/phb4.h      |   3 +
 5 files changed, 303 insertions(+), 6 deletions(-)

Comments

Alistair Popple July 30, 2018, 6:37 a.m. UTC | #1
Looks reasonable to me. A couple of minor comments below, but nothing that would
stop me adding:

Reviewed-by: Alistair Popple <alistair@popple.id.au>

On Friday, 27 July 2018 9:47:42 AM AEST Reza Arbab wrote:
> Some device drivers support out of order access to GPU memory. This does
> not affect the CPU view of memory but it does affect the GPU view of
> memory. It should only be enabled once the GPU driver has requested it
> to be enabled.
> 
> Current hardware only allows it to be enabled per PCIe root port,
> therefore all devices on a given PCIe root port must be explicitly
> enabled for relaxed-ordering prior to it actually being enabled.
> 
> Signed-off-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
> ---
>  hw/npu2.c           | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h |  21 +++-
>  include/opal-api.h  |   4 +-
>  include/pci.h       |   8 ++
>  include/phb4.h      |   3 +
>  5 files changed, 303 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 9748536..6ff6e4d 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -38,6 +38,7 @@
>  #include <nvram.h>
>  #include <xive.h>
>  #include <xscom-p9-regs.h>
> +#include <phb4.h>
>  
>  #define NPU2_IRQ_BASE_SHIFT 13
>  #define NPU2_N_DL_IRQS 23
> @@ -919,12 +920,17 @@ static void npu2_hw_init(struct npu2 *p)
>  	}
>  
>  	/* Static initialization of every relaxed-ordering cfg[2] register */
> -	val = NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
> +	val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
> +	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
> +	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
> +	      NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
> +	      NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
> +	      NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
>  	      NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
>  
>  	for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
>  		for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
> -			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG2);
> +			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));

If I was being pedantic I'd ask for this to be split out into a seperate change
as well, but it's obvious enough what's going on here.

>  			npu2_write(p, reg, val);
>  		}
>  	}
> @@ -2253,3 +2259,266 @@ out:
>  	return rc;
>  }
>  opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
> +
> +static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
> +{
> +	/* Repack 0bGGGGCCC to 0bGGCC */
> +	return ((gcid & 0x18) >> 1) | (gcid & 0x3);

Reading the rest of the code it does not seem like the gcid is validated
anywhere. If a gcid gets passed in that the NPU HW doesn't support matching on
we should error out rather than just dropping bits and trying to filter on some
other "random" gcid.

> +}
> +
> +static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
> +{
> +	uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
> +
> +	return npu2_read(ndev->npu, reg);
> +}
> +
> +static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
> +					    uint64_t val)
> +{
> +	uint64_t reg;
> +	int sm;
> +
> +	/* Set every register on our stack */
> +	for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
> +		reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
> +		npu2_write(ndev->npu, reg, val);
> +	}
> +}
> +
> +/*
> + * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
> + * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
> + * Returns 0 if unset.
> + */
> +static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
> +						  int pec)
> +{
> +	uint32_t src, grpchp;
> +	uint64_t mask;
> +	int i;
> +
> +	for (i = 0; i < 2; i++) {
> +		mask = NPU2_RELAXED_ORDERING_SOURCE(i);
> +		src = GETFIELD(mask, val);
> +
> +		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
> +			continue;
> +
> +		if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
> +			continue;
> +
> +		grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
> +		if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
> +			return mask;
> +
> +		if (grpchp == 0xf) /* match all */
> +			return mask;
> +	}
> +
> +	return 0;
> +}
> +
> +static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
> +					int pec)
> +{
> +	uint64_t val, mask;
> +	uint32_t src;
> +	int rc = OPAL_RESOURCE;
> +	int i;
> +
> +	NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
> +	lock(&ndev->npu->lock);
> +
> +	for (i = 0; i < 2; i++) {
> +		val = npu2_relaxed_ordering_cfg_read(ndev, i);
> +		if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
> +			continue;
> +
> +		/* Already enabled */
> +		rc = OPAL_SUCCESS;
> +		goto out;
> +	}
> +
> +	src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
> +	      NPU2_RELAXED_ORDERING_SOURCE_RDENA;
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
> +		       npu2_relaxed_ordering_source_grpchp(gcid));
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
> +	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
> +
> +	/* Find somewhere to write this config */
> +	for (i = 0; i < 2; i++) {
> +		val = npu2_relaxed_ordering_cfg_read(ndev, i);
> +
> +		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
> +			mask = NPU2_RELAXED_ORDERING_SOURCE(0);
> +		else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
> +			mask = NPU2_RELAXED_ORDERING_SOURCE(1);
> +		else
> +			continue;
> +
> +		val = SETFIELD(mask, val, src);
> +		npu2_relaxed_ordering_cfg_write(ndev, i, val);
> +
> +		rc = OPAL_SUCCESS;
> +		break;
> +	}
> +
> +out:
> +	unlock(&ndev->npu->lock);
> +	return rc;
> +}
> +
> +static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
> +					  int pec)
> +{
> +	uint64_t val, mask;
> +	int i;
> +
> +	NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
> +	lock(&ndev->npu->lock);
> +
> +	for (i = 0; i < 2; i++) {
> +		val = npu2_relaxed_ordering_cfg_read(ndev, i);
> +
> +		mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
> +		if (!mask)
> +			continue;
> +
> +		val = SETFIELD(mask, val, 0);
> +		npu2_relaxed_ordering_cfg_write(ndev, i, val);
> +	}
> +
> +	unlock(&ndev->npu->lock);
> +}
> +
> +/*
> + * Enable or disable relaxed ordering on all nvlinks on a given NPU. May leave
> + * relaxed ordering partially enabled if there are insufficient HW resources to
> + * enable it on all links.
> + */

This comment isn't correct as it does so for all nvlinks on all NPUs. Whomever
wrote that originaly clearly didn't know what he was talking about. He also
can't spell "originally" correctly.

> +static int npu2_set_relaxed_ordering(uint32_t gcid, int pec, bool enable)
> +{
> +	int rc = OPAL_SUCCESS;
> +	struct phb *phb;
> +	struct npu2 *npu;
> +	struct npu2_dev *ndev;
> +
> +	for_each_phb(phb) {
> +		if (phb->phb_type != phb_type_npu_v2)
> +			continue;
> +
> +		npu = phb_to_npu2_nvlink(phb);
> +		for (int i = 0; i < npu->total_devices; i++) {
> +			ndev = &npu->devices[i];
> +			if (enable)
> +				rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
> +			else
> +				npu2_disable_relaxed_ordering(ndev, gcid, pec);
> +
> +			if (rc != OPAL_SUCCESS) {
> +				NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
> +				return OPAL_RESOURCE;
> +			}
> +		}
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int npu2_check_relaxed_ordering(struct phb *phb __unused,
> +				       struct pci_device *pd, void *enable)
> +{
> +	/*
> +	 * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
> +	 * ordering
> +	 */
> +	if (pd->vdid == 0x04c11014)
> +		pd->allow_relaxed_ordering = true;
> +
> +	PCINOTICE(phb, pd->bdfn, "Checking relaxed ordering config\n");
> +	if (pd->allow_relaxed_ordering)
> +		return 0;
> +
> +	PCINOTICE(phb, pd->bdfn, "Relaxed ordering not allowed\n");

It's probably best to drop the log level here. Maybe PCIDBG?

> +	*(bool *) enable = false;
> +
> +	return 1;
> +}
> +
> +static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
> +					  bool request_enabled)
> +{
> +	struct phb *phb = pci_get_phb(phb_id);
> +	struct phb4 *phb4;
> +	uint32_t chip_id, pec;
> +	struct pci_device *pd;
> +	bool enable = true;
> +
> +	if (!phb || phb->phb_type != phb_type_pcie_v4)
> +		return OPAL_PARAMETER;
> +
> +	phb4 = phb_to_phb4(phb);
> +	pec = phb4->pec;
> +	chip_id = phb4->chip_id;
> +
> +	/* Can chip_id be packed into NPU2_RELAXED_ORDERING_SOURCE_GRPCHP? */
> +	if (chip_id & 0x64)

Actually this negates my comments above on gcid validation, but perhaps a
comment in npu2_relaxed_ordering_source_grpchp(uint32_t gcid) mentioning that
the caller needs to ensure the gcid is within a valid range wouldn't hurt.

> +		return OPAL_PARAMETER;
> +
> +	pd = pci_find_dev(phb, bdfn);
> +	if (!pd)
> +		return OPAL_PARAMETER;
> +
> +	/*
> +	 * Not changing state, so no need to rescan PHB devices to determine if
> +	 * we need to enable/disable it
> +	 */
> +	if (pd->allow_relaxed_ordering == request_enabled)
> +		return OPAL_SUCCESS;
> +
> +	pd->allow_relaxed_ordering = request_enabled;
> +
> +	/*
> +	 * Walk all devices on this PHB to ensure they all support relaxed
> +	 * ordering
> +	 */
> +	pci_walk_dev(phb, NULL, npu2_check_relaxed_ordering, &enable);
> +
> +	if (request_enabled && !enable) {
> +		/*
> +		 * Not all devices on this PHB support relaxed-ordering
> +		 * mode so we can't enable it as requested
> +		 */
> +		prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n",
> +		      pec, chip_id);
> +		return OPAL_CONSTRAINED;
> +	}
> +
> +	if (npu2_set_relaxed_ordering(chip_id, pec, request_enabled) != OPAL_SUCCESS) {
> +		npu2_set_relaxed_ordering(chip_id, pec, false);
> +		return OPAL_RESOURCE;
> +	}
> +
> +	phb4->ro_state = request_enabled;
> +	return OPAL_SUCCESS;
> +}
> +opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3);
> +
> +static int64_t opal_npu_get_relaxed_order(uint64_t phb_id,
> +					  uint16_t bdfn __unused)
> +{
> +	struct phb *phb = pci_get_phb(phb_id);
> +	struct phb4 *phb4;
> +
> +	if (!phb || phb->phb_type != phb_type_pcie_v4)
> +		return OPAL_PARAMETER;
> +
> +	phb4 = phb_to_phb4(phb);
> +	return phb4->ro_state;
> +}
> +opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2);
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index d9db988..6bd77e4 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -152,9 +152,24 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>  #define NPU2_LOW_WATER_MARKS			0x040
>  #define   NPU2_LOW_WATER_MARKS_ENABLE_MACHINE_ALLOC PPC_BIT(51)
>  #define NPU2_HIGH_WATER_MARKS			0x048
> -#define NPU2_RELAXED_ORDERING_CFG0		0x050
> -#define NPU2_RELAXED_ORDERING_CFG1		0x058
> -#define NPU2_RELAXED_ORDERING_CFG2		0x060
> +#define NPU2_RELAXED_ORDERING_CFG(n)		(0x050 + (n)*8)
> +#define   NPU2_RELAXED_ORDERING_SOURCE(n)	(PPC_BITMASK(0,31) >> ((n)*32))
> +#define     NPU2_RELAXED_ORDERING_SOURCE_ENA	PPC_BITMASK32(0,3)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_WRENA	PPC_BIT32(0)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_RDENA	PPC_BIT32(1)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_AWENA	PPC_BIT32(2)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_ARENA	PPC_BIT32(3)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_PECSEL	PPC_BITMASK32(4,5)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_GRPCHP	PPC_BITMASK32(6,9)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_WRMIN	PPC_BITMASK32(10,14)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_WRMAX	PPC_BITMASK32(15,19)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_RDMIN	PPC_BITMASK32(20,25)
> +#define     NPU2_RELAXED_ORDERING_SOURCE_RDMAX	PPC_BITMASK32(26,31)
> +#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_W	PPC_BIT(0)
> +#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP	PPC_BIT(1)
> +#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ	PPC_BIT(2)
> +#define   NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ	PPC_BIT(3)
> +#define   NPU2_RELAXED_ORDERING_CMD_DMA_PR_W	PPC_BIT(4)
>  #define   NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0	PPC_BIT(5)
>  #define   NPU2_RELAXED_ORDERING_SOURCE4_RDENA	PPC_BIT(29)
>  #define NPU2_NTL0_BAR				0x068
> diff --git a/include/opal-api.h b/include/opal-api.h
> index f766dce..5f397c8 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -224,7 +224,9 @@
>  #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR		165
>  #define OPAL_HANDLE_HMI2			166
>  #define OPAL_NX_COPROC_INIT			167
> -#define OPAL_LAST				167
> +#define OPAL_NPU_SET_RELAXED_ORDER		168
> +#define OPAL_NPU_GET_RELAXED_ORDER		169
> +#define OPAL_LAST				169
>  
>  #define QUIESCE_HOLD			1 /* Spin all calls at entry */
>  #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
> diff --git a/include/pci.h b/include/pci.h
> index 19fe9c0..1d1cec7 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -111,6 +111,14 @@ struct pci_device {
>  	uint32_t		pcrf_end;
>  	struct list_head	pcrf;
>  
> +	/*
> +	 * Relaxed ordering is a feature which allows PCIe devices accessing GPU
> +	 * memory to bypass the normal PCIe ordering rules to increase
> +	 * performance. It is enabled on a per-PEC basis so every device on a
> +	 * PEC must support it before we can enable it.
> +	 */
> +	bool                    allow_relaxed_ordering;
> +
>  	struct dt_node		*dn;
>  	struct pci_slot		*slot;
>  	struct pci_device	*parent;
> diff --git a/include/phb4.h b/include/phb4.h
> index 3186dca..d78bc31 100644
> --- a/include/phb4.h
> +++ b/include/phb4.h
> @@ -227,6 +227,9 @@ struct phb4 {
>  	/* Cache some RC registers that need to be emulated */
>  	uint32_t		rc_cache[4];
>  
> +	/* Current NPU2 relaxed ordering state */
> +	bool			ro_state;
> +
>  	struct phb		phb;
>  };
>  
>
Reza Arbab July 30, 2018, 4:05 p.m. UTC | #2
On Mon, Jul 30, 2018 at 04:37:33PM +1000, Alistair Popple wrote:
>On Friday, 27 July 2018 9:47:42 AM AEST Reza Arbab wrote: 
>> +/*
>> + * Enable or disable relaxed ordering on all nvlinks on a given NPU. May leave
>> + * relaxed ordering partially enabled if there are insufficient HW resources to
>> + * enable it on all links.
>> + */
>
>This comment isn't correct as it does so for all nvlinks on all NPUs. Whomever
>wrote that originaly clearly didn't know what he was talking about. He also
>can't spell "originally" correctly.

Changed to "for a given PEC" in v2.

>> +static int npu2_check_relaxed_ordering(struct phb *phb __unused,
>> +				       struct pci_device *pd, void *enable)
>> +{
>> +	/*
>> +	 * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
>> +	 * ordering
>> +	 */
>> +	if (pd->vdid == 0x04c11014)
>> +		pd->allow_relaxed_ordering = true;
>> +
>> +	PCINOTICE(phb, pd->bdfn, "Checking relaxed ordering config\n");
>> +	if (pd->allow_relaxed_ordering)
>> +		return 0;
>> +
>> +	PCINOTICE(phb, pd->bdfn, "Relaxed ordering not allowed\n");
>
>It's probably best to drop the log level here. Maybe PCIDBG?

No problem.

>> +static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
>> +					  bool request_enabled)
>> +{
>> +	struct phb *phb = pci_get_phb(phb_id);
>> +	struct phb4 *phb4;
>> +	uint32_t chip_id, pec;
>> +	struct pci_device *pd;
>> +	bool enable = true;
>> +
>> +	if (!phb || phb->phb_type != phb_type_pcie_v4)
>> +		return OPAL_PARAMETER;
>> +
>> +	phb4 = phb_to_phb4(phb);
>> +	pec = phb4->pec;
>> +	chip_id = phb4->chip_id;
>> +
>> +	/* Can chip_id be packed into NPU2_RELAXED_ORDERING_SOURCE_GRPCHP? */
>> +	if (chip_id & 0x64)
>
>Actually this negates my comments above on gcid validation, but perhaps a
>comment in npu2_relaxed_ordering_source_grpchp(uint32_t gcid) mentioning that
>the caller needs to ensure the gcid is within a valid range wouldn't hurt.

I reworked this a bit so npu2_relaxed_ordering_source_grpchp() is where 
the argument gets validated now.
diff mbox series

Patch

diff --git a/hw/npu2.c b/hw/npu2.c
index 9748536..6ff6e4d 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -38,6 +38,7 @@ 
 #include <nvram.h>
 #include <xive.h>
 #include <xscom-p9-regs.h>
+#include <phb4.h>
 
 #define NPU2_IRQ_BASE_SHIFT 13
 #define NPU2_N_DL_IRQS 23
@@ -919,12 +920,17 @@  static void npu2_hw_init(struct npu2 *p)
 	}
 
 	/* Static initialization of every relaxed-ordering cfg[2] register */
-	val = NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
+	val = NPU2_RELAXED_ORDERING_CMD_CL_DMA_W |
+	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP |
+	      NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ |
+	      NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ |
+	      NPU2_RELAXED_ORDERING_CMD_DMA_PR_W |
+	      NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0 |
 	      NPU2_RELAXED_ORDERING_SOURCE4_RDENA;
 
 	for (s = NPU2_STACK_STCK_0; s <= NPU2_STACK_STCK_2; s++) {
 		for (b = NPU2_BLOCK_SM_0; b <= NPU2_BLOCK_SM_3; b++) {
-			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG2);
+			reg = NPU2_REG_OFFSET(s, b, NPU2_RELAXED_ORDERING_CFG(2));
 			npu2_write(p, reg, val);
 		}
 	}
@@ -2253,3 +2259,266 @@  out:
 	return rc;
 }
 opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
+
+static inline uint32_t npu2_relaxed_ordering_source_grpchp(uint32_t gcid)
+{
+	/* Repack 0bGGGGCCC to 0bGGCC */
+	return ((gcid & 0x18) >> 1) | (gcid & 0x3);
+}
+
+static uint64_t npu2_relaxed_ordering_cfg_read(struct npu2_dev *ndev, int n)
+{
+	uint64_t reg = NPU2_SM_REG_OFFSET(ndev, 0, NPU2_RELAXED_ORDERING_CFG(n));
+
+	return npu2_read(ndev->npu, reg);
+}
+
+static void npu2_relaxed_ordering_cfg_write(struct npu2_dev *ndev, int n,
+					    uint64_t val)
+{
+	uint64_t reg;
+	int sm;
+
+	/* Set every register on our stack */
+	for (sm = NPU2_BLOCK_SM_0; sm <= NPU2_BLOCK_SM_3; sm++) {
+		reg = NPU2_SM_REG_OFFSET(ndev, sm, NPU2_RELAXED_ORDERING_CFG(n));
+		npu2_write(ndev->npu, reg, val);
+	}
+}
+
+/*
+ * Parse the value of a relaxed ordering config register. Returns SOURCE0 or
+ * SOURCE1 register mask if relaxed ordering is set for the given chip/pec.
+ * Returns 0 if unset.
+ */
+static uint64_t npu2_relaxed_ordering_cfg_enabled(uint64_t val, uint32_t gcid,
+						  int pec)
+{
+	uint32_t src, grpchp;
+	uint64_t mask;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		mask = NPU2_RELAXED_ORDERING_SOURCE(i);
+		src = GETFIELD(mask, val);
+
+		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, src))
+			continue;
+
+		if (GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src) != pec)
+			continue;
+
+		grpchp = GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src);
+		if (grpchp == npu2_relaxed_ordering_source_grpchp(gcid))
+			return mask;
+
+		if (grpchp == 0xf) /* match all */
+			return mask;
+	}
+
+	return 0;
+}
+
+static int npu2_enable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+					int pec)
+{
+	uint64_t val, mask;
+	uint32_t src;
+	int rc = OPAL_RESOURCE;
+	int i;
+
+	NPU2DEVINF(ndev, "Enabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+	lock(&ndev->npu->lock);
+
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+		if (!npu2_relaxed_ordering_cfg_enabled(val, gcid, pec))
+			continue;
+
+		/* Already enabled */
+		rc = OPAL_SUCCESS;
+		goto out;
+	}
+
+	src = NPU2_RELAXED_ORDERING_SOURCE_WRENA |
+	      NPU2_RELAXED_ORDERING_SOURCE_RDENA;
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_PECSEL, src, pec);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_GRPCHP, src,
+		       npu2_relaxed_ordering_source_grpchp(gcid));
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMIN, src, 0);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_WRMAX, src, 23);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMIN, src, 0);
+	src = SETFIELD(NPU2_RELAXED_ORDERING_SOURCE_RDMAX, src, 47);
+
+	/* Find somewhere to write this config */
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+		if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA << 32, val))
+			mask = NPU2_RELAXED_ORDERING_SOURCE(0);
+		else if (!GETFIELD(NPU2_RELAXED_ORDERING_SOURCE_ENA, val))
+			mask = NPU2_RELAXED_ORDERING_SOURCE(1);
+		else
+			continue;
+
+		val = SETFIELD(mask, val, src);
+		npu2_relaxed_ordering_cfg_write(ndev, i, val);
+
+		rc = OPAL_SUCCESS;
+		break;
+	}
+
+out:
+	unlock(&ndev->npu->lock);
+	return rc;
+}
+
+static void npu2_disable_relaxed_ordering(struct npu2_dev *ndev, uint32_t gcid,
+					  int pec)
+{
+	uint64_t val, mask;
+	int i;
+
+	NPU2DEVINF(ndev, "Disabling relaxed ordering for PEC %d on chip %d\n", pec, gcid);
+	lock(&ndev->npu->lock);
+
+	for (i = 0; i < 2; i++) {
+		val = npu2_relaxed_ordering_cfg_read(ndev, i);
+
+		mask = npu2_relaxed_ordering_cfg_enabled(val, gcid, pec);
+		if (!mask)
+			continue;
+
+		val = SETFIELD(mask, val, 0);
+		npu2_relaxed_ordering_cfg_write(ndev, i, val);
+	}
+
+	unlock(&ndev->npu->lock);
+}
+
+/*
+ * Enable or disable relaxed ordering on all nvlinks on a given NPU. May leave
+ * relaxed ordering partially enabled if there are insufficient HW resources to
+ * enable it on all links.
+ */
+static int npu2_set_relaxed_ordering(uint32_t gcid, int pec, bool enable)
+{
+	int rc = OPAL_SUCCESS;
+	struct phb *phb;
+	struct npu2 *npu;
+	struct npu2_dev *ndev;
+
+	for_each_phb(phb) {
+		if (phb->phb_type != phb_type_npu_v2)
+			continue;
+
+		npu = phb_to_npu2_nvlink(phb);
+		for (int i = 0; i < npu->total_devices; i++) {
+			ndev = &npu->devices[i];
+			if (enable)
+				rc = npu2_enable_relaxed_ordering(ndev, gcid, pec);
+			else
+				npu2_disable_relaxed_ordering(ndev, gcid, pec);
+
+			if (rc != OPAL_SUCCESS) {
+				NPU2DEVINF(ndev, "Insufficient resources to activate relaxed ordering mode\n");
+				return OPAL_RESOURCE;
+			}
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static int npu2_check_relaxed_ordering(struct phb *phb __unused,
+				       struct pci_device *pd, void *enable)
+{
+	/*
+	 * IBM PCIe bridge devices (ie. the root ports) can always allow relaxed
+	 * ordering
+	 */
+	if (pd->vdid == 0x04c11014)
+		pd->allow_relaxed_ordering = true;
+
+	PCINOTICE(phb, pd->bdfn, "Checking relaxed ordering config\n");
+	if (pd->allow_relaxed_ordering)
+		return 0;
+
+	PCINOTICE(phb, pd->bdfn, "Relaxed ordering not allowed\n");
+	*(bool *) enable = false;
+
+	return 1;
+}
+
+static int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
+					  bool request_enabled)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct phb4 *phb4;
+	uint32_t chip_id, pec;
+	struct pci_device *pd;
+	bool enable = true;
+
+	if (!phb || phb->phb_type != phb_type_pcie_v4)
+		return OPAL_PARAMETER;
+
+	phb4 = phb_to_phb4(phb);
+	pec = phb4->pec;
+	chip_id = phb4->chip_id;
+
+	/* Can chip_id be packed into NPU2_RELAXED_ORDERING_SOURCE_GRPCHP? */
+	if (chip_id & 0x64)
+		return OPAL_PARAMETER;
+
+	pd = pci_find_dev(phb, bdfn);
+	if (!pd)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Not changing state, so no need to rescan PHB devices to determine if
+	 * we need to enable/disable it
+	 */
+	if (pd->allow_relaxed_ordering == request_enabled)
+		return OPAL_SUCCESS;
+
+	pd->allow_relaxed_ordering = request_enabled;
+
+	/*
+	 * Walk all devices on this PHB to ensure they all support relaxed
+	 * ordering
+	 */
+	pci_walk_dev(phb, NULL, npu2_check_relaxed_ordering, &enable);
+
+	if (request_enabled && !enable) {
+		/*
+		 * Not all devices on this PHB support relaxed-ordering
+		 * mode so we can't enable it as requested
+		 */
+		prlog(PR_INFO, "Cannot set relaxed ordering for PEC %d on chip %d\n",
+		      pec, chip_id);
+		return OPAL_CONSTRAINED;
+	}
+
+	if (npu2_set_relaxed_ordering(chip_id, pec, request_enabled) != OPAL_SUCCESS) {
+		npu2_set_relaxed_ordering(chip_id, pec, false);
+		return OPAL_RESOURCE;
+	}
+
+	phb4->ro_state = request_enabled;
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_NPU_SET_RELAXED_ORDER, opal_npu_set_relaxed_order, 3);
+
+static int64_t opal_npu_get_relaxed_order(uint64_t phb_id,
+					  uint16_t bdfn __unused)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	struct phb4 *phb4;
+
+	if (!phb || phb->phb_type != phb_type_pcie_v4)
+		return OPAL_PARAMETER;
+
+	phb4 = phb_to_phb4(phb);
+	return phb4->ro_state;
+}
+opal_call(OPAL_NPU_GET_RELAXED_ORDER, opal_npu_get_relaxed_order, 2);
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index d9db988..6bd77e4 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -152,9 +152,24 @@  void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define NPU2_LOW_WATER_MARKS			0x040
 #define   NPU2_LOW_WATER_MARKS_ENABLE_MACHINE_ALLOC PPC_BIT(51)
 #define NPU2_HIGH_WATER_MARKS			0x048
-#define NPU2_RELAXED_ORDERING_CFG0		0x050
-#define NPU2_RELAXED_ORDERING_CFG1		0x058
-#define NPU2_RELAXED_ORDERING_CFG2		0x060
+#define NPU2_RELAXED_ORDERING_CFG(n)		(0x050 + (n)*8)
+#define   NPU2_RELAXED_ORDERING_SOURCE(n)	(PPC_BITMASK(0,31) >> ((n)*32))
+#define     NPU2_RELAXED_ORDERING_SOURCE_ENA	PPC_BITMASK32(0,3)
+#define     NPU2_RELAXED_ORDERING_SOURCE_WRENA	PPC_BIT32(0)
+#define     NPU2_RELAXED_ORDERING_SOURCE_RDENA	PPC_BIT32(1)
+#define     NPU2_RELAXED_ORDERING_SOURCE_AWENA	PPC_BIT32(2)
+#define     NPU2_RELAXED_ORDERING_SOURCE_ARENA	PPC_BIT32(3)
+#define     NPU2_RELAXED_ORDERING_SOURCE_PECSEL	PPC_BITMASK32(4,5)
+#define     NPU2_RELAXED_ORDERING_SOURCE_GRPCHP	PPC_BITMASK32(6,9)
+#define     NPU2_RELAXED_ORDERING_SOURCE_WRMIN	PPC_BITMASK32(10,14)
+#define     NPU2_RELAXED_ORDERING_SOURCE_WRMAX	PPC_BITMASK32(15,19)
+#define     NPU2_RELAXED_ORDERING_SOURCE_RDMIN	PPC_BITMASK32(20,25)
+#define     NPU2_RELAXED_ORDERING_SOURCE_RDMAX	PPC_BITMASK32(26,31)
+#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_W	PPC_BIT(0)
+#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_W_HP	PPC_BIT(1)
+#define   NPU2_RELAXED_ORDERING_CMD_CL_DMA_INJ	PPC_BIT(2)
+#define   NPU2_RELAXED_ORDERING_CMD_PR_DMA_INJ	PPC_BIT(3)
+#define   NPU2_RELAXED_ORDERING_CMD_DMA_PR_W	PPC_BIT(4)
 #define   NPU2_RELAXED_ORDERING_CMD_CL_RD_NC_F0	PPC_BIT(5)
 #define   NPU2_RELAXED_ORDERING_SOURCE4_RDENA	PPC_BIT(29)
 #define NPU2_NTL0_BAR				0x068
diff --git a/include/opal-api.h b/include/opal-api.h
index f766dce..5f397c8 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -224,7 +224,9 @@ 
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR		165
 #define OPAL_HANDLE_HMI2			166
 #define OPAL_NX_COPROC_INIT			167
-#define OPAL_LAST				167
+#define OPAL_NPU_SET_RELAXED_ORDER		168
+#define OPAL_NPU_GET_RELAXED_ORDER		169
+#define OPAL_LAST				169
 
 #define QUIESCE_HOLD			1 /* Spin all calls at entry */
 #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
diff --git a/include/pci.h b/include/pci.h
index 19fe9c0..1d1cec7 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -111,6 +111,14 @@  struct pci_device {
 	uint32_t		pcrf_end;
 	struct list_head	pcrf;
 
+	/*
+	 * Relaxed ordering is a feature which allows PCIe devices accessing GPU
+	 * memory to bypass the normal PCIe ordering rules to increase
+	 * performance. It is enabled on a per-PEC basis so every device on a
+	 * PEC must support it before we can enable it.
+	 */
+	bool                    allow_relaxed_ordering;
+
 	struct dt_node		*dn;
 	struct pci_slot		*slot;
 	struct pci_device	*parent;
diff --git a/include/phb4.h b/include/phb4.h
index 3186dca..d78bc31 100644
--- a/include/phb4.h
+++ b/include/phb4.h
@@ -227,6 +227,9 @@  struct phb4 {
 	/* Cache some RC registers that need to be emulated */
 	uint32_t		rc_cache[4];
 
+	/* Current NPU2 relaxed ordering state */
+	bool			ro_state;
+
 	struct phb		phb;
 };