diff mbox series

[next,2/2] iommu/tegra241-cmdqv: Do not allocate vcmdq until dma_set_mask_and_coherent

Message ID 530993c3aafa1b0fc3d879b8119e13c629d12e2b.1725503154.git.nicolinc@nvidia.com
State New
Headers show
Series iommu/tegra241-cmdqv: Fix two bugs in __tegra241_cmdqv_probe | expand

Commit Message

Nicolin Chen Sept. 5, 2024, 2:40 a.m. UTC
It's observed that, when the first 4GB of system memory was reserved, all
VCMDQ allocations failed (even with the smallest qsz in the last attempt):
    arm-smmu-v3: found companion CMDQV device: NVDA200C:00
    arm-smmu-v3: option mask 0x10
    arm-smmu-v3: failed to allocate queue (0x8000 bytes) for vcmdq0
    acpi NVDA200C:00: tegra241_cmdqv: Falling back to standard SMMU CMDQ
    arm-smmu-v3: ias 48-bit, oas 48-bit (features 0x001e1fbf)
    arm-smmu-v3: allocated 524288 entries for cmdq
    arm-smmu-v3: allocated 524288 entries for evtq
    arm-smmu-v3: allocated 524288 entries for priq

This is because the 4GB reserved memory shifted the entire DMA zone from a
lower 32-bit range (on a system without the 4GB carveout) to higher range,
while the dev->coherent_dma_mask was set to DMA_BIT_MASK(32) by default.

The dma_set_mask_and_coherent() call is done in arm_smmu_device_hw_probe()
of the SMMU driver. So any DMA allocation from tegra241_cmdqv_probe() must
wait until the coherent_dma_mask is correctly set.

Move the vintf/vcmdq structure initialization routine into a different op,
"init_structures". Call it at the end of arm_smmu_init_structures(), where
standard SMMU queues get allocated.

Most of the impl_ops aren't ready until vintf/vcmdq structure are init-ed.
So replace the full impl_ops with an init_ops in __tegra241_cmdqv_probe().

And switch to tegra241_cmdqv_impl_ops later in arm_smmu_init_structures().
Note that tegra241_cmdqv_impl_ops does not link to the new init_structures
op after this switch, since there is no point in having it once it's done.

Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
Reported-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  9 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  1 +
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 83 ++++++++++++-------
 3 files changed, 60 insertions(+), 33 deletions(-)

Comments

Jason Gunthorpe Sept. 5, 2024, 1:18 p.m. UTC | #1
On Wed, Sep 04, 2024 at 07:40:43PM -0700, Nicolin Chen wrote:
> It's observed that, when the first 4GB of system memory was reserved, all
> VCMDQ allocations failed (even with the smallest qsz in the last attempt):
>     arm-smmu-v3: found companion CMDQV device: NVDA200C:00
>     arm-smmu-v3: option mask 0x10
>     arm-smmu-v3: failed to allocate queue (0x8000 bytes) for vcmdq0
>     acpi NVDA200C:00: tegra241_cmdqv: Falling back to standard SMMU CMDQ
>     arm-smmu-v3: ias 48-bit, oas 48-bit (features 0x001e1fbf)
>     arm-smmu-v3: allocated 524288 entries for cmdq
>     arm-smmu-v3: allocated 524288 entries for evtq
>     arm-smmu-v3: allocated 524288 entries for priq
> 
> This is because the 4GB reserved memory shifted the entire DMA zone from a
> lower 32-bit range (on a system without the 4GB carveout) to higher range,
> while the dev->coherent_dma_mask was set to DMA_BIT_MASK(32) by default.
> 
> The dma_set_mask_and_coherent() call is done in arm_smmu_device_hw_probe()
> of the SMMU driver. So any DMA allocation from tegra241_cmdqv_probe() must
> wait until the coherent_dma_mask is correctly set.
> 
> Move the vintf/vcmdq structure initialization routine into a different op,
> "init_structures". Call it at the end of arm_smmu_init_structures(), where
> standard SMMU queues get allocated.
> 
> Most of the impl_ops aren't ready until vintf/vcmdq structure are init-ed.
> So replace the full impl_ops with an init_ops in __tegra241_cmdqv_probe().
> 
> And switch to tegra241_cmdqv_impl_ops later in arm_smmu_init_structures().
> Note that tegra241_cmdqv_impl_ops does not link to the new init_structures
> op after this switch, since there is no point in having it once it's done.
> 
> Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
> Reported-by: Matt Ochs <mochs@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  9 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  1 +
>  .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 83 ++++++++++++-------
>  3 files changed, 60 insertions(+), 33 deletions(-)

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Matt Ochs Sept. 5, 2024, 4:26 p.m. UTC | #2
> On Sep 4, 2024, at 9:40 PM, Nicolin Chen <nicolinc@nvidia.com> wrote:
> 
> It's observed that, when the first 4GB of system memory was reserved, all
> VCMDQ allocations failed (even with the smallest qsz in the last attempt):
>    arm-smmu-v3: found companion CMDQV device: NVDA200C:00
>    arm-smmu-v3: option mask 0x10
>    arm-smmu-v3: failed to allocate queue (0x8000 bytes) for vcmdq0
>    acpi NVDA200C:00: tegra241_cmdqv: Falling back to standard SMMU CMDQ
>    arm-smmu-v3: ias 48-bit, oas 48-bit (features 0x001e1fbf)
>    arm-smmu-v3: allocated 524288 entries for cmdq
>    arm-smmu-v3: allocated 524288 entries for evtq
>    arm-smmu-v3: allocated 524288 entries for priq
> 
> This is because the 4GB reserved memory shifted the entire DMA zone from a
> lower 32-bit range (on a system without the 4GB carveout) to higher range,
> while the dev->coherent_dma_mask was set to DMA_BIT_MASK(32) by default.
> 
> The dma_set_mask_and_coherent() call is done in arm_smmu_device_hw_probe()
> of the SMMU driver. So any DMA allocation from tegra241_cmdqv_probe() must
> wait until the coherent_dma_mask is correctly set.
> 
> Move the vintf/vcmdq structure initialization routine into a different op,
> "init_structures". Call it at the end of arm_smmu_init_structures(), where
> standard SMMU queues get allocated.
> 
> Most of the impl_ops aren't ready until vintf/vcmdq structure are init-ed.
> So replace the full impl_ops with an init_ops in __tegra241_cmdqv_probe().
> 
> And switch to tegra241_cmdqv_impl_ops later in arm_smmu_init_structures().
> Note that tegra241_cmdqv_impl_ops does not link to the new init_structures
> op after this switch, since there is no point in having it once it's done.
> 
> Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
> Reported-by: Matt Ochs <mochs@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> —

Tested-by: Matt Ochs <mochs@nvidia.com>
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b2de56dfceb9..df852ab04fd7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3744,7 +3744,14 @@  static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
-	return arm_smmu_init_strtab(smmu);
+	ret = arm_smmu_init_strtab(smmu);
+	if (ret)
+		return ret;
+
+	if (smmu->impl_ops && smmu->impl_ops->init_structures)
+		return smmu->impl_ops->init_structures(smmu);
+
+	return 0;
 }
 
 static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index e044ce5b5372..e8320e9341d7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -643,6 +643,7 @@  struct arm_smmu_strtab_cfg {
 struct arm_smmu_impl_ops {
 	int (*device_reset)(struct arm_smmu_device *smmu);
 	void (*device_remove)(struct arm_smmu_device *smmu);
+	int (*init_structures)(struct arm_smmu_device *smmu);
 	struct arm_smmu_cmdq *(*get_secondary_cmdq)(
 		struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent);
 };
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 0766dc2789cb..fcd13d301fff 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -755,18 +755,65 @@  tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq)
 	return res;
 }
 
+static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct tegra241_vintf *vintf;
+	int lidx;
+	int ret;
+
+	vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
+	if (!vintf)
+		goto out_fallback;
+
+	/* Init VINTF0 for in-kernel use */
+	ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
+	if (ret) {
+		dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
+		goto free_vintf;
+	}
+
+	/* Preallocate logical VCMDQs to VINTF0 */
+	for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+		struct tegra241_vcmdq *vcmdq;
+
+		vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
+		if (IS_ERR(vcmdq))
+			goto free_lvcmdq;
+	}
+
+	/* Now, we are ready to run all the impl ops */
+	smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+	return 0;
+
+free_lvcmdq:
+	for (lidx--; lidx >= 0; lidx--)
+		tegra241_vintf_free_lvcmdq(vintf, lidx);
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+free_vintf:
+	kfree(vintf);
+out_fallback:
+	dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n");
+	smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV;
+	tegra241_cmdqv_remove(smmu);
+	return 0;
+}
+
 struct dentry *cmdqv_debugfs_dir;
 
 static struct arm_smmu_device *
 __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 		       int irq)
 {
+	static const struct arm_smmu_impl_ops init_ops = {
+		.init_structures = tegra241_cmdqv_init_structures,
+		.device_remove = tegra241_cmdqv_remove,
+	};
 	struct tegra241_cmdqv *cmdqv = NULL;
 	struct arm_smmu_device *new_smmu;
-	struct tegra241_vintf *vintf;
 	void __iomem *base;
 	u32 regval;
-	int lidx;
 	int ret;
 
 	static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0);
@@ -815,26 +862,6 @@  __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 
 	ida_init(&cmdqv->vintf_ids);
 
-	vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
-	if (!vintf)
-		goto destroy_ids;
-
-	/* Init VINTF0 for in-kernel use */
-	ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
-	if (ret) {
-		dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
-		goto free_vintf;
-	}
-
-	/* Preallocate logical VCMDQs to VINTF0 */
-	for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
-		struct tegra241_vcmdq *vcmdq;
-
-		vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
-		if (IS_ERR(vcmdq))
-			goto free_lvcmdq;
-	}
-
 #ifdef CONFIG_IOMMU_DEBUGFS
 	if (!cmdqv_debugfs_dir) {
 		cmdqv_debugfs_dir =
@@ -844,19 +871,11 @@  __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	}
 #endif
 
-	new_smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+	/* Provide init-level ops only, until tegra241_cmdqv_init_structures */
+	new_smmu->impl_ops = &init_ops;
 
 	return new_smmu;
 
-free_lvcmdq:
-	for (lidx--; lidx >= 0; lidx--)
-		tegra241_vintf_free_lvcmdq(vintf, lidx);
-	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
-free_vintf:
-	kfree(vintf);
-destroy_ids:
-	ida_destroy(&cmdqv->vintf_ids);
-	kfree(cmdqv->vintfs);
 free_irq:
 	if (cmdqv->irq > 0)
 		free_irq(cmdqv->irq, cmdqv);