diff mbox series

[v6,6/6] iommu/tegra241-cmdqv: Limit CMDs for guest owned VINTF

Message ID 4ee1f867e838b90a21de16b12cf2e39ba699eab4.1714451595.git.nicolinc@nvidia.com
State Superseded
Headers show
Series Add Tegra241 (Grace) CMDQV Support (part 1/2) | expand

Commit Message

Nicolin Chen April 30, 2024, 4:43 a.m. UTC
When VCMDQs are assigned to a VINTF owned by a guest (HYP_OWN bit unset),
only TLB and ATC invalidation commands are supported by the VCMDQ HW. So,
add a new helper to scan the input cmds to make sure every single command
is supported when selecting a queue.

Note that the guest VM shouldn't have HYP_OWN bit being set regardless of
guest kernel driver writing it or not, i.e. the hypervisor running in the
host OS should wire this bit to zero when trapping a write access to this
VINTF_CONFIG register from a guest kernel.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  7 +--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 ++-
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 44 ++++++++++++++++++-
 3 files changed, 50 insertions(+), 6 deletions(-)

Comments

Jason Gunthorpe April 30, 2024, 5:06 p.m. UTC | #1
On Mon, Apr 29, 2024 at 09:43:49PM -0700, Nicolin Chen wrote:
> -struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
> +static bool tegra241_vintf_support_cmds(struct tegra241_vintf *vintf,
> +					u64 *cmds, int n)
> +{
> +	int i;
> +
> +	/* VINTF owned by hypervisor can execute any command */
> +	if (vintf->hyp_own)
> +		return true;
> +
> +	/* Guest-owned VINTF must Check against the list of supported CMDs */
> +	for (i = 0; i < n; i++) {
> +		switch (FIELD_GET(CMDQ_0_OP, cmds[i * CMDQ_ENT_DWORDS])) {
> +		case CMDQ_OP_TLBI_NH_ASID:
> +		case CMDQ_OP_TLBI_NH_VA:
> +		case CMDQ_OP_ATC_INV:

So CMDQ only works if not ARM_SMMU_FEAT_E2H? Probably worth mentioning
that too along with the discussion about HYP


> +			continue;
> +		default:
> +			return false;
> +		}
> +	}
> +
> +	return true;
> +}

For a performance path this looping seems disappointing.. The callers
don't actually mix different command type. Is there something
preventing adding a parameter at the callers?

Actually looking at this more closely, isn't the command q selection
in the wrong place?

Ie this batch stuff:

static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
				    struct arm_smmu_cmdq_batch *cmds,
				    struct arm_smmu_cmdq_ent *cmd)
{
	int index;

	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
		cmds->num = 0;
	}

	if (cmds->num == CMDQ_BATCH_ENTRIES) {
		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
		cmds->num = 0;
	}

	index = cmds->num * CMDQ_ENT_DWORDS;
	if (unlikely(arm_smmu_cmdq_build_cmd(&cmds->cmds[index], cmd))) {
		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
			 cmd->opcode);
		return;
	}

Has to push everything, across all the iterations of add/submut, onto
the same CMDQ otherwise the SYNC won't be properly flushing?

But each arm_smmu_cmdq_issue_cmdlist() calls its own get q
function. Yes, they probably return the same Q since we are probably
on the same CPU, but it seems logically wrong (and slower!) to
organize it like this.

I would expect the Q to be selected when the struct
arm_smmu_cmdq_batch is allocated on the stack, and be the same for the
entire batch operation. Not only do we spend less time trying to
compute the Q to use we have a built in guarentee that every command
will be on the same Q as the fenching SYNC.

Something sort of like this as another patch?

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 268da20baa4e9c..d8c9597878315a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
-					       u64 *cmds, int n)
+enum required_cmds {
+	CMDS_ALL,
+	/*
+	 * Commands will be one of:
+	 *  CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA,
+	 *  CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA,
+	 *  CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC
+	 */
+	CMDS_INVALIDATION,
+};
+
+static struct arm_smmu_cmdq *
+arm_smmu_get_cmdq(struct arm_smmu_device *smmu, enum required_cmds required)
 {
 	if (smmu->tegra241_cmdqv)
-		return tegra241_cmdqv_get_cmdq(smmu, cmds, n);
+		return tegra241_cmdqv_get_cmdq(smmu, required);
 
 	return &smmu->cmdq;
 }
@@ -766,13 +777,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
  *   CPU will appear before any of the commands from the other CPU.
  */
 static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
-				       u64 *cmds, int n, bool sync)
+				       struct arm_smmu_cmdq *cmdq, u64 *cmds,
+				       int n, bool sync)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
 	bool owner;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
 	struct arm_smmu_ll_queue llq, head;
 	int ret = 0;
 
@@ -897,7 +908,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return -EINVAL;
 	}
 
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync);
+	return arm_smmu_cmdq_issue_cmdlist(
+		smmu, arm_smmu_get_cmdq(smmu, CMDS_ALL), cmd, 1, sync);
 }
 
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -912,6 +924,14 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
 	return __arm_smmu_cmdq_issue_cmd(smmu, ent, true);
 }
 
+static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
+				     struct arm_smmu_cmdq_batch *cmds,
+				     enum required_cmds required)
+{
+	cmds->num = 0;
+	cmds->q = arm_smmu_get_cmdq(smmu, required);
+}
+
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_batch *cmds,
 				    struct arm_smmu_cmdq_ent *cmd)
@@ -920,12 +940,14 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
 	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds,
+					    cmds->num, true);
 		cmds->num = 0;
 	}
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds,
+					    cmds->num, false);
 		cmds->num = 0;
 	}
 
@@ -942,7 +964,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 				      struct arm_smmu_cmdq_batch *cmds)
 {
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds, cmds->num,
+					   true);
 }
 
 static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused,
@@ -1181,7 +1204,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu, &cmds, CMDS_ALL);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -2045,7 +2068,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 
 	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(master->smmu, &cmds, CMDS_INVALIDATION);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2083,7 +2106,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices,
@@ -2161,7 +2184,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 9412fa4ff5e045..5651ea2541a0a2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -576,6 +576,7 @@ struct arm_smmu_cmdq {
 
 struct arm_smmu_cmdq_batch {
 	u64				cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+	struct arm_smmu_cmdq		*q;
 	int				num;
 };
Nicolin Chen April 30, 2024, 6:58 p.m. UTC | #2
On Tue, Apr 30, 2024 at 02:06:55PM -0300, Jason Gunthorpe wrote:
> On Mon, Apr 29, 2024 at 09:43:49PM -0700, Nicolin Chen wrote:
> > -struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
> > +static bool tegra241_vintf_support_cmds(struct tegra241_vintf *vintf,
> > +					u64 *cmds, int n)
> > +{
> > +	int i;
> > +
> > +	/* VINTF owned by hypervisor can execute any command */
> > +	if (vintf->hyp_own)
> > +		return true;
> > +
> > +	/* Guest-owned VINTF must Check against the list of supported CMDs */
> > +	for (i = 0; i < n; i++) {
> > +		switch (FIELD_GET(CMDQ_0_OP, cmds[i * CMDQ_ENT_DWORDS])) {
> > +		case CMDQ_OP_TLBI_NH_ASID:
> > +		case CMDQ_OP_TLBI_NH_VA:
> > +		case CMDQ_OP_ATC_INV:
> 
> So CMDQ only works if not ARM_SMMU_FEAT_E2H? Probably worth mentioning
> that too along with the discussion about HYP

Nod. EL2/EL3 commands aren't supported. And they aren't supposed
to be issued by a guess either, since ARM64_HAS_VIRT_HOST_EXTN is
the feature of "Virtualization Host Extensions"?

> 
> > +			continue;
> > +		default:
> > +			return false;
> > +		}
> > +	}
> > +
> > +	return true;
> > +}
> 
> For a performance path this looping seems disappointing.. The callers
> don't actually mix different command type. Is there something
> preventing adding a parameter at the callers?

The callers don't seem to mix at this moment. Yet we would have
to be extra careful against any future SMMU patch that may mix
commands?

> Actually looking at this more closely, isn't the command q selection
> in the wrong place?
> 
> Ie this batch stuff:
> 
> static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
> 				    struct arm_smmu_cmdq_batch *cmds,
> 				    struct arm_smmu_cmdq_ent *cmd)
> {
> 	int index;
> 
> 	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
> 	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
> 		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
> 		cmds->num = 0;
> 	}
> 
> 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
> 		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
> 		cmds->num = 0;
> 	}
> 
> 	index = cmds->num * CMDQ_ENT_DWORDS;
> 	if (unlikely(arm_smmu_cmdq_build_cmd(&cmds->cmds[index], cmd))) {
> 		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> 			 cmd->opcode);
> 		return;
> 	}
> 
> Has to push everything, across all the iterations of add/submut, onto
> the same CMDQ otherwise the SYNC won't be properly flushing?

ECMDQ seems to have such a limitation, but VCMDQs can get away
as HW can insert a SYNC to a queue that doesn't end with a SYNC.

> But each arm_smmu_cmdq_issue_cmdlist() calls its own get q
> function. Yes, they probably return the same Q since we are probably
> on the same CPU, but it seems logically wrong (and slower!) to
> organize it like this.
> 
> I would expect the Q to be selected when the struct
> arm_smmu_cmdq_batch is allocated on the stack, and be the same for the
> entire batch operation. Not only do we spend less time trying to
> compute the Q to use we have a built in guarentee that every command
> will be on the same Q as the fenching SYNC.

This seems to be helpful to ECMDQ. The current version disables
the preempts, which feels costly to me.

> Something sort of like this as another patch?
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 268da20baa4e9c..d8c9597878315a 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>  	return 0;
>  }
>  
> -static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
> -					       u64 *cmds, int n)
> +enum required_cmds {
> +	CMDS_ALL,
> +	/*
> +	 * Commands will be one of:
> +	 *  CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA,
> +	 *  CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA,
> +	 *  CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC
> +	 */
> +	CMDS_INVALIDATION,
> +};

Hmm, guest-owned VCMDQs don't support EL2 commands. So, it feels
to be somehow complicated to decouple them further in the callers
of arm_smmu_cmdq_batch_add(). And I am not sure if there is a use
case of guest issuing CMDQ_OP_TLBI_S2_IPA/CMDQ_OP_TLBI_S12_VMALL
either, HW surprisingly supports these two though.

Perhaps we could just scan the first command in the batch, giving
a faith that no one will covertly sneak different commands in it?

Otherwise, there has to be a get_suported_cmdq callback so batch
or its callers can avoid adding unsupported commands at the first
place.

Thanks
Nicolin
Jason Gunthorpe May 1, 2024, 12:17 a.m. UTC | #3
On Tue, Apr 30, 2024 at 11:58:44AM -0700, Nicolin Chen wrote:

> > Has to push everything, across all the iterations of add/submut, onto
> > the same CMDQ otherwise the SYNC won't be properly flushing?
> 
> ECMDQ seems to have such a limitation, but VCMDQs can get away
> as HW can insert a SYNC to a queue that doesn't end with a SYNC.

That seems like a strange thing to do in HW, but I recall you
mentioned it once before. Still, I'm not sure there is any merit in
relying on it?

> > But each arm_smmu_cmdq_issue_cmdlist() calls its own get q
> > function. Yes, they probably return the same Q since we are probably
> > on the same CPU, but it seems logically wrong (and slower!) to
> > organize it like this.
> > 
> > I would expect the Q to be selected when the struct
> > arm_smmu_cmdq_batch is allocated on the stack, and be the same for the
> > entire batch operation. Not only do we spend less time trying to
> > compute the Q to use we have a built in guarentee that every command
> > will be on the same Q as the fenching SYNC.
> 
> This seems to be helpful to ECMDQ. The current version disables
> the preempts, which feels costly to me.

Oh, yes, definately should work like this then!

> > Something sort of like this as another patch?
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index 268da20baa4e9c..d8c9597878315a 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> >  	return 0;
> >  }
> >  
> > -static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
> > -					       u64 *cmds, int n)
> > +enum required_cmds {
> > +	CMDS_ALL,
> > +	/*
> > +	 * Commands will be one of:
> > +	 *  CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA,
> > +	 *  CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA,
> > +	 *  CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC
> > +	 */
> > +	CMDS_INVALIDATION,
> > +};
> 
> Hmm, guest-owned VCMDQs don't support EL2 commands. So, it feels
> to be somehow complicated to decouple them further in the callers
> of arm_smmu_cmdq_batch_add(). And I am not sure if there is a use
> case of guest issuing CMDQ_OP_TLBI_S2_IPA/CMDQ_OP_TLBI_S12_VMALL
> either, HW surprisingly supports these two though.

These are the max commands that could be issued, but they are all
gated based on the feature bits. The ones VCMDQ don't support are not
going to be issued because of the feature bits. You could test and
enforce this when probing the ECMDQ parts.

> Perhaps we could just scan the first command in the batch, giving
> a faith that no one will covertly sneak different commands in it?

Yes with the current design that does seem to work, but it also feels
a bit obtuse.

> Otherwise, there has to be a get_suported_cmdq callback so batch
> or its callers can avoid adding unsupported commands at the first
> place.

If you really feel strongly the invalidation could be split into
S1/S2/S1_VM groupings that align with the feature bits and that could
be passed down from one step above. But I don't think the complexity
is really needed. It is better to deal with it through the feature
mechanism.

If high speed invalidation is supported then the invalidation queue
must support all the invalidation commands used by the SMMU's active
feature set, otherwise do not enable the invalidation queue. It does
make logical sense.

Jason
Nicolin Chen May 1, 2024, 4:32 p.m. UTC | #4
On Tue, Apr 30, 2024 at 09:17:58PM -0300, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 11:58:44AM -0700, Nicolin Chen wrote:
> 
> > > Has to push everything, across all the iterations of add/submut, onto
> > > the same CMDQ otherwise the SYNC won't be properly flushing?
> > 
> > ECMDQ seems to have such a limitation, but VCMDQs can get away
> > as HW can insert a SYNC to a queue that doesn't end with a SYNC.
> 
> That seems like a strange thing to do in HW, but I recall you
> mentioned it once before. Still, I'm not sure there is any merit in
> relying on it?

I was hoping to get some idea from the designer. Yet, at this
moment, let's say there's likely no merit besides SW can care
less and stay simpler, AFAIK.

Robin previously remarked that there could be some performance
impact from such a feature, so I think adding your patch would
be nicer.

> > > Something sort of like this as another patch?
> > > 
> > > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > index 268da20baa4e9c..d8c9597878315a 100644
> > > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > @@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> > >  	return 0;
> > >  }
> > >  
> > > -static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
> > > -					       u64 *cmds, int n)
> > > +enum required_cmds {
> > > +	CMDS_ALL,
> > > +	/*
> > > +	 * Commands will be one of:
> > > +	 *  CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA,
> > > +	 *  CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA,
> > > +	 *  CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC
> > > +	 */
> > > +	CMDS_INVALIDATION,
> > > +};
> > 
> > Hmm, guest-owned VCMDQs don't support EL2 commands. So, it feels
> > to be somehow complicated to decouple them further in the callers
> > of arm_smmu_cmdq_batch_add(). And I am not sure if there is a use
> > case of guest issuing CMDQ_OP_TLBI_S2_IPA/CMDQ_OP_TLBI_S12_VMALL
> > either, HW surprisingly supports these two though.
> 
> These are the max commands that could be issued, but they are all
> gated based on the feature bits. The ones VCMDQ don't support are not
> going to be issued because of the feature bits. You could test and
> enforce this when probing the ECMDQ parts.

Ah, I see. So cmdqv's probe() could check the smmu->features
against ARM_SMMU_FEAT_E2H, and disable VINTF0 right away, in
case of guest-owned while E2H is present.

And we could do the same for ARM_SMMU_FEAT_TRANS_S2, stating
that the driver does not expect use cases of issuing S2 TLBI
commands from the guest OS.

Thanks
Nicolin
Nicolin Chen May 6, 2024, 3:52 a.m. UTC | #5
On Tue, Apr 30, 2024 at 09:17:58PM -0300, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 11:58:44AM -0700, Nicolin Chen wrote:
> > Otherwise, there has to be a get_suported_cmdq callback so batch
> > or its callers can avoid adding unsupported commands at the first
> > place.
> 
> If you really feel strongly the invalidation could be split into
> S1/S2/S1_VM groupings that align with the feature bits and that could
> be passed down from one step above. But I don't think the complexity
> is really needed. It is better to deal with it through the feature
> mechanism.

Hmm, I tried following your design by passing in a CMD_TYPE_xxx
to the tegra241_cmdqv_get_cmdq(), but I found a little painful
to accommodate these two cases:
1. TLBI_NH_ASID is issued via arm_smmu_cmdq_issue_cmdlist(), so
   we should not mark it as CMD_TYPE_ALL. Yet, this function is
   used by other commands too. So, either we pass in a type from
   higher callers, or simply check the opcode in that function.
2. It is a bit tricky to define, from SMMU's P.O.V, a good TYPE
   subset for VCMDQ, since guest-owned VCMDQ does not support
   TLBI_NSNH_ALL.

So, it feels to me that checking against the opcode is still a
straightforward solution. And what I ended up with is somewhat
similar to this v6, yet this time it only checks at batch init
call as your design does.

How do you think of this?

Thanks
Nicolin
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5775a7bfa874..b1334121f5c4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -332,10 +332,11 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *
+arm_smmu_get_cmdq(struct arm_smmu_device *smmu, u8 opcode)
 {
 	if (arm_smmu_has_tegra241_cmdqv(smmu))
-		return tegra241_cmdqv_get_cmdq(smmu);
+		return tegra241_cmdqv_get_cmdq(smmu, opcode);
 
 	return &smmu->cmdq;
 }
@@ -871,7 +872,7 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	}
 
 	return arm_smmu_cmdq_issue_cmdlist(
-		smmu, arm_smmu_get_cmdq(smmu), cmd, 1, sync);
+		smmu, arm_smmu_get_cmdq(smmu, ent->opcode), cmd, 1, sync);
 }
 
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -887,10 +888,11 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
 }
 
 static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
-				     struct arm_smmu_cmdq_batch *cmds)
+				     struct arm_smmu_cmdq_batch *cmds,
+				     u8 opcode)
 {
 	cmds->num = 0;
-	cmds->cmdq = arm_smmu_get_cmdq(smmu);
+	cmds->cmdq = arm_smmu_get_cmdq(smmu, opcode);
 }
 
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
@@ -1167,7 +1169,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	arm_smmu_cmdq_batch_init(smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd.opcode);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -2006,7 +2008,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
 
 	arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
 
-	arm_smmu_cmdq_batch_init(master->smmu, &cmds);
+	arm_smmu_cmdq_batch_init(master->smmu, &cmds, cmd.opcode);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2046,7 +2048,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 
 	arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
 
-	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, cmd.opcode);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
@@ -2123,7 +2125,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, cmd->opcode);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 604e26a292e7..2c1fe7e129cd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -879,7 +879,8 @@ struct tegra241_cmdqv *tegra241_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
 						 struct acpi_iort_node *node);
 void tegra241_cmdqv_device_remove(struct arm_smmu_device *smmu);
 int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu);
-struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu);
+struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+					      u8 opcode);
 #else /* CONFIG_TEGRA241_CMDQV */
 static inline bool arm_smmu_has_tegra241_cmdqv(struct arm_smmu_device *smmu)
 {
@@ -903,7 +904,7 @@ static inline int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 }
 
 static inline struct arm_smmu_cmdq *
-tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u8 opcode)
 {
 	return NULL;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index b59f4e31a116..22718835f4be 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -181,6 +181,7 @@ struct tegra241_vcmdq {
  * struct tegra241_vintf - Virtual Interface
  * @idx: Global index in the CMDQV HW
  * @enabled: Enable status
+ * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: CMDQV HW pointer
  * @vcmdqs: List of VCMDQ pointers
  * @base: MMIO base address
@@ -189,6 +190,7 @@ struct tegra241_vintf {
 	u16 idx;
 
 	bool enabled;
+	bool hyp_own;
 
 	struct tegra241_cmdqv *cmdqv;
 	struct tegra241_vcmdq **vcmdqs;
@@ -321,7 +323,25 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
 	return IRQ_HANDLED;
 }
 
-struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+static bool tegra241_vintf_support_cmd(struct tegra241_vintf *vintf, u8 opcode)
+{
+       /* Hypervisor-owned VINTF can execute any command in its VCMDQs */
+       if (READ_ONCE(vintf->hyp_own))
+               return true;
+
+       /* Guest-owned VINTF must Check against the list of supported CMDs */
+       switch (opcode) {
+	case CMDQ_OP_TLBI_NH_ASID:
+	case CMDQ_OP_TLBI_NH_VA:
+	case CMDQ_OP_ATC_INV:
+		return true;
+	default:
+		return false;
+       }
+}
+
+struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+					      u8 opcode)
 {
 	struct tegra241_cmdqv *cmdqv = smmu->tegra241_cmdqv;
 	struct tegra241_vintf *vintf = cmdqv->vintfs[0];
@@ -335,6 +355,10 @@ struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
 	if (!READ_ONCE(vintf->enabled))
 		return &smmu->cmdq;
 
+	/* Unsupported CMD go for smmu->cmdq pathway */
+	if (!tegra241_vintf_support_cmd(vintf, opcode))
+		return &smmu->cmdq;
+
 	/*
 	 * Select a vcmdq to use. Here we use a temporal solution to
 	 * balance out traffic on cmdq issuing: each cmdq has its own
@@ -523,6 +547,11 @@ int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 		}
 	}
 
+	/*
+	 * Note that HYP_OWN bit is wired to zero when running in guest kernel
+	 * regardless of enabling it here, as !HYP_OWN cmdqs have a restricted
+	 * set of supported commands, by following the HW design.
+	 */
 	regval = FIELD_PREP(VINTF_HYP_OWN, 1);
 	vintf_writel(vintf, regval, CONFIG);
 
@@ -530,6 +559,12 @@ int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
+	/*
+	 * As being mentioned above, HYP_OWN bit is wired to zero for a guest
+	 * kernel, so read it back from HW to ensure that reflects in hyp_own
+	 */
+	vintf->hyp_own = !!(VINTF_HYP_OWN & vintf_readl(vintf, CONFIG));
+
 	for (lidx = 0; lidx < cmdqv->num_vcmdqs_per_vintf; lidx++) {
 		ret = tegra241_vcmdq_hw_init(vintf->vcmdqs[lidx]);
 		if (ret)
Jason Gunthorpe May 6, 2024, 1 p.m. UTC | #6
On Sun, May 05, 2024 at 08:52:32PM -0700, Nicolin Chen wrote:
> On Tue, Apr 30, 2024 at 09:17:58PM -0300, Jason Gunthorpe wrote:
> > On Tue, Apr 30, 2024 at 11:58:44AM -0700, Nicolin Chen wrote:
> > > Otherwise, there has to be a get_suported_cmdq callback so batch
> > > or its callers can avoid adding unsupported commands at the first
> > > place.
> > 
> > If you really feel strongly the invalidation could be split into
> > S1/S2/S1_VM groupings that align with the feature bits and that could
> > be passed down from one step above. But I don't think the complexity
> > is really needed. It is better to deal with it through the feature
> > mechanism.
> 
> Hmm, I tried following your design by passing in a CMD_TYPE_xxx
> to the tegra241_cmdqv_get_cmdq(), but I found a little painful
> to accommodate these two cases:
> 1. TLBI_NH_ASID is issued via arm_smmu_cmdq_issue_cmdlist(), so
>    we should not mark it as CMD_TYPE_ALL. Yet, this function is
>    used by other commands too. So, either we pass in a type from
>    higher callers, or simply check the opcode in that function.

Yes, you'd have to pass in the type there too, which makes it more
ugly.

> So, it feels to me that checking against the opcode is still a
> straightforward solution. And what I ended up with is somewhat
> similar to this v6, yet this time it only checks at batch init
> call as your design does.

Well, the only downside is that the commands have to be same in a
batch, but maybe that is OK anyhow.

Don't forget to take the hunks that fix the queue as well.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 665a5e585f72..0802c3c96a2a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -352,10 +352,11 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
+					       u64 *cmds, int n)
 {
 	if (smmu->tegra241_cmdqv)
-		return tegra241_cmdqv_get_cmdq(smmu);
+		return tegra241_cmdqv_get_cmdq(smmu, cmds, n);
 
 	return &smmu->cmdq;
 }
@@ -766,7 +767,7 @@  static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	u32 prod;
 	unsigned long flags;
 	bool owner;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
+	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
 	struct arm_smmu_ll_queue llq, head;
 	int ret = 0;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 87e4c227a937..e21e29f4770b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -837,7 +837,8 @@  static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 struct tegra241_cmdqv *
 tegra241_cmdqv_acpi_probe(struct arm_smmu_device *smmu, int id);
 int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu);
-struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu);
+struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+					      u64 *cmds, int n);
 #else /* CONFIG_TEGRA241_CMDQV */
 static inline struct tegra241_cmdqv *
 tegra241_cmdqv_acpi_probe(struct arm_smmu_device *smmu, int id)
@@ -851,7 +852,7 @@  static inline int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 }
 
 static inline struct arm_smmu_cmdq *
-tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
 {
 	return NULL;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 4b2af3aaa6b4..59ff2b740bec 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -266,6 +266,7 @@  struct tegra241_vcmdq {
  * struct tegra241_vintf - Virtual Interface
  * @idx: Global index in the CMDQV HW
  * @status: cached status register
+ * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: CMDQV HW pointer
  * @vcmdqs: List of VCMDQ pointers
  * @base: MMIO base address
@@ -274,6 +275,7 @@  struct tegra241_vintf {
 	u16 idx;
 
 	atomic_t status;
+	bool hyp_own;
 
 	struct tegra241_cmdqv *cmdqv;
 	struct tegra241_vcmdq **vcmdqs;
@@ -372,7 +374,32 @@  static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
 	return IRQ_HANDLED;
 }
 
-struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+static bool tegra241_vintf_support_cmds(struct tegra241_vintf *vintf,
+					u64 *cmds, int n)
+{
+	int i;
+
+	/* VINTF owned by hypervisor can execute any command */
+	if (vintf->hyp_own)
+		return true;
+
+	/* Guest-owned VINTF must Check against the list of supported CMDs */
+	for (i = 0; i < n; i++) {
+		switch (FIELD_GET(CMDQ_0_OP, cmds[i * CMDQ_ENT_DWORDS])) {
+		case CMDQ_OP_TLBI_NH_ASID:
+		case CMDQ_OP_TLBI_NH_VA:
+		case CMDQ_OP_ATC_INV:
+			continue;
+		default:
+			return false;
+		}
+	}
+
+	return true;
+}
+
+struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+					      u64 *cmds, int n)
 {
 	struct tegra241_cmdqv *cmdqv = smmu->tegra241_cmdqv;
 	struct tegra241_vintf *vintf = cmdqv->vintfs[0];
@@ -390,6 +417,10 @@  struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
 	if (FIELD_GET(VINTF_STATUS, atomic_read(&vintf->status)))
 		return &smmu->cmdq;
 
+	/* Unsupported CMDs go for smmu->cmdq pathway */
+	if (!tegra241_vintf_support_cmds(vintf, cmds, n))
+		return &smmu->cmdq;
+
 	/*
 	 * Select a vcmdq to use. Here we use a temporal solution to
 	 * balance out traffic on cmdq issuing: each cmdq has its own
@@ -590,6 +621,11 @@  int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 		}
 	}
 
+	/*
+	 * Note that HYP_OWN bit is wired to zero when running in guest kernel
+	 * regardless of enabling it here, as !HYP_OWN cmdqs have a restricted
+	 * set of supported commands, by following the HW design.
+	 */
 	regval = FIELD_PREP(VINTF_HYP_OWN, 1);
 	vintf_writel(regval, CONFIG);
 
@@ -597,6 +633,12 @@  int tegra241_cmdqv_device_reset(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
+	/*
+	 * As being mentioned above, HYP_OWN bit is wired to zero for a guest
+	 * kernel, so read it back from HW to ensure that reflects in hyp_own
+	 */
+	vintf->hyp_own = !!(VINTF_HYP_OWN & vintf_readl(CONFIG));
+
 	/* Build an arm_smmu_cmdq for each vcmdq allocated to vintf */
 	vintf->vcmdqs = devm_kcalloc(cmdqv->dev, cmdqv->num_vcmdqs_per_vintf,
 				     sizeof(*vintf->vcmdqs), GFP_KERNEL);