diff mbox series

[RFCv1,05/14] iommufd: Add IOMMUFD_OBJ_VIOMMU and IOMMUFD_CMD_VIOMMU_ALLOC

Message ID 3aa9bc1df6a2ee58a03c6ea6ededbc210a2d23a8.1712978212.git.nicolinc@nvidia.com
State Handled Elsewhere
Headers show
Series Add Tegra241 (Grace) CMDQV Support (part 2/2) | expand

Commit Message

Nicolin Chen April 13, 2024, 3:47 a.m. UTC
Corresponding to the new iommufd_viommu core structure that represents a
vIOMMU instance in the user space for HW-accelerated features, add a new
IOMMUFD_OBJ_VIOMMU and its ioctl for user space to allocate it.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  3 +
 drivers/iommu/iommufd/main.c            |  6 ++
 drivers/iommu/iommufd/viommu.c          | 83 +++++++++++++++++++++++++
 include/linux/iommufd.h                 |  1 +
 include/uapi/linux/iommufd.h            | 30 +++++++++
 5 files changed, 123 insertions(+)

Comments

Jason Gunthorpe May 12, 2024, 2:27 p.m. UTC | #1
On Fri, Apr 12, 2024 at 08:47:02PM -0700, Nicolin Chen wrote:

> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_viommu_alloc *cmd = ucmd->cmd;
> +	struct iommufd_hwpt_paging *hwpt_paging;
> +	struct iommu_device *iommu_dev;
> +	struct iommufd_viommu *viommu;
> +	struct iommufd_device *idev;
> +	int rc;
> +
> +	if (cmd->flags)
> +		return -EOPNOTSUPP;
> +
> +	idev = iommufd_get_device(ucmd, cmd->dev_id);
> +	if (IS_ERR(idev))
> +		return PTR_ERR(idev);
> +	iommu_dev = idev->dev->iommu->iommu_dev;
> +
> +	if (!iommu_dev->ops->viommu_alloc) {
> +		rc = -EOPNOTSUPP;
> +		goto out_put_idev;
> +	}
> +
> +	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt_paging)) {
> +		rc = PTR_ERR(hwpt_paging);
> +		goto out_put_idev;
> +	}
> +
> +	if (!hwpt_paging->nest_parent) {
> +		rc = -EINVAL;
> +		goto out_put_hwpt;
> +	}
> +
> +	viommu = iommu_dev->ops->viommu_alloc(idev->dev, cmd->type,
> +					      hwpt_paging->common.domain);
> +	if (IS_ERR(viommu)) {
> +		rc = PTR_ERR(viommu);
> +		goto out_put_hwpt;
> +	}

Ah you did already include the S2, So should it be
domain->viommu_alloc() then?

> +
> +	/* iommufd_object_finalize will store the viommu->obj.id */
> +	rc = xa_alloc(&ucmd->ictx->objects, &viommu->obj.id, XA_ZERO_ENTRY,
> +		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
> +	if (rc)
> +		goto out_free;
> +
> +	viommu->obj.type = IOMMUFD_OBJ_VIOMMU;

See my other notes, lets try not to open code this.

> +	viommu->type = cmd->type;
> +
> +	viommu->ictx = ucmd->ictx;
> +	viommu->hwpt = hwpt_paging;
> +	viommu->iommu_dev = idev->dev->iommu->iommu_dev;
> +	cmd->out_viommu_id = viommu->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_erase_xa;
> +	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
> +	refcount_inc(&viommu->hwpt->common.obj.users);
> +	goto out_put_hwpt;
> +
> +out_erase_xa:
> +	xa_erase(&ucmd->ictx->objects, viommu->obj.id);
> +out_free:
> +	if (viommu->ops && viommu->ops->free)
> +		viommu->ops->free(viommu);
> +	kfree(viommu);

This really should use the abort flow. The driver free callback has to
be in the object release..

> +
> +/**
> + * enum iommu_viommu_type - VIOMMU Type
> + * @IOMMU_VIOMMU_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
> + */
> +enum iommu_viommu_type {
> +	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> +};

At least the 241 line should be in a following patch

> +/**
> + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
> + * @size: sizeof(struct iommu_viommu_alloc)
> + * @flags: Must be 0
> + * @type: Type of the VIOMMU object. Must be defined in enum iommu_viommu_type
> + * @dev_id: The device to allocate this virtual IOMMU for
> + * @hwpt_id: ID of a nested parent HWPT
> + * @out_viommu_id: Output virtual IOMMU ID for the allocated object
> + *
> + * Allocate an virtual IOMMU object that holds a (shared) nested parent HWPT
> + */
> +struct iommu_viommu_alloc {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 type;
> +	__u32 dev_id;
> +	__u32 hwpt_id;
> +	__u32 out_viommu_id;
> +};

This seems fine.

Let's have a following patch to change the hwpt_alloc to accept the
viommu as a hwpt as a uAPI change as well. 

The more I think about how this needs to work the more sure I am that
we need to do that.

ARM will need a fairly tricky set of things to manage the VMID
lifecycle. In BTM mode the VMID must come from the KVM. For vcmdq the
VMID is needed to create the queue/viommu. For AMD the S2 is needed to
create the VIOMMU in the first place.

So, to make this all work perfectly we need approx the following
 - S2 sharing across instances in ARM - meaning the VMID is allocated
   at attach not domain alloc
 - S2 hwpt is refcounted by the VIOMMU in the iommufd layer
 - VIOMMU is refcounted by every nesting child in the iommufd layer
 - The nesting child holds a pointer to both the S2 and the VIOMMU
   (viommu optional)
 - When the nesting child attaches to a device the STE will source the
   VMID from the VIOMMU if present otherwise from the S2
 - "RID" attach (ie naked S2) will have to be done with a Nesting
   Child using a vSTE that indicates Identity. Then the attach logic
   will have enough information to get the VMID from the VIOMMU
 - In full VIOMMU mode the S2 will never get a VMID of its own, it
   will always use the VIOMMU. Life cycle is simple, the VMID is freed
   when the VIOMMU is freed. That can't happen until all Nesting
   Children are freed. That can't happen until all Nesting Children
   are detached from devices. Detatching removes the HW touch of the VMID.

At this point you don't need the full generality, but let's please get
ready and get the viommu pointer available in all the right spots and
we can keep the current logic to borrow the VMID from the S2 for the
VIOMMU.

AMD folks, please consider if this works for you as well.

Jason
Nicolin Chen May 13, 2024, 4:33 a.m. UTC | #2
On Sun, May 12, 2024 at 11:27:45AM -0300, Jason Gunthorpe wrote:
> On Fri, Apr 12, 2024 at 08:47:02PM -0700, Nicolin Chen wrote:
> > +	viommu = iommu_dev->ops->viommu_alloc(idev->dev, cmd->type,
> > +					      hwpt_paging->common.domain);
> > +	if (IS_ERR(viommu)) {
> > +		rc = PTR_ERR(viommu);
> > +		goto out_put_hwpt;
> > +	}
> 
> Ah you did already include the S2, So should it be
> domain->viommu_alloc() then?

We can do that. In that case, the VIOMMU_ALLOC ioctl should be
simply per S2 HWPT too v.s. per IDEV.

> > +
> > +	/* iommufd_object_finalize will store the viommu->obj.id */
> > +	rc = xa_alloc(&ucmd->ictx->objects, &viommu->obj.id, XA_ZERO_ENTRY,
> > +		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
> > +	if (rc)
> > +		goto out_free;
> > +
> > +	viommu->obj.type = IOMMUFD_OBJ_VIOMMU;
> 
> See my other notes, lets try not to open code this.

Ack.

> > +	viommu->type = cmd->type;
> > +
> > +	viommu->ictx = ucmd->ictx;
> > +	viommu->hwpt = hwpt_paging;
> > +	viommu->iommu_dev = idev->dev->iommu->iommu_dev;
> > +	cmd->out_viommu_id = viommu->obj.id;
> > +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> > +	if (rc)
> > +		goto out_erase_xa;
> > +	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
> > +	refcount_inc(&viommu->hwpt->common.obj.users);
> > +	goto out_put_hwpt;
> > +
> > +out_erase_xa:
> > +	xa_erase(&ucmd->ictx->objects, viommu->obj.id);
> > +out_free:
> > +	if (viommu->ops && viommu->ops->free)
> > +		viommu->ops->free(viommu);
> > +	kfree(viommu);
> 
> This really should use the abort flow. The driver free callback has to
> be in the object release..

Yea, with the original object allocator, we probably can do abort().

> > +
> > +/**
> > + * enum iommu_viommu_type - VIOMMU Type
> > + * @IOMMU_VIOMMU_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
> > + */
> > +enum iommu_viommu_type {
> > +	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> > +};
> 
> At least the 241 line should be in a following patch

It's for the "enum iommu_viommu_type" mentioned in the following
structure. Yi told me that you don't like an empty enum, and he
did something like this in HWPT_INVALIDATE series:
https://lore.kernel.org/linux-iommu/20240111041015.47920-3-yi.l.liu@intel.com/

> > +/**
> > + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
> > + * @size: sizeof(struct iommu_viommu_alloc)
> > + * @flags: Must be 0
> > + * @type: Type of the VIOMMU object. Must be defined in enum iommu_viommu_type
> > + * @dev_id: The device to allocate this virtual IOMMU for
> > + * @hwpt_id: ID of a nested parent HWPT
> > + * @out_viommu_id: Output virtual IOMMU ID for the allocated object
> > + *
> > + * Allocate an virtual IOMMU object that holds a (shared) nested parent HWPT
> > + */
> > +struct iommu_viommu_alloc {
> > +	__u32 size;
> > +	__u32 flags;
> > +	__u32 type;
> > +	__u32 dev_id;
> > +	__u32 hwpt_id;
> > +	__u32 out_viommu_id;
> > +};
> 
> This seems fine.
> 
> Let's have a following patch to change the hwpt_alloc to accept the
> viommu as a hwpt as a uAPI change as well. 
> 
> The more I think about how this needs to work the more sure I am that
> we need to do that.
> 
> ARM will need a fairly tricky set of things to manage the VMID
> lifecycle. In BTM mode the VMID must come from the KVM. For vcmdq the
> VMID is needed to create the queue/viommu. For AMD the S2 is needed to
> create the VIOMMU in the first place.
> 
> So, to make this all work perfectly we need approx the following
>  - S2 sharing across instances in ARM - meaning the VMID is allocated
>    at attach not domain alloc
>  - S2 hwpt is refcounted by the VIOMMU in the iommufd layer
>  - VIOMMU is refcounted by every nesting child in the iommufd layer
>  - The nesting child holds a pointer to both the S2 and the VIOMMU
>    (viommu optional)
>  - When the nesting child attaches to a device the STE will source the
>    VMID from the VIOMMU if present otherwise from the S2
>  - "RID" attach (ie naked S2) will have to be done with a Nesting
>    Child using a vSTE that indicates Identity. Then the attach logic
>    will have enough information to get the VMID from the VIOMMU

What is this RID attach (naked S2) case? S1DSS_BYPASS + SVA?

>  - In full VIOMMU mode the S2 will never get a VMID of its own, it
>    will always use the VIOMMU. Life cycle is simple, the VMID is freed
>    when the VIOMMU is freed. That can't happen until all Nesting
>    Children are freed. That can't happen until all Nesting Children
>    are detached from devices. Detatching removes the HW touch of the VMID.

So, each VM will have one S2 HWPT/domain/iopt, but each VM can
have multiple VIOMMU instances sharing that single S2 HWPT, and
each VIOMMU instance (in the SMMU driver at least) holds a vmid.

This seems to be a quite clear big picture now!

> At this point you don't need the full generality, but let's please get
> ready and get the viommu pointer available in all the right spots and
> we can keep the current logic to borrow the VMID from the S2 for the
> VIOMMU.

Yea. Will try as much as I can.

Thanks
Nicolin
Jason Gunthorpe May 14, 2024, 3:38 p.m. UTC | #3
> > > +
> > > +/**
> > > + * enum iommu_viommu_type - VIOMMU Type
> > > + * @IOMMU_VIOMMU_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
> > > + */
> > > +enum iommu_viommu_type {
> > > +	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> > > +};
> > 
> > At least the 241 line should be in a following patch
> 
> It's for the "enum iommu_viommu_type" mentioned in the following
> structure. Yi told me that you don't like an empty enum, and he
> did something like this in HWPT_INVALIDATE series:
> https://lore.kernel.org/linux-iommu/20240111041015.47920-3-yi.l.liu@intel.com/

I suspect 0 should be reserved as a non-set value for some
basic sanity in all these driver type enums.

Jason

> > So, to make this all work perfectly we need approx the following
> >  - S2 sharing across instances in ARM - meaning the VMID is allocated
> >    at attach not domain alloc
> >  - S2 hwpt is refcounted by the VIOMMU in the iommufd layer
> >  - VIOMMU is refcounted by every nesting child in the iommufd layer
> >  - The nesting child holds a pointer to both the S2 and the VIOMMU
> >    (viommu optional)
> >  - When the nesting child attaches to a device the STE will source the
> >    VMID from the VIOMMU if present otherwise from the S2
> >  - "RID" attach (ie naked S2) will have to be done with a Nesting
> >    Child using a vSTE that indicates Identity. Then the attach logic
> >    will have enough information to get the VMID from the VIOMMU
> 
> What is this RID attach (naked S2) case? S1DSS_BYPASS + SVA?

No, when the guest installs a vSTE that simply says bypass with no CD
table pointer. That should result in a pSTE that is the S2 with on CD
pointer.

I was originally thinking that the VMM would simply directly attach
the S2 HWPT in this caes, but given the above issue with the VMID lifetime
it makes more sense to 'attach' the viommu which holds the correct
VMID. 

The issue with direct attach the S2 HWPT is the VMID lifetime, as it
would have to borrow the VMID from the viommu but then the lifetime
becomes more complex as it has to live beyond VIOMMU destruction. Not
unsolvable but it seems easier to just avoid it entirely.

> >  - In full VIOMMU mode the S2 will never get a VMID of its own, it
> >    will always use the VIOMMU. Life cycle is simple, the VMID is freed
> >    when the VIOMMU is freed. That can't happen until all Nesting
> >    Children are freed. That can't happen until all Nesting Children
> >    are detached from devices. Detatching removes the HW touch of the VMID.
> 
> So, each VM will have one S2 HWPT/domain/iopt, but each VM can
> have multiple VIOMMU instances sharing that single S2 HWPT, and
> each VIOMMU instance (in the SMMU driver at least) holds a vmid.

Yes, right. We really want to share the S2 across instances in the end
and I have made the VMID per-instance along with the per-instance
ASID. So the above sounds like it could work

Jason
Nicolin Chen May 15, 2024, 1:20 a.m. UTC | #4
On Tue, May 14, 2024 at 12:38:57PM -0300, Jason Gunthorpe wrote:
> > > > +
> > > > +/**
> > > > + * enum iommu_viommu_type - VIOMMU Type
> > > > + * @IOMMU_VIOMMU_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
> > > > + */
> > > > +enum iommu_viommu_type {
> > > > +	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> > > > +};
> > > 
> > > At least the 241 line should be in a following patch
> > 
> > It's for the "enum iommu_viommu_type" mentioned in the following
> > structure. Yi told me that you don't like an empty enum, and he
> > did something like this in HWPT_INVALIDATE series:
> > https://lore.kernel.org/linux-iommu/20240111041015.47920-3-yi.l.liu@intel.com/
> 
> I suspect 0 should be reserved as a non-set value for some
> basic sanity in all these driver type enums.

We have an IOMMU_HWPT_DATA_NONE for HWPT_ALLOC to compatible
with an S2 hwpt, since it doesn't need a data.

Maybe we can have an IOMMU_VIOMMU_TYPE_DEFAULT to be 0, for
an IOMMU driver (e.g. VT-d) that doesn't need to handle nor
be aware of any viommu object?

So, VMM can have a unified "attach-to-viommu" practice with
different IOMMUs, v.s. some still doing "attach-to-s2"?

> > > So, to make this all work perfectly we need approx the following
> > >  - S2 sharing across instances in ARM - meaning the VMID is allocated
> > >    at attach not domain alloc
> > >  - S2 hwpt is refcounted by the VIOMMU in the iommufd layer
> > >  - VIOMMU is refcounted by every nesting child in the iommufd layer
> > >  - The nesting child holds a pointer to both the S2 and the VIOMMU
> > >    (viommu optional)
> > >  - When the nesting child attaches to a device the STE will source the
> > >    VMID from the VIOMMU if present otherwise from the S2
> > >  - "RID" attach (ie naked S2) will have to be done with a Nesting
> > >    Child using a vSTE that indicates Identity. Then the attach logic
> > >    will have enough information to get the VMID from the VIOMMU
> > 
> > What is this RID attach (naked S2) case? S1DSS_BYPASS + SVA?
> 
> No, when the guest installs a vSTE that simply says bypass with no CD
> table pointer. That should result in a pSTE that is the S2 with on CD
> pointer.
> 
> I was originally thinking that the VMM would simply directly attach
> the S2 HWPT in this caes, but given the above issue with the VMID lifetime
> it makes more sense to 'attach' the viommu which holds the correct
> VMID. 
> 
> The issue with direct attach the S2 HWPT is the VMID lifetime, as it
> would have to borrow the VMID from the viommu but then the lifetime
> becomes more complex as it has to live beyond VIOMMU destruction. Not
> unsolvable but it seems easier to just avoid it entirely.

That makes a lot sense. I'd need to go through QEMU code and
see how we will accommodate these two more naturally: likely
the QEMU core should allocate an S2 HWPT for a VM, while the
viommu code should allocate a VIOMMU for each instance.

Thanks
Nicolin
Jason Gunthorpe May 21, 2024, 6:05 p.m. UTC | #5
On Tue, May 14, 2024 at 06:20:06PM -0700, Nicolin Chen wrote:
> > I suspect 0 should be reserved as a non-set value for some
> > basic sanity in all these driver type enums.
> 
> We have an IOMMU_HWPT_DATA_NONE for HWPT_ALLOC to compatible
> with an S2 hwpt, since it doesn't need a data.
> 
> Maybe we can have an IOMMU_VIOMMU_TYPE_DEFAULT to be 0, for
> an IOMMU driver (e.g. VT-d) that doesn't need to handle nor
> be aware of any viommu object?

Seems like a good practice, and perhaps userspace will find value in a
generic viommu object that is always present.

> That makes a lot sense. I'd need to go through QEMU code and
> see how we will accommodate these two more naturally: likely
> the QEMU core should allocate an S2 HWPT for a VM, while the
> viommu code should allocate a VIOMMU for each instance.

I'd suggest that core qemu should allocate the S2 IOAS and pass that
to the qemu viommu driver

The qemu viommu driver should create the hwpt and then the viommu and
perhaps return the viommu or hwpt back to the core code.

If the vSTE flow above is used for identity then the qemu viommu
driver would also have to go an create vSTEs for identity and attach
them to all devices before the VM starts up. Then when the OS
activates the SMMU it would have to mirror the real vSTE from guest
memory to the kernel.

Not sure there is value in having the core qemu code directly access
the hwpt/viommu?

Jason
Nicolin Chen May 22, 2024, 12:13 a.m. UTC | #6
On Tue, May 21, 2024 at 03:05:55PM -0300, Jason Gunthorpe wrote:
> On Tue, May 14, 2024 at 06:20:06PM -0700, Nicolin Chen wrote:
> > > I suspect 0 should be reserved as a non-set value for some
> > > basic sanity in all these driver type enums.
> > 
> > We have an IOMMU_HWPT_DATA_NONE for HWPT_ALLOC to compatible
> > with an S2 hwpt, since it doesn't need a data.
> > 
> > Maybe we can have an IOMMU_VIOMMU_TYPE_DEFAULT to be 0, for
> > an IOMMU driver (e.g. VT-d) that doesn't need to handle nor
> > be aware of any viommu object?
> 
> Seems like a good practice, and perhaps userspace will find value in a
> generic viommu object that is always present.

Yea. VMM is always allowed to create a viommu to wrap an S2
HWPT. Then, I assume iommufd in this case should allocate a
viommu object if !domain_ops->viommu_alloc.

> > That makes a lot sense. I'd need to go through QEMU code and
> > see how we will accommodate these two more naturally: likely
> > the QEMU core should allocate an S2 HWPT for a VM, while the
> > viommu code should allocate a VIOMMU for each instance.
> 
> I'd suggest that core qemu should allocate the S2 IOAS and pass that
> to the qemu viommu driver
>
> The qemu viommu driver should create the hwpt and then the viommu and
> perhaps return the viommu or hwpt back to the core code.
>
> If the vSTE flow above is used for identity then the qemu viommu
> driver would also have to go an create vSTEs for identity and attach
> them to all devices before the VM starts up. Then when the OS
> activates the SMMU it would have to mirror the real vSTE from guest
> memory to the kernel.

The entire flow makes sense to me.

> Not sure there is value in having the core qemu code directly access
> the hwpt/viommu?

I think so, though here might be some complication here.

On one side, it may not be straightforward for a qemu viommu
driver to hold a shared S2 hwpt, as the driver is typically
per instance, though I think it can keep viommu to its own.
So passing the S2 hwpt back to qemu core and tie to iommufd
handler (ictx) makes sense.

On the other side, there can be some future HW potentially
supporting two+ kinds of IO page tables so a VM may have two+
S2 hwpts? Then the core would hold a list of S2 hwpts and the
viommu driver would need to try-n-allocate viommu against the
list..

Thanks
Nicolin
Jason Gunthorpe May 22, 2024, 4:46 p.m. UTC | #7
On Tue, May 21, 2024 at 05:13:50PM -0700, Nicolin Chen wrote:
> Yea. VMM is always allowed to create a viommu to wrap an S2
> HWPT. Then, I assume iommufd in this case should allocate a
> viommu object if !domain_ops->viommu_alloc.

Yeah

> On one side, it may not be straightforward for a qemu viommu
> driver to hold a shared S2 hwpt, as the driver is typically
> per instance, though I think it can keep viommu to its own.
> So passing the S2 hwpt back to qemu core and tie to iommufd
> handler (ictx) makes sense.

Yes, qemu will need some per-driver-type but not per-instance storage
to make this work. Ie the ARM per-driver-type shared storage would
hold the ARM specific list of S2 hwpts.

> On the other side, there can be some future HW potentially
> supporting two+ kinds of IO page tables so a VM may have two+
> S2 hwpts? Then the core would hold a list of S2 hwpts and the
> viommu driver would need to try-n-allocate viommu against the
> list..

Yes, it is supported in the API. Userspace should try to create
viommus with all the S2 hwpts available and build a new one if it
can't, just like hwpt attachment to a device.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index eccc565ed38e..ae90b4493109 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -424,6 +424,9 @@  void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 5187942b375d..9de7e3e63ce4 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -323,6 +323,7 @@  union ucmd_buffer {
 	struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
 	struct iommu_ioas_alloc alloc;
 	struct iommu_ioas_allow_iovas allow_iovas;
+	struct iommu_viommu_alloc viommu;
 	struct iommu_ioas_copy ioas_copy;
 	struct iommu_ioas_iova_ranges iova_ranges;
 	struct iommu_ioas_map map;
@@ -378,6 +379,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+		 struct iommu_viommu_alloc, out_viommu_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -510,6 +513,9 @@  static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
+	[IOMMUFD_OBJ_VIOMMU] = {
+		.destroy = iommufd_viommu_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 3886b1dd1f13..079e0ff79942 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -19,3 +19,86 @@ 
 	}
 
 viommu_struct_alloc(viommu);
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_viommu *viommu =
+		container_of(obj, struct iommufd_viommu, obj);
+
+	if (viommu->ops && viommu->ops->free)
+		viommu->ops->free(viommu);
+	refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	struct iommufd_hwpt_paging *hwpt_paging;
+	struct iommu_device *iommu_dev;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	iommu_dev = idev->dev->iommu->iommu_dev;
+
+	if (!iommu_dev->ops->viommu_alloc) {
+		rc = -EOPNOTSUPP;
+		goto out_put_idev;
+	}
+
+	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt_paging)) {
+		rc = PTR_ERR(hwpt_paging);
+		goto out_put_idev;
+	}
+
+	if (!hwpt_paging->nest_parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	viommu = iommu_dev->ops->viommu_alloc(idev->dev, cmd->type,
+					      hwpt_paging->common.domain);
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_put_hwpt;
+	}
+
+	/* iommufd_object_finalize will store the viommu->obj.id */
+	rc = xa_alloc(&ucmd->ictx->objects, &viommu->obj.id, XA_ZERO_ENTRY,
+		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
+	if (rc)
+		goto out_free;
+
+	viommu->obj.type = IOMMUFD_OBJ_VIOMMU;
+	viommu->type = cmd->type;
+
+	viommu->ictx = ucmd->ictx;
+	viommu->hwpt = hwpt_paging;
+	viommu->iommu_dev = idev->dev->iommu->iommu_dev;
+	cmd->out_viommu_id = viommu->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_erase_xa;
+	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+	refcount_inc(&viommu->hwpt->common.obj.users);
+	goto out_put_hwpt;
+
+out_erase_xa:
+	xa_erase(&ucmd->ictx->objects, viommu->obj.id);
+out_free:
+	if (viommu->ops && viommu->ops->free)
+		viommu->ops->free(viommu);
+	kfree(viommu);
+out_put_hwpt:
+	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 650acfac307a..dec10c6bb261 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -28,6 +28,7 @@  enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_VIOMMU,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1dfeaa2e649e..2b0825d69846 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@  enum {
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 	IOMMUFD_CMD_HWPT_INVALIDATE,
+	IOMMUFD_CMD_VIOMMU_ALLOC,
 };
 
 /**
@@ -692,4 +693,33 @@  struct iommu_hwpt_invalidate {
 	__u32 __reserved;
 };
 #define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+
+/**
+ * enum iommu_viommu_type - VIOMMU Type
+ * @IOMMU_VIOMMU_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the VIOMMU object. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device to allocate this virtual IOMMU for
+ * @hwpt_id: ID of a nested parent HWPT
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate an virtual IOMMU object that holds a (shared) nested parent HWPT
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 #endif