@@ -2408,6 +2408,32 @@ an implementation for these despite the in kernel acceleration.
This capability is always enabled.
+4.87 KVM_CREATE_SPAPR_TCE_IOMMU
+
+Capability: KVM_CAP_SPAPR_TCE_IOMMU
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_iommu (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_create_spapr_tce_iommu {
+ __u64 liobn;
+ __u32 fd;
+ __u32 flags;
+};
+
+This creates a link between IOMMU group and a hardware TCE (translation
+control entry) table. This link lets the host kernel know what IOMMU
+group (i.e. TCE table) to use for the LIOBN number passed with
+H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
+
+User space passes VFIO group fd. Using the external user VFIO API,
+KVM tries gets IOMMU id from passed fd. If succeeded, acceleration
+turns on. If failed, map/unmap requests are passed to user space.
+
+No flag is supported at the moment.
+
+
5. The kvm_run structure
------------------------
@@ -181,6 +181,8 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+ struct iommu_group *grp; /* used for IOMMU groups */
+ struct vfio_group *vfio_grp; /* used for IOMMU groups */
struct page *pages[0];
};
@@ -612,6 +614,7 @@ struct kvm_vcpu_arch {
u64 busy_preempt;
unsigned long *tce_tmp_hpas; /* TCE cache for TCE_PUT_INDIRECT hcall */
+ unsigned long tce_tmp_num; /* Number of handled TCEs in the cache */
enum {
TCERM_NONE,
TCERM_GETPAGE,
@@ -133,6 +133,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
+ struct kvm_create_spapr_tce_iommu *args);
extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
struct kvm_vcpu *vcpu, unsigned long liobn);
extern long kvmppc_tce_validate(unsigned long tce);
@@ -319,6 +319,13 @@ struct kvm_create_spapr_tce {
__u32 window_size;
};
+/* for KVM_CAP_SPAPR_TCE_IOMMU */
+struct kvm_create_spapr_tce_iommu {
+ __u64 liobn;
+ __u32 fd;
+ __u32 flags;
+};
+
/* for KVM_ALLOCATE_RMA */
struct kvm_allocate_rma {
__u64 rma_size;
@@ -27,6 +27,10 @@
#include <linux/hugetlb.h>
#include <linux/list.h>
#include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/vfio.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
@@ -42,6 +46,53 @@
#define ERROR_ADDR ((void *)~(unsigned long)0x0)
+/*
+ * Dynamically linked version of the external user VFIO API.
+ *
+ * As a IOMMU group access control is implemented by VFIO,
+ * there is some API to vefiry that specific process can own
+ * a group. As KVM may run when VFIO is not loaded, KVM is not
+ * linked statically to VFIO, instead wrappers are used.
+ */
+struct vfio_group *kvmppc_vfio_group_get_external_user(struct file *filep)
+{
+ struct vfio_group *ret;
+ struct vfio_group * (*proc)(struct file *) =
+ symbol_get(vfio_group_get_external_user);
+ if (!proc)
+ return NULL;
+
+ ret = proc(filep);
+ symbol_put(vfio_group_get_external_user);
+
+ return ret;
+}
+
+void kvmppc_vfio_group_put_external_user(struct vfio_group *group)
+{
+ void (*proc)(struct vfio_group *) =
+ symbol_get(vfio_group_put_external_user);
+ if (!proc)
+ return;
+
+ proc(group);
+ symbol_put(vfio_group_put_external_user);
+}
+
+int kvmppc_vfio_external_user_iommu_id(struct vfio_group *group)
+{
+ int ret;
+ int (*proc)(struct vfio_group *) =
+ symbol_get(vfio_external_user_iommu_id);
+ if (!proc)
+ return -EINVAL;
+
+ ret = proc(group);
+ symbol_put(vfio_external_user_iommu_id);
+
+ return ret;
+}
+
static long kvmppc_stt_npages(unsigned long window_size)
{
return ALIGN((window_size >> SPAPR_TCE_SHIFT)
@@ -55,8 +106,15 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
mutex_lock(&kvm->lock);
list_del(&stt->list);
- for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
- __free_page(stt->pages[i]);
+
+ if (stt->grp) {
+ if (stt->vfio_grp)
+ kvmppc_vfio_group_put_external_user(stt->vfio_grp);
+ iommu_group_put(stt->grp);
+ } else
+ for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+ __free_page(stt->pages[i]);
+
kfree(stt);
mutex_unlock(&kvm->lock);
@@ -152,9 +210,90 @@ fail:
return ret;
}
-/* Converts guest physical address to host virtual address */
+static const struct file_operations kvm_spapr_tce_iommu_fops = {
+ .release = kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
+ struct kvm_create_spapr_tce_iommu *args)
+{
+ struct kvmppc_spapr_tce_table *tt = NULL;
+ struct iommu_group *grp;
+ struct iommu_table *tbl;
+ struct file *vfio_filp;
+ struct vfio_group *vfio_grp;
+ int ret = 0, iommu_id;
+
+ /* Check this LIOBN hasn't been previously registered */
+ list_for_each_entry(tt, &kvm->arch.spapr_tce_tables, list) {
+ if (tt->liobn == args->liobn)
+ return -EBUSY;
+ }
+
+ vfio_filp = fget(args->fd);
+ if (!vfio_filp)
+ return -ENXIO;
+
+ /* Lock the group. Fails if group is not viable or does not have IOMMU set */
+ vfio_grp = kvmppc_vfio_group_get_external_user(vfio_filp);
+ if (IS_ERR_VALUE((unsigned long)vfio_grp))
+ goto fput_exit;
+
+ /* Get IOMMU ID, find iommu_group and iommu_table*/
+ iommu_id = kvmppc_vfio_external_user_iommu_id(vfio_grp);
+ if (iommu_id < 0)
+ goto grpput_fput_exit;
+
+ ret = -ENXIO;
+ grp = iommu_group_get_by_id(iommu_id);
+ if (!grp)
+ goto grpput_fput_exit;
+
+ tbl = iommu_group_get_iommudata(grp);
+ if (!tbl)
+ goto grpput_fput_exit;
+
+ /* Create a TCE table descriptor and add into the descriptor list */
+ tt = kzalloc(sizeof(*tt), GFP_KERNEL);
+ if (!tt)
+ goto grpput_fput_exit;
+
+ tt->liobn = args->liobn;
+ kvm_get_kvm(kvm);
+ tt->kvm = kvm;
+ tt->grp = grp;
+ tt->window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+ tt->vfio_grp = vfio_grp;
+
+ /* Create an inode to provide automatic cleanup upon exit */
+ ret = anon_inode_getfd("kvm-spapr-tce-iommu",
+ &kvm_spapr_tce_iommu_fops, tt, O_RDWR);
+ if (ret < 0)
+ goto free_grpput_fput_exit;
+
+ /* Add the TCE table descriptor to the descriptor list */
+ mutex_lock(&kvm->lock);
+ list_add(&tt->list, &kvm->arch.spapr_tce_tables);
+ mutex_unlock(&kvm->lock);
+
+ goto fput_exit;
+
+free_grpput_fput_exit:
+ kfree(tt);
+grpput_fput_exit:
+ kvmppc_vfio_group_put_external_user(vfio_grp);
+fput_exit:
+ fput(vfio_filp);
+
+ return ret;
+}
+
+/*
+ * Converts guest physical address to host virtual address.
+ * Also returns host physical address which is to put to TCE table.
+ */
static void __user *kvmppc_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
- unsigned long gpa, struct page **pg)
+ unsigned long gpa, struct page **pg, unsigned long *phpa)
{
unsigned long hva, gfn = gpa >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
@@ -169,9 +308,140 @@ static void __user *kvmppc_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
if (get_user_pages_fast(hva & PAGE_MASK, 1, is_write, pg) != 1)
return ERROR_ADDR;
+ if (phpa)
+ *phpa = __pa((unsigned long) page_address(*pg)) |
+ (hva & ~PAGE_MASK);
+
return (void *) hva;
}
+long kvmppc_h_put_tce_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce)
+{
+ struct page *pg = NULL;
+ unsigned long hpa;
+ void __user *hva;
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ /* Clear TCE */
+ if (!(tce & (TCE_PCI_READ | TCE_PCI_WRITE))) {
+ if (iommu_tce_clear_param_check(tbl, ioba, 0, 1))
+ return H_PARAMETER;
+
+ if (iommu_free_tces(tbl, ioba >> IOMMU_PAGE_SHIFT,
+ 1, false))
+ return H_HARDWARE;
+
+ return H_SUCCESS;
+ }
+
+ /* Put TCE */
+ if (vcpu->arch.tce_rm_fail != TCERM_NONE) {
+ /* Retry iommu_tce_build if it failed in real mode */
+ vcpu->arch.tce_rm_fail = TCERM_NONE;
+ hpa = vcpu->arch.tce_tmp_hpas[0];
+ } else {
+ if (iommu_tce_put_param_check(tbl, ioba, tce))
+ return H_PARAMETER;
+
+ hva = kvmppc_gpa_to_hva_and_get(vcpu, tce, &pg, &hpa);
+ if (hva == ERROR_ADDR)
+ return H_HARDWARE;
+ }
+
+ if (!iommu_tce_build(tbl, ioba >> IOMMU_PAGE_SHIFT, &hpa, 1, false))
+ return H_SUCCESS;
+
+ pg = pfn_to_page(hpa >> PAGE_SHIFT);
+ if (pg)
+ put_page(pg);
+
+ return H_HARDWARE;
+}
+
+static long kvmppc_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt, unsigned long ioba,
+ unsigned long __user *tces, unsigned long npages)
+{
+ long i = 0, start = 0;
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ switch (vcpu->arch.tce_rm_fail) {
+ case TCERM_NONE:
+ break;
+ case TCERM_GETPAGE:
+ start = vcpu->arch.tce_tmp_num;
+ break;
+ case TCERM_PUTTCE:
+ goto put_tces;
+ case TCERM_PUTLIST:
+ default:
+ WARN_ON(1);
+ return H_HARDWARE;
+ }
+
+ for (i = start; i < npages; ++i) {
+ struct page *pg = NULL;
+ unsigned long gpa;
+ void __user *hva;
+
+ if (get_user(gpa, tces + i))
+ return H_HARDWARE;
+
+ if (iommu_tce_put_param_check(tbl, ioba +
+ (i << IOMMU_PAGE_SHIFT), gpa))
+ return H_PARAMETER;
+
+ hva = kvmppc_gpa_to_hva_and_get(vcpu, gpa, &pg,
+ &vcpu->arch.tce_tmp_hpas[i]);
+ if (hva == ERROR_ADDR)
+ goto putpages_flush_exit;
+ }
+
+put_tces:
+ if (!iommu_tce_build(tbl, ioba >> IOMMU_PAGE_SHIFT,
+ vcpu->arch.tce_tmp_hpas, npages, false))
+ return H_SUCCESS;
+
+putpages_flush_exit:
+ for ( --i; i >= 0; --i) {
+ struct page *pg;
+ pg = pfn_to_page(vcpu->arch.tce_tmp_hpas[i] >> PAGE_SHIFT);
+ if (pg)
+ put_page(pg);
+ }
+
+ return H_HARDWARE;
+}
+
+long kvmppc_h_stuff_tce_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_value, unsigned long npages)
+{
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+ unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ if (iommu_tce_clear_param_check(tbl, ioba, tce_value, npages))
+ return H_PARAMETER;
+
+ if (iommu_free_tces(tbl, entry, npages, false))
+ return H_HARDWARE;
+
+ return H_SUCCESS;
+}
+
long kvmppc_h_put_tce(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce)
@@ -183,6 +453,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu,
if (!tt)
return H_TOO_HARD;
+ if (tt->grp)
+ return kvmppc_h_put_tce_iommu(vcpu, tt, liobn, ioba, tce);
+
+ /* Emulated IO */
if (ioba >= tt->window_size)
return H_PARAMETER;
@@ -221,13 +495,20 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
return H_PARAMETER;
- tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, &pg);
+ tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, &pg, NULL);
if (tces == ERROR_ADDR)
return H_TOO_HARD;
if (vcpu->arch.tce_rm_fail == TCERM_PUTLIST)
goto put_list_page_exit;
+ if (tt->grp) {
+ ret = kvmppc_h_put_tce_indirect_iommu(vcpu,
+ tt, ioba, tces, npages);
+ goto put_list_page_exit;
+ }
+
+ /* Emulated IO */
for (i = 0; i < npages; ++i) {
if (get_user(vcpu->arch.tce_tmp_hpas[i], tces + i)) {
ret = H_PARAMETER;
@@ -266,6 +547,11 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
if (!tt)
return H_TOO_HARD;
+ if (tt->grp)
+ return kvmppc_h_stuff_tce_iommu(vcpu, tt, liobn, ioba,
+ tce_value, npages);
+
+ /* Emulated IO */
if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
return H_PARAMETER;
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/list.h>
+#include <linux/iommu.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
@@ -188,6 +189,111 @@ static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
return hpa;
}
+static long kvmppc_rm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt, unsigned long liobn,
+ unsigned long ioba, unsigned long tce)
+{
+ int ret = 0;
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+ unsigned long hpa;
+ struct page *pg = NULL;
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ /* Clear TCE */
+ if (!(tce & (TCE_PCI_READ | TCE_PCI_WRITE))) {
+ if (iommu_tce_clear_param_check(tbl, ioba, 0, 1))
+ return H_PARAMETER;
+
+ if (iommu_free_tces(tbl, ioba >> IOMMU_PAGE_SHIFT, 1, true))
+ return H_TOO_HARD;
+
+ return H_SUCCESS;
+ }
+
+ /* Put TCE */
+ if (iommu_tce_put_param_check(tbl, ioba, tce))
+ return H_PARAMETER;
+
+ hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce, &pg);
+ if (hpa != ERROR_ADDR) {
+ ret = iommu_tce_build(tbl, ioba >> IOMMU_PAGE_SHIFT,
+ &hpa, 1, true);
+ }
+
+ if (((hpa == ERROR_ADDR) && pg) || ret) {
+ vcpu->arch.tce_tmp_hpas[0] = hpa;
+ vcpu->arch.tce_tmp_num = 0;
+ vcpu->arch.tce_rm_fail = TCERM_PUTTCE;
+ return H_TOO_HARD;
+ }
+
+ return H_SUCCESS;
+}
+
+static long kvmppc_rm_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt, unsigned long ioba,
+ unsigned long *tces, unsigned long npages)
+{
+ int i, ret;
+ unsigned long hpa;
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+ struct page *pg = NULL;
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ /* Check all TCEs */
+ for (i = 0; i < npages; ++i) {
+ if (iommu_tce_put_param_check(tbl, ioba +
+ (i << IOMMU_PAGE_SHIFT), tces[i]))
+ return H_PARAMETER;
+ }
+
+ /* Translate TCEs and go get_page() */
+ for (i = 0; i < npages; ++i) {
+ hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tces[i], &pg);
+ if (hpa == ERROR_ADDR) {
+ vcpu->arch.tce_tmp_num = i;
+ vcpu->arch.tce_rm_fail = TCERM_GETPAGE;
+ return H_TOO_HARD;
+ }
+ vcpu->arch.tce_tmp_hpas[i] = hpa;
+ }
+
+ /* Put TCEs to the table */
+ ret = iommu_tce_build(tbl, (ioba >> IOMMU_PAGE_SHIFT),
+ vcpu->arch.tce_tmp_hpas, npages, true);
+ if (ret == -EAGAIN) {
+ vcpu->arch.tce_rm_fail = TCERM_PUTTCE;
+ return H_TOO_HARD;
+ } else if (ret) {
+ return H_HARDWARE;
+ }
+
+ return H_SUCCESS;
+}
+
+static long kvmppc_rm_h_stuff_tce_iommu(struct kvm_vcpu *vcpu,
+ struct kvmppc_spapr_tce_table *tt,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_value, unsigned long npages)
+{
+ struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
+
+ if (!tbl)
+ return H_RESCINDED;
+
+ if (iommu_tce_clear_param_check(tbl, ioba, tce_value, npages))
+ return H_PARAMETER;
+
+ if (iommu_free_tces(tbl, ioba >> IOMMU_PAGE_SHIFT, npages, true))
+ return H_TOO_HARD;
+
+ return H_SUCCESS;
+}
+
long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba, unsigned long tce)
{
@@ -198,6 +304,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (!tt)
return H_TOO_HARD;
+ if (tt->grp)
+ return kvmppc_rm_h_put_tce_iommu(vcpu, tt, liobn, ioba, tce);
+
+ /* Emulated IO */
if (ioba >= tt->window_size)
return H_PARAMETER;
@@ -240,6 +350,13 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
goto put_unlock_exit;
}
+ if (tt->grp) {
+ ret = kvmppc_rm_h_put_tce_indirect_iommu(vcpu,
+ tt, ioba, (unsigned long *)tces, npages);
+ goto put_unlock_exit;
+ }
+
+ /* Emulated IO */
for (i = 0; i < npages; ++i) {
ret = kvmppc_tce_validate(((unsigned long *)tces)[i]);
if (ret)
@@ -270,6 +387,11 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
if (!tt)
return H_TOO_HARD;
+ if (tt->grp)
+ return kvmppc_rm_h_stuff_tce_iommu(vcpu, tt, liobn, ioba,
+ tce_value, npages);
+
+ /* Emulated IO */
if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
return H_PARAMETER;
@@ -395,6 +395,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = 1;
break;
case KVM_CAP_SPAPR_MULTITCE:
+ case KVM_CAP_SPAPR_TCE_IOMMU:
r = 1;
break;
#endif
@@ -1025,6 +1026,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
goto out;
}
+ case KVM_CREATE_SPAPR_TCE_IOMMU: {
+ struct kvm_create_spapr_tce_iommu create_tce_iommu;
+ struct kvm *kvm = filp->private_data;
+
+ r = -EFAULT;
+ if (copy_from_user(&create_tce_iommu, argp,
+ sizeof(create_tce_iommu)))
+ goto out;
+ r = kvm_vm_ioctl_create_spapr_tce_iommu(kvm, &create_tce_iommu);
+ goto out;
+ }
#endif /* CONFIG_PPC_BOOK3S_64 */
#ifdef CONFIG_KVM_BOOK3S_64_HV