Message ID | 20220114203849.243657-5-mjrosato@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | s390x/pci: zPCI interpretation support | expand |
On 14/01/2022 21.38, Matthew Rosato wrote: > Use the associated vfio feature ioctl to enable interpretation for devices > when requested. As part of this process, we must use the host function > handle rather than a QEMU-generated one -- this is provided as part of the > ioctl payload. > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- > hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- > hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ > include/hw/s390x/s390-pci-bus.h | 1 + > include/hw/s390x/s390-pci-vfio.h | 15 +++++++ > 5 files changed, 199 insertions(+), 2 deletions(-) > > diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c > index 01b58ebc70..a39ccfee05 100644 > --- a/hw/s390x/s390-pci-bus.c > +++ b/hw/s390x/s390-pci-bus.c > @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) > } > } > > +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) > +{ > + uint32_t idx; > + int rc; > + > + rc = s390_pci_probe_interp(pbdev); > + if (rc) { > + return rc; > + } > + > + rc = s390_pci_update_passthrough_fh(pbdev); > + if (rc) { > + return rc; > + } > + > + /* > + * The host device is already in an enabled state, but we always present > + * the initial device state to the guest as disabled (ZPCI_FS_DISABLED). > + * Therefore, mask off the enable bit from the passthrough handle until > + * the guest issues a CLP SET PCI FN later to enable the device. > + */ > + pbdev->fh &= ~FH_MASK_ENABLE; > + > + /* Next, see if the idx is already in-use */ > + idx = pbdev->fh & FH_MASK_INDEX; > + if (pbdev->idx != idx) { > + if (s390_pci_find_dev_by_idx(s, idx)) { > + return -EINVAL; > + } > + /* > + * Update the idx entry with the passed through idx > + * If the relinquished idx is lower than next_idx, use it > + * to replace next_idx > + */ > + g_hash_table_remove(s->zpci_table, &pbdev->idx); > + if (idx < s->next_idx) { > + s->next_idx = idx; > + } > + pbdev->idx = idx; > + g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev); > + } > + > + return 0; > +} > + > static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, > Error **errp) > { > S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); > PCIDevice *pdev = NULL; > S390PCIBusDevice *pbdev = NULL; > + int rc; > > if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { > PCIBridge *pb = PCI_BRIDGE(dev); > @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, > set_pbdev_info(pbdev); > > if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > - pbdev->fh |= FH_SHM_VFIO; > + /* > + * By default, interpretation is always requested; if the available > + * facilities indicate it is not available, fallback to the > + * intercept model. > + */ > + if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { > + DPRINTF("zPCI interpretation facilities missing.\n"); > + pbdev->interp = false; > + } Wrong indentation in the above three lines. > + if (pbdev->interp) { > + rc = s390_pci_interp_plug(s, pbdev); > + if (rc) { > + error_setg(errp, "zpci interp plug failed: %d", rc); The error message is a little bit scarce for something that might be presented to the user - maybe write at least "interpretation" instead of "interp" ? > + return; > + } > + } > pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); > /* Fill in CLP information passed via the vfio region */ > s390_pci_get_clp_info(pbdev); > + if (!pbdev->interp) { > + /* Do vfio passthrough but intercept for I/O */ > + pbdev->fh |= FH_SHM_VFIO; > + } > } else { > pbdev->fh |= FH_SHM_EMUL; > + /* Always intercept emulated devices */ > + pbdev->interp = false; > } Thomas
On 14/01/2022 21.38, Matthew Rosato wrote: > Use the associated vfio feature ioctl to enable interpretation for devices > when requested. As part of this process, we must use the host function > handle rather than a QEMU-generated one -- this is provided as part of the > ioctl payload. > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- > hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- > hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ > include/hw/s390x/s390-pci-bus.h | 1 + > include/hw/s390x/s390-pci-vfio.h | 15 +++++++ > 5 files changed, 199 insertions(+), 2 deletions(-) > > diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c > index 01b58ebc70..a39ccfee05 100644 > --- a/hw/s390x/s390-pci-bus.c > +++ b/hw/s390x/s390-pci-bus.c ... > @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = { > DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), > DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), > DEFINE_PROP_STRING("target", S390PCIBusDevice, target), > + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), > DEFINE_PROP_END_OF_LIST(), > }; Since this is something that the user can see, would it maybe make sense to provide a full word instead of an abbreviation here? I.e. "interpret" or "interpretation" instead of "interp" ? Thomas
On 1/17/22 9:51 AM, Thomas Huth wrote: > On 14/01/2022 21.38, Matthew Rosato wrote: ... >> static void s390_pcihost_plug(HotplugHandler *hotplug_dev, >> DeviceState *dev, >> Error **errp) >> { >> S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); >> PCIDevice *pdev = NULL; >> S390PCIBusDevice *pbdev = NULL; >> + int rc; >> if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { >> PCIBridge *pb = PCI_BRIDGE(dev); >> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler >> *hotplug_dev, DeviceState *dev, >> set_pbdev_info(pbdev); >> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { >> - pbdev->fh |= FH_SHM_VFIO; >> + /* >> + * By default, interpretation is always requested; if the >> available >> + * facilities indicate it is not available, fallback to the >> + * intercept model. >> + */ >> + if (pbdev->interp && >> !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { >> + DPRINTF("zPCI interpretation facilities >> missing.\n"); >> + pbdev->interp = false; >> + } > > Wrong indentation in the above three lines. Thanks > >> + if (pbdev->interp) { >> + rc = s390_pci_interp_plug(s, pbdev); >> + if (rc) { >> + error_setg(errp, "zpci interp plug failed: %d", rc); > > The error message is a little bit scarce for something that might be > presented to the user - maybe write at least "interpretation" instead of > "interp" ? > Good point, I'll re-word to something like "Plug failed for zPCI device in interpretation mode: %d" >> + return; >> + } >> + } >> pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, >> pbdev); >> /* Fill in CLP information passed via the vfio region */ >> s390_pci_get_clp_info(pbdev); >> + if (!pbdev->interp) { >> + /* Do vfio passthrough but intercept for I/O */ >> + pbdev->fh |= FH_SHM_VFIO; >> + } >> } else { >> pbdev->fh |= FH_SHM_EMUL; >> + /* Always intercept emulated devices */ >> + pbdev->interp = false; >> } > > Thomas >
On 1/17/22 10:38 AM, Thomas Huth wrote: > On 14/01/2022 21.38, Matthew Rosato wrote: >> Use the associated vfio feature ioctl to enable interpretation for >> devices >> when requested. As part of this process, we must use the host function >> handle rather than a QEMU-generated one -- this is provided as part of >> the >> ioctl payload. >> >> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >> --- >> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- >> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- >> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ >> include/hw/s390x/s390-pci-bus.h | 1 + >> include/hw/s390x/s390-pci-vfio.h | 15 +++++++ >> 5 files changed, 199 insertions(+), 2 deletions(-) >> >> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c >> index 01b58ebc70..a39ccfee05 100644 >> --- a/hw/s390x/s390-pci-bus.c >> +++ b/hw/s390x/s390-pci-bus.c > ... >> @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = { >> DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), >> DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), >> DEFINE_PROP_STRING("target", S390PCIBusDevice, target), >> + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), >> DEFINE_PROP_END_OF_LIST(), >> }; > > Since this is something that the user can see, would it maybe make sense > to provide a full word instead of an abbreviation here? I.e. "interpret" > or "interpretation" instead of "interp" ? I'll go with "interpret" unless someone else has a strong opinion on it.
On 1/14/22 21:38, Matthew Rosato wrote: > Use the associated vfio feature ioctl to enable interpretation for devices > when requested. As part of this process, we must use the host function > handle rather than a QEMU-generated one -- this is provided as part of the > ioctl payload. > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- > hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- > hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ > include/hw/s390x/s390-pci-bus.h | 1 + > include/hw/s390x/s390-pci-vfio.h | 15 +++++++ > 5 files changed, 199 insertions(+), 2 deletions(-) > > diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c > index 01b58ebc70..a39ccfee05 100644 > --- a/hw/s390x/s390-pci-bus.c > +++ b/hw/s390x/s390-pci-bus.c > @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) > } > } > ...snip... > diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h > index ff708aef50..42533e38f7 100644 > --- a/include/hw/s390x/s390-pci-vfio.h > +++ b/include/hw/s390x/s390-pci-vfio.h > @@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail); > S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, > S390PCIBusDevice *pbdev); > void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); > +int s390_pci_probe_interp(S390PCIBusDevice *pbdev); > +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable); > +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev); > void s390_pci_get_clp_info(S390PCIBusDevice *pbdev); > #else > static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) > @@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, > } > static inline void s390_pci_end_dma_count(S390pciState *s, > S390PCIDMACount *cnt) { } > +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) static inline ? > +{ > + return -EINVAL; > +} > +static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) > +{ > + return -EINVAL; > +} > +static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) > +{ > + return -EINVAL; > +} > static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { } > #endif > >
On 1/14/22 21:38, Matthew Rosato wrote: > Use the associated vfio feature ioctl to enable interpretation for devices > when requested. As part of this process, we must use the host function > handle rather than a QEMU-generated one -- this is provided as part of the > ioctl payload. I wonder if we should not explain here that having interpretation as a default and silently fall back to interception allows backward compatibility while allowing performence be chosing by default. (You can say it better as I do :) ) > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- > hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- > hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ > include/hw/s390x/s390-pci-bus.h | 1 + > include/hw/s390x/s390-pci-vfio.h | 15 +++++++ > 5 files changed, 199 insertions(+), 2 deletions(-) > > diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c > index 01b58ebc70..a39ccfee05 100644 > --- a/hw/s390x/s390-pci-bus.c > +++ b/hw/s390x/s390-pci-bus.c > @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) > } > } > > +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) > +{ > + uint32_t idx; > + int rc; > + > + rc = s390_pci_probe_interp(pbdev); > + if (rc) { > + return rc; > + } > + > + rc = s390_pci_update_passthrough_fh(pbdev); > + if (rc) { > + return rc; > + } > + > + /* > + * The host device is already in an enabled state, but we always present > + * the initial device state to the guest as disabled (ZPCI_FS_DISABLED). > + * Therefore, mask off the enable bit from the passthrough handle until > + * the guest issues a CLP SET PCI FN later to enable the device. > + */ > + pbdev->fh &= ~FH_MASK_ENABLE; > + > + /* Next, see if the idx is already in-use */ > + idx = pbdev->fh & FH_MASK_INDEX; > + if (pbdev->idx != idx) { > + if (s390_pci_find_dev_by_idx(s, idx)) { > + return -EINVAL; > + } > + /* > + * Update the idx entry with the passed through idx > + * If the relinquished idx is lower than next_idx, use it > + * to replace next_idx > + */ > + g_hash_table_remove(s->zpci_table, &pbdev->idx); > + if (idx < s->next_idx) { > + s->next_idx = idx; > + } > + pbdev->idx = idx; > + g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev); > + } > + > + return 0; > +} > + > static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, > Error **errp) > { > S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); > PCIDevice *pdev = NULL; > S390PCIBusDevice *pbdev = NULL; > + int rc; > > if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { > PCIBridge *pb = PCI_BRIDGE(dev); > @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, > set_pbdev_info(pbdev); > > if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > - pbdev->fh |= FH_SHM_VFIO; > + /* > + * By default, interpretation is always requested; if the available > + * facilities indicate it is not available, fallback to the > + * intercept model. s/intercept/interception/ ? > + */ > + if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { > + DPRINTF("zPCI interpretation facilities missing.\n"); > + pbdev->interp = false; > + } > + if (pbdev->interp) { > + rc = s390_pci_interp_plug(s, pbdev); > + if (rc) { > + error_setg(errp, "zpci interp plug failed: %d", rc); > + return; > + } > + } Can't we rearrange that as if (pbdev->interp) { if (s390_has_feat) { } else { } } > pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); > /* Fill in CLP information passed via the vfio region */ > s390_pci_get_clp_info(pbdev); > + if (!pbdev->interp) { > + /* Do vfio passthrough but intercept for I/O */ > + pbdev->fh |= FH_SHM_VFIO; > + } > } else { > pbdev->fh |= FH_SHM_EMUL; > + /* Always intercept emulated devices */ > + pbdev->interp = false; > } > > if (s390_pci_msix_init(pbdev)) { > @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = { > DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), > DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), > DEFINE_PROP_STRING("target", S390PCIBusDevice, target), > + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c > index 6d400d4147..e9a0dc12e4 100644 > --- a/hw/s390x/s390-pci-inst.c > +++ b/hw/s390x/s390-pci-inst.c > @@ -18,6 +18,7 @@ > #include "sysemu/hw_accel.h" > #include "hw/s390x/s390-pci-inst.h" > #include "hw/s390x/s390-pci-bus.h" > +#include "hw/s390x/s390-pci-vfio.h" > #include "hw/s390x/tod.h" > > #ifndef DEBUG_S390PCI_INST > @@ -156,6 +157,47 @@ out: > return rc; > } > > +static int clp_enable_interp(S390PCIBusDevice *pbdev) > +{ > + int rc; > + > + rc = s390_pci_set_interp(pbdev, true); > + if (rc) { > + DPRINTF("Failed to enable interpretation\n"); > + return rc; > + } > + rc = s390_pci_update_passthrough_fh(pbdev); > + if (rc) { > + DPRINTF("Failed to update passthrough fh\n"); > + return rc; > + } > + if (!(pbdev->fh & FH_MASK_ENABLE)) { > + DPRINTF("Passthrough handle is not enabled\n"); > + return -EINVAL; > + } > + > + return 0; > +} > + > +static int clp_disable_interp(S390PCIBusDevice *pbdev) > +{ > + int rc; > + > + rc = s390_pci_set_interp(pbdev, false); > + if (rc) { > + DPRINTF("Failed to disable interpretation\n"); > + return rc; > + } > + > + rc = s390_pci_update_passthrough_fh(pbdev); > + if (rc) { > + DPRINTF("Failed to update passthrough fh\n"); > + return rc; > + } > + > + return 0; > +} > + > int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) > { > ClpReqHdr *reqh; > @@ -246,7 +288,19 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) > goto out; > } > > - pbdev->fh |= FH_MASK_ENABLE; > + /* > + * If interpretation is specified, attempt to enable this now and > + * update with the host fh > + */ > + if (pbdev->interp) { > + if (clp_enable_interp(pbdev)) { > + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR); > + goto out; > + } > + } else { > + pbdev->fh |= FH_MASK_ENABLE; > + } > + > pbdev->state = ZPCI_FS_ENABLED; > stl_p(&ressetpci->fh, pbdev->fh); > stw_p(&ressetpci->hdr.rsp, CLP_RC_OK); > @@ -257,6 +311,13 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) > goto out; > } > device_legacy_reset(DEVICE(pbdev)); > + if (pbdev->interp) { > + if (clp_disable_interp(pbdev)) { > + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR); > + goto out; > + } > + } > + /* Mask off the enabled bit for interpreted devices too */ > pbdev->fh &= ~FH_MASK_ENABLE; > pbdev->state = ZPCI_FS_DISABLED; > stl_p(&ressetpci->fh, pbdev->fh); > diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c > index 6f80a47e29..2cab3a9e89 100644 > --- a/hw/s390x/s390-pci-vfio.c > +++ b/hw/s390x/s390-pci-vfio.c > @@ -97,6 +97,58 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) > } > } > > +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) > +{ > + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); > + struct vfio_device_feature feat = { > + .argsz = sizeof(struct vfio_device_feature), > + .flags = VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_ZPCI_INTERP > + }; > + > + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, &feat); > +} > + > +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) > +{ > + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); > + struct vfio_device_zpci_interp *data; > + int size = sizeof(struct vfio_device_feature) + sizeof(*data); > + g_autofree struct vfio_device_feature *feat = g_malloc0(size); > + > + feat->argsz = size; > + feat->flags = VFIO_DEVICE_FEATURE_SET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; > + > + data = (struct vfio_device_zpci_interp *)&feat->data; > + if (enable) { > + data->flags = VFIO_DEVICE_ZPCI_FLAG_INTERP; > + } else { > + data->flags = 0; > + } > + > + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); > +} > + > +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) > +{ > + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); > + struct vfio_device_zpci_interp *data; > + int size = sizeof(struct vfio_device_feature) + sizeof(*data); > + g_autofree struct vfio_device_feature *feat = g_malloc0(size); > + int rc; > + > + feat->argsz = size; > + feat->flags = VFIO_DEVICE_FEATURE_GET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; > + > + rc = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); > + if (rc) { > + return rc; > + } > + > + data = (struct vfio_device_zpci_interp *)&feat->data; > + pbdev->fh = data->fh; > + return 0; > +} > + > static void s390_pci_read_base(S390PCIBusDevice *pbdev, > struct vfio_device_info *info) > { > diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h > index da3cde2bb4..a9843dfe97 100644 > --- a/include/hw/s390x/s390-pci-bus.h > +++ b/include/hw/s390x/s390-pci-bus.h > @@ -350,6 +350,7 @@ struct S390PCIBusDevice { > IndAddr *indicator; > bool pci_unplug_request_processed; > bool unplug_requested; > + bool interp; > QTAILQ_ENTRY(S390PCIBusDevice) link; > }; > > diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h > index ff708aef50..42533e38f7 100644 > --- a/include/hw/s390x/s390-pci-vfio.h > +++ b/include/hw/s390x/s390-pci-vfio.h > @@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail); > S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, > S390PCIBusDevice *pbdev); > void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); > +int s390_pci_probe_interp(S390PCIBusDevice *pbdev); > +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable); > +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev); > void s390_pci_get_clp_info(S390PCIBusDevice *pbdev); > #else > static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) > @@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, > } > static inline void s390_pci_end_dma_count(S390pciState *s, > S390PCIDMACount *cnt) { } > +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) > +{ > + return -EINVAL; > +} > +static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) > +{ > + return -EINVAL; > +} > +static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) > +{ > + return -EINVAL; > +} > static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { } > #endif > > LGTM With the corrections proposed by Thomas. Mine... you see what you prefer. Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
On 1/31/22 9:46 AM, Pierre Morel wrote: > > > On 1/14/22 21:38, Matthew Rosato wrote: >> Use the associated vfio feature ioctl to enable interpretation for >> devices >> when requested. As part of this process, we must use the host function >> handle rather than a QEMU-generated one -- this is provided as part of >> the >> ioctl payload. > > I wonder if we should not explain here that having interpretation as a > default and silently fall back to interception allows backward > compatibility while allowing performence be chosing by default. > (You can say it better as I do :) ) Good suggestion, I'll think of something to add to the commit message. >> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler >> *hotplug_dev, DeviceState *dev, >> set_pbdev_info(pbdev); >> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { >> - pbdev->fh |= FH_SHM_VFIO; >> + /* >> + * By default, interpretation is always requested; if the >> available >> + * facilities indicate it is not available, fallback to the >> + * intercept model. > > s/intercept/interception/ ? > OK >> + */ >> + if (pbdev->interp && >> !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { >> + DPRINTF("zPCI interpretation facilities >> missing.\n"); >> + pbdev->interp = false; >> + } >> + if (pbdev->interp) { >> + rc = s390_pci_interp_plug(s, pbdev); >> + if (rc) { >> + error_setg(errp, "zpci interp plug failed: %d", rc); >> + return; >> + } >> + } > > > Can't we rearrange that as > if (pbdev->interp) { > if (s390_has_feat) { > } else { > } > } Yep, sure ... > > LGTM > With the corrections proposed by Thomas. > Mine... you see what you prefer. > > Reviewed-by: Pierre Morel <pmorel@linux.ibm.com> Thanks!
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 01b58ebc70..a39ccfee05 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) } } +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) +{ + uint32_t idx; + int rc; + + rc = s390_pci_probe_interp(pbdev); + if (rc) { + return rc; + } + + rc = s390_pci_update_passthrough_fh(pbdev); + if (rc) { + return rc; + } + + /* + * The host device is already in an enabled state, but we always present + * the initial device state to the guest as disabled (ZPCI_FS_DISABLED). + * Therefore, mask off the enable bit from the passthrough handle until + * the guest issues a CLP SET PCI FN later to enable the device. + */ + pbdev->fh &= ~FH_MASK_ENABLE; + + /* Next, see if the idx is already in-use */ + idx = pbdev->fh & FH_MASK_INDEX; + if (pbdev->idx != idx) { + if (s390_pci_find_dev_by_idx(s, idx)) { + return -EINVAL; + } + /* + * Update the idx entry with the passed through idx + * If the relinquished idx is lower than next_idx, use it + * to replace next_idx + */ + g_hash_table_remove(s->zpci_table, &pbdev->idx); + if (idx < s->next_idx) { + s->next_idx = idx; + } + pbdev->idx = idx; + g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev); + } + + return 0; +} + static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); PCIDevice *pdev = NULL; S390PCIBusDevice *pbdev = NULL; + int rc; if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { PCIBridge *pb = PCI_BRIDGE(dev); @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, set_pbdev_info(pbdev); if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { - pbdev->fh |= FH_SHM_VFIO; + /* + * By default, interpretation is always requested; if the available + * facilities indicate it is not available, fallback to the + * intercept model. + */ + if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { + DPRINTF("zPCI interpretation facilities missing.\n"); + pbdev->interp = false; + } + if (pbdev->interp) { + rc = s390_pci_interp_plug(s, pbdev); + if (rc) { + error_setg(errp, "zpci interp plug failed: %d", rc); + return; + } + } pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); /* Fill in CLP information passed via the vfio region */ s390_pci_get_clp_info(pbdev); + if (!pbdev->interp) { + /* Do vfio passthrough but intercept for I/O */ + pbdev->fh |= FH_SHM_VFIO; + } } else { pbdev->fh |= FH_SHM_EMUL; + /* Always intercept emulated devices */ + pbdev->interp = false; } if (s390_pci_msix_init(pbdev)) { @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = { DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), DEFINE_PROP_STRING("target", S390PCIBusDevice, target), + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 6d400d4147..e9a0dc12e4 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -18,6 +18,7 @@ #include "sysemu/hw_accel.h" #include "hw/s390x/s390-pci-inst.h" #include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-vfio.h" #include "hw/s390x/tod.h" #ifndef DEBUG_S390PCI_INST @@ -156,6 +157,47 @@ out: return rc; } +static int clp_enable_interp(S390PCIBusDevice *pbdev) +{ + int rc; + + rc = s390_pci_set_interp(pbdev, true); + if (rc) { + DPRINTF("Failed to enable interpretation\n"); + return rc; + } + rc = s390_pci_update_passthrough_fh(pbdev); + if (rc) { + DPRINTF("Failed to update passthrough fh\n"); + return rc; + } + if (!(pbdev->fh & FH_MASK_ENABLE)) { + DPRINTF("Passthrough handle is not enabled\n"); + return -EINVAL; + } + + return 0; +} + +static int clp_disable_interp(S390PCIBusDevice *pbdev) +{ + int rc; + + rc = s390_pci_set_interp(pbdev, false); + if (rc) { + DPRINTF("Failed to disable interpretation\n"); + return rc; + } + + rc = s390_pci_update_passthrough_fh(pbdev); + if (rc) { + DPRINTF("Failed to update passthrough fh\n"); + return rc; + } + + return 0; +} + int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) { ClpReqHdr *reqh; @@ -246,7 +288,19 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) goto out; } - pbdev->fh |= FH_MASK_ENABLE; + /* + * If interpretation is specified, attempt to enable this now and + * update with the host fh + */ + if (pbdev->interp) { + if (clp_enable_interp(pbdev)) { + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR); + goto out; + } + } else { + pbdev->fh |= FH_MASK_ENABLE; + } + pbdev->state = ZPCI_FS_ENABLED; stl_p(&ressetpci->fh, pbdev->fh); stw_p(&ressetpci->hdr.rsp, CLP_RC_OK); @@ -257,6 +311,13 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) goto out; } device_legacy_reset(DEVICE(pbdev)); + if (pbdev->interp) { + if (clp_disable_interp(pbdev)) { + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR); + goto out; + } + } + /* Mask off the enabled bit for interpreted devices too */ pbdev->fh &= ~FH_MASK_ENABLE; pbdev->state = ZPCI_FS_DISABLED; stl_p(&ressetpci->fh, pbdev->fh); diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index 6f80a47e29..2cab3a9e89 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -97,6 +97,58 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) } } +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) +{ + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); + struct vfio_device_feature feat = { + .argsz = sizeof(struct vfio_device_feature), + .flags = VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_ZPCI_INTERP + }; + + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, &feat); +} + +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) +{ + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); + struct vfio_device_zpci_interp *data; + int size = sizeof(struct vfio_device_feature) + sizeof(*data); + g_autofree struct vfio_device_feature *feat = g_malloc0(size); + + feat->argsz = size; + feat->flags = VFIO_DEVICE_FEATURE_SET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; + + data = (struct vfio_device_zpci_interp *)&feat->data; + if (enable) { + data->flags = VFIO_DEVICE_ZPCI_FLAG_INTERP; + } else { + data->flags = 0; + } + + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); +} + +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) +{ + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev); + struct vfio_device_zpci_interp *data; + int size = sizeof(struct vfio_device_feature) + sizeof(*data); + g_autofree struct vfio_device_feature *feat = g_malloc0(size); + int rc; + + feat->argsz = size; + feat->flags = VFIO_DEVICE_FEATURE_GET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; + + rc = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); + if (rc) { + return rc; + } + + data = (struct vfio_device_zpci_interp *)&feat->data; + pbdev->fh = data->fh; + return 0; +} + static void s390_pci_read_base(S390PCIBusDevice *pbdev, struct vfio_device_info *info) { diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index da3cde2bb4..a9843dfe97 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -350,6 +350,7 @@ struct S390PCIBusDevice { IndAddr *indicator; bool pci_unplug_request_processed; bool unplug_requested; + bool interp; QTAILQ_ENTRY(S390PCIBusDevice) link; }; diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h index ff708aef50..42533e38f7 100644 --- a/include/hw/s390x/s390-pci-vfio.h +++ b/include/hw/s390x/s390-pci-vfio.h @@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail); S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, S390PCIBusDevice *pbdev); void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); +int s390_pci_probe_interp(S390PCIBusDevice *pbdev); +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable); +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev); void s390_pci_get_clp_info(S390PCIBusDevice *pbdev); #else static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) @@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, } static inline void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) { } +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) +{ + return -EINVAL; +} +static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) +{ + return -EINVAL; +} +static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) +{ + return -EINVAL; +} static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { } #endif
Use the associated vfio feature ioctl to enable interpretation for devices when requested. As part of this process, we must use the host function handle rather than a QEMU-generated one -- this is provided as part of the ioctl payload. Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> --- hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++- hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++- hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++ include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-vfio.h | 15 +++++++ 5 files changed, 199 insertions(+), 2 deletions(-)