Message ID | 1422965498-11500-2-git-send-email-thuth@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On 03/02/2015 13:11, Thomas Huth wrote: > On s390, we've got to make sure to hold the IPTE lock while accessing > virtual memory. So let's add an ioctl for reading and writing virtual > memory to provide this feature for userspace, too. > > Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com> > Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com> > Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com> > --- > Documentation/virtual/kvm/api.txt | 44 +++++++++++++++++++++++++ > arch/s390/kvm/gaccess.c | 22 +++++++++++++ > arch/s390/kvm/gaccess.h | 2 + > arch/s390/kvm/kvm-s390.c | 63 +++++++++++++++++++++++++++++++++++++ > include/uapi/linux/kvm.h | 21 ++++++++++++ > 5 files changed, 152 insertions(+), 0 deletions(-) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index b112efc..bf44b53 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -2716,6 +2716,50 @@ The fields in each entry are defined as follows: > eax, ebx, ecx, edx: the values returned by the cpuid instruction for > this function/index combination > > +4.89 KVM_GUEST_MEM_OP > + > +Capability: KVM_CAP_MEM_OP Put "virtual" somewhere in the ioctl name and capability? > +Architectures: s390 > +Type: vcpu ioctl > +Parameters: struct kvm_guest_mem_op (in) > +Returns: = 0 on success, > + < 0 on generic error (e.g. -EFAULT or -ENOMEM), > + > 0 if an exception occurred while walking the page tables > + > +Read or write data from/to the virtual memory of a VPCU. > + > +Parameters are specified via the following structure: > + > +struct kvm_guest_mem_op { > + __u64 gaddr; /* the guest address */ > + __u64 flags; /* arch specific flags */ > + __u32 size; /* amount of bytes */ > + __u32 op; /* type of operation */ > + __u64 buf; /* buffer in userspace */ > + __u8 reserved[32]; /* should be set to 0 */ > +}; > + > +The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD > +for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or > +KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the Better: #define KVM_MEMOP_READ 0 #define KVM_MEMOP_WRITE 1 and in the flags field: #define KVM_MEMOP_F_CHECK_ONLY (1 << 1) > +corresponding memory access would create an access exception (without > +changing the data in the memory at the destination). In case an access > +exception occurred while walking the MMU tables of the guest, the ioctl > +returns a positive error number to indicate the type of exception. The > +exception is raised directly at the corresponding VCPU if the bit > +KVM_MEMOP_F_INJECT_EXC is set in the "flags" field. KVM_MEMOP_F_INJECT_EXCEPTION. > +The logical (virtual) start address of the memory region has to be specified > +in the "gaddr" field, and the length of the region in the "size" field. > +"buf" is the buffer supplied by the userspace application where the read data > +should be written to for KVM_MEMOP_VIRTREAD, or where the data that should > +be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both > +CHECK operations. "buf" is unused and can be NULL for both CHECK operations. > +The "reserved" field is meant for future extensions. It must currently be > +set to 0. Not really true, as you don't check it. So "It is not used by KVM with the currently defined set of flags" is a better explanation. Paolo > + > 5. The kvm_run structure > ------------------------ > > diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c > index 8a1be90..d912362 100644 > --- a/arch/s390/kvm/gaccess.c > +++ b/arch/s390/kvm/gaccess.c > @@ -697,6 +697,28 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, > } > > /** > + * check_gva_range - test a range of guest virtual addresses for accessibility > + */ > +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, > + unsigned long length, int is_write) > +{ > + unsigned long gpa; > + unsigned long currlen; > + int rc = 0; > + > + ipte_lock(vcpu); > + while (length > 0 && !rc) { > + currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE)); > + rc = guest_translate_address(vcpu, gva, &gpa, is_write); > + gva += currlen; > + length -= currlen; > + } > + ipte_unlock(vcpu); > + > + return rc; > +} > + > +/** > * kvm_s390_check_low_addr_protection - check for low-address protection > * @ga: Guest address > * > diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h > index 0149cf1..268beb7 100644 > --- a/arch/s390/kvm/gaccess.h > +++ b/arch/s390/kvm/gaccess.h > @@ -157,6 +157,8 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, > > int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, > unsigned long *gpa, int write); > +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, > + unsigned long length, int is_write); > > int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, > unsigned long len, int write); > diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c > index b2371c0..c80a640 100644 > --- a/arch/s390/kvm/kvm-s390.c > +++ b/arch/s390/kvm/kvm-s390.c > @@ -168,6 +168,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > case KVM_CAP_VM_ATTRIBUTES: > case KVM_CAP_MP_STATE: > case KVM_CAP_S390_USER_SIGP: > + case KVM_CAP_MEM_OP: > r = 1; > break; > case KVM_CAP_NR_VCPUS: > @@ -1886,6 +1887,59 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, > return r; > } > > +static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, > + struct kvm_guest_mem_op *mop) > +{ > + void __user *uaddr = (void __user *)mop->buf; > + void *tmpbuf = NULL; > + int r, srcu_idx; > + > + if ((mop->flags & ~KVM_MEMOP_F_INJECT_EXC) != 0) > + return -EINVAL; > + > + if (mop->op == KVM_MEMOP_VIRTREAD || mop->op == KVM_MEMOP_VIRTWRITE) { > + if (mop->size > 16 * PAGE_SIZE) > + return -E2BIG; > + tmpbuf = vmalloc(mop->size); > + if (!tmpbuf) > + return -ENOMEM; > + } > + > + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > + > + switch (mop->op) { > + case KVM_MEMOP_VIRTREAD: > + r = read_guest(vcpu, mop->gaddr, tmpbuf, mop->size); > + if (r == 0) { > + if (copy_to_user(uaddr, tmpbuf, mop->size)) > + r = -EFAULT; > + } > + break; > + case KVM_MEMOP_VIRTWRITE: > + if (copy_from_user(tmpbuf, uaddr, mop->size)) { > + r = -EFAULT; > + break; > + } > + r = write_guest(vcpu, mop->gaddr, tmpbuf, mop->size); > + break; > + case KVM_MEMOP_CHECKVIRTREAD: > + case KVM_MEMOP_CHECKVIRTWRITE: > + r = check_gva_range(vcpu, mop->gaddr, mop->size, > + mop->op == KVM_MEMOP_CHECKVIRTWRITE); > + break; > + default: > + r = -EINVAL; > + } > + > + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > + > + if (r > 0 && (mop->flags & KVM_MEMOP_F_INJECT_EXC) != 0) > + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); > + > + vfree(tmpbuf); > + return r; > +} > + > long kvm_arch_vcpu_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -1985,6 +2039,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); > break; > } > + case KVM_GUEST_MEM_OP: { > + struct kvm_guest_mem_op mem_op; > + > + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) > + r = kvm_s390_guest_mem_op(vcpu, &mem_op); > + else > + r = -EFAULT; > + break; > + } > default: > r = -ENOTTY; > } > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 8055706..528a3d4 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -365,6 +365,24 @@ struct kvm_translation { > __u8 pad[5]; > }; > > +/* for KVM_GUEST_MEM_OP */ > +struct kvm_guest_mem_op { > + /* in */ > + __u64 gaddr; /* the guest address */ > + __u64 flags; /* arch specific flags */ > + __u32 size; /* amount of bytes */ > + __u32 op; /* type of operation */ > + __u64 buf; /* buffer in userspace */ > + __u8 reserved[32]; /* should be set to 0 */ > +}; > +/* types for kvm_guest_mem_op->op */ > +#define KVM_MEMOP_VIRTREAD 0 > +#define KVM_MEMOP_VIRTWRITE 1 > +#define KVM_MEMOP_CHECKVIRTREAD 2 > +#define KVM_MEMOP_CHECKVIRTWRITE 3 > +/* flags for kvm_guest_mem_op->flags */ > +#define KVM_MEMOP_F_INJECT_EXC 0x0001UL /* Inject exception on faults */ > + > /* for KVM_INTERRUPT */ > struct kvm_interrupt { > /* in */ > @@ -760,6 +778,7 @@ struct kvm_ppc_smmu_info { > #define KVM_CAP_PPC_ENABLE_HCALL 104 > #define KVM_CAP_CHECK_EXTENSION_VM 105 > #define KVM_CAP_S390_USER_SIGP 106 > +#define KVM_CAP_MEM_OP 107 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1135,6 +1154,8 @@ struct kvm_s390_ucas_mapping { > #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) > #define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) > #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) > +/* Available with KVM_CAP_MEM_OP */ > +#define KVM_GUEST_MEM_OP _IOW(KVMIO, 0xb1, struct kvm_guest_mem_op) > > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) >
On Tue, 03 Feb 2015 14:04:43 +0100 Paolo Bonzini <pbonzini@redhat.com> wrote: > On 03/02/2015 13:11, Thomas Huth wrote: > > On s390, we've got to make sure to hold the IPTE lock while accessing > > virtual memory. So let's add an ioctl for reading and writing virtual > > memory to provide this feature for userspace, too. > > > > Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com> > > Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com> > > Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com> > > --- > > Documentation/virtual/kvm/api.txt | 44 +++++++++++++++++++++++++ > > arch/s390/kvm/gaccess.c | 22 +++++++++++++ > > arch/s390/kvm/gaccess.h | 2 + > > arch/s390/kvm/kvm-s390.c | 63 +++++++++++++++++++++++++++++++++++++ > > include/uapi/linux/kvm.h | 21 ++++++++++++ > > 5 files changed, 152 insertions(+), 0 deletions(-) > > > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > > index b112efc..bf44b53 100644 > > --- a/Documentation/virtual/kvm/api.txt > > +++ b/Documentation/virtual/kvm/api.txt > > @@ -2716,6 +2716,50 @@ The fields in each entry are defined as follows: > > eax, ebx, ecx, edx: the values returned by the cpuid instruction for > > this function/index combination > > > > +4.89 KVM_GUEST_MEM_OP > > + > > +Capability: KVM_CAP_MEM_OP > > Put "virtual" somewhere in the ioctl name and capability? Actually, I'd prefer to keep the "virtual" in the defines for the type of operation below: When it comes to s390 storage keys, we likely might need some calls for reading and writing to physical memory, too. Then we could simply extend this ioctl instead of inventing a new one. > > +Architectures: s390 > > +Type: vcpu ioctl > > +Parameters: struct kvm_guest_mem_op (in) > > +Returns: = 0 on success, > > + < 0 on generic error (e.g. -EFAULT or -ENOMEM), > > + > 0 if an exception occurred while walking the page tables > > + > > +Read or write data from/to the virtual memory of a VPCU. > > + > > +Parameters are specified via the following structure: > > + > > +struct kvm_guest_mem_op { > > + __u64 gaddr; /* the guest address */ > > + __u64 flags; /* arch specific flags */ > > + __u32 size; /* amount of bytes */ > > + __u32 op; /* type of operation */ > > + __u64 buf; /* buffer in userspace */ > > + __u8 reserved[32]; /* should be set to 0 */ > > +}; > > + > > +The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD > > +for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or > > +KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the > > Better: > > #define KVM_MEMOP_READ 0 > #define KVM_MEMOP_WRITE 1 > > and in the flags field: > > #define KVM_MEMOP_F_CHECK_ONLY (1 << 1) Ok, a flag for the check operations is fine for me, too. ... > > +The logical (virtual) start address of the memory region has to be specified > > +in the "gaddr" field, and the length of the region in the "size" field. > > +"buf" is the buffer supplied by the userspace application where the read data > > +should be written to for KVM_MEMOP_VIRTREAD, or where the data that should > > +be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both > > +CHECK operations. > > "buf" is unused and can be NULL for both CHECK operations. > > > +The "reserved" field is meant for future extensions. It must currently be > > +set to 0. > > Not really true, as you don't check it. So "It is not used by KVM with > the currently defined set of flags" is a better explanation. ok ... and maybe add "should be set to zero" ? > Paolo Thanks for the review! Thomas
On 03/02/2015 16:16, Thomas Huth wrote: > Actually, I'd prefer to keep the "virtual" in the defines for the type > of operation below: When it comes to s390 storage keys, we likely might > need some calls for reading and writing to physical memory, too. Then > we could simply extend this ioctl instead of inventing a new one. Can you explain why it is necessary to read/write physical addresses from user space? In the case of QEMU, I'm worried that you would have to invent your own memory read/write APIs that are different from everything else. On real s390 zPCI, does bus-master DMA update storage keys? >> Not really true, as you don't check it. So "It is not used by KVM with >> the currently defined set of flags" is a better explanation. > > ok ... and maybe add "should be set to zero" ? If you don't check it, it is misleading to document this. Paolo
On Tue, 03 Feb 2015 16:22:32 +0100 Paolo Bonzini <pbonzini@redhat.com> wrote: > > > On 03/02/2015 16:16, Thomas Huth wrote: > > Actually, I'd prefer to keep the "virtual" in the defines for the type > > of operation below: When it comes to s390 storage keys, we likely might > > need some calls for reading and writing to physical memory, too. Then > > we could simply extend this ioctl instead of inventing a new one. > > Can you explain why it is necessary to read/write physical addresses > from user space? In the case of QEMU, I'm worried that you would have > to invent your own memory read/write APIs that are different from > everything else. > > On real s390 zPCI, does bus-master DMA update storage keys? Ah, I was not thinking about bus-mastering/DMA here: AFAIK there are some CPU instructions that access a parameter block in physical memory, for example the SCLP instruction (see hw/s390x/sclp.c) - it's already doing a cpu_physical_memory_read and ..._write for the parameters. However, I haven't checked yet whether it is also supposed to touch the storage keys, so if not, we also might be fine without the ioctls for reading/writing to physical memory. > >> Not really true, as you don't check it. So "It is not used by KVM with > >> the currently defined set of flags" is a better explanation. > > > > ok ... and maybe add "should be set to zero" ? > > If you don't check it, it is misleading to document this. True... so I'll omit that. Thomas
Am 03.02.2015 um 16:22 schrieb Paolo Bonzini: > > > On 03/02/2015 16:16, Thomas Huth wrote: >> Actually, I'd prefer to keep the "virtual" in the defines for the type >> of operation below: When it comes to s390 storage keys, we likely might >> need some calls for reading and writing to physical memory, too. Then >> we could simply extend this ioctl instead of inventing a new one. Rereading that. Shall we replace "virtual" with "logical"? That is what is used architecturally when we mean "do whatever is appropriate right now" That can boil down to virtual via DAT, virtual via access register mode, real if DAT is off... and if fact your kernel implementation does that. > > Can you explain why it is necessary to read/write physical addresses > from user space? In the case of QEMU, I'm worried that you would have > to invent your own memory read/write APIs that are different from > everything else. > > On real s390 zPCI, does bus-master DMA update storage keys? the classic channel I/O does set the storage key change/reference and also triggers errors in the storage key protection value mismatches. The PCI IOTA structure does contain a storage key value for accesses, so I assume its the same here, but I dont know for sure. Conny: I am asking myself, if we should explicitly add a comment in the virtio-ccw spec, that all accesses are assumed to be with key 0 and thus never cause key protection. The change/reference bit is set by the underlying I/O or memory copy anyway. We can then add a ccw later on to set a different key if we ever need that. > >>> Not really true, as you don't check it. So "It is not used by KVM with >>> the currently defined set of flags" is a better explanation. >> >> ok ... and maybe add "should be set to zero" ? > > If you don't check it, it is misleading to document this. > > Paolo > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
On 04/02/2015 09:26, Christian Borntraeger wrote: > Am 03.02.2015 um 16:22 schrieb Paolo Bonzini: >> On 03/02/2015 16:16, Thomas Huth wrote: >>> Actually, I'd prefer to keep the "virtual" in the defines for the type >>> of operation below: When it comes to s390 storage keys, we likely might >>> need some calls for reading and writing to physical memory, too. Then >>> we could simply extend this ioctl instead of inventing a new one. > > Rereading that. Shall we replace "virtual" with "logical"? That is what is > used architecturally when we mean "do whatever is appropriate right now" > That can boil down to virtual via DAT, virtual via access register mode, > real if DAT is off... and if fact your kernel implementation does that. That makes sense. >> Can you explain why it is necessary to read/write physical addresses >> from user space? In the case of QEMU, I'm worried that you would have >> to invent your own memory read/write APIs that are different from >> everything else. >> >> On real s390 zPCI, does bus-master DMA update storage keys? > > the classic channel I/O does set the storage key change/reference and > also triggers errors in the storage key protection value mismatches. > > The PCI IOTA structure does contain a storage key value for accesses, > so I assume its the same here, but I dont know for sure. Emulating that in QEMU would be very hard. Every DMA read/write would have to go through a bounce buffer, but QEMU block device models for example try hard to read from host disk directly into guest memory. > Conny: > I am asking myself, if we should explicitly add a comment in the > virtio-ccw spec, that all accesses are assumed to be with key 0 and > thus never cause key protection. The change/reference bit is set > by the underlying I/O or memory copy anyway. Can you explain the last sentence? :) Paolo > We can then add a ccw later on to set a different key if we ever need > that. > > >> >>>> Not really true, as you don't check it. So "It is not used by KVM with >>>> the currently defined set of flags" is a better explanation. >>> >>> ok ... and maybe add "should be set to zero" ? >> >> If you don't check it, it is misleading to document this. >> >> Paolo >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
On Wed, 04 Feb 2015 09:26:11 +0100 Christian Borntraeger <borntraeger@de.ibm.com> wrote: > Am 03.02.2015 um 16:22 schrieb Paolo Bonzini: > > > > > > On 03/02/2015 16:16, Thomas Huth wrote: > >> Actually, I'd prefer to keep the "virtual" in the defines for the type > >> of operation below: When it comes to s390 storage keys, we likely might > >> need some calls for reading and writing to physical memory, too. Then > >> we could simply extend this ioctl instead of inventing a new one. > > Rereading that. Shall we replace "virtual" with "logical"? That is what is > used architecturally when we mean "do whatever is appropriate right now" > That can boil down to virtual via DAT, virtual via access register mode, > real if DAT is off... and if fact your kernel implementation does that. True, but so far I tried to avoid to include s390-only wording into this ioctl in case other architectures might need such a functionality later, too (then this ioctl could be simply used there, too). OTOH, maybe the memory model on s390 is just too special to try to keep this ioctl generic ... but then I guess I also should rename the define KVM_GUEST_MEM_OP into KVM_S390_GUEST_MEM_OP, for example? Thomas
Am 04.02.2015 um 11:39 schrieb Paolo Bonzini: >> Conny: >> I am asking myself, if we should explicitly add a comment in the >> virtio-ccw spec, that all accesses are assumed to be with key 0 and >> thus never cause key protection. The change/reference bit is set >> by the underlying I/O or memory copy anyway. > > Can you explain the last sentence? :) Whenever vhost or qemu or a finished aio request wrote content into a virtio buffer, the HW has set the storage key for that physical page, which makes it automatically dirty/referenced in the guest visible storage key. For completeness sake: Now, if the guest does not use the storage key, but instead the new fault based software dirty tracking, it wont notice the change bit. The guest I/O itself when finished will mark the struct page as Dirty, just like on x86. Makes sense? Christian
On 04/02/2015 12:25, Christian Borntraeger wrote: > Whenever vhost or qemu or a finished aio request wrote content into a > virtio buffer, the HW has set the storage key for that physical page, > which makes it automatically dirty/referenced in the guest visible > storage key. Ah, I knew the storage keys were per-physical page, but I wasn't sure if they were separate for the host and the guest. That's obvious now. Can we detect non-zero storage key in emulated zPCI requests, and fail the request somehow? Paolo
Am 04.02.2015 um 12:42 schrieb Paolo Bonzini: > > > On 04/02/2015 12:25, Christian Borntraeger wrote: >> Whenever vhost or qemu or a finished aio request wrote content into a >> virtio buffer, the HW has set the storage key for that physical page, >> which makes it automatically dirty/referenced in the guest visible >> storage key. > > Ah, I knew the storage keys were per-physical page, but I wasn't sure if > they were separate for the host and the guest. That's obvious now. Just something on top: the storage key is per physical page. Just once. It contains C/R/ACC/F (change, reference, access key, fetch protection) But: there is also the pgste page table extension. That is used to separate both by doing logically ORs. The host and millicode will do the right shifting copying to keep both values separate, but when the physical storage key gets dirty, the host and the guest view is now "changed==yes" > > Can we detect non-zero storage key in emulated zPCI requests, and fail > the request somehow? Not right now. Even the kernel KVM module does not do this for emulated instructions (as Linux has always key 0). Somewhen we might want to add that capability, but its obviously not trivial for I/O like things. It would get easier if we avoid VFIO etc and just had used the hardware support, though. but as far as I can see this is not an option in QEMU. Christian
On Wed, 04 Feb 2015 09:26:11 +0100 Christian Borntraeger <borntraeger@de.ibm.com> wrote: > the classic channel I/O does set the storage key change/reference and > also triggers errors in the storage key protection value mismatches. Just a bit of background for the benefit of innocent bystanders: When classic channel I/O is initiated, the caller provides a so-called operation request block (orb) which references the actual channel program it wants to start. In this same orb, the caller also provides the storage key to be used for all memory fetches (either of the individual channel commands or of the memory they reference). qemu interpretation of channel commands currently ignores all of this. > Conny: > I am asking myself, if we should explicitly add a comment in the > virtio-ccw spec, that all accesses are assumed to be with key 0 and > thus never cause key protection. The change/reference bit is set > by the underlying I/O or memory copy anyway. > We can then add a ccw later on to set a different key if we ever need > that. I don't think we need to clarify anything for the normal channel commands used by virtio: They are treated as any other channel command wrt key protection; and the lack of key checking is a feature of qemu, not of the spec. For the memory used for the virtqueues, I agree that we should clarify that we assume key 0. I'm not sure whether there's a case for extending this in the future, but who knows. I'll probably have to open an issue against the virtio spec for this.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b112efc..bf44b53 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2716,6 +2716,50 @@ The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +4.89 KVM_GUEST_MEM_OP + +Capability: KVM_CAP_MEM_OP +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_guest_mem_op (in) +Returns: = 0 on success, + < 0 on generic error (e.g. -EFAULT or -ENOMEM), + > 0 if an exception occurred while walking the page tables + +Read or write data from/to the virtual memory of a VPCU. + +Parameters are specified via the following structure: + +struct kvm_guest_mem_op { + __u64 gaddr; /* the guest address */ + __u64 flags; /* arch specific flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 reserved[32]; /* should be set to 0 */ +}; + +The type of operation is specified in the "op" field, either KVM_MEMOP_VIRTREAD +for reading from memory, KVM_MEMOP_VIRTWRITE for writing to memory, or +KVM_MEMOP_CHECKVIRTREAD or KVM_MEMOP_CHECKVIRTWRITE to check whether the +corresponding memory access would create an access exception (without +changing the data in the memory at the destination). In case an access +exception occurred while walking the MMU tables of the guest, the ioctl +returns a positive error number to indicate the type of exception. The +exception is raised directly at the corresponding VCPU if the bit +KVM_MEMOP_F_INJECT_EXC is set in the "flags" field. + +The logical (virtual) start address of the memory region has to be specified +in the "gaddr" field, and the length of the region in the "size" field. +"buf" is the buffer supplied by the userspace application where the read data +should be written to for KVM_MEMOP_VIRTREAD, or where the data that should +be written is stored for a KVM_MEMOP_VIRTWRITE. "buf" can be NULL for both +CHECK operations. + +The "reserved" field is meant for future extensions. It must currently be +set to 0. + + 5. The kvm_run structure ------------------------ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 8a1be90..d912362 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -697,6 +697,28 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, } /** + * check_gva_range - test a range of guest virtual addresses for accessibility + */ +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long length, int is_write) +{ + unsigned long gpa; + unsigned long currlen; + int rc = 0; + + ipte_lock(vcpu); + while (length > 0 && !rc) { + currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE)); + rc = guest_translate_address(vcpu, gva, &gpa, is_write); + gva += currlen; + length -= currlen; + } + ipte_unlock(vcpu); + + return rc; +} + +/** * kvm_s390_check_low_addr_protection - check for low-address protection * @ga: Guest address * diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 0149cf1..268beb7 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -157,6 +157,8 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, unsigned long *gpa, int write); +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long length, int is_write); int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, unsigned long len, int write); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b2371c0..c80a640 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -168,6 +168,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: case KVM_CAP_S390_USER_SIGP: + case KVM_CAP_MEM_OP: r = 1; break; case KVM_CAP_NR_VCPUS: @@ -1886,6 +1887,59 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, + struct kvm_guest_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf = NULL; + int r, srcu_idx; + + if ((mop->flags & ~KVM_MEMOP_F_INJECT_EXC) != 0) + return -EINVAL; + + if (mop->op == KVM_MEMOP_VIRTREAD || mop->op == KVM_MEMOP_VIRTWRITE) { + if (mop->size > 16 * PAGE_SIZE) + return -E2BIG; + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_MEMOP_VIRTREAD: + r = read_guest(vcpu, mop->gaddr, tmpbuf, mop->size); + if (r == 0) { + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } + break; + case KVM_MEMOP_VIRTWRITE: + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + break; + } + r = write_guest(vcpu, mop->gaddr, tmpbuf, mop->size); + break; + case KVM_MEMOP_CHECKVIRTREAD: + case KVM_MEMOP_CHECKVIRTWRITE: + r = check_gva_range(vcpu, mop->gaddr, mop->size, + mop->op == KVM_MEMOP_CHECKVIRTWRITE); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + + if (r > 0 && (mop->flags & KVM_MEMOP_F_INJECT_EXC) != 0) + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + + vfree(tmpbuf); + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1985,6 +2039,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_GUEST_MEM_OP: { + struct kvm_guest_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_guest_mem_op(vcpu, &mem_op); + else + r = -EFAULT; + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8055706..528a3d4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -365,6 +365,24 @@ struct kvm_translation { __u8 pad[5]; }; +/* for KVM_GUEST_MEM_OP */ +struct kvm_guest_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* arch specific flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 reserved[32]; /* should be set to 0 */ +}; +/* types for kvm_guest_mem_op->op */ +#define KVM_MEMOP_VIRTREAD 0 +#define KVM_MEMOP_VIRTWRITE 1 +#define KVM_MEMOP_CHECKVIRTREAD 2 +#define KVM_MEMOP_CHECKVIRTWRITE 3 +/* flags for kvm_guest_mem_op->flags */ +#define KVM_MEMOP_F_INJECT_EXC 0x0001UL /* Inject exception on faults */ + /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -760,6 +778,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_ENABLE_HCALL 104 #define KVM_CAP_CHECK_EXTENSION_VM 105 #define KVM_CAP_S390_USER_SIGP 106 +#define KVM_CAP_MEM_OP 107 #ifdef KVM_CAP_IRQ_ROUTING @@ -1135,6 +1154,8 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) #define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) +/* Available with KVM_CAP_MEM_OP */ +#define KVM_GUEST_MEM_OP _IOW(KVMIO, 0xb1, struct kvm_guest_mem_op) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)