diff mbox series

[05/13] target/arm/kvm: Add kvm_arch_get/put_sve

Message ID 20190512083624.8916-6-drjones@redhat.com
State New
Headers show
Series target/arm/kvm: enable SVE in guests | expand

Commit Message

Andrew Jones May 12, 2019, 8:36 a.m. UTC
These are the SVE equivalents to kvm_arch_get/put_fpsimd.

Signed-off-by: Andrew Jones <drjones@redhat.com>
---
 target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 4 deletions(-)

Comments

Dave Martin May 13, 2019, 12:31 p.m. UTC | #1
On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> 
> Signed-off-by: Andrew Jones <drjones@redhat.com>
> ---
>  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 123 insertions(+), 4 deletions(-)

[...]

> +static int kvm_arch_put_sve(CPUState *cs)
> +{
> +    ARMCPU *cpu = ARM_CPU(cs);
> +    CPUARMState *env = &cpu->env;
> +    struct kvm_one_reg reg;
> +    int n, ret;
> +
> +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> +        uint64_t *q = aa64_vfp_qreg(env, n);
> +#ifdef HOST_WORDS_BIGENDIAN
> +        uint64_t d[ARM_MAX_VQ * 2];
> +        int i;
> +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> +        }

Out of interest, why do all this swabbing?  It seems expensive.

Cheers
---Dave
Dave Martin May 13, 2019, 12:43 p.m. UTC | #2
On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> 
> Signed-off-by: Andrew Jones <drjones@redhat.com>
> ---
>  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 123 insertions(+), 4 deletions(-)

[...]

> +static int kvm_arch_put_sve(CPUState *cs)
> +{
> +    ARMCPU *cpu = ARM_CPU(cs);
> +    CPUARMState *env = &cpu->env;
> +    struct kvm_one_reg reg;
> +    int n, ret;
> +
> +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> +        uint64_t *q = aa64_vfp_qreg(env, n);
> +#ifdef HOST_WORDS_BIGENDIAN
> +        uint64_t d[ARM_MAX_VQ * 2];
> +        int i;
> +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> +        }
> +        reg.addr = (uintptr_t)d;
> +#else
> +        reg.addr = (uintptr_t)q;
> +#endif
> +        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);

Will this silently go wrong if more than one slice is required (i.e.,
the register size grows beyond 8192 bits?)

[...]

Cheers
---Dave
Andrew Jones May 13, 2019, 1:55 p.m. UTC | #3
On Mon, May 13, 2019 at 01:31:11PM +0100, Dave Martin wrote:
> On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> > These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> > 
> > Signed-off-by: Andrew Jones <drjones@redhat.com>
> > ---
> >  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 123 insertions(+), 4 deletions(-)
> 
> [...]
> 
> > +static int kvm_arch_put_sve(CPUState *cs)
> > +{
> > +    ARMCPU *cpu = ARM_CPU(cs);
> > +    CPUARMState *env = &cpu->env;
> > +    struct kvm_one_reg reg;
> > +    int n, ret;
> > +
> > +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> > +        uint64_t *q = aa64_vfp_qreg(env, n);
> > +#ifdef HOST_WORDS_BIGENDIAN
> > +        uint64_t d[ARM_MAX_VQ * 2];
> > +        int i;
> > +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> > +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> > +        }
> 
> Out of interest, why do all this swabbing?  It seems expensive.
>

QEMU keeps its 128-bit and larger words in the same order (least
significant word first) for both host endian types. We need to
do word swapping every time we set/get them to/from KVM.

Thanks,
drew
Andrew Jones May 13, 2019, 2:07 p.m. UTC | #4
On Mon, May 13, 2019 at 01:43:56PM +0100, Dave Martin wrote:
> On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> > These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> > 
> > Signed-off-by: Andrew Jones <drjones@redhat.com>
> > ---
> >  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 123 insertions(+), 4 deletions(-)
> 
> [...]
> 
> > +static int kvm_arch_put_sve(CPUState *cs)
> > +{
> > +    ARMCPU *cpu = ARM_CPU(cs);
> > +    CPUARMState *env = &cpu->env;
> > +    struct kvm_one_reg reg;
> > +    int n, ret;
> > +
> > +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> > +        uint64_t *q = aa64_vfp_qreg(env, n);
> > +#ifdef HOST_WORDS_BIGENDIAN
> > +        uint64_t d[ARM_MAX_VQ * 2];
> > +        int i;
> > +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> > +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> > +        }
> > +        reg.addr = (uintptr_t)d;
> > +#else
> > +        reg.addr = (uintptr_t)q;
> > +#endif
> > +        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
> 
> Will this silently go wrong if more than one slice is required (i.e.,
> the register size grows beyond 8192 bits?)

Yeah, I could probably implement the slice loop now and add a
function that returns 1 (for now) like your vcpu_sve_slices()
function in KVM. I'll do that for v2.

Thanks,
drew



> 
> [...]
> 
> Cheers
> ---Dave
>
Dave Martin May 13, 2019, 2:39 p.m. UTC | #5
On Mon, May 13, 2019 at 03:07:26PM +0100, Andrew Jones wrote:
> On Mon, May 13, 2019 at 01:43:56PM +0100, Dave Martin wrote:
> > On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> > > These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> > > 
> > > Signed-off-by: Andrew Jones <drjones@redhat.com>
> > > ---
> > >  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 123 insertions(+), 4 deletions(-)
> > 
> > [...]
> > 
> > > +static int kvm_arch_put_sve(CPUState *cs)
> > > +{
> > > +    ARMCPU *cpu = ARM_CPU(cs);
> > > +    CPUARMState *env = &cpu->env;
> > > +    struct kvm_one_reg reg;
> > > +    int n, ret;
> > > +
> > > +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> > > +        uint64_t *q = aa64_vfp_qreg(env, n);
> > > +#ifdef HOST_WORDS_BIGENDIAN
> > > +        uint64_t d[ARM_MAX_VQ * 2];
> > > +        int i;
> > > +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> > > +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> > > +        }
> > > +        reg.addr = (uintptr_t)d;
> > > +#else
> > > +        reg.addr = (uintptr_t)q;
> > > +#endif
> > > +        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
> > 
> > Will this silently go wrong if more than one slice is required (i.e.,
> > the register size grows beyond 8192 bits?)
> 
> Yeah, I could probably implement the slice loop now and add a
> function that returns 1 (for now) like your vcpu_sve_slices()
> function in KVM. I'll do that for v2.

Or just add a sanity check that the vector length is <= 2048 bits.

Support for larger vectors is untestable for now, since the kernel
doesn't support that and would never expose it.


On that point, could TCG easily be made to expose a larger vector length
to the kernel?  I'd be interested to see what happened.

The kernel should warn and hide the larger vector lengths from KVM,
but I've not been able to test that.

It's only worth trying this out if the hacks to QEMU to enable testing
this were pretty trivial, though.

Cheers
---Dave
Dave Martin May 13, 2019, 3:31 p.m. UTC | #6
On Mon, May 13, 2019 at 02:55:01PM +0100, Andrew Jones wrote:
> On Mon, May 13, 2019 at 01:31:11PM +0100, Dave Martin wrote:
> > On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> > > These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> > > 
> > > Signed-off-by: Andrew Jones <drjones@redhat.com>
> > > ---
> > >  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 123 insertions(+), 4 deletions(-)
> > 
> > [...]
> > 
> > > +static int kvm_arch_put_sve(CPUState *cs)
> > > +{
> > > +    ARMCPU *cpu = ARM_CPU(cs);
> > > +    CPUARMState *env = &cpu->env;
> > > +    struct kvm_one_reg reg;
> > > +    int n, ret;
> > > +
> > > +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> > > +        uint64_t *q = aa64_vfp_qreg(env, n);
> > > +#ifdef HOST_WORDS_BIGENDIAN
> > > +        uint64_t d[ARM_MAX_VQ * 2];
> > > +        int i;
> > > +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> > > +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> > > +        }
> > 
> > Out of interest, why do all this swabbing?  It seems expensive.
> >
> 
> QEMU keeps its 128-bit and larger words in the same order (least
> significant word first) for both host endian types. We need to
> do word swapping every time we set/get them to/from KVM.

I'm not sure whether this is appropriate here, though it depends on
what QEMU does with the data.

Something non-obvious to be aware of:

As exposed through the signal frame and the KVM ABI, the memory
representation of an SVE reg is invariant with respect to the
endianness.

IIUC, the byte order seen for a V-reg in KVM_REG_ARM_CORE and for
the equivalent Z-reg in KVM_REG_ARM64_SVE would be the opposite of each
other on BE, but the same on LE.

This is a feature of the archtiecture: a V-reg can be stored as a single
value, but Z-regs are in general too big to be treated as a single
value: they are always treated as a sequence of elements, and the
largest element size supported is 64 bits, not 128.  IIUC, there is no
direct native way to store with 128-bit swabbing: some explicit data
processing operation would also be needed to swap adjacent 64-bit
elements in the vector around the store/load.

This is not specified in the ABI documentation -- I should address that.

If this is infeasible for KVM to work with, we could perhaps change it,
but I'm not too keen on that at this stage.  KVM_REG_ARM64_SVE_VLS
has a similar behaviour: it's a vector of 64-bit possibly-swabbed words,
not a single possibly-swabbed 512-bit word.


Looking at the kernel, I may have screwed up in places where the
two representations interact, like fpsimd_to_sve().  I should take a
look at that...  This doesn't affect the KVM ABI though.

Cheers
---Dave
Peter Maydell May 13, 2019, 3:40 p.m. UTC | #7
On Mon, 13 May 2019 at 16:31, Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Mon, May 13, 2019 at 02:55:01PM +0100, Andrew Jones wrote:
> > QEMU keeps its 128-bit and larger words in the same order (least
> > significant word first) for both host endian types. We need to
> > do word swapping every time we set/get them to/from KVM.
>
> I'm not sure whether this is appropriate here, though it depends on
> what QEMU does with the data.

The layout is optimised for TCG emulation to be able
to work with it, I think (rth would have the definite
reason, though).

> Something non-obvious to be aware of:
>
> As exposed through the signal frame and the KVM ABI, the memory
> representation of an SVE reg is invariant with respect to the
> endianness.

Yes; we handle this conversion as we write out the signal frame:
https://github.com/qemu/qemu/blob/master/linux-user/aarch64/signal.c#L184

thanks
-- PMM
Dave Martin May 13, 2019, 4:05 p.m. UTC | #8
On Mon, May 13, 2019 at 04:40:52PM +0100, Peter Maydell wrote:
> On Mon, 13 May 2019 at 16:31, Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Mon, May 13, 2019 at 02:55:01PM +0100, Andrew Jones wrote:
> > > QEMU keeps its 128-bit and larger words in the same order (least
> > > significant word first) for both host endian types. We need to
> > > do word swapping every time we set/get them to/from KVM.
> >
> > I'm not sure whether this is appropriate here, though it depends on
> > what QEMU does with the data.
> 
> The layout is optimised for TCG emulation to be able
> to work with it, I think (rth would have the definite
> reason, though).

So long as we are agreed about the ABI, this is none of my concern :)

> > Something non-obvious to be aware of:
> >
> > As exposed through the signal frame and the KVM ABI, the memory
> > representation of an SVE reg is invariant with respect to the
> > endianness.
> 
> Yes; we handle this conversion as we write out the signal frame:
> https://github.com/qemu/qemu/blob/master/linux-user/aarch64/signal.c#L184

Right.  I hadn't focused consciously on this, since the architecture
does the work for us in the kernel (mostly).

I will check the documentation to make sure the behaviour is clearly
described, anyhow.

Cheers
---Dave
Richard Henderson May 13, 2019, 4:40 p.m. UTC | #9
On 5/13/19 5:31 AM, Dave Martin wrote:
> On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
>> These are the SVE equivalents to kvm_arch_get/put_fpsimd.
>>
>> Signed-off-by: Andrew Jones <drjones@redhat.com>
>> ---
>>  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 123 insertions(+), 4 deletions(-)
> 
> [...]
> 
>> +static int kvm_arch_put_sve(CPUState *cs)
>> +{
>> +    ARMCPU *cpu = ARM_CPU(cs);
>> +    CPUARMState *env = &cpu->env;
>> +    struct kvm_one_reg reg;
>> +    int n, ret;
>> +
>> +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
>> +        uint64_t *q = aa64_vfp_qreg(env, n);
>> +#ifdef HOST_WORDS_BIGENDIAN
>> +        uint64_t d[ARM_MAX_VQ * 2];
>> +        int i;
>> +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
>> +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
>> +        }
> 
> Out of interest, why do all this swabbing?  It seems expensive.

Indeed, to me this seems to be the wrong kind of swabbing here.  Exactly what
format is KVM expecting?  Surely it should be the one used by the unpredicated
LDR/STR instructions.  Anything else would seem to be working against the
architecture.

If so, the format is, architecturally, a stream of bytes in index order, which
corresponds to a little-endian stream of words.  So the loop I'd expect to see
here is

    for (i = 0, n = cpu->sve_max_vq; i < n; ++i) {
        d[i] = bswap64(q[i]);
    }


r~
Richard Henderson May 13, 2019, 4:58 p.m. UTC | #10
On 5/13/19 7:39 AM, Dave Martin wrote:
> On that point, could TCG easily be made to expose a larger vector length
> to the kernel?  I'd be interested to see what happened.

It would be easy enough to extend the maximum vector length within TCG.

Increase ARM_MAX_VQ.  Alter the couple of places where we manipulate ZCR.LEN to
extend the current 4-bit mask.

How large do you need the max to be, for testing?


r~
Andrew Jones May 13, 2019, 6:14 p.m. UTC | #11
On Mon, May 13, 2019 at 09:40:29AM -0700, Richard Henderson wrote:
> On 5/13/19 5:31 AM, Dave Martin wrote:
> > On Sun, May 12, 2019 at 09:36:16AM +0100, Andrew Jones wrote:
> >> These are the SVE equivalents to kvm_arch_get/put_fpsimd.
> >>
> >> Signed-off-by: Andrew Jones <drjones@redhat.com>
> >> ---
> >>  target/arm/kvm64.c | 127 +++++++++++++++++++++++++++++++++++++++++++--
> >>  1 file changed, 123 insertions(+), 4 deletions(-)
> > 
> > [...]
> > 
> >> +static int kvm_arch_put_sve(CPUState *cs)
> >> +{
> >> +    ARMCPU *cpu = ARM_CPU(cs);
> >> +    CPUARMState *env = &cpu->env;
> >> +    struct kvm_one_reg reg;
> >> +    int n, ret;
> >> +
> >> +    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
> >> +        uint64_t *q = aa64_vfp_qreg(env, n);
> >> +#ifdef HOST_WORDS_BIGENDIAN
> >> +        uint64_t d[ARM_MAX_VQ * 2];
> >> +        int i;
> >> +        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
> >> +            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
> >> +        }
> > 
> > Out of interest, why do all this swabbing?  It seems expensive.
> 
> Indeed, to me this seems to be the wrong kind of swabbing here.  Exactly what
> format is KVM expecting?  Surely it should be the one used by the unpredicated
> LDR/STR instructions.  Anything else would seem to be working against the
> architecture.
> 
> If so, the format is, architecturally, a stream of bytes in index order, which
> corresponds to a little-endian stream of words.  So the loop I'd expect to see
> here is
> 
>     for (i = 0, n = cpu->sve_max_vq; i < n; ++i) {
>         d[i] = bswap64(q[i]);
>     }

That's the opposite of what we do for fpsimd registers though. I'm
fine with doing whatever KVM/TCG needs, but so far I was just following
the same pattern we already have.

Thanks,
drew
Richard Henderson May 13, 2019, 6:31 p.m. UTC | #12
On 5/13/19 11:14 AM, Andrew Jones wrote:
>> Indeed, to me this seems to be the wrong kind of swabbing here.  Exactly what
>> format is KVM expecting?  Surely it should be the one used by the unpredicated
>> LDR/STR instructions.  Anything else would seem to be working against the
>> architecture.
>>
>> If so, the format is, architecturally, a stream of bytes in index order, which
>> corresponds to a little-endian stream of words.  So the loop I'd expect to see
>> here is
>>
>>     for (i = 0, n = cpu->sve_max_vq; i < n; ++i) {
>>         d[i] = bswap64(q[i]);
>>     }
> 
> That's the opposite of what we do for fpsimd registers though. I'm
> fine with doing whatever KVM/TCG needs, but so far I was just following
> the same pattern we already have.

The behaviour of the hardware is different for LDR of fpsimd registers.

FP&SIMD LDR operates on datasize (8, 16, 32, 64, 128 bits).
SVE LDR always operates on bytes.


r~
Dave Martin May 14, 2019, 9:10 a.m. UTC | #13
On Mon, May 13, 2019 at 05:58:59PM +0100, Richard Henderson wrote:
> On 5/13/19 7:39 AM, Dave Martin wrote:
> > On that point, could TCG easily be made to expose a larger vector length
> > to the kernel?  I'd be interested to see what happened.
> 
> It would be easy enough to extend the maximum vector length within TCG.
> 
> Increase ARM_MAX_VQ.  Alter the couple of places where we manipulate ZCR.LEN to
> extend the current 4-bit mask.
> 
> How large do you need the max to be, for testing?

Anything upwards of 256 bytes is interesting.

The architecture reserves space for it to grow up to 9 bits, though it's
unlikely it would ever get that large in reality.

So if you wanted to go crazy, you might be able to go up to 8192 bytes.

This is just for fun, since it goes outside the architecture and Linux
officially doesn't support it today in any case.  So definitely not a
priority!

Cheers
---Dave
diff mbox series

Patch

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 61947f3716e1..86362f4cd7d0 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -658,11 +658,12 @@  int kvm_arch_init_vcpu(CPUState *cs)
 bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx)
 {
     /* Return true if the regidx is a register we should synchronize
-     * via the cpreg_tuples array (ie is not a core reg we sync by
-     * hand in kvm_arch_get/put_registers())
+     * via the cpreg_tuples array (ie is not a core or sve reg that
+     * we sync by hand in kvm_arch_get/put_registers())
      */
     switch (regidx & KVM_REG_ARM_COPROC_MASK) {
     case KVM_REG_ARM_CORE:
+    case KVM_REG_ARM64_SVE:
         return false;
     default:
         return true;
@@ -748,6 +749,61 @@  static int kvm_arch_put_fpsimd(CPUState *cs)
     return 0;
 }
 
+static int kvm_arch_put_sve(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    struct kvm_one_reg reg;
+    int n, ret;
+
+    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
+        uint64_t *q = aa64_vfp_qreg(env, n);
+#ifdef HOST_WORDS_BIGENDIAN
+        uint64_t d[ARM_MAX_VQ * 2];
+        int i;
+        for (i = 0; i < cpu->sve_max_vq * 2; i++) {
+            d[i] = q[cpu->sve_max_vq * 2 - 1 - i];
+        }
+        reg.addr = (uintptr_t)d;
+#else
+        reg.addr = (uintptr_t)q;
+#endif
+        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; n++) {
+        uint64_t *p = &env->vfp.pregs[n].p[0];
+#ifdef HOST_WORDS_BIGENDIAN
+        uint64_t d[ARM_MAX_VQ * 2];
+        int i;
+        for (i = 0; i < cpu->sve_max_vq * 2 / 8; i++) {
+            d[i] = p[cpu->sve_max_vq * 2 / 8 - 1 - i];
+        }
+        reg.addr = (uintptr_t)d;
+#else
+        reg.addr = (uintptr_t)p;
+#endif
+        reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    reg.addr = (uintptr_t)&env->vfp.pregs[FFR_PRED_NUM].p[0];
+    reg.id = KVM_REG_ARM64_SVE_FFR(0);
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
 int kvm_arch_put_registers(CPUState *cs, int level)
 {
     struct kvm_one_reg reg;
@@ -842,7 +898,11 @@  int kvm_arch_put_registers(CPUState *cs, int level)
         }
     }
 
-    ret = kvm_arch_put_fpsimd(cs);
+    if (!cpu_isar_feature(aa64_sve, cpu)) {
+        ret = kvm_arch_put_fpsimd(cs);
+    } else {
+        ret = kvm_arch_put_sve(cs);
+    }
     if (ret) {
         return ret;
     }
@@ -905,6 +965,61 @@  static int kvm_arch_get_fpsimd(CPUState *cs)
     return 0;
 }
 
+static int kvm_arch_get_sve(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    struct kvm_one_reg reg;
+    int n, ret;
+
+    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; n++) {
+        uint64_t *q = aa64_vfp_qreg(env, n);
+        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
+        reg.addr = (uintptr_t)q;
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        } else {
+#ifdef HOST_WORDS_BIGENDIAN
+            int i = 0, j = cpu->sve_max_vq * 2 - 1;
+            while (i < j) {
+                uint64_t t;
+                t = q[i], q[i] = q[j], q[j] = t;
+                ++i, --j;
+            }
+#endif
+        }
+    }
+
+    for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; n++) {
+        uint64_t *p = &env->vfp.pregs[n].p[0];
+        reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
+        reg.addr = (uintptr_t)p;
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        } else {
+#ifdef HOST_WORDS_BIGENDIAN
+            int i = 0, j = cpu->sve_max_vq * 2 / 8 - 1;
+            while (i < j) {
+                uint64_t t;
+                t = q[i], q[i] = q[j], q[j] = t;
+                ++i, --j;
+            }
+#endif
+        }
+    }
+
+    reg.addr = (uintptr_t)&env->vfp.pregs[FFR_PRED_NUM].p[0];
+    reg.id = KVM_REG_ARM64_SVE_FFR(0);
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
 int kvm_arch_get_registers(CPUState *cs)
 {
     struct kvm_one_reg reg;
@@ -999,7 +1114,11 @@  int kvm_arch_get_registers(CPUState *cs)
         env->spsr = env->banked_spsr[i];
     }
 
-    ret = kvm_arch_get_fpsimd(cs);
+    if (!cpu_isar_feature(aa64_sve, cpu)) {
+        ret = kvm_arch_get_fpsimd(cs);
+    } else {
+        ret = kvm_arch_get_sve(cs);
+    }
     if (ret) {
         return ret;
     }