Message ID | 20210310003024.2026253-4-jingzhangos@google.com |
---|---|
State | Not Applicable |
Headers | show |
Series | [RFC,1/4] KVM: stats: Separate statistics name strings from debugfs code | expand |
On 10/03/21 01:30, Jing Zhang wrote: > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 383df23514b9..87dd62516c8b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, > r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); > break; > } > + case KVM_STATS_GET_INFO: { > + struct kvm_stats_info stats_info; > + > + r = -EFAULT; > + stats_info.num_stats = VCPU_STAT_COUNT; > + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) > + goto out; > + r = 0; > + break; > + } > + case KVM_STATS_GET_NAMES: { > + struct kvm_stats_names stats_names; > + > + r = -EFAULT; > + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) > + goto out; > + r = -EINVAL; > + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) > + goto out; > + > + r = -EFAULT; > + if (copy_to_user(argp + sizeof(stats_names), > + kvm_vcpu_stat_strings, > + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) The only reason to separate the strings in patch 1 is to pass them here. But this is a poor API because it imposes a limit on the length of the statistics, and makes that length part of the binary interface. I would prefer a completely different interface, where you have a file descriptor that can be created and associated to a vCPU or VM (or even to /dev/kvm). Having a file descriptor is important because the fd can be passed to a less-privileged process that takes care of gathering the metrics The result of reading the file descriptor could be either ASCII or binary. IMO the real cost lies in opening and reading a multitude of files rather than in the ASCII<->binary conversion. The format could be one of the following: * binary: 4 bytes flags (always zero) 4 bytes number of statistics 4 bytes offset of the first stat description 4 bytes offset of the first stat value stat descriptions: - 4 bytes for the type (for now always zero: uint64_t) - 4 bytes for the flags (for now always zero) - length of name - name statistics in 64-bit format * text: stat1_name uint64 123 stat2_name uint64 456 ... What do you think? Paolo
On Wed, 10 Mar 2021 00:30:23 +0000, Jing Zhang <jingzhangos@google.com> wrote: > > Three ioctl commands are added to support binary form statistics data > retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA. > KVM_CAP_STATS_BINARY_FORM indicates the capability. > > Signed-off-by: Jing Zhang <jingzhangos@google.com> > --- > virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 115 insertions(+) > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 383df23514b9..87dd62516c8b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, > r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); > break; > } > + case KVM_STATS_GET_INFO: { > + struct kvm_stats_info stats_info; > + > + r = -EFAULT; > + stats_info.num_stats = VCPU_STAT_COUNT; > + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) > + goto out; > + r = 0; > + break; > + } > + case KVM_STATS_GET_NAMES: { > + struct kvm_stats_names stats_names; > + > + r = -EFAULT; > + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) > + goto out; > + r = -EINVAL; > + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) > + goto out; > + > + r = -EFAULT; > + if (copy_to_user(argp + sizeof(stats_names), > + kvm_vcpu_stat_strings, > + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) > + goto out; > + r = 0; > + break; > + } > + case KVM_STATS_GET_DATA: { > + struct kvm_stats_data stats_data; > + > + r = -EFAULT; > + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) > + goto out; > + r = -EINVAL; > + if (stats_data.size < sizeof(vcpu->stat)) > + goto out; > + > + r = -EFAULT; > + argp += sizeof(stats_data); > + if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat))) > + goto out; > + r = 0; > + break; > + } > default: > r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); > } > @@ -3695,6 +3740,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) > case KVM_CAP_CHECK_EXTENSION_VM: > case KVM_CAP_ENABLE_CAP_VM: > case KVM_CAP_HALT_POLL: > + case KVM_CAP_STATS_BINARY_FORM: > return 1; > #ifdef CONFIG_KVM_MMIO > case KVM_CAP_COALESCED_MMIO: > @@ -3825,6 +3871,40 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, > } > } > > +static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg) > +{ > + void __user *argp = (void __user *)arg; > + struct kvm_vcpu *vcpu; > + struct kvm_stats_data stats_data; > + u64 *data = NULL, *pdata; > + int i, j, ret = 0; > + size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data); > + > + > + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) > + return -EFAULT; > + if (stats_data.size < dsize) > + return -EINVAL; > + data = kzalloc(dsize, GFP_KERNEL_ACCOUNT); > + if (!data) > + return -ENOMEM; > + > + for (i = 0; i < VM_STAT_COUNT; i++) > + *(data + i) = *((ulong *)&kvm->stat + i); This kind of dance could be avoided if your stats were just an array, or a union of the current data structure and an array. > + > + kvm_for_each_vcpu(j, vcpu, kvm) { > + pdata = data + VM_STAT_COUNT; > + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) > + *pdata += *((u64 *)&vcpu->stat + i); Do you really need the in-kernel copy? Why not directly organise the data structures in a way that would allow a bulk copy using copy_to_user()? Another thing is the atomicity of what you are reporting. Don't you care about the consistency of the counters? Thanks, M.
On 10/03/21 16:51, Marc Zyngier wrote: >> + kvm_for_each_vcpu(j, vcpu, kvm) { >> + pdata = data + VM_STAT_COUNT; >> + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) >> + *pdata += *((u64 *)&vcpu->stat + i); > Do you really need the in-kernel copy? Why not directly organise the > data structures in a way that would allow a bulk copy using > copy_to_user()? The result is built by summing per-vCPU counters, so that the counter updates are fast and do not require a lock. So consistency basically cannot be guaranteed. Paolo
On Wed, 10 Mar 2021 16:03:42 +0000, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 16:51, Marc Zyngier wrote: > >> + kvm_for_each_vcpu(j, vcpu, kvm) { > >> + pdata = data + VM_STAT_COUNT; > >> + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) > >> + *pdata += *((u64 *)&vcpu->stat + i); > > Do you really need the in-kernel copy? Why not directly organise the > > data structures in a way that would allow a bulk copy using > > copy_to_user()? > > The result is built by summing per-vCPU counters, so that the counter > updates are fast and do not require a lock. So consistency basically > cannot be guaranteed. Sure, but I wonder whether there is scope for VM-global counters to be maintained in parallel with per-vCPU counters if speed/efficiency is of the essence (and this seems to be how it is sold in the cover letter). Thanks, M.
On 10/03/21 18:05, Marc Zyngier wrote: > On Wed, 10 Mar 2021 16:03:42 +0000, > Paolo Bonzini <pbonzini@redhat.com> wrote: >> >> On 10/03/21 16:51, Marc Zyngier wrote: >>>> + kvm_for_each_vcpu(j, vcpu, kvm) { >>>> + pdata = data + VM_STAT_COUNT; >>>> + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) >>>> + *pdata += *((u64 *)&vcpu->stat + i); >>> Do you really need the in-kernel copy? Why not directly organise the >>> data structures in a way that would allow a bulk copy using >>> copy_to_user()? >> >> The result is built by summing per-vCPU counters, so that the counter >> updates are fast and do not require a lock. So consistency basically >> cannot be guaranteed. > > Sure, but I wonder whether there is scope for VM-global counters to be > maintained in parallel with per-vCPU counters if speed/efficiency is > of the essence (and this seems to be how it is sold in the cover > letter). Maintaining VM-global counters would require an atomic instruction and would suffer lots of cacheline bouncing even on architectures that have relaxed atomic memory operations. Speed/efficiency of retrieving statistics is important, but let's keep in mind that the baseline for comparison is hundreds of syscalls and filesystem lookups. Paolo
On Wed, 10 Mar 2021 17:11:47 +0000, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 18:05, Marc Zyngier wrote: > > On Wed, 10 Mar 2021 16:03:42 +0000, > > Paolo Bonzini <pbonzini@redhat.com> wrote: > >> > >> On 10/03/21 16:51, Marc Zyngier wrote: > >>>> + kvm_for_each_vcpu(j, vcpu, kvm) { > >>>> + pdata = data + VM_STAT_COUNT; > >>>> + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) > >>>> + *pdata += *((u64 *)&vcpu->stat + i); > >>> Do you really need the in-kernel copy? Why not directly organise the > >>> data structures in a way that would allow a bulk copy using > >>> copy_to_user()? > >> > >> The result is built by summing per-vCPU counters, so that the counter > >> updates are fast and do not require a lock. So consistency basically > >> cannot be guaranteed. > > > > Sure, but I wonder whether there is scope for VM-global counters to be > > maintained in parallel with per-vCPU counters if speed/efficiency is > > of the essence (and this seems to be how it is sold in the cover > > letter). > > Maintaining VM-global counters would require an atomic instruction and > would suffer lots of cacheline bouncing even on architectures that > have relaxed atomic memory operations. Which is why we have per-cpu counters already. Making use of them doesn't seem that outlandish. > Speed/efficiency of retrieving statistics is important, but let's keep > in mind that the baseline for comparison is hundreds of syscalls and > filesystem lookups. Having that baseline in the cover letter would be a good start, as well as an indication of the frequency this is used at. M.
On 10/03/21 18:31, Marc Zyngier wrote: >> Maintaining VM-global counters would require an atomic instruction and >> would suffer lots of cacheline bouncing even on architectures that >> have relaxed atomic memory operations. > Which is why we have per-cpu counters already. Making use of them > doesn't seem that outlandish. But you wouldn't be able to guarantee consistency anyway, would you? You *could* copy N*M counters to userspace, but there's no guarantee that they are consistent, neither within a single vCPU nor within a single counter. >> Speed/efficiency of retrieving statistics is important, but let's keep >> in mind that the baseline for comparison is hundreds of syscalls and >> filesystem lookups. > > Having that baseline in the cover letter would be a good start, as > well as an indication of the frequency this is used at. Can't disagree, especially on the latter which I have no idea about. Paolo
Hi Paolo, On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 01:30, Jing Zhang wrote: > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index 383df23514b9..87dd62516c8b 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, > > r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); > > break; > > } > > + case KVM_STATS_GET_INFO: { > > + struct kvm_stats_info stats_info; > > + > > + r = -EFAULT; > > + stats_info.num_stats = VCPU_STAT_COUNT; > > + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) > > + goto out; > > + r = 0; > > + break; > > + } > > + case KVM_STATS_GET_NAMES: { > > + struct kvm_stats_names stats_names; > > + > > + r = -EFAULT; > > + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) > > + goto out; > > + r = -EINVAL; > > + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) > > + goto out; > > + > > + r = -EFAULT; > > + if (copy_to_user(argp + sizeof(stats_names), > > + kvm_vcpu_stat_strings, > > + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) > > The only reason to separate the strings in patch 1 is to pass them here. > But this is a poor API because it imposes a limit on the length of the > statistics, and makes that length part of the binary interface. Agreed. I am considering returning the length of stats name strings in the kvm_stats_info structure instead of exporting it as a constant in uapi, which would put no limit on the length of the stats name strings. > > I would prefer a completely different interface, where you have a file > descriptor that can be created and associated to a vCPU or VM (or even > to /dev/kvm). Having a file descriptor is important because the fd can > be passed to a less-privileged process that takes care of gathering the > metrics Separate file descriptor solution is very tempting. We are still considering it seriously. Our biggest concern is that the metrics gathering/handling process is not necessary running on the same node as the one file descriptor belongs to. It scales better to pass metrics data directly than to pass file descriptors. > > The result of reading the file descriptor could be either ASCII or > binary. IMO the real cost lies in opening and reading a multitude of > files rather than in the ASCII<->binary conversion. Agreed. > > The format could be one of the following: > > * binary: > > 4 bytes flags (always zero) > 4 bytes number of statistics > 4 bytes offset of the first stat description > 4 bytes offset of the first stat value > stat descriptions: > - 4 bytes for the type (for now always zero: uint64_t) > - 4 bytes for the flags (for now always zero) > - length of name > - name > statistics in 64-bit format > > * text: > > stat1_name uint64 123 > stat2_name uint64 456 > ... > > What do you think? The binary format presented above is very flexible. I understand why it is organized this way. In our situation, the metrics data could be pulled periodically as short as half second. They are used by different kinds of monitors/triggers/alerts. To enhance efficiency and reduce traffic caused by metrics passing, we treat all metrics info/data as two kinds. One is immutable information, which doesn't change in a given system boot. The other is mutable data (statistics data), which is pulled/transferred periodically at a high frequency. > > Paolo > Thanks, Jing
On Wed, Mar 10, 2021 at 11:44 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 18:31, Marc Zyngier wrote: > >> Maintaining VM-global counters would require an atomic instruction and > >> would suffer lots of cacheline bouncing even on architectures that > >> have relaxed atomic memory operations. > > Which is why we have per-cpu counters already. Making use of them > > doesn't seem that outlandish. > > But you wouldn't be able to guarantee consistency anyway, would you? > You *could* copy N*M counters to userspace, but there's no guarantee > that they are consistent, neither within a single vCPU nor within a > single counter. > > >> Speed/efficiency of retrieving statistics is important, but let's keep > >> in mind that the baseline for comparison is hundreds of syscalls and > >> filesystem lookups. > > > > Having that baseline in the cover letter would be a good start, as > > well as an indication of the frequency this is used at. > > Can't disagree, especially on the latter which I have no idea about. > > Paolo > Marc, Paolo, thanks for the comments. I will add some more information in the cover letter. Thanks, Jing
On 10/03/21 22:41, Jing Zhang wrote: >> I would prefer a completely different interface, where you have a file >> descriptor that can be created and associated to a vCPU or VM (or even >> to /dev/kvm). Having a file descriptor is important because the fd can >> be passed to a less-privileged process that takes care of gathering the >> metrics > Separate file descriptor solution is very tempting. We are still considering it > seriously. Our biggest concern is that the metrics gathering/handling process > is not necessary running on the same node as the one file descriptor belongs to. > It scales better to pass metrics data directly than to pass file descriptors. If you want to pass metrics data directly, you can just read the file descriptor from your VMM, just like you're using the ioctls now. However the file descriptor also allows a privilege-separated same-host interface. >> 4 bytes flags (always zero) >> 4 bytes number of statistics >> 4 bytes offset of the first stat description >> 4 bytes offset of the first stat value >> stat descriptions: >> - 4 bytes for the type (for now always zero: uint64_t) >> - 4 bytes for the flags (for now always zero) >> - length of name >> - name >> statistics in 64-bit format > > The binary format presented above is very flexible. I understand why it is > organized this way. > In our situation, the metrics data could be pulled periodically as short as > half second. They are used by different kinds of monitors/triggers/alerts. > To enhance efficiency and reduce traffic caused by metrics passing, we > treat all metrics info/data as two kinds. One is immutable information, > which doesn't change in a given system boot. The other is mutable > data (statistics data), which is pulled/transferred periodically at a high > frequency. The format allows to place the values before the descriptions. So you could use pread to only read the first part of the file descriptor, and the file_operations implementation would then skip the work of building the immutable data. It doesn't have to be implemented from the beginning like that, but the above format supports it. Paolo
Hi Paolo, On Fri, Mar 12, 2021 at 12:11 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 22:41, Jing Zhang wrote: > >> I would prefer a completely different interface, where you have a file > >> descriptor that can be created and associated to a vCPU or VM (or even > >> to /dev/kvm). Having a file descriptor is important because the fd can > >> be passed to a less-privileged process that takes care of gathering the > >> metrics > > Separate file descriptor solution is very tempting. We are still considering it > > seriously. Our biggest concern is that the metrics gathering/handling process > > is not necessary running on the same node as the one file descriptor belongs to. > > It scales better to pass metrics data directly than to pass file descriptors. > > If you want to pass metrics data directly, you can just read the file > descriptor from your VMM, just like you're using the ioctls now. > However the file descriptor also allows a privilege-separated same-host > interface. It makes sense. > > >> 4 bytes flags (always zero) Could you give some potential use for this flag? > >> 4 bytes number of statistics > >> 4 bytes offset of the first stat description > >> 4 bytes offset of the first stat value > >> stat descriptions: > >> - 4 bytes for the type (for now always zero: uint64_t) Potential use for this type? Should we move this outside descriptor? Since all stats probably have the same size. > >> - 4 bytes for the flags (for now always zero) Potential use for this flag? > >> - length of name > >> - name > >> statistics in 64-bit format > > > > The binary format presented above is very flexible. I understand why it is > > organized this way. > > In our situation, the metrics data could be pulled periodically as short as > > half second. They are used by different kinds of monitors/triggers/alerts. > > To enhance efficiency and reduce traffic caused by metrics passing, we > > treat all metrics info/data as two kinds. One is immutable information, > > which doesn't change in a given system boot. The other is mutable > > data (statistics data), which is pulled/transferred periodically at a high > > frequency. > > The format allows to place the values before the descriptions. So you > could use pread to only read the first part of the file descriptor, and > the file_operations implementation would then skip the work of building > the immutable data. It doesn't have to be implemented from the > beginning like that, but the above format supports it. Good point! I'll be working on the new fd-based interface and come back with new patchset. > > Paolo > Thanks, Jing
On 12/03/21 23:27, Jing Zhang wrote: >>>> 4 bytes flags (always zero) > Could you give some potential use for this flag? No idea, honestly. It probably would signal the presence of more fields after "offset of the first stat value". In general it's better to leave some room for extension. >>>> 4 bytes number of statistics >>>> 4 bytes offset of the first stat description >>>> 4 bytes offset of the first stat value >>>> stat descriptions: >>>> - 4 bytes for the type (for now always zero: uint64_t) > Potential use for this type? Should we move this outside descriptor? Since > all stats probably have the same size. Yes, all stats should be 8 bytes. But for example: - 0 = uint64_t - 1 = int64_t - 0x80000000 | n: enum with n different values, which are stored after the name >>>> - 4 bytes for the flags (for now always zero) > Potential use for this flag? Looking back at Emanuele's statsfs, it could be: - bit 0: can be cleared (by writing eight zero bytes in the statistics' offset) - bit 1: cumulative value (count of events, can only grow) vs. instantaneous value (can go up or down) This is currently stored in the debugfs mode, so we can already use these flags. Paolo
Hi Paolo, On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 10/03/21 01:30, Jing Zhang wrote: > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index 383df23514b9..87dd62516c8b 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, > > r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); > > break; > > } > > + case KVM_STATS_GET_INFO: { > > + struct kvm_stats_info stats_info; > > + > > + r = -EFAULT; > > + stats_info.num_stats = VCPU_STAT_COUNT; > > + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) > > + goto out; > > + r = 0; > > + break; > > + } > > + case KVM_STATS_GET_NAMES: { > > + struct kvm_stats_names stats_names; > > + > > + r = -EFAULT; > > + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) > > + goto out; > > + r = -EINVAL; > > + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) > > + goto out; > > + > > + r = -EFAULT; > > + if (copy_to_user(argp + sizeof(stats_names), > > + kvm_vcpu_stat_strings, > > + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) > > The only reason to separate the strings in patch 1 is to pass them here. > But this is a poor API because it imposes a limit on the length of the > statistics, and makes that length part of the binary interface. > > I would prefer a completely different interface, where you have a file > descriptor that can be created and associated to a vCPU or VM (or even > to /dev/kvm). Having a file descriptor is important because the fd can We are considering about how to create the file descriptor. It might be risky to create an extra fd for every vCPU. It will easily hit the fd limit for the process or the system for machines running a ton of small VMs. Looks like creating an extra file descriptor for every VM is a better option. And then we can check per vCPU stats through Ioctl of this VM fd by passing the vCPU index. What do you think? > be passed to a less-privileged process that takes care of gathering the > metrics > > The result of reading the file descriptor could be either ASCII or > binary. IMO the real cost lies in opening and reading a multitude of > files rather than in the ASCII<->binary conversion. > > The format could be one of the following: > > * binary: > > 4 bytes flags (always zero) > 4 bytes number of statistics > 4 bytes offset of the first stat description > 4 bytes offset of the first stat value > stat descriptions: > - 4 bytes for the type (for now always zero: uint64_t) > - 4 bytes for the flags (for now always zero) > - length of name > - name > statistics in 64-bit format > > * text: > > stat1_name uint64 123 > stat2_name uint64 456 > ... > > What do you think? > > Paolo >
On 15/03/21 23:31, Jing Zhang wrote: > We are considering about how to create the file descriptor. It might be risky > to create an extra fd for every vCPU. It will easily hit the fd limit for the > process or the system for machines running a ton of small VMs. You already have a file descriptor for every vCPU, but I agree that having twice as many is not very good. > Looks like creating an extra file descriptor for every VM is a better option. > And then we can check per vCPU stats through Ioctl of this VM fd by > passing the vCPU index. The file descriptor idea is not really infeasible I think (not just because the # of file descriptors is "only" doubled, but also because most of the time I think you'd only care of per-VM stats). If you really believe it's not usable for you, you can use two ioctls to fill the description and the data respectively (i.e. ioctl(fd, KVM_GET_STATS_{DESCRIPTION,VALUES}, pdata) using the same layout as below. If called with NULL argument, the ioctl returns how much data they will fill in. The (always zero) global flags can be replaced by the value returned by KVM_CHECK_EXTENSION. The number of statistics can be obtained by ioctl(fd, KVM_GET_STATS_VALUES, NULL), just divide the returned value by 8. Paolo >> 4 bytes flags (always zero) >> 4 bytes number of statistics >> 4 bytes offset of the first stat description >> 4 bytes offset of the first stat value >> stat descriptions: >> - 4 bytes for the type (for now always zero: uint64_t) >> - 4 bytes for the flags (for now always zero) >> - length of name >> - name >> statistics in 64-bit format
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 383df23514b9..87dd62516c8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } + case KVM_STATS_GET_INFO: { + struct kvm_stats_info stats_info; + + r = -EFAULT; + stats_info.num_stats = VCPU_STAT_COUNT; + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_NAMES: { + struct kvm_stats_names stats_names; + + r = -EFAULT; + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) + goto out; + r = -EINVAL; + if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN) + goto out; + + r = -EFAULT; + if (copy_to_user(argp + sizeof(stats_names), + kvm_vcpu_stat_strings, + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_DATA: { + struct kvm_stats_data stats_data; + + r = -EFAULT; + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) + goto out; + r = -EINVAL; + if (stats_data.size < sizeof(vcpu->stat)) + goto out; + + r = -EFAULT; + argp += sizeof(stats_data); + if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat))) + goto out; + r = 0; + break; + } default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -3695,6 +3740,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_HALT_POLL: + case KVM_CAP_STATS_BINARY_FORM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3825,6 +3871,40 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, } } +static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu; + struct kvm_stats_data stats_data; + u64 *data = NULL, *pdata; + int i, j, ret = 0; + size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data); + + + if (copy_from_user(&stats_data, argp, sizeof(stats_data))) + return -EFAULT; + if (stats_data.size < dsize) + return -EINVAL; + data = kzalloc(dsize, GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + for (i = 0; i < VM_STAT_COUNT; i++) + *(data + i) = *((ulong *)&kvm->stat + i); + + kvm_for_each_vcpu(j, vcpu, kvm) { + pdata = data + VM_STAT_COUNT; + for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++) + *pdata += *((u64 *)&vcpu->stat + i); + } + + if (copy_to_user(argp + sizeof(stats_data), data, dsize)) + ret = -EFAULT; + + kfree(data); + return ret; +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4001,6 +4081,41 @@ static long kvm_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_STATS_GET_INFO: { + struct kvm_stats_info stats_info; + + r = -EFAULT; + stats_info.num_stats = VM_STAT_COUNT + VCPU_STAT_COUNT; + if (copy_to_user(argp, &stats_info, sizeof(stats_info))) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_NAMES: { + struct kvm_stats_names stats_names; + + r = -EFAULT; + if (copy_from_user(&stats_names, argp, sizeof(stats_names))) + goto out; + r = -EINVAL; + if (stats_names.size < + (VM_STAT_COUNT + VCPU_STAT_COUNT) * KVM_STATS_NAME_LEN) + goto out; + r = -EFAULT; + argp += sizeof(stats_names); + if (copy_to_user(argp, kvm_vm_stat_strings, + VM_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + argp += VM_STAT_COUNT * KVM_STATS_NAME_LEN; + if (copy_to_user(argp, kvm_vcpu_stat_strings, + VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)) + goto out; + r = 0; + break; + } + case KVM_STATS_GET_DATA: + r = kvm_vm_ioctl_stats_get_data(kvm, arg); + break; case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break;
Three ioctl commands are added to support binary form statistics data retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA. KVM_CAP_STATS_BINARY_FORM indicates the capability. Signed-off-by: Jing Zhang <jingzhangos@google.com> --- virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+)