@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct timespec *ts);
void pvclock_resume(void);
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
#endif /* _ASM_X86_PVCLOCK_H */
@@ -74,7 +74,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+ return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+ shadow->tsc_shift);
}
/*
--
1.7.0.4
From 06bd89325e6441c130c8dc71859867f6d68d617c Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serge.hallyn@canonical.com>
Date: Wed, 2 Mar 2011 20:05:34 +0000
Subject: [PATCH 2/3] KVM: x86: Fix a possible backwards warp of kvmclock
Kernel time, which advances in discrete steps may progress much slower
than TSC. As a result, when kvmclock is adjusted to a new base, the
apparent time to the guest, which runs at a much higher, nsec scaled
rate based on the current TSC, may have already been observed to have
a larger value (kernel_ns + scaled tsc) than the value to which we are
setting it (kernel_ns + 0).
We must instead compute the clock as potentially observed by the guest
for kernel_ns to make sure it does not go backwards.
Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
BugLink: http://bugs.launchpad.net/bugs/714335
(backported from commit 1d5f066e0b63271b67eac6d3752f8aa96adcbddb)
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Stefan Bader <stefan.bader@canonical.com>
---
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++++++++++++++--
2 files changed, 47 insertions(+), 3 deletions(-)
@@ -357,6 +357,9 @@ struct kvm_vcpu_arch {
struct page *time_page;
bool singlestep; /* guest is single stepped by KVM */
+ u64 last_guest_tsc;
+ u64 last_kernel_ns;
+
bool nmi_pending;
bool nmi_injected;
@@ -47,6 +47,7 @@
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/pvclock.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -633,6 +634,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
unsigned long this_tsc_khz;
+ s64 kernel_ns, max_kernel_ns;
+ u64 tsc_timestamp;
if ((!vcpu->time_page))
return;
@@ -646,15 +649,51 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+ kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
ktime_get_ts(&ts);
monotonic_to_bootbased(&ts);
+ kernel_ns = timespec_to_ns(&ts);
local_irq_restore(flags);
+ /*
+ * Time as measured by the TSC may go backwards when resetting the base
+ * tsc_timestamp. The reason for this is that the TSC resolution is
+ * higher than the resolution of the other clock scales. Thus, many
+ * possible measurments of the TSC correspond to one measurement of any
+ * other clock, and so a spread of values is possible. This is not a
+ * problem for the computation of the nanosecond clock; with TSC rates
+ * around 1GHZ, there can only be a few cycles which correspond to one
+ * nanosecond value, and any path through this code will inevitably
+ * take longer than that. However, with the kernel_ns value itself,
+ * the precision may be much lower, down to HZ granularity. If the
+ * first sampling of TSC against kernel_ns ends in the low part of the
+ * range, and the second in the high end of the range, we can get:
+ *
+ * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+ *
+ * As the sampling errors potentially range in the thousands of cycles,
+ * it is possible such a time value has already been observed by the
+ * guest. To protect against this, we must compute the system time as
+ * observed by the guest and ensure the new system time is greater.
+ */
+ max_kernel_ns = 0;
+ if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+ max_kernel_ns = vcpu->last_guest_tsc -
+ vcpu->hv_clock.tsc_timestamp;
+ max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+ vcpu->hv_clock.tsc_to_system_mul,
+ vcpu->hv_clock.tsc_shift);
+ max_kernel_ns += vcpu->last_kernel_ns;
+ }
+
+ if (max_kernel_ns > kernel_ns)
+ kernel_ns = max_kernel_ns;
+
/* With all the info we got, fill in the values */
- vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_kernel_ns = kernel_ns;
/*
* The interface expects us to write an even number signaling that the
@@ -3695,6 +3734,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->prepare_guest_switch(vcpu);
kvm_load_guest_fpu(vcpu);
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
local_irq_disable();
clear_bit(KVM_REQ_KICK, &vcpu->requests);
--
1.7.0.4
From 36cf90d9adc1fcdd98eeb89b97948882fbd2d1db Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serge.hallyn@canonical.com>
Date: Wed, 2 Mar 2011 20:07:48 +0000
Subject: [PATCH 3/3] KVM: x86: Fix kvmclock bug
If preempted after kvmclock values are updated, but before hardware
virtualization is entered, the last tsc time as read by the guest is
never set. It underflows the next time kvmclock is updated if there
has not yet been a successful entry / exit into hardware virt.
Fix this by simply setting last_tsc to the newly read tsc value so
that any computed nsec advance of kvmclock is nulled.
Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
BugLink: http://bugs.launchpad.net/bugs/714335
(backported from commit 28e4639adf0c9f26f6bb56149b7ab547bf33bb95)
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Stefan Bader <stefan.bader@canonical.com>
---
arch/x86/kvm/x86.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
@@ -694,6 +694,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
+ vcpu->last_guest_tsc = tsc_timestamp;
/*
* The interface expects us to write an even number signaling that the