@@ -1060,6 +1060,11 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
}
+static int vcpus_running(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
/*
* Returns the number of system pages that are dirty.
* This can be more than 1 if we find a huge-page HPTE.
@@ -1069,6 +1074,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
struct revmap_entry *rev = kvm->arch.revmap;
unsigned long head, i, j;
unsigned long n;
+ unsigned long v, r;
unsigned long *hptep;
int npages_dirty = 0;
@@ -1088,7 +1094,22 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
- if (!(hptep[1] & HPTE_R_C))
+ /*
+ * Checking the C (changed) bit here is racy since there
+ * is no guarantee about when the hardware writes it back.
+ * If the HPTE is not writable then it is stable since the
+ * page can't be written to, and we would have done a tlbie
+ * (which forces the hardware to complete any writeback)
+ * when making the HPTE read-only.
+ * If vcpus are running then this call is racy anyway
+ * since the page could get dirtied subsequently, so we
+ * expect there to be a further call which would pick up
+ * any delayed C bit writeback.
+ * Otherwise we need to do the tlbie even if C==0 in
+ * order to pick up any delayed writeback of C.
+ */
+ if (!(hptep[1] & HPTE_R_C) &&
+ (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1100,23 +1121,29 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
}
/* Now check and modify the HPTE */
- if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
- /* need to make it temporarily absent to clear C */
- hptep[0] |= HPTE_V_ABSENT;
- kvmppc_invalidate_hpte(kvm, hptep, i);
- hptep[1] &= ~HPTE_R_C;
- eieio();
- hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+ if (!(hptep[0] & HPTE_V_VALID))
+ continue;
+
+ /* need to make it temporarily absent so C is stable */
+ hptep[0] |= HPTE_V_ABSENT;
+ kvmppc_invalidate_hpte(kvm, hptep, i);
+ v = hptep[0];
+ r = hptep[1];
+ if (r & HPTE_R_C) {
+ hptep[1] = r & ~HPTE_R_C;
if (!(rev[i].guest_rpte & HPTE_R_C)) {
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
- n = hpte_page_size(hptep[0], hptep[1]);
+ n = hpte_page_size(v, r);
n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (n > npages_dirty)
npages_dirty = n;
+ eieio();
}
- hptep[0] &= ~HPTE_V_HVLOCK;
+ v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+ v |= HPTE_V_VALID;
+ hptep[0] = v;
} while ((i = j) != head);
unlock_rmap(rmapp);